omg-llmkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llmkit/__init__.py ADDED
@@ -0,0 +1,76 @@
1
+ """LLM client with multi-provider support.
2
+
3
+ Provides a thin, opinionated layer over **LiteLLM** (with ``instructor``
4
+ for structured output) that gives the application a unified, provider-
5
+ agnostic call surface across cloud providers (OpenRouter, Google,
6
+ Anthropic) and local Ollama.
7
+
8
+ This package provides:
9
+ - The structured / plain-text / streaming call functions
10
+ - Provider switching based on a host-supplied config
11
+ - A process-global async rate limiter shared across all calls
12
+ - Per-call invocation logging via a pluggable sink (with approximate cost)
13
+ """
14
+
15
+ from llmkit.exceptions import LLM_RECOVERABLE_ERRORS
16
+ from llmkit.logging import (
17
+ LLMCallRecord,
18
+ LocalYamlLogSink,
19
+ LogSink,
20
+ configure_llm_logging,
21
+ )
22
+ from llmkit.providers import (
23
+ AnthropicProvider,
24
+ GoogleProvider,
25
+ LLMClientConfig,
26
+ LLMInfo,
27
+ LLMProviderInterface,
28
+ OllamaProvider,
29
+ OpenRouterProvider,
30
+ Provider,
31
+ configure_llm_client,
32
+ get_llm_config,
33
+ get_provider,
34
+ )
35
+ from llmkit.rate_limiting import (
36
+ GlobalRateLimiter,
37
+ configure_rate_limit,
38
+ )
39
+ from llmkit.structured_output import (
40
+ capture_llm_log_paths,
41
+ stream_text_with_log,
42
+ structured_llm_call,
43
+ structured_llm_call_sync,
44
+ text_llm_call,
45
+ )
46
+
47
+ __all__ = [
48
+ # Providers + config
49
+ "LLMProviderInterface",
50
+ "OpenRouterProvider",
51
+ "OllamaProvider",
52
+ "GoogleProvider",
53
+ "AnthropicProvider",
54
+ "Provider",
55
+ "LLMClientConfig",
56
+ "LLMInfo",
57
+ "configure_llm_client",
58
+ "get_provider",
59
+ "get_llm_config",
60
+ # Logging
61
+ "LLMCallRecord",
62
+ "LogSink",
63
+ "LocalYamlLogSink",
64
+ "configure_llm_logging",
65
+ # Rate limiting
66
+ "GlobalRateLimiter",
67
+ "configure_rate_limit",
68
+ # Structured + plain-text call functions (the public call surface)
69
+ "structured_llm_call",
70
+ "structured_llm_call_sync",
71
+ "text_llm_call",
72
+ "stream_text_with_log",
73
+ "capture_llm_log_paths",
74
+ # Exception handling
75
+ "LLM_RECOVERABLE_ERRORS",
76
+ ]
llmkit/_litellm.py ADDED
@@ -0,0 +1,145 @@
1
+ """Internal LiteLLM call layer.
2
+
3
+ The single place that talks to LiteLLM (and, for structured output,
4
+ ``instructor`` over LiteLLM). The public call functions in
5
+ :mod:`llmkit.structured_output` build/log :class:`LLMCallRecord`s
6
+ around these helpers; this module owns provider routing, the rate-limit
7
+ semaphore, structured-output mode pinning, and best-effort cost extraction.
8
+
9
+ It is also the **test seam**: unit tests patch these three coroutines
10
+ (``acompletion_structured`` / ``acompletion_text`` / ``astream_text``) so
11
+ the real call-function bodies — logging, retry, content coercion — still
12
+ run over a faked provider response (see ``tests/_support`` ``patch_llm``).
13
+
14
+ LiteLLM's ``acompletion`` and instructor's ``create_with_completion`` carry
15
+ very strict, heavily-overloaded type stubs that reject this module's generic
16
+ ``**credential-kwargs`` and ``list[dict[str, str]]`` message shapes. Those
17
+ call expressions therefore carry a single ``reportArgumentType`` suppression
18
+ each, tagged ``raw-llm`` — the boundary where our thin wrapper meets the
19
+ provider SDK's exhaustive parameter surface.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import logging
25
+ from collections.abc import AsyncIterator
26
+
27
+ import instructor
28
+ import litellm
29
+ from pydantic import BaseModel
30
+
31
+ from llmkit.providers import BaseProvider, get_provider
32
+ from llmkit.rate_limiting import GlobalRateLimiter
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ def _messages(prompt: str | list[dict[str, str]]) -> list[dict[str, str]]:
38
+ """Normalise a prompt into LiteLLM's message-list shape."""
39
+ return [{"role": "user", "content": prompt}] if isinstance(prompt, str) else prompt
40
+
41
+
42
+ def _response_cost(
43
+ raw: object,
44
+ ) -> float | None:
45
+ """Best-effort USD cost for a completion from its ``_hidden_params``.
46
+
47
+ LiteLLM stamps ``response_cost`` onto the completion's
48
+ ``_hidden_params`` (token usage x model pricing). Best-effort: any
49
+ missing/odd shape degrades to ``None`` rather than breaking the call.
50
+ """
51
+ hidden = getattr(raw, "_hidden_params", None)
52
+ if isinstance(hidden, dict):
53
+ cost = hidden.get("response_cost") # pyright: ignore[reportUnknownMemberType] # raw-llm — litellm hidden-params dict
54
+ if isinstance(cost, (int, float)):
55
+ return float(cost)
56
+ return None
57
+
58
+
59
+ async def acompletion_structured[T: BaseModel](
60
+ prompt: str | list[dict[str, str]],
61
+ output_schema: type[T],
62
+ *,
63
+ temperature: float,
64
+ model: str | None,
65
+ validation_retries: int = 1,
66
+ ) -> tuple[T, float | None]:
67
+ """Structured completion via instructor pinned to the provider's mode.
68
+
69
+ Uses ``create_with_completion`` so the parsed model *and* the raw
70
+ completion (for cost) are both in hand. ``validation_retries`` is
71
+ instructor's in-call schema-repair budget — deliberately low and kept
72
+ separate from the transient-error retry layer (``with_retries`` in
73
+ :mod:`llmkit.retry`), which handles 429/503/5xx.
74
+
75
+ Returns ``(parsed, approximate_cost)``.
76
+ """
77
+ provider: BaseProvider = get_provider()
78
+ creds = provider.completion_kwargs()
79
+ client = instructor.from_litellm(litellm.acompletion, mode=provider.instructor_mode)
80
+ async with GlobalRateLimiter.acquire_async():
81
+ parsed, completion = await client.chat.completions.create_with_completion(
82
+ model=provider.litellm_model(model),
83
+ messages=_messages(prompt), # pyright: ignore[reportArgumentType] # raw-llm — instructor over-strict ChatCompletionMessageParam
84
+ response_model=output_schema,
85
+ temperature=temperature,
86
+ max_retries=validation_retries,
87
+ api_key=creds.get("api_key"),
88
+ api_base=creds.get("api_base"),
89
+ )
90
+ return parsed, _response_cost(completion)
91
+
92
+
93
+ async def acompletion_text(
94
+ prompt: str | list[dict[str, str]],
95
+ *,
96
+ temperature: float,
97
+ model: str | None,
98
+ max_tokens: int | None = None,
99
+ ) -> tuple[str, float | None]:
100
+ """Plain-text completion via LiteLLM.
101
+
102
+ Returns ``(text, approximate_cost)``. The text is the first choice's
103
+ message content (an empty string when the provider returns none).
104
+ """
105
+ provider: BaseProvider = get_provider()
106
+ creds = provider.completion_kwargs()
107
+ async with GlobalRateLimiter.acquire_async():
108
+ resp = await litellm.acompletion( # pyright: ignore[reportArgumentType] # raw-llm — litellm over-strict signature
109
+ model=provider.litellm_model(model),
110
+ messages=_messages(prompt),
111
+ temperature=temperature,
112
+ max_tokens=max_tokens,
113
+ api_key=creds.get("api_key"),
114
+ api_base=creds.get("api_base"),
115
+ )
116
+ content = resp.choices[0].message.content # pyright: ignore[reportAttributeAccessIssue] # raw-llm — litellm ModelResponse
117
+ return (content or ""), _response_cost(resp)
118
+
119
+
120
+ async def astream_text(
121
+ prompt: str | list[dict[str, str]],
122
+ *,
123
+ temperature: float,
124
+ model: str | None,
125
+ ) -> AsyncIterator[str]:
126
+ """Stream plain-text deltas via LiteLLM.
127
+
128
+ Yields each chunk's textual delta as it arrives. The rate-limit slot
129
+ is held for the lifetime of the stream.
130
+ """
131
+ provider: BaseProvider = get_provider()
132
+ creds = provider.completion_kwargs()
133
+ async with GlobalRateLimiter.acquire_async():
134
+ stream = await litellm.acompletion( # pyright: ignore[reportArgumentType] # raw-llm — litellm over-strict signature
135
+ model=provider.litellm_model(model),
136
+ messages=_messages(prompt),
137
+ temperature=temperature,
138
+ stream=True,
139
+ api_key=creds.get("api_key"),
140
+ api_base=creds.get("api_base"),
141
+ )
142
+ async for chunk in stream: # pyright: ignore[reportGeneralTypeIssues] # raw-llm — litellm stream wrapper is async-iterable
143
+ delta = chunk.choices[0].delta.content # pyright: ignore[reportAttributeAccessIssue] # raw-llm — litellm stream chunk
144
+ if delta:
145
+ yield delta
llmkit/exceptions.py ADDED
@@ -0,0 +1,27 @@
1
+ """Recoverable exception types for LLM service calls.
2
+
3
+ Use ``LLM_RECOVERABLE_ERRORS`` in ``except`` clauses to catch expected LLM
4
+ operational failures (network errors, rate limits, transient provider
5
+ errors, schema-validation/parsing failures, timeouts) while letting
6
+ programming errors (TypeError, AttributeError) propagate.
7
+
8
+ ``with_retries()`` (see :mod:`llmkit.retry`) is the transient-retry
9
+ layer; instructor's own ``max_retries`` handles schema-repair separately.
10
+ """
11
+
12
+ import httpx
13
+ import openai
14
+ from instructor.core import InstructorRetryException
15
+ from pydantic import ValidationError
16
+
17
+ # LiteLLM's transient errors (RateLimitError, Timeout, APIConnectionError,
18
+ # ServiceUnavailableError, InternalServerError, ...) all subclass
19
+ # ``openai.APIError``, so it covers them in one entry. ``InstructorRetryException``
20
+ # is raised when instructor exhausts its in-call schema-validation retries.
21
+ LLM_RECOVERABLE_ERRORS: tuple[type[Exception], ...] = (
22
+ openai.APIError,
23
+ InstructorRetryException,
24
+ httpx.RequestError,
25
+ ValidationError,
26
+ TimeoutError,
27
+ )
llmkit/logging.py ADDED
@@ -0,0 +1,221 @@
1
+ """Per-call LLM invocation logging via a pluggable sink.
2
+
3
+ Every LLM round-trip is recorded as an :class:`LLMCallRecord` and handed
4
+ to the configured :class:`LogSink`. The default sink writes one YAML file
5
+ per call to a directory (``data/llm-logs/`` by default), preserving the
6
+ historical log shape so existing analysis tooling keeps working.
7
+
8
+ Logging is unconditional and best-effort — a sink that raises is swallowed
9
+ so the LLM call itself never breaks because logging did. The host
10
+ application points the sink at its chosen directory once at startup via
11
+ :func:`configure_llm_logging`, mirroring the ``configure_rate_limit``
12
+ module-level pattern.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import logging
19
+ from dataclasses import dataclass
20
+ from datetime import datetime
21
+ from pathlib import Path
22
+ from typing import Any, Protocol
23
+
24
+ import yaml
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ DEFAULT_LOG_DIR = Path("data/llm-logs")
29
+
30
+ # Compact append-only summary sibling to the per-call YAML files: one JSON
31
+ # line per call, so cross-call scans don't have to glob + parse every YAML.
32
+ INDEX_FILENAME = "index.jsonl"
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class LLMCallRecord:
37
+ """A single LLM round-trip, as written to the log sink.
38
+
39
+ ``model`` is the *resolved effective* model (the provider default
40
+ substituted when the caller passed ``None``) and ``provider`` names
41
+ the active provider, so cost attribution is a ``grep`` over the logs
42
+ rather than a code trace. ``schema`` is the output schema name, or the
43
+ literal ``"stream"`` for streamed plain-text calls. ``response`` is the
44
+ Pydantic-dumped result, the accumulated stream text, or ``None``.
45
+
46
+ ``approximate_cost`` is a best-effort USD estimate for budget
47
+ visibility — NOT a billing figure. It is sourced from LiteLLM's
48
+ per-response cost (no local price table) and is ``None`` when the
49
+ provider does not report it (e.g. streamed calls).
50
+ """
51
+
52
+ started_at: datetime
53
+ feature: str
54
+ label: str | None
55
+ model: str | None
56
+ provider: str | None
57
+ temperature: float
58
+ duration_ms: float
59
+ schema: str
60
+ prompt: str | list[dict[str, str]]
61
+ response: Any # pyright: ignore[reportExplicitAny] # raw-llm — Pydantic dump or accumulated text
62
+ error: str | None
63
+ approximate_cost: float | None = None
64
+
65
+
66
+ class LogSink(Protocol):
67
+ """Destination for :class:`LLMCallRecord`s.
68
+
69
+ ``write`` returns the path it wrote (so callers tracking log paths can
70
+ cross-reference), or ``None`` if nothing was persisted.
71
+ """
72
+
73
+ def write(self, record: LLMCallRecord) -> Path | None: ...
74
+
75
+
76
+ class LocalYamlLogSink:
77
+ """Default sink: one YAML file per call under ``log_dir``, plus a
78
+ compact append-only ``index.jsonl`` summarising every call.
79
+
80
+ The per-call YAML is laid out **verdict-first** — a one-line summary
81
+ comment header (status / feature / model / schema / duration / cost),
82
+ then the small metadata fields, with the large ``response`` and
83
+ ``prompt`` blobs last — so a reader (a human, but in practice mostly a
84
+ coding agent) learns what happened from the head of the file without
85
+ paying to scan the whole prompt. ``index.jsonl`` carries one short
86
+ line per call (file, timestamp, feature, label, model, schema,
87
+ duration, cost, error) so cross-call questions — "which calls errored
88
+ / were slowest / most expensive / the last call for feature X" — are a
89
+ single small scan instead of globbing and parsing every YAML.
90
+ """
91
+
92
+ def __init__(self, log_dir: Path = DEFAULT_LOG_DIR) -> None:
93
+ self.log_dir = log_dir
94
+
95
+ def write(self, record: LLMCallRecord) -> Path | None:
96
+ try:
97
+ self.log_dir.mkdir(parents=True, exist_ok=True)
98
+ ts = record.started_at.strftime("%Y-%m-%dT%H-%M-%S-%f")
99
+ safe_label = (record.label or "unlabeled").replace(".", "_").replace("/", "_")
100
+ filepath = self.log_dir / f"{ts}_{record.feature}_{safe_label}.yaml"
101
+
102
+ # Verdict-first order: cheap, high-signal metadata up top; the
103
+ # large ``response``/``prompt`` blobs last (``response`` first —
104
+ # it's what a debugger usually wants), so the head of the file
105
+ # is the whole story for most reads.
106
+ doc: dict[str, Any] = { # pyright: ignore[reportExplicitAny] # raw-llm — YAML log body dict
107
+ "timestamp": record.started_at.isoformat(),
108
+ "feature": record.feature,
109
+ "label": record.label,
110
+ "model": record.model,
111
+ "provider": record.provider,
112
+ "schema": record.schema,
113
+ "temperature": record.temperature,
114
+ "duration_ms": round(record.duration_ms, 1),
115
+ "approximate_cost": record.approximate_cost,
116
+ "error": record.error,
117
+ "response": record.response,
118
+ "prompt": record.prompt,
119
+ }
120
+
121
+ with open(filepath, "w") as f:
122
+ f.write(self._summary_header(record))
123
+ yaml.dump(
124
+ doc,
125
+ f,
126
+ default_flow_style=False,
127
+ sort_keys=False,
128
+ allow_unicode=True,
129
+ width=120,
130
+ )
131
+ except (OSError, yaml.YAMLError):
132
+ logger.warning(
133
+ "Failed to write LLM invocation log for %s/%s",
134
+ record.feature,
135
+ record.label,
136
+ exc_info=True,
137
+ )
138
+ return None
139
+
140
+ # Best-effort index append, kept separate so an index failure can
141
+ # never lose the per-call record that was just written successfully.
142
+ self._append_index(record, filepath)
143
+ return filepath
144
+
145
+ @staticmethod
146
+ def _summary_header(record: LLMCallRecord) -> str:
147
+ """Build the two-line ``#`` comment that opens each per-call YAML.
148
+
149
+ The first line is a single-glance verdict — ``ok``/``ERROR``,
150
+ feature/label, resolved model, schema, duration, approximate cost —
151
+ so ``head -1`` across the directory triages a whole run.
152
+ """
153
+ status = "ERROR" if record.error else "ok"
154
+ cost = f"${record.approximate_cost:.3g}" if record.approximate_cost is not None else "$?"
155
+ return (
156
+ f"# {status} | {record.feature}/{record.label or 'unlabeled'} | "
157
+ f"{record.model or '?'} | {record.schema} | "
158
+ f"{round(record.duration_ms)}ms | {cost}\n"
159
+ f"# {record.started_at.isoformat()}\n\n"
160
+ )
161
+
162
+ def _append_index(self, record: LLMCallRecord, filepath: Path) -> None:
163
+ """Append one compact JSON line for *record* to ``index.jsonl``.
164
+
165
+ Best-effort and swallowed on failure (logging must never break the
166
+ call). A single ``write`` of a sub-4KB line under ``O_APPEND`` is
167
+ atomic on POSIX, so concurrent calls don't interleave lines.
168
+ """
169
+ line: dict[str, str | float | None] = {
170
+ "file": filepath.name,
171
+ "timestamp": record.started_at.isoformat(),
172
+ "feature": record.feature,
173
+ "label": record.label,
174
+ "model": record.model,
175
+ "provider": record.provider,
176
+ "schema": record.schema,
177
+ "duration_ms": round(record.duration_ms, 1),
178
+ "approximate_cost": record.approximate_cost,
179
+ "error": record.error,
180
+ }
181
+ try:
182
+ with open(self.log_dir / INDEX_FILENAME, "a", encoding="utf-8") as f:
183
+ f.write(json.dumps(line, ensure_ascii=False) + "\n")
184
+ except OSError:
185
+ logger.warning(
186
+ "Failed to append LLM log index for %s/%s",
187
+ record.feature,
188
+ record.label,
189
+ exc_info=True,
190
+ )
191
+
192
+
193
+ # Module-level configured sink, defaulting to the local-YAML sink at the
194
+ # default directory. The host overrides it once at startup; tests typically
195
+ # point it at a tmp directory.
196
+ _sink: LogSink | None = LocalYamlLogSink()
197
+
198
+
199
+ def configure_llm_logging(sink: LogSink | None) -> None:
200
+ """Set the sink that receives every :class:`LLMCallRecord`.
201
+
202
+ Pass ``None`` to disable logging entirely (writes become no-ops).
203
+ """
204
+ global _sink
205
+ _sink = sink
206
+
207
+
208
+ def write_llm_log(record: LLMCallRecord) -> Path | None:
209
+ """Hand ``record`` to the configured sink, swallowing any failure.
210
+
211
+ Logging must never break the LLM call, so a sink that raises is
212
+ caught here in addition to the sink's own best-effort handling.
213
+ Returns the written path, or ``None`` when nothing was persisted.
214
+ """
215
+ if _sink is None:
216
+ return None
217
+ try:
218
+ return _sink.write(record)
219
+ except Exception:
220
+ logger.warning("LLM log sink raised for %s/%s", record.feature, record.label, exc_info=True)
221
+ return None