PyPI - omg-llmkit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

omg-llmkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

llmkit/__init__.py +76 -0
llmkit/_litellm.py +145 -0
llmkit/exceptions.py +27 -0
llmkit/logging.py +221 -0
llmkit/providers.py +327 -0
llmkit/rate_limiting.py +130 -0
llmkit/retry.py +108 -0
llmkit/structured_output.py +357 -0
llmkit/sync.py +29 -0
omg_llmkit-0.1.0.dist-info/METADATA +226 -0
omg_llmkit-0.1.0.dist-info/RECORD +13 -0
omg_llmkit-0.1.0.dist-info/WHEEL +4 -0
omg_llmkit-0.1.0.dist-info/licenses/LICENSE +21 -0

llmkit/__init__.py ADDED Viewed

@@ -0,0 +1,76 @@
+"""LLM client with multi-provider support.
+Provides a thin, opinionated layer over **LiteLLM** (with ``instructor``
+for structured output) that gives the application a unified, provider-
+agnostic call surface across cloud providers (OpenRouter, Google,
+Anthropic) and local Ollama.
+This package provides:
+- The structured / plain-text / streaming call functions
+- Provider switching based on a host-supplied config
+- A process-global async rate limiter shared across all calls
+- Per-call invocation logging via a pluggable sink (with approximate cost)
+"""
+from llmkit.exceptions import LLM_RECOVERABLE_ERRORS
+from llmkit.logging import (
+    LLMCallRecord,
+    LocalYamlLogSink,
+    LogSink,
+    configure_llm_logging,
+)
+from llmkit.providers import (
+    AnthropicProvider,
+    GoogleProvider,
+    LLMClientConfig,
+    LLMInfo,
+    LLMProviderInterface,
+    OllamaProvider,
+    OpenRouterProvider,
+    Provider,
+    configure_llm_client,
+    get_llm_config,
+    get_provider,
+)
+from llmkit.rate_limiting import (
+    GlobalRateLimiter,
+    configure_rate_limit,
+)
+from llmkit.structured_output import (
+    capture_llm_log_paths,
+    stream_text_with_log,
+    structured_llm_call,
+    structured_llm_call_sync,
+    text_llm_call,
+)
+__all__ = [
+    # Providers + config
+    "LLMProviderInterface",
+    "OpenRouterProvider",
+    "OllamaProvider",
+    "GoogleProvider",
+    "AnthropicProvider",
+    "Provider",
+    "LLMClientConfig",
+    "LLMInfo",
+    "configure_llm_client",
+    "get_provider",
+    "get_llm_config",
+    # Logging
+    "LLMCallRecord",
+    "LogSink",
+    "LocalYamlLogSink",
+    "configure_llm_logging",
+    # Rate limiting
+    "GlobalRateLimiter",
+    "configure_rate_limit",
+    # Structured + plain-text call functions (the public call surface)
+    "structured_llm_call",
+    "structured_llm_call_sync",
+    "text_llm_call",
+    "stream_text_with_log",
+    "capture_llm_log_paths",
+    # Exception handling
+    "LLM_RECOVERABLE_ERRORS",
+]

llmkit/_litellm.py ADDED Viewed

@@ -0,0 +1,145 @@
+"""Internal LiteLLM call layer.
+The single place that talks to LiteLLM (and, for structured output,
+``instructor`` over LiteLLM). The public call functions in
+:mod:`llmkit.structured_output` build/log :class:`LLMCallRecord`s
+around these helpers; this module owns provider routing, the rate-limit
+semaphore, structured-output mode pinning, and best-effort cost extraction.
+It is also the **test seam**: unit tests patch these three coroutines
+(``acompletion_structured`` / ``acompletion_text`` / ``astream_text``) so
+the real call-function bodies — logging, retry, content coercion — still
+run over a faked provider response (see ``tests/_support`` ``patch_llm``).
+LiteLLM's ``acompletion`` and instructor's ``create_with_completion`` carry
+very strict, heavily-overloaded type stubs that reject this module's generic
+``**credential-kwargs`` and ``list[dict[str, str]]`` message shapes. Those
+call expressions therefore carry a single ``reportArgumentType`` suppression
+each, tagged ``raw-llm`` — the boundary where our thin wrapper meets the
+provider SDK's exhaustive parameter surface.
+"""
+from __future__ import annotations
+import logging
+from collections.abc import AsyncIterator
+import instructor
+import litellm
+from pydantic import BaseModel
+from llmkit.providers import BaseProvider, get_provider
+from llmkit.rate_limiting import GlobalRateLimiter
+logger = logging.getLogger(__name__)
+def _messages(prompt: str | list[dict[str, str]]) -> list[dict[str, str]]:
+    """Normalise a prompt into LiteLLM's message-list shape."""
+    return [{"role": "user", "content": prompt}] if isinstance(prompt, str) else prompt
+def _response_cost(
+    raw: object,
+) -> float | None:
+    """Best-effort USD cost for a completion from its ``_hidden_params``.
+    LiteLLM stamps ``response_cost`` onto the completion's
+    ``_hidden_params`` (token usage x model pricing). Best-effort: any
+    missing/odd shape degrades to ``None`` rather than breaking the call.
+    """
+    hidden = getattr(raw, "_hidden_params", None)
+    if isinstance(hidden, dict):
+        cost = hidden.get("response_cost")  # pyright: ignore[reportUnknownMemberType]  # raw-llm — litellm hidden-params dict
+        if isinstance(cost, (int, float)):
+            return float(cost)
+    return None
+async def acompletion_structured[T: BaseModel](
+    prompt: str | list[dict[str, str]],
+    output_schema: type[T],
+    *,
+    temperature: float,
+    model: str | None,
+    validation_retries: int = 1,
+) -> tuple[T, float | None]:
+    """Structured completion via instructor pinned to the provider's mode.
+    Uses ``create_with_completion`` so the parsed model *and* the raw
+    completion (for cost) are both in hand. ``validation_retries`` is
+    instructor's in-call schema-repair budget — deliberately low and kept
+    separate from the transient-error retry layer (``with_retries`` in
+    :mod:`llmkit.retry`), which handles 429/503/5xx.
+    Returns ``(parsed, approximate_cost)``.
+    """
+    provider: BaseProvider = get_provider()
+    creds = provider.completion_kwargs()
+    client = instructor.from_litellm(litellm.acompletion, mode=provider.instructor_mode)
+    async with GlobalRateLimiter.acquire_async():
+        parsed, completion = await client.chat.completions.create_with_completion(
+            model=provider.litellm_model(model),
+            messages=_messages(prompt),  # pyright: ignore[reportArgumentType]  # raw-llm — instructor over-strict ChatCompletionMessageParam
+            response_model=output_schema,
+            temperature=temperature,
+            max_retries=validation_retries,
+            api_key=creds.get("api_key"),
+            api_base=creds.get("api_base"),
+        )
+    return parsed, _response_cost(completion)
+async def acompletion_text(
+    prompt: str | list[dict[str, str]],
+    *,
+    temperature: float,
+    model: str | None,
+    max_tokens: int | None = None,
+) -> tuple[str, float | None]:
+    """Plain-text completion via LiteLLM.
+    Returns ``(text, approximate_cost)``. The text is the first choice's
+    message content (an empty string when the provider returns none).
+    """
+    provider: BaseProvider = get_provider()
+    creds = provider.completion_kwargs()
+    async with GlobalRateLimiter.acquire_async():
+        resp = await litellm.acompletion(  # pyright: ignore[reportArgumentType]  # raw-llm — litellm over-strict signature
+            model=provider.litellm_model(model),
+            messages=_messages(prompt),
+            temperature=temperature,
+            max_tokens=max_tokens,
+            api_key=creds.get("api_key"),
+            api_base=creds.get("api_base"),
+        )
+    content = resp.choices[0].message.content  # pyright: ignore[reportAttributeAccessIssue]  # raw-llm — litellm ModelResponse
+    return (content or ""), _response_cost(resp)
+async def astream_text(
+    prompt: str | list[dict[str, str]],
+    *,
+    temperature: float,
+    model: str | None,
+) -> AsyncIterator[str]:
+    """Stream plain-text deltas via LiteLLM.
+    Yields each chunk's textual delta as it arrives. The rate-limit slot
+    is held for the lifetime of the stream.
+    """
+    provider: BaseProvider = get_provider()
+    creds = provider.completion_kwargs()
+    async with GlobalRateLimiter.acquire_async():
+        stream = await litellm.acompletion(  # pyright: ignore[reportArgumentType]  # raw-llm — litellm over-strict signature
+            model=provider.litellm_model(model),
+            messages=_messages(prompt),
+            temperature=temperature,
+            stream=True,
+            api_key=creds.get("api_key"),
+            api_base=creds.get("api_base"),
+        )
+        async for chunk in stream:  # pyright: ignore[reportGeneralTypeIssues]  # raw-llm — litellm stream wrapper is async-iterable
+            delta = chunk.choices[0].delta.content  # pyright: ignore[reportAttributeAccessIssue]  # raw-llm — litellm stream chunk
+            if delta:
+                yield delta

llmkit/exceptions.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""Recoverable exception types for LLM service calls.
+Use ``LLM_RECOVERABLE_ERRORS`` in ``except`` clauses to catch expected LLM
+operational failures (network errors, rate limits, transient provider
+errors, schema-validation/parsing failures, timeouts) while letting
+programming errors (TypeError, AttributeError) propagate.
+``with_retries()`` (see :mod:`llmkit.retry`) is the transient-retry
+layer; instructor's own ``max_retries`` handles schema-repair separately.
+"""
+import httpx
+import openai
+from instructor.core import InstructorRetryException
+from pydantic import ValidationError
+# LiteLLM's transient errors (RateLimitError, Timeout, APIConnectionError,
+# ServiceUnavailableError, InternalServerError, ...) all subclass
+# ``openai.APIError``, so it covers them in one entry. ``InstructorRetryException``
+# is raised when instructor exhausts its in-call schema-validation retries.
+LLM_RECOVERABLE_ERRORS: tuple[type[Exception], ...] = (
+    openai.APIError,
+    InstructorRetryException,
+    httpx.RequestError,
+    ValidationError,
+    TimeoutError,
+)

llmkit/logging.py ADDED Viewed

@@ -0,0 +1,221 @@
+"""Per-call LLM invocation logging via a pluggable sink.
+Every LLM round-trip is recorded as an :class:`LLMCallRecord` and handed
+to the configured :class:`LogSink`. The default sink writes one YAML file
+per call to a directory (``data/llm-logs/`` by default), preserving the
+historical log shape so existing analysis tooling keeps working.
+Logging is unconditional and best-effort — a sink that raises is swallowed
+so the LLM call itself never breaks because logging did. The host
+application points the sink at its chosen directory once at startup via
+:func:`configure_llm_logging`, mirroring the ``configure_rate_limit``
+module-level pattern.
+"""
+from __future__ import annotations
+import json
+import logging
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Protocol
+import yaml
+logger = logging.getLogger(__name__)
+DEFAULT_LOG_DIR = Path("data/llm-logs")
+# Compact append-only summary sibling to the per-call YAML files: one JSON
+# line per call, so cross-call scans don't have to glob + parse every YAML.
+INDEX_FILENAME = "index.jsonl"
+@dataclass(frozen=True)
+class LLMCallRecord:
+    """A single LLM round-trip, as written to the log sink.
+    ``model`` is the *resolved effective* model (the provider default
+    substituted when the caller passed ``None``) and ``provider`` names
+    the active provider, so cost attribution is a ``grep`` over the logs
+    rather than a code trace. ``schema`` is the output schema name, or the
+    literal ``"stream"`` for streamed plain-text calls. ``response`` is the
+    Pydantic-dumped result, the accumulated stream text, or ``None``.
+    ``approximate_cost`` is a best-effort USD estimate for budget
+    visibility — NOT a billing figure. It is sourced from LiteLLM's
+    per-response cost (no local price table) and is ``None`` when the
+    provider does not report it (e.g. streamed calls).
+    """
+    started_at: datetime
+    feature: str
+    label: str | None
+    model: str | None
+    provider: str | None
+    temperature: float
+    duration_ms: float
+    schema: str
+    prompt: str | list[dict[str, str]]
+    response: Any  # pyright: ignore[reportExplicitAny]  # raw-llm — Pydantic dump or accumulated text
+    error: str | None
+    approximate_cost: float | None = None
+class LogSink(Protocol):
+    """Destination for :class:`LLMCallRecord`s.
+    ``write`` returns the path it wrote (so callers tracking log paths can
+    cross-reference), or ``None`` if nothing was persisted.
+    """
+    def write(self, record: LLMCallRecord) -> Path | None: ...
+class LocalYamlLogSink:
+    """Default sink: one YAML file per call under ``log_dir``, plus a
+    compact append-only ``index.jsonl`` summarising every call.
+    The per-call YAML is laid out **verdict-first** — a one-line summary
+    comment header (status / feature / model / schema / duration / cost),
+    then the small metadata fields, with the large ``response`` and
+    ``prompt`` blobs last — so a reader (a human, but in practice mostly a
+    coding agent) learns what happened from the head of the file without
+    paying to scan the whole prompt. ``index.jsonl`` carries one short
+    line per call (file, timestamp, feature, label, model, schema,
+    duration, cost, error) so cross-call questions — "which calls errored
+    / were slowest / most expensive / the last call for feature X" — are a
+    single small scan instead of globbing and parsing every YAML.
+    """
+    def __init__(self, log_dir: Path = DEFAULT_LOG_DIR) -> None:
+        self.log_dir = log_dir
+    def write(self, record: LLMCallRecord) -> Path | None:
+        try:
+            self.log_dir.mkdir(parents=True, exist_ok=True)
+            ts = record.started_at.strftime("%Y-%m-%dT%H-%M-%S-%f")
+            safe_label = (record.label or "unlabeled").replace(".", "_").replace("/", "_")
+            filepath = self.log_dir / f"{ts}_{record.feature}_{safe_label}.yaml"
+            # Verdict-first order: cheap, high-signal metadata up top; the
+            # large ``response``/``prompt`` blobs last (``response`` first —
+            # it's what a debugger usually wants), so the head of the file
+            # is the whole story for most reads.
+            doc: dict[str, Any] = {  # pyright: ignore[reportExplicitAny]  # raw-llm — YAML log body dict
+                "timestamp": record.started_at.isoformat(),
+                "feature": record.feature,
+                "label": record.label,
+                "model": record.model,
+                "provider": record.provider,
+                "schema": record.schema,
+                "temperature": record.temperature,
+                "duration_ms": round(record.duration_ms, 1),
+                "approximate_cost": record.approximate_cost,
+                "error": record.error,
+                "response": record.response,
+                "prompt": record.prompt,
+            }
+            with open(filepath, "w") as f:
+                f.write(self._summary_header(record))
+                yaml.dump(
+                    doc,
+                    f,
+                    default_flow_style=False,
+                    sort_keys=False,
+                    allow_unicode=True,
+                    width=120,
+                )
+        except (OSError, yaml.YAMLError):
+            logger.warning(
+                "Failed to write LLM invocation log for %s/%s",
+                record.feature,
+                record.label,
+                exc_info=True,
+            )
+            return None
+        # Best-effort index append, kept separate so an index failure can
+        # never lose the per-call record that was just written successfully.
+        self._append_index(record, filepath)
+        return filepath
+    @staticmethod
+    def _summary_header(record: LLMCallRecord) -> str:
+        """Build the two-line ``#`` comment that opens each per-call YAML.
+        The first line is a single-glance verdict — ``ok``/``ERROR``,
+        feature/label, resolved model, schema, duration, approximate cost —
+        so ``head -1`` across the directory triages a whole run.
+        """
+        status = "ERROR" if record.error else "ok"
+        cost = f"${record.approximate_cost:.3g}" if record.approximate_cost is not None else "$?"
+        return (
+            f"# {status} | {record.feature}/{record.label or 'unlabeled'} | "
+            f"{record.model or '?'} | {record.schema} | "
+            f"{round(record.duration_ms)}ms | {cost}\n"
+            f"# {record.started_at.isoformat()}\n\n"
+        )
+    def _append_index(self, record: LLMCallRecord, filepath: Path) -> None:
+        """Append one compact JSON line for *record* to ``index.jsonl``.
+        Best-effort and swallowed on failure (logging must never break the
+        call). A single ``write`` of a sub-4KB line under ``O_APPEND`` is
+        atomic on POSIX, so concurrent calls don't interleave lines.
+        """
+        line: dict[str, str | float | None] = {
+            "file": filepath.name,
+            "timestamp": record.started_at.isoformat(),
+            "feature": record.feature,
+            "label": record.label,
+            "model": record.model,
+            "provider": record.provider,
+            "schema": record.schema,
+            "duration_ms": round(record.duration_ms, 1),
+            "approximate_cost": record.approximate_cost,
+            "error": record.error,
+        }
+        try:
+            with open(self.log_dir / INDEX_FILENAME, "a", encoding="utf-8") as f:
+                f.write(json.dumps(line, ensure_ascii=False) + "\n")
+        except OSError:
+            logger.warning(
+                "Failed to append LLM log index for %s/%s",
+                record.feature,
+                record.label,
+                exc_info=True,
+            )
+# Module-level configured sink, defaulting to the local-YAML sink at the
+# default directory. The host overrides it once at startup; tests typically
+# point it at a tmp directory.
+_sink: LogSink | None = LocalYamlLogSink()
+def configure_llm_logging(sink: LogSink | None) -> None:
+    """Set the sink that receives every :class:`LLMCallRecord`.
+    Pass ``None`` to disable logging entirely (writes become no-ops).
+    """
+    global _sink
+    _sink = sink
+def write_llm_log(record: LLMCallRecord) -> Path | None:
+    """Hand ``record`` to the configured sink, swallowing any failure.
+    Logging must never break the LLM call, so a sink that raises is
+    caught here in addition to the sink's own best-effort handling.
+    Returns the written path, or ``None`` when nothing was persisted.
+    """
+    if _sink is None:
+        return None
+    try:
+        return _sink.write(record)
+    except Exception:
+        logger.warning("LLM log sink raised for %s/%s", record.feature, record.label, exc_info=True)
+        return None