debugerai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debugai/metrics.py ADDED
@@ -0,0 +1,139 @@
1
+ """Per-model metrics ledger — lightweight thread-safe counters for tokens,
2
+ cost, latency, requests, and failures.
3
+
4
+ import debugai
5
+ debugai.metrics.snapshot() # full dict
6
+ debugai.metrics.by_model # per-model breakdown
7
+ debugai.metrics.reset() # clear all counters
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import threading
13
+ from dataclasses import dataclass, field
14
+
15
+
16
+ @dataclass
17
+ class _ModelStats:
18
+ requests: int = 0
19
+ prompt_tokens: int = 0
20
+ completion_tokens: int = 0
21
+ total_tokens: int = 0
22
+ cost_usd: float = 0.0
23
+ failures: int = 0
24
+ cache_hits: int = 0
25
+ cache_misses: int = 0
26
+ _latencies: list[float] = field(default_factory=list)
27
+
28
+ def record(self, prompt: int, completion: int, cost: float,
29
+ latency_ms: float, failed: bool, from_cache: bool = False) -> None:
30
+ self.requests += 1
31
+ self.prompt_tokens += prompt
32
+ self.completion_tokens += completion
33
+ self.total_tokens += prompt + completion
34
+ self.cost_usd = round(self.cost_usd + cost, 8)
35
+ if not from_cache:
36
+ self._latencies.append(latency_ms)
37
+ if failed:
38
+ self.failures += 1
39
+ if from_cache:
40
+ self.cache_hits += 1
41
+ else:
42
+ self.cache_misses += 1
43
+
44
+ def _pct(self, p: float) -> float:
45
+ if not self._latencies:
46
+ return 0.0
47
+ s = sorted(self._latencies)
48
+ i = max(0, min(int(len(s) * p), len(s) - 1))
49
+ return round(s[i], 2)
50
+
51
+ def to_dict(self) -> dict:
52
+ return {
53
+ "requests": self.requests,
54
+ "prompt_tokens": self.prompt_tokens,
55
+ "completion_tokens": self.completion_tokens,
56
+ "total_tokens": self.total_tokens,
57
+ "cost_usd": round(self.cost_usd, 6),
58
+ "failures": self.failures,
59
+ "cache_hits": self.cache_hits,
60
+ "cache_misses": self.cache_misses,
61
+ "latency_p50_ms": self._pct(0.50),
62
+ "latency_p95_ms": self._pct(0.95),
63
+ }
64
+
65
+
66
+ class MetricsLedger:
67
+ """Thread-safe per-model aggregate counters. Updated by the SDK worker
68
+ after each request; safe to read from any thread."""
69
+
70
+ def __init__(self) -> None:
71
+ self._lock = threading.Lock()
72
+ self._models: dict[str, _ModelStats] = {}
73
+ self._global = _ModelStats()
74
+
75
+ # ── Recording (called by background worker) ─────────────────────────────
76
+ def record(self, model: str, prompt_tokens: int, completion_tokens: int,
77
+ cost_usd: float, latency_ms: float, failed: bool,
78
+ from_cache: bool = False) -> None:
79
+ with self._lock:
80
+ if model not in self._models:
81
+ self._models[model] = _ModelStats()
82
+ self._models[model].record(prompt_tokens, completion_tokens,
83
+ cost_usd, latency_ms, failed, from_cache)
84
+ self._global.record(prompt_tokens, completion_tokens,
85
+ cost_usd, latency_ms, failed, from_cache)
86
+
87
+ # ── Read properties (safe from any thread) ──────────────────────────────
88
+ @property
89
+ def requests(self) -> int:
90
+ with self._lock:
91
+ return self._global.requests
92
+
93
+ @property
94
+ def failures(self) -> int:
95
+ with self._lock:
96
+ return self._global.failures
97
+
98
+ @property
99
+ def total_tokens(self) -> int:
100
+ with self._lock:
101
+ return self._global.total_tokens
102
+
103
+ @property
104
+ def cost_usd(self) -> float:
105
+ with self._lock:
106
+ return self._global.cost_usd
107
+
108
+ @property
109
+ def latency_p50(self) -> float:
110
+ with self._lock:
111
+ return self._global._pct(0.50)
112
+
113
+ @property
114
+ def latency_p95(self) -> float:
115
+ with self._lock:
116
+ return self._global._pct(0.95)
117
+
118
+ @property
119
+ def by_model(self) -> dict[str, dict]:
120
+ with self._lock:
121
+ return {m: s.to_dict() for m, s in self._models.items()}
122
+
123
+ def snapshot(self) -> dict:
124
+ """Return a complete, JSON-serialisable snapshot of all counters."""
125
+ with self._lock:
126
+ return {
127
+ **self._global.to_dict(),
128
+ "by_model": {m: s.to_dict() for m, s in self._models.items()},
129
+ }
130
+
131
+ def reset(self) -> None:
132
+ """Clear all counters (useful between test runs or reporting windows)."""
133
+ with self._lock:
134
+ self._models.clear()
135
+ self._global = _ModelStats()
136
+
137
+
138
+ # Module-level singleton — `import debugai; debugai.metrics`
139
+ metrics = MetricsLedger()
debugai/models.py ADDED
@@ -0,0 +1,92 @@
1
+ """Lazy-loaded small ML models (Architecture §8.2).
2
+
3
+ These are NOT LLMs — they are tiny, fast, task-specific models that run on CPU:
4
+ - sentence-transformers/all-MiniLM-L6-v2 (embeddings, ~80MB)
5
+ - spaCy en_core_web_sm (NER, ~12MB)
6
+ - cross-encoder/nli-MiniLM2-L6-H768 (NLI, ~120MB)
7
+
8
+ Each loader is a cached singleton so model weights load once per process. If a
9
+ model is unavailable, the loader returns ``None`` and signal computations fall
10
+ back to their deterministic pure-Python methods (per the doc's layered design).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import functools
16
+ import logging
17
+ import os
18
+
19
+ log = logging.getLogger("debugai.models")
20
+
21
+ EMBED_MODEL = "all-MiniLM-L6-v2"
22
+ SPACY_MODEL = "en_core_web_sm"
23
+
24
+ # Three NLI modes (set via env var):
25
+ #
26
+ # default → cross-encoder/nli-deberta-v3-base local (~500 MB RAM)
27
+ # Most accurate. Best for self-hosted VPS with ≥1 GB RAM.
28
+ #
29
+ # DEBUGAI_LITE=1 → cross-encoder/nli-MiniLM2-L6-H768 local (~120 MB RAM)
30
+ # Fits in free-tier PaaS (512 MB RAM). Slightly more
31
+ # false-positive contradictions but good enough.
32
+ #
33
+ # DEBUGAI_NLI_API=1 → HuggingFace Inference API zero local RAM
34
+ # Sends (premise, hypothesis) to api-inference.huggingface.co.
35
+ # Set HF_TOKEN for higher rate limits (free account works).
36
+ # Best choice for Render / Railway free tier.
37
+ _LITE = bool(os.environ.get("DEBUGAI_LITE"))
38
+ _NLI_API = bool(os.environ.get("DEBUGAI_NLI_API"))
39
+ NLI_MODEL = ("cross-encoder/nli-MiniLM2-L6-H768" if _LITE
40
+ else "cross-encoder/nli-deberta-v3-base")
41
+ NLI_HF_MODEL_ID = "cross-encoder/nli-deberta-v3-base" # used by the API path
42
+
43
+
44
+ @functools.lru_cache(maxsize=1)
45
+ def embedder():
46
+ """SentenceTransformer for semantic cosine. None if unavailable."""
47
+ try:
48
+ from sentence_transformers import SentenceTransformer
49
+
50
+ log.info("loading embedding model %s", EMBED_MODEL)
51
+ return SentenceTransformer(EMBED_MODEL)
52
+ except Exception as e: # pragma: no cover - environment dependent
53
+ log.warning("embedder unavailable (%s); using token-overlap fallback", e)
54
+ return None
55
+
56
+
57
+ @functools.lru_cache(maxsize=1)
58
+ def nli_model():
59
+ """CrossEncoder NLI model. Returns label-ordered logits. None if unavailable.
60
+
61
+ When DEBUGAI_NLI_API=1 this returns a special sentinel object that tells
62
+ compute_contradiction() to call the HuggingFace Inference API instead.
63
+ """
64
+ if _NLI_API:
65
+ return _HFNLISentinel()
66
+ try:
67
+ from sentence_transformers import CrossEncoder
68
+
69
+ log.info("loading NLI model %s", NLI_MODEL)
70
+ return CrossEncoder(NLI_MODEL)
71
+ except Exception as e: # pragma: no cover - environment dependent
72
+ log.warning("NLI model unavailable (%s); contradiction set to 0.0", e)
73
+ return None
74
+
75
+
76
+ class _HFNLISentinel:
77
+ """Marker returned by nli_model() when DEBUGAI_NLI_API=1.
78
+ compute_contradiction() detects this and calls the HF Inference API."""
79
+ is_hf_api = True
80
+
81
+
82
+ @functools.lru_cache(maxsize=1)
83
+ def ner():
84
+ """spaCy NER pipeline. None if unavailable (regex fallback used instead)."""
85
+ try:
86
+ import spacy
87
+
88
+ log.info("loading spaCy model %s", SPACY_MODEL)
89
+ return spacy.load(SPACY_MODEL)
90
+ except Exception as e: # pragma: no cover - environment dependent
91
+ log.warning("spaCy model unavailable (%s); using regex NER fallback", e)
92
+ return None
debugai/providers.py ADDED
@@ -0,0 +1,179 @@
1
+ """Provider routing table — maps model name prefixes to (base_url, api_key_env,
2
+ adapter_class). Called by ``debugai.completion()`` / ``debugai.acompletion()``.
3
+
4
+ The key architectural insight: most modern providers speak the OpenAI REST API spec
5
+ (same endpoint shape, same response format). A single ``_OpenAICompatAdapter`` covers:
6
+ - Google Gemini (via Google's official OpenAI-compat endpoint)
7
+ - Ollama + any local model (Qwen, Llama, Phi, DeepSeek…)
8
+ - Groq, Together AI, Mistral AI, OpenRouter, Azure OpenAI, LM Studio, vLLM
9
+ - Any custom server that accepts POST /v1/chat/completions
10
+
11
+ Only Cohere requires a native adapter (different API shape).
12
+
13
+ All routing is prefix-based on the model name. Users can extend or override the table:
14
+ from debugai import register_provider
15
+ register_provider(matches=lambda m: m.startswith("my-"), adapter=MyAdapter,
16
+ client_factory=lambda cfg: MyClient(...))
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import os
22
+ from dataclasses import dataclass
23
+ from typing import TYPE_CHECKING, Any, Callable
24
+
25
+ if TYPE_CHECKING:
26
+ from debugai.config import DebugAIConfig
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class ProviderRoute:
31
+ prefix: str # model name prefix, e.g. "gemini-", "ollama/"
32
+ name: str # human name, e.g. "Google Gemini"
33
+ base_url: str | None # None → use the SDK default
34
+ api_key_env: str | None # env var name for the API key; None → no key (local)
35
+ adapter: str # "openai" | "openai_compat" | "anthropic" | "cohere"
36
+ notes: str = ""
37
+
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # The routing table. Checked in order — first prefix match wins.
41
+ # ---------------------------------------------------------------------------
42
+ PROVIDER_ROUTES: list[ProviderRoute] = [
43
+ # ── OpenAI ──────────────────────────────────────────────────────────────
44
+ ProviderRoute("gpt-", "OpenAI", None, "OPENAI_API_KEY", "openai"),
45
+ ProviderRoute("o1-", "OpenAI", None, "OPENAI_API_KEY", "openai"),
46
+ ProviderRoute("o3-", "OpenAI", None, "OPENAI_API_KEY", "openai"),
47
+ ProviderRoute("o4-", "OpenAI", None, "OPENAI_API_KEY", "openai"),
48
+ ProviderRoute("text-", "OpenAI", None, "OPENAI_API_KEY", "openai"),
49
+
50
+ # ── Anthropic ───────────────────────────────────────────────────────────
51
+ ProviderRoute("claude-", "Anthropic", None, "ANTHROPIC_API_KEY", "anthropic"),
52
+
53
+ # ── Google Gemini (OpenAI-compat endpoint) ──────────────────────────────
54
+ ProviderRoute("gemini-", "Google Gemini", "https://generativelanguage.googleapis.com/v1beta/openai/", "GEMINI_API_KEY", "openai_compat"),
55
+ ProviderRoute("google/", "Google Gemini", "https://generativelanguage.googleapis.com/v1beta/openai/", "GEMINI_API_KEY", "openai_compat"),
56
+
57
+ # ── Groq (fast inference, OpenAI-compat) ────────────────────────────────
58
+ ProviderRoute("groq/", "Groq", "https://api.groq.com/openai/v1", "GROQ_API_KEY", "openai_compat"),
59
+
60
+ # ── Together AI (OpenAI-compat) ─────────────────────────────────────────
61
+ ProviderRoute("together/", "Together AI", "https://api.together.xyz/v1", "TOGETHER_API_KEY", "openai_compat"),
62
+
63
+ # ── Mistral AI (OpenAI-compat) ──────────────────────────────────────────
64
+ ProviderRoute("mistral/", "Mistral AI", "https://api.mistral.ai/v1", "MISTRAL_API_KEY", "openai_compat"),
65
+
66
+ # ── OpenRouter (multi-provider proxy, OpenAI-compat) ────────────────────
67
+ ProviderRoute("openrouter/", "OpenRouter", "https://openrouter.ai/api/v1", "OPENROUTER_API_KEY", "openai_compat"),
68
+
69
+ # ── Azure OpenAI (OpenAI-compat + endpoint from env) ────────────────────
70
+ ProviderRoute("azure/", "Azure OpenAI", None, "AZURE_OPENAI_API_KEY","openai_compat",
71
+ notes="Set AZURE_OPENAI_ENDPOINT. Model name: 'azure/<deployment-name>'"),
72
+
73
+ # ── Cohere (native adapter) ──────────────────────────────────────────────
74
+ ProviderRoute("cohere/", "Cohere", None, "COHERE_API_KEY", "cohere"),
75
+ ProviderRoute("command-", "Cohere", None, "COHERE_API_KEY", "cohere"),
76
+
77
+ # ── Ollama — local models via the Ollama OpenAI-compat server ───────────
78
+ # Any model served by Ollama: qwen2.5, llama3, phi3, deepseek-coder, codellama…
79
+ # Default: http://localhost:11434/v1 (override with OLLAMA_BASE_URL or config.ollama_base_url)
80
+ ProviderRoute("ollama/", "Ollama (local)", None, None, "ollama",
81
+ notes="Requires Ollama running. Set OLLAMA_BASE_URL if not localhost:11434."),
82
+ # Common local model families also route to Ollama when no prefix given:
83
+ ProviderRoute("qwen", "Ollama (Qwen)", None, None, "ollama"),
84
+ ProviderRoute("llama", "Ollama (Llama)", None, None, "ollama"),
85
+ ProviderRoute("phi", "Ollama (Phi)", None, None, "ollama"),
86
+ ProviderRoute("deepseek", "Ollama (DeepSeek)",None, None, "ollama"),
87
+ ProviderRoute("codellama", "Ollama (CodeLlama)",None,None, "ollama"),
88
+ ProviderRoute("gemma", "Ollama (Gemma)", None, None, "ollama"),
89
+ ProviderRoute("mixtral", "Ollama (Mixtral)",None, None, "ollama"),
90
+ ProviderRoute("vicuna", "Ollama (Vicuna)", None, None, "ollama"),
91
+ ]
92
+
93
+
94
+ def route_for(model: str) -> ProviderRoute | None:
95
+ """Return the first matching route for a model name, or None."""
96
+ low = model.lower()
97
+ for route in PROVIDER_ROUTES:
98
+ if low.startswith(route.prefix.lower()):
99
+ return route
100
+ return None
101
+
102
+
103
+ def _get_adapter_map():
104
+ """Deferred import to avoid circular imports at module load time."""
105
+ from debugai.sdk import (
106
+ _AnthropicAdapter, _CohereAdapter,
107
+ _OpenAIAdapter, _OpenAICompatAdapter,
108
+ )
109
+ return {
110
+ "openai": _OpenAIAdapter,
111
+ "anthropic": _AnthropicAdapter,
112
+ "openai_compat": _OpenAICompatAdapter,
113
+ "ollama": _OpenAICompatAdapter,
114
+ "cohere": _CohereAdapter,
115
+ }
116
+
117
+
118
+ # Lazily evaluated so there's no circular import at load time.
119
+ class _AdapterMapProxy(dict):
120
+ _loaded = False
121
+ def _ensure(self):
122
+ if not self._loaded:
123
+ self.update(_get_adapter_map())
124
+ self._loaded = True
125
+ def get(self, key, default=None):
126
+ self._ensure()
127
+ return super().get(key, default)
128
+ def __getitem__(self, key):
129
+ self._ensure()
130
+ return super().__getitem__(key)
131
+
132
+
133
+ _ADAPTER_MAP = _AdapterMapProxy()
134
+
135
+
136
+ def make_client(route: ProviderRoute, config: "DebugAIConfig") -> Any:
137
+ """Build the provider client from a route + config. All OpenAI-compat clients
138
+ are instantiated via the OpenAI SDK (no per-provider install needed)."""
139
+
140
+ if route.adapter == "anthropic":
141
+ from anthropic import Anthropic
142
+ return Anthropic(timeout=60.0, max_retries=2)
143
+
144
+ if route.adapter == "cohere":
145
+ try:
146
+ import cohere # optional; graceful ImportError if not installed
147
+ return cohere.ClientV2(api_key=os.environ.get(route.api_key_env or "COHERE_API_KEY", ""))
148
+ except ImportError:
149
+ raise ImportError(
150
+ "Cohere models require the 'cohere' package: pip install cohere"
151
+ )
152
+
153
+ if route.adapter == "ollama":
154
+ from openai import OpenAI
155
+ base_url = (getattr(config, "ollama_base_url", None)
156
+ or os.environ.get("OLLAMA_BASE_URL")
157
+ or "http://localhost:11434/v1")
158
+ return OpenAI(base_url=base_url, api_key="ollama", timeout=120.0)
159
+
160
+ if route.adapter in ("openai_compat", "openai"):
161
+ from openai import OpenAI
162
+ base_url = route.base_url
163
+ api_key: str | None = None
164
+
165
+ if route.adapter == "openai_compat":
166
+ if route.prefix.startswith("azure/"):
167
+ base_url = (os.environ.get("AZURE_OPENAI_ENDPOINT", "")
168
+ .rstrip("/") + "/openai")
169
+ api_key = (os.environ.get(route.api_key_env, "") if route.api_key_env
170
+ else "no-key-needed")
171
+
172
+ return OpenAI(
173
+ base_url=base_url,
174
+ api_key=api_key or os.environ.get(route.api_key_env or "", ""),
175
+ timeout=60.0,
176
+ max_retries=2,
177
+ )
178
+
179
+ raise ValueError(f"Unknown adapter type {route.adapter!r} in routing table.")
debugai/schema.py ADDED
@@ -0,0 +1,66 @@
1
+ """SDK capture schema (Architecture §3).
2
+
3
+ The unified payload every integration level produces. Only the Core IO group is
4
+ strictly required; retrieval and runtime groups unlock RAG-specific and
5
+ capacity signals respectively.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import Any
12
+
13
+
14
+ @dataclass
15
+ class CaptureRecord:
16
+ """Unified request payload: prompt + context + output + metadata.
17
+
18
+ Groups (§3.1):
19
+ Core IO — minimum viable input (required).
20
+ Retrieval — RAG-specific signals.
21
+ Metadata — pipeline configuration.
22
+ Runtime — auto-captured metrics.
23
+ """
24
+
25
+ # --- Core IO (required) ---
26
+ user_prompt: str
27
+ llm_output: str
28
+ system_prompt: str = ""
29
+ expected_output: str | None = None
30
+
31
+ # --- Retrieval context (RAG) ---
32
+ retrieved_chunks: list[str] = field(default_factory=list)
33
+ similarity_scores: list[float] = field(default_factory=list)
34
+ retrieval_query: str | None = None
35
+ chunk_sources: list[dict[str, Any]] = field(default_factory=list)
36
+
37
+ # --- Pipeline metadata ---
38
+ model_name: str | None = None
39
+ temperature: float | None = None
40
+ max_tokens: int | None = None
41
+ tool_calls: list[dict[str, Any]] = field(default_factory=list)
42
+
43
+ # --- Runtime metrics ---
44
+ latency_ms: int | None = None
45
+ token_usage: dict[str, int] = field(default_factory=dict)
46
+ timestamp: str | None = None
47
+ error_code: str | None = None
48
+
49
+ # --- Optional capacity hints (used by ratio signals) ---
50
+ context_window: int | None = None # model's max context window in tokens
51
+
52
+ def __post_init__(self) -> None:
53
+ if not self.user_prompt:
54
+ raise ValueError("user_prompt is required (Core IO)")
55
+ if self.llm_output is None:
56
+ raise ValueError("llm_output is required (Core IO)")
57
+
58
+ @property
59
+ def context_text(self) -> str:
60
+ """Concatenated retrieved chunks — the 'grounding' the output should rest on."""
61
+ return "\n".join(self.retrieved_chunks)
62
+
63
+ @property
64
+ def full_prompt(self) -> str:
65
+ """System + user prompt combined (used for context-length accounting)."""
66
+ return (self.system_prompt + "\n" + self.user_prompt).strip()