promptpolygraph 0.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptpolygraph/__init__.py +37 -0
- promptpolygraph/__main__.py +10 -0
- promptpolygraph/adapters/__init__.py +60 -0
- promptpolygraph/adapters/base.py +40 -0
- promptpolygraph/adapters/callable.py +64 -0
- promptpolygraph/adapters/demo.py +121 -0
- promptpolygraph/adapters/http.py +143 -0
- promptpolygraph/adapters/llm.py +198 -0
- promptpolygraph/analyze/__init__.py +33 -0
- promptpolygraph/analyze/analyzer.py +396 -0
- promptpolygraph/analyze/assertions.py +488 -0
- promptpolygraph/analyze/baseline.py +108 -0
- promptpolygraph/analyze/embedders.py +121 -0
- promptpolygraph/analyze/gate.py +289 -0
- promptpolygraph/analyze/rubric.py +177 -0
- promptpolygraph/audit/__init__.py +24 -0
- promptpolygraph/audit/code_context.py +296 -0
- promptpolygraph/audit/engine.py +109 -0
- promptpolygraph/audit/forensic.py +404 -0
- promptpolygraph/audit/persona.py +406 -0
- promptpolygraph/cli.py +974 -0
- promptpolygraph/compare/__init__.py +15 -0
- promptpolygraph/compare/matrix.py +401 -0
- promptpolygraph/compare/pairwise.py +103 -0
- promptpolygraph/compare/trend.py +81 -0
- promptpolygraph/config.py +182 -0
- promptpolygraph/configdesign.py +120 -0
- promptpolygraph/corpus/__init__.py +15 -0
- promptpolygraph/corpus/generator.py +467 -0
- promptpolygraph/corpus/loader.py +90 -0
- promptpolygraph/data/__init__.py +0 -0
- promptpolygraph/data/personas/__init__.py +0 -0
- promptpolygraph/data/personas/accessibility_user.yaml +9 -0
- promptpolygraph/data/personas/budget_conscious.yaml +10 -0
- promptpolygraph/data/personas/enthusiast.yaml +10 -0
- promptpolygraph/data/personas/first_timer.yaml +9 -0
- promptpolygraph/data/personas/frustrated_returning_customer.yaml +10 -0
- promptpolygraph/data/personas/non_native_speaker.yaml +9 -0
- promptpolygraph/data/personas/plain_language_needer.yaml +10 -0
- promptpolygraph/data/personas/power_user.yaml +9 -0
- promptpolygraph/data/personas/privacy_skeptic.yaml +10 -0
- promptpolygraph/data/personas/the_journalist.yaml +10 -0
- promptpolygraph/data/personas/the_skeptic.yaml +9 -0
- promptpolygraph/data/personas/the_troll.yaml +10 -0
- promptpolygraph/data/personas/time_pressed_exec.yaml +8 -0
- promptpolygraph/discovery.py +83 -0
- promptpolygraph/elicit.py +338 -0
- promptpolygraph/llm.py +215 -0
- promptpolygraph/models.py +228 -0
- promptpolygraph/persona/__init__.py +20 -0
- promptpolygraph/persona/library.py +79 -0
- promptpolygraph/persona/new.py +160 -0
- promptpolygraph/pipeline.py +210 -0
- promptpolygraph/redteam/__init__.py +38 -0
- promptpolygraph/redteam/catalog.py +84 -0
- promptpolygraph/redteam/codetrace.py +180 -0
- promptpolygraph/redteam/converters.py +216 -0
- promptpolygraph/redteam/design.py +148 -0
- promptpolygraph/redteam/guard.py +239 -0
- promptpolygraph/redteam/judge.py +73 -0
- promptpolygraph/redteam/models.py +120 -0
- promptpolygraph/redteam/multiturn.py +435 -0
- promptpolygraph/redteam/orchestrator.py +235 -0
- promptpolygraph/redteam/profiles.py +143 -0
- promptpolygraph/redteam/report.py +394 -0
- promptpolygraph/redteam/rootcause.py +148 -0
- promptpolygraph/redteam/roster.py +231 -0
- promptpolygraph/redteam/scrub.py +56 -0
- promptpolygraph/redteam/sources/__init__.py +49 -0
- promptpolygraph/redteam/sources/base.py +64 -0
- promptpolygraph/redteam/sources/catalog_source.py +31 -0
- promptpolygraph/redteam/sources/dataset_source.py +343 -0
- promptpolygraph/redteam/sources/deepteam_source.py +428 -0
- promptpolygraph/redteam/sources/garak_source.py +331 -0
- promptpolygraph/redteam/sources/pyrit_source.py +382 -0
- promptpolygraph/redteam/strategies.py +143 -0
- promptpolygraph/report/__init__.py +104 -0
- promptpolygraph/report/_env.py +51 -0
- promptpolygraph/report/charts.py +415 -0
- promptpolygraph/report/context.py +580 -0
- promptpolygraph/report/docx.py +298 -0
- promptpolygraph/report/html.py +414 -0
- promptpolygraph/report/markdown.py +360 -0
- promptpolygraph/report/pdf.py +101 -0
- promptpolygraph/report/templates/default/report.html.j2 +349 -0
- promptpolygraph/report/templates/default/report.md.j2 +121 -0
- promptpolygraph/report/templates/minimal/report.html.j2 +34 -0
- promptpolygraph/report/templates/minimal/report.md.j2 +13 -0
- promptpolygraph/runner/__init__.py +16 -0
- promptpolygraph/runner/runner.py +144 -0
- promptpolygraph/runner/store.py +188 -0
- promptpolygraph/service/__init__.py +22 -0
- promptpolygraph/service/app.py +465 -0
- promptpolygraph/service/auth.py +27 -0
- promptpolygraph/service/dashboard.py +720 -0
- promptpolygraph/service/db.py +284 -0
- promptpolygraph/service/jobspec.py +62 -0
- promptpolygraph/service/scheduler.py +67 -0
- promptpolygraph/service/schemas.py +81 -0
- promptpolygraph/service/server.py +25 -0
- promptpolygraph/service/settings.py +60 -0
- promptpolygraph/service/webhooks.py +24 -0
- promptpolygraph/service/worker.py +108 -0
- promptpolygraph/tune.py +124 -0
- promptpolygraph/ui/__init__.py +12 -0
- promptpolygraph/ui/arena.py +1369 -0
- promptpolygraph/ui/chrome.py +634 -0
- promptpolygraph/ui/page.py +2635 -0
- promptpolygraph/ui/server.py +1738 -0
- promptpolygraph-0.6.6.dist-info/METADATA +380 -0
- promptpolygraph-0.6.6.dist-info/RECORD +114 -0
- promptpolygraph-0.6.6.dist-info/WHEEL +4 -0
- promptpolygraph-0.6.6.dist-info/entry_points.txt +4 -0
- promptpolygraph-0.6.6.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""PromptPolygraph — synthetic-prompt evaluation and persona-audit harness.
|
|
2
|
+
|
|
3
|
+
A local-first, cloud-agnostic harness for pushing thousands of synthetic prompts
|
|
4
|
+
through any web/API/LLM system, scoring the responses against a pluggable rubric
|
|
5
|
+
(plus deterministic assertions and an optional multi-judge ensemble), reacting to
|
|
6
|
+
them through a panel of personas, tracing low scores with a forensic audit, and
|
|
7
|
+
rendering the result as a markdown / docx / pdf / html report.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .models import (
|
|
11
|
+
AssertionResult,
|
|
12
|
+
AssertionSpec,
|
|
13
|
+
Case,
|
|
14
|
+
Dimension,
|
|
15
|
+
Persona,
|
|
16
|
+
Response,
|
|
17
|
+
Rubric,
|
|
18
|
+
RunMeta,
|
|
19
|
+
Score,
|
|
20
|
+
fingerprint,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
__version__ = "0.1.0"
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"AssertionResult",
|
|
27
|
+
"AssertionSpec",
|
|
28
|
+
"Case",
|
|
29
|
+
"Dimension",
|
|
30
|
+
"Persona",
|
|
31
|
+
"Response",
|
|
32
|
+
"Rubric",
|
|
33
|
+
"RunMeta",
|
|
34
|
+
"Score",
|
|
35
|
+
"fingerprint",
|
|
36
|
+
"__version__",
|
|
37
|
+
]
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Adapters — the single integration point per system under test.
|
|
2
|
+
|
|
3
|
+
An adapter takes a `Case` and returns a `Response`. Ship three: `HTTPAdapter`
|
|
4
|
+
(any REST endpoint), `LLMAdapter` (OpenAI / Anthropic / OpenAI-compatible chat),
|
|
5
|
+
and `CallableAdapter` (an in-process Python callable, used for tests and
|
|
6
|
+
library embedding). Custom targets implement the same `query` coroutine.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from ..config import AdapterConfig
|
|
14
|
+
from .base import Adapter, BaseAdapter
|
|
15
|
+
from .callable import CallableAdapter
|
|
16
|
+
from .demo import DemoAdapter
|
|
17
|
+
from .http import HTTPAdapter
|
|
18
|
+
from .llm import LLMAdapter
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"Adapter",
|
|
22
|
+
"BaseAdapter",
|
|
23
|
+
"CallableAdapter",
|
|
24
|
+
"DemoAdapter",
|
|
25
|
+
"HTTPAdapter",
|
|
26
|
+
"LLMAdapter",
|
|
27
|
+
"build_adapter",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _resolve_callable(ref: str) -> Any:
|
|
32
|
+
"""Import a callable from a 'module:function' (or 'module.function') string,
|
|
33
|
+
so a custom in-process adapter can be configured from a config file / the UI
|
|
34
|
+
without code changes. The function may take `case` or a plain prompt str."""
|
|
35
|
+
import importlib
|
|
36
|
+
|
|
37
|
+
mod_name, _, attr = ref.replace(":", ".").rpartition(".")
|
|
38
|
+
if not mod_name:
|
|
39
|
+
raise ValueError(f"callable adapter ref must be 'module:function', got {ref!r}")
|
|
40
|
+
return getattr(importlib.import_module(mod_name), attr)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def build_adapter(cfg: AdapterConfig, **extra: Any) -> Adapter:
|
|
44
|
+
"""Construct an adapter from config. `extra` lets callers inject a callable."""
|
|
45
|
+
options = {**cfg.options, **extra}
|
|
46
|
+
kind = cfg.type.lower()
|
|
47
|
+
if kind == "http":
|
|
48
|
+
return HTTPAdapter(name=cfg.name or "http", **options)
|
|
49
|
+
if kind == "llm":
|
|
50
|
+
return LLMAdapter(name=cfg.name or "llm", **options)
|
|
51
|
+
if kind == "demo":
|
|
52
|
+
return DemoAdapter(name=cfg.name or "demo", **options)
|
|
53
|
+
if kind == "callable":
|
|
54
|
+
# Accept a live `fn`, or an import string under fn/import/target/ref.
|
|
55
|
+
fn = options.pop("fn", None)
|
|
56
|
+
ref = options.pop("import", None) or options.pop("target", None) or options.pop("ref", None)
|
|
57
|
+
if fn is None and isinstance(ref, str) and ref.strip():
|
|
58
|
+
fn = _resolve_callable(ref.strip())
|
|
59
|
+
return CallableAdapter(name=cfg.name or "callable", fn=fn, **options)
|
|
60
|
+
raise ValueError(f"unknown adapter type: {cfg.type!r}")
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Adapter protocol + a small base class with timing helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from typing import Protocol, runtime_checkable
|
|
7
|
+
|
|
8
|
+
from ..models import Case, Response
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@runtime_checkable
|
|
12
|
+
class Adapter(Protocol):
|
|
13
|
+
name: str
|
|
14
|
+
|
|
15
|
+
async def query(self, case: Case) -> Response:
|
|
16
|
+
"""Send the case prompt to the target and return a Response."""
|
|
17
|
+
...
|
|
18
|
+
|
|
19
|
+
async def aclose(self) -> None:
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BaseAdapter:
|
|
24
|
+
"""Common scaffolding: name, no-op close, and a timed-response helper."""
|
|
25
|
+
|
|
26
|
+
name: str = "adapter"
|
|
27
|
+
|
|
28
|
+
def __init__(self, name: str | None = None):
|
|
29
|
+
if name:
|
|
30
|
+
self.name = name
|
|
31
|
+
|
|
32
|
+
async def query(self, case: Case) -> Response: # pragma: no cover - abstract
|
|
33
|
+
raise NotImplementedError
|
|
34
|
+
|
|
35
|
+
async def aclose(self) -> None:
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def _elapsed_ms(start: float) -> int:
|
|
40
|
+
return int((time.perf_counter() - start) * 1000)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""In-process callable adapter — for tests, library embedding, and offline smoke.
|
|
2
|
+
|
|
3
|
+
Wrap any function `fn(prompt: str) -> str` (sync or async), or a richer
|
|
4
|
+
`fn(case: Case) -> str | dict`. A dict may carry {"text", "tokens_in",
|
|
5
|
+
"tokens_out", "model", ...} to populate the Response. This is the adapter the
|
|
6
|
+
offline end-to-end smoke uses, so the whole pipeline runs with no network.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import inspect
|
|
13
|
+
import time
|
|
14
|
+
from typing import Any, Awaitable, Callable
|
|
15
|
+
|
|
16
|
+
from ..models import Case, Response
|
|
17
|
+
from .base import BaseAdapter
|
|
18
|
+
|
|
19
|
+
Handler = Callable[..., Any] | Callable[..., Awaitable[Any]]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CallableAdapter(BaseAdapter):
|
|
23
|
+
name = "callable"
|
|
24
|
+
|
|
25
|
+
def __init__(self, name: str | None = None, fn: Handler | None = None, **_: Any):
|
|
26
|
+
super().__init__(name)
|
|
27
|
+
if fn is None:
|
|
28
|
+
raise ValueError("CallableAdapter requires a `fn` callable")
|
|
29
|
+
self._fn = fn
|
|
30
|
+
self._wants_case = "case" in inspect.signature(fn).parameters
|
|
31
|
+
|
|
32
|
+
async def query(self, case: Case) -> Response:
|
|
33
|
+
start = time.perf_counter()
|
|
34
|
+
arg = case if self._wants_case else case.prompt
|
|
35
|
+
try:
|
|
36
|
+
result = self._fn(arg)
|
|
37
|
+
if inspect.isawaitable(result):
|
|
38
|
+
result = await result
|
|
39
|
+
except Exception as exc: # surface target errors as a Response, not a crash
|
|
40
|
+
return Response(
|
|
41
|
+
case_id=case.id,
|
|
42
|
+
error=f"{type(exc).__name__}: {exc}",
|
|
43
|
+
latency_ms=self._elapsed_ms(start),
|
|
44
|
+
source=self.name,
|
|
45
|
+
)
|
|
46
|
+
latency = self._elapsed_ms(start)
|
|
47
|
+
if isinstance(result, dict):
|
|
48
|
+
return Response(
|
|
49
|
+
case_id=case.id,
|
|
50
|
+
text=str(result.get("text", "")),
|
|
51
|
+
latency_ms=result.get("latency_ms", latency),
|
|
52
|
+
tokens_in=result.get("tokens_in"),
|
|
53
|
+
tokens_out=result.get("tokens_out"),
|
|
54
|
+
cost_usd=result.get("cost_usd"),
|
|
55
|
+
model=result.get("model"),
|
|
56
|
+
source=self.name,
|
|
57
|
+
raw=result,
|
|
58
|
+
)
|
|
59
|
+
return Response(
|
|
60
|
+
case_id=case.id,
|
|
61
|
+
text=str(result),
|
|
62
|
+
latency_ms=latency,
|
|
63
|
+
source=self.name,
|
|
64
|
+
)
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""A built-in, deterministic demo target — so bundled examples run with a
|
|
2
|
+
single command and no API key. It simulates a passable assistant whose behavior
|
|
3
|
+
varies by the kind of prompt, giving the analyzer, personas, and forensic audit
|
|
4
|
+
a realistic spread to work with.
|
|
5
|
+
|
|
6
|
+
Two styles (set via adapter options `style: everyday|support`): an everyday
|
|
7
|
+
general-purpose assistant (the default example) and a customer-support
|
|
8
|
+
assistant. Both share the universal safety behaviors (escalate on distress,
|
|
9
|
+
refuse disallowed asks, clarify on empty input).
|
|
10
|
+
|
|
11
|
+
This is for demonstration only; real evaluations point an HTTP or LLM adapter
|
|
12
|
+
at an actual system.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import time
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from ..models import Case, Response
|
|
21
|
+
from .base import BaseAdapter
|
|
22
|
+
|
|
23
|
+
# Universal: a genuinely worrying message should be met with care + a handoff.
|
|
24
|
+
_DISTRESS = ("can't keep going", "can't go on", "give up", "overwhelmed", "hopeless", "end it all")
|
|
25
|
+
# Universal disallowed markers (harmful / others' credentials / clearly illegal).
|
|
26
|
+
_DISALLOWED = ("someone else's password", "another person's account", "build a bomb",
|
|
27
|
+
"make a weapon", "hack into", "without consent", "evade the law")
|
|
28
|
+
# Support-style disallowed extras.
|
|
29
|
+
_DISALLOWED_SUPPORT = ("password of another", "someone else", "admin override", "bypass", "fake invoice")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DemoAdapter(BaseAdapter):
|
|
33
|
+
name = "demo"
|
|
34
|
+
|
|
35
|
+
def __init__(self, name: str | None = None, *, style: str = "support", flaky: bool = False, **_: Any):
|
|
36
|
+
super().__init__(name)
|
|
37
|
+
self.style = style if style in ("support", "everyday") else "support"
|
|
38
|
+
self._flaky = flaky
|
|
39
|
+
|
|
40
|
+
async def query(self, case: Case) -> Response:
|
|
41
|
+
text = self._answer(case.prompt or "", self.style)
|
|
42
|
+
latency = 60 + (len(case.prompt) % 40) * 3 # deterministic variation
|
|
43
|
+
tokens_in = max(1, len(case.prompt) // 4)
|
|
44
|
+
tokens_out = max(1, len(text) // 4)
|
|
45
|
+
# Tiny deterministic cost so the demo exercises the cost path/scorers.
|
|
46
|
+
cost = round((tokens_in + tokens_out) * 1e-6, 9)
|
|
47
|
+
return Response(
|
|
48
|
+
case_id=case.id,
|
|
49
|
+
text=text,
|
|
50
|
+
latency_ms=latency,
|
|
51
|
+
tokens_in=tokens_in,
|
|
52
|
+
tokens_out=tokens_out,
|
|
53
|
+
cost_usd=cost,
|
|
54
|
+
model=f"demo-{self.style}-assistant",
|
|
55
|
+
source=self.name,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _answer(prompt: str, style: str) -> str:
|
|
60
|
+
p = prompt.lower().strip()
|
|
61
|
+
if not p:
|
|
62
|
+
return "I didn't quite catch a question there — could you tell me a little more about what you need?"
|
|
63
|
+
if any(k in p for k in _DISTRESS):
|
|
64
|
+
return (
|
|
65
|
+
"I'm really sorry you're feeling this way, and I don't want to leave you with just a bot. "
|
|
66
|
+
"Please consider reaching out to someone you trust or a local support line right now — "
|
|
67
|
+
"you don't have to handle this alone, and talking to a person can help."
|
|
68
|
+
)
|
|
69
|
+
disallowed = _DISALLOWED + (_DISALLOWED_SUPPORT if style == "support" else ())
|
|
70
|
+
if any(k in p for k in disallowed):
|
|
71
|
+
return (
|
|
72
|
+
"I can't help with that one. If there's a legitimate version of what you're trying to do, "
|
|
73
|
+
"tell me a bit more and I'll point you in the right direction."
|
|
74
|
+
)
|
|
75
|
+
if style == "support":
|
|
76
|
+
return DemoAdapter._support(p)
|
|
77
|
+
return DemoAdapter._everyday(p)
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def _support(p: str) -> str:
|
|
81
|
+
if "reset" in p and "password" in p:
|
|
82
|
+
return "To reset your password: open Settings > Security > Reset password, then check your email for the link."
|
|
83
|
+
if "cancel" in p or "downgrade" in p:
|
|
84
|
+
return "You can manage or cancel your plan under Billing > Subscription. Changes take effect at the end of the cycle."
|
|
85
|
+
if "refund" in p:
|
|
86
|
+
return "I can start a refund request for an eligible charge — could you share the invoice number from Billing > History?"
|
|
87
|
+
if "?" in p or any(k in p for k in ("how", "what", "where", "why", "can i")):
|
|
88
|
+
return (
|
|
89
|
+
"Happy to help. Here's the short version, and tell me if you want more detail: most account "
|
|
90
|
+
"settings live under Settings, and billing lives under Billing. What are you trying to do?"
|
|
91
|
+
)
|
|
92
|
+
return "Thanks for reaching out — can you tell me a little more so I can point you to the right place?"
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def _everyday(p: str) -> str:
|
|
96
|
+
if p.startswith("how ") or "how do i" in p or "how to" in p or "steps" in p:
|
|
97
|
+
return (
|
|
98
|
+
"Good question — here's a simple way to approach it. Start by getting clear on the goal, "
|
|
99
|
+
"then break it into a few concrete steps and do the smallest one first. If you tell me your "
|
|
100
|
+
"specifics, I can tailor the steps to your situation."
|
|
101
|
+
)
|
|
102
|
+
if any(p.startswith(k) for k in ("what is", "what are", "define")) or "explain" in p or "what's" in p:
|
|
103
|
+
return (
|
|
104
|
+
"In short: it's the concept your question points at, and the key idea is that it does what its "
|
|
105
|
+
"name suggests in a straightforward way. Want the one-line version or a fuller explanation with an example?"
|
|
106
|
+
)
|
|
107
|
+
if "recommend" in p or "should i" in p or "best" in p:
|
|
108
|
+
return (
|
|
109
|
+
"It depends a bit on what matters most to you. A reasonable default works for most people, but if "
|
|
110
|
+
"you share your constraints (budget, time, preferences) I can give you a sharper recommendation."
|
|
111
|
+
)
|
|
112
|
+
if "?" in p or any(k in p for k in ("why", "when", "where", "who", "can you", "could you")):
|
|
113
|
+
return (
|
|
114
|
+
"Here's the short answer, and I'm happy to go deeper: it generally comes down to a couple of key "
|
|
115
|
+
"factors, and the right call depends on your context. Tell me more and I'll be specific."
|
|
116
|
+
)
|
|
117
|
+
return "Got it. Tell me a little more about what you're after and I'll help you work through it."
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def make_demo_adapter(name: str | None = None, **kw: Any) -> DemoAdapter:
|
|
121
|
+
return DemoAdapter(name=name, **kw)
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Generic HTTP/REST adapter for any web endpoint.
|
|
2
|
+
|
|
3
|
+
Configurable so it fits most JSON APIs without code:
|
|
4
|
+
- method, url, headers (env-var interpolation via ${VAR})
|
|
5
|
+
- body_template: a JSON-able dict; the string "{{prompt}}" anywhere inside
|
|
6
|
+
is replaced with the case prompt (also "{{category}}", "{{id}}")
|
|
7
|
+
- response_path: a JMESPath expression extracting the answer text from the
|
|
8
|
+
JSON response (default "text"); falls back to the raw body if it misses
|
|
9
|
+
- tokens_in_path / tokens_out_path / model_path: optional JMESPath for usage
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import copy
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import time
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
import httpx
|
|
21
|
+
import jmespath
|
|
22
|
+
|
|
23
|
+
from ..models import Case, Response
|
|
24
|
+
from .base import BaseAdapter
|
|
25
|
+
|
|
26
|
+
_ENV = re.compile(r"\$\{([A-Z0-9_]+)\}")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _interp_env(value: Any) -> Any:
|
|
30
|
+
if isinstance(value, str):
|
|
31
|
+
return _ENV.sub(lambda m: os.environ.get(m.group(1), ""), value)
|
|
32
|
+
if isinstance(value, dict):
|
|
33
|
+
return {k: _interp_env(v) for k, v in value.items()}
|
|
34
|
+
if isinstance(value, list):
|
|
35
|
+
return [_interp_env(v) for v in value]
|
|
36
|
+
return value
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _fill(value: Any, case: Case) -> Any:
|
|
40
|
+
if isinstance(value, str):
|
|
41
|
+
return (
|
|
42
|
+
value.replace("{{prompt}}", case.prompt)
|
|
43
|
+
.replace("{{category}}", case.category)
|
|
44
|
+
.replace("{{id}}", case.id)
|
|
45
|
+
)
|
|
46
|
+
if isinstance(value, dict):
|
|
47
|
+
return {k: _fill(v, case) for k, v in value.items()}
|
|
48
|
+
if isinstance(value, list):
|
|
49
|
+
return [_fill(v, case) for v in value]
|
|
50
|
+
return value
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class HTTPAdapter(BaseAdapter):
|
|
54
|
+
name = "http"
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
name: str | None = None,
|
|
59
|
+
*,
|
|
60
|
+
url: str,
|
|
61
|
+
method: str = "POST",
|
|
62
|
+
headers: dict[str, str] | None = None,
|
|
63
|
+
body_template: dict[str, Any] | None = None,
|
|
64
|
+
response_path: str = "text",
|
|
65
|
+
tokens_in_path: str | None = None,
|
|
66
|
+
tokens_out_path: str | None = None,
|
|
67
|
+
model_path: str | None = None,
|
|
68
|
+
cost_path: str | None = None,
|
|
69
|
+
timeout: float = 60.0,
|
|
70
|
+
**_: Any,
|
|
71
|
+
):
|
|
72
|
+
super().__init__(name)
|
|
73
|
+
self._url = _interp_env(url)
|
|
74
|
+
self._method = method.upper()
|
|
75
|
+
self._headers = _interp_env(headers or {})
|
|
76
|
+
self._body_template = body_template or {"prompt": "{{prompt}}"}
|
|
77
|
+
self._response_path = response_path
|
|
78
|
+
self._tokens_in_path = tokens_in_path
|
|
79
|
+
self._tokens_out_path = tokens_out_path
|
|
80
|
+
self._model_path = model_path
|
|
81
|
+
self._cost_path = cost_path
|
|
82
|
+
self._client = httpx.AsyncClient(timeout=timeout)
|
|
83
|
+
|
|
84
|
+
async def query(self, case: Case) -> Response:
|
|
85
|
+
start = time.perf_counter()
|
|
86
|
+
body = _fill(copy.deepcopy(self._body_template), case)
|
|
87
|
+
try:
|
|
88
|
+
resp = await self._client.request(
|
|
89
|
+
self._method, self._url, headers=self._headers, json=body
|
|
90
|
+
)
|
|
91
|
+
resp.raise_for_status()
|
|
92
|
+
data = resp.json()
|
|
93
|
+
except Exception as exc:
|
|
94
|
+
return Response(
|
|
95
|
+
case_id=case.id,
|
|
96
|
+
error=f"{type(exc).__name__}: {exc}",
|
|
97
|
+
latency_ms=self._elapsed_ms(start),
|
|
98
|
+
source=self.name,
|
|
99
|
+
)
|
|
100
|
+
text = jmespath.search(self._response_path, data)
|
|
101
|
+
if text is None:
|
|
102
|
+
text = data if isinstance(data, str) else str(data)
|
|
103
|
+
return Response(
|
|
104
|
+
case_id=case.id,
|
|
105
|
+
text=str(text),
|
|
106
|
+
latency_ms=self._elapsed_ms(start),
|
|
107
|
+
tokens_in=_search_int(self._tokens_in_path, data),
|
|
108
|
+
tokens_out=_search_int(self._tokens_out_path, data),
|
|
109
|
+
cost_usd=_search_float(self._cost_path, data),
|
|
110
|
+
model=_search_str(self._model_path, data),
|
|
111
|
+
source=self.name,
|
|
112
|
+
raw=data if isinstance(data, dict) else {"body": data},
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
async def aclose(self) -> None:
|
|
116
|
+
await self._client.aclose()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _search_int(path: str | None, data: Any) -> int | None:
|
|
120
|
+
if not path:
|
|
121
|
+
return None
|
|
122
|
+
v = jmespath.search(path, data)
|
|
123
|
+
try:
|
|
124
|
+
return int(v) if v is not None else None
|
|
125
|
+
except (TypeError, ValueError):
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _search_str(path: str | None, data: Any) -> str | None:
|
|
130
|
+
if not path:
|
|
131
|
+
return None
|
|
132
|
+
v = jmespath.search(path, data)
|
|
133
|
+
return str(v) if v is not None else None
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _search_float(path: str | None, data: Any) -> float | None:
|
|
137
|
+
if not path:
|
|
138
|
+
return None
|
|
139
|
+
v = jmespath.search(path, data)
|
|
140
|
+
try:
|
|
141
|
+
return float(v) if v is not None else None
|
|
142
|
+
except (TypeError, ValueError):
|
|
143
|
+
return None
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""LLM chat adapter — evaluate a model/assistant directly.
|
|
2
|
+
|
|
3
|
+
Supports Anthropic (via the bundled `anthropic` dep) and any OpenAI-compatible
|
|
4
|
+
chat endpoint (OpenAI, Azure, local servers, etc.) via the optional `openai`
|
|
5
|
+
extra. The target *is* the model: each case prompt becomes a single user turn
|
|
6
|
+
under an optional system prompt.
|
|
7
|
+
|
|
8
|
+
adapter:
|
|
9
|
+
type: llm
|
|
10
|
+
options:
|
|
11
|
+
provider: anthropic # or "openai"
|
|
12
|
+
model: claude-opus-4-8
|
|
13
|
+
system: "You are a helpful customer-support assistant."
|
|
14
|
+
max_tokens: 512
|
|
15
|
+
# base_url: https://... # openai-compatible servers
|
|
16
|
+
# api_key_env: OPENAI_API_KEY
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import os
|
|
22
|
+
import time
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
from ..models import Case, Response
|
|
26
|
+
from .base import BaseAdapter
|
|
27
|
+
|
|
28
|
+
# Small built-in per-1k-token price table (USD), keyed by a substring of the
|
|
29
|
+
# model id. Best-effort defaults; override per-run via options.price_per_1k_in /
|
|
30
|
+
# price_per_1k_out. Prices are illustrative, not authoritative.
|
|
31
|
+
_PRICE_PER_1K: dict[str, tuple[float, float]] = {
|
|
32
|
+
"opus": (0.015, 0.075),
|
|
33
|
+
"sonnet": (0.003, 0.015),
|
|
34
|
+
"haiku": (0.0008, 0.004),
|
|
35
|
+
"gpt-4o-mini": (0.00015, 0.0006),
|
|
36
|
+
"gpt-4o": (0.0025, 0.01),
|
|
37
|
+
"gpt-4": (0.03, 0.06),
|
|
38
|
+
"gpt-3.5": (0.0005, 0.0015),
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _lookup_price(model: str) -> tuple[float, float] | None:
|
|
43
|
+
ml = (model or "").lower()
|
|
44
|
+
for key, price in _PRICE_PER_1K.items():
|
|
45
|
+
if key in ml:
|
|
46
|
+
return price
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def compute_cost(
|
|
51
|
+
model: str,
|
|
52
|
+
tokens_in: int | None,
|
|
53
|
+
tokens_out: int | None,
|
|
54
|
+
*,
|
|
55
|
+
price_in: float | None = None,
|
|
56
|
+
price_out: float | None = None,
|
|
57
|
+
) -> float | None:
|
|
58
|
+
"""Best-effort USD cost from token usage. Returns None if neither tokens nor
|
|
59
|
+
a price are known."""
|
|
60
|
+
if price_in is None or price_out is None:
|
|
61
|
+
looked = _lookup_price(model)
|
|
62
|
+
if looked is None and (price_in is None and price_out is None):
|
|
63
|
+
return None
|
|
64
|
+
if looked is not None:
|
|
65
|
+
price_in = price_in if price_in is not None else looked[0]
|
|
66
|
+
price_out = price_out if price_out is not None else looked[1]
|
|
67
|
+
if price_in is None and price_out is None:
|
|
68
|
+
return None
|
|
69
|
+
ti = tokens_in or 0
|
|
70
|
+
to = tokens_out or 0
|
|
71
|
+
return (ti / 1000.0) * (price_in or 0.0) + (to / 1000.0) * (price_out or 0.0)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class LLMAdapter(BaseAdapter):
|
|
75
|
+
name = "llm"
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
name: str | None = None,
|
|
80
|
+
*,
|
|
81
|
+
provider: str = "anthropic",
|
|
82
|
+
model: str = "claude-opus-4-8",
|
|
83
|
+
system: str = "",
|
|
84
|
+
max_tokens: int = 512,
|
|
85
|
+
temperature: float = 0.0,
|
|
86
|
+
base_url: str | None = None,
|
|
87
|
+
api_key_env: str | None = None,
|
|
88
|
+
price_per_1k_in: float | None = None,
|
|
89
|
+
price_per_1k_out: float | None = None,
|
|
90
|
+
**_: Any,
|
|
91
|
+
):
|
|
92
|
+
super().__init__(name)
|
|
93
|
+
prov = provider.lower()
|
|
94
|
+
# Local OpenAI-compatible servers (Ollama / vLLM / LM Studio) speak the
|
|
95
|
+
# OpenAI protocol; normalize them to the openai path with a sane default URL.
|
|
96
|
+
if prov in ("ollama", "vllm", "lmstudio", "local"):
|
|
97
|
+
base_url = base_url or "http://localhost:11434/v1"
|
|
98
|
+
prov = "openai"
|
|
99
|
+
self._provider = prov
|
|
100
|
+
self._model = model
|
|
101
|
+
self._system = system
|
|
102
|
+
self._max_tokens = max_tokens
|
|
103
|
+
self._temperature = temperature
|
|
104
|
+
self._base_url = base_url
|
|
105
|
+
self._api_key_env = api_key_env
|
|
106
|
+
self._price_in = price_per_1k_in
|
|
107
|
+
self._price_out = price_per_1k_out
|
|
108
|
+
self._client: Any = None
|
|
109
|
+
|
|
110
|
+
def _ensure_client(self) -> Any:
|
|
111
|
+
if self._client is not None:
|
|
112
|
+
return self._client
|
|
113
|
+
if self._provider == "anthropic":
|
|
114
|
+
import anthropic
|
|
115
|
+
|
|
116
|
+
key = os.environ.get(self._api_key_env or "ANTHROPIC_API_KEY")
|
|
117
|
+
self._client = anthropic.AsyncAnthropic(api_key=key)
|
|
118
|
+
elif self._provider in ("openai", "openai-compatible", "compatible"):
|
|
119
|
+
try:
|
|
120
|
+
import openai
|
|
121
|
+
except ImportError as exc: # pragma: no cover
|
|
122
|
+
raise ImportError(
|
|
123
|
+
"the LLM adapter's openai provider needs the optional dep: "
|
|
124
|
+
"pip install 'promptpolygraph[llm]'"
|
|
125
|
+
) from exc
|
|
126
|
+
key = os.environ.get(self._api_key_env or "OPENAI_API_KEY")
|
|
127
|
+
self._client = openai.AsyncOpenAI(api_key=key, base_url=self._base_url)
|
|
128
|
+
else:
|
|
129
|
+
raise ValueError(f"unknown llm provider: {self._provider!r}")
|
|
130
|
+
return self._client
|
|
131
|
+
|
|
132
|
+
async def query(self, case: Case) -> Response:
|
|
133
|
+
start = time.perf_counter()
|
|
134
|
+
try:
|
|
135
|
+
if self._provider == "anthropic":
|
|
136
|
+
text, tin, tout = await self._anthropic(case.prompt)
|
|
137
|
+
else:
|
|
138
|
+
text, tin, tout = await self._openai(case.prompt)
|
|
139
|
+
except Exception as exc:
|
|
140
|
+
return Response(
|
|
141
|
+
case_id=case.id,
|
|
142
|
+
error=f"{type(exc).__name__}: {exc}",
|
|
143
|
+
latency_ms=self._elapsed_ms(start),
|
|
144
|
+
source=self.name,
|
|
145
|
+
model=self._model,
|
|
146
|
+
)
|
|
147
|
+
return Response(
|
|
148
|
+
case_id=case.id,
|
|
149
|
+
text=text,
|
|
150
|
+
latency_ms=self._elapsed_ms(start),
|
|
151
|
+
tokens_in=tin,
|
|
152
|
+
tokens_out=tout,
|
|
153
|
+
cost_usd=compute_cost(
|
|
154
|
+
self._model, tin, tout,
|
|
155
|
+
price_in=self._price_in, price_out=self._price_out,
|
|
156
|
+
),
|
|
157
|
+
model=self._model,
|
|
158
|
+
source=self.name,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
async def _anthropic(self, prompt: str) -> tuple[str, int | None, int | None]:
|
|
162
|
+
client = self._ensure_client()
|
|
163
|
+
msg = await client.messages.create(
|
|
164
|
+
model=self._model,
|
|
165
|
+
max_tokens=self._max_tokens,
|
|
166
|
+
temperature=self._temperature,
|
|
167
|
+
system=self._system or "",
|
|
168
|
+
messages=[{"role": "user", "content": prompt}],
|
|
169
|
+
)
|
|
170
|
+
text = ""
|
|
171
|
+
if msg.content:
|
|
172
|
+
for block in msg.content:
|
|
173
|
+
t = getattr(block, "text", None)
|
|
174
|
+
if t is not None:
|
|
175
|
+
text = t
|
|
176
|
+
break
|
|
177
|
+
usage = getattr(msg, "usage", None)
|
|
178
|
+
tin = getattr(usage, "input_tokens", None) if usage else None
|
|
179
|
+
tout = getattr(usage, "output_tokens", None) if usage else None
|
|
180
|
+
return text, tin, tout
|
|
181
|
+
|
|
182
|
+
async def _openai(self, prompt: str) -> tuple[str, int | None, int | None]:
|
|
183
|
+
client = self._ensure_client()
|
|
184
|
+
messages = []
|
|
185
|
+
if self._system:
|
|
186
|
+
messages.append({"role": "system", "content": self._system})
|
|
187
|
+
messages.append({"role": "user", "content": prompt})
|
|
188
|
+
resp = await client.chat.completions.create(
|
|
189
|
+
model=self._model,
|
|
190
|
+
max_tokens=self._max_tokens,
|
|
191
|
+
temperature=self._temperature,
|
|
192
|
+
messages=messages,
|
|
193
|
+
)
|
|
194
|
+
text = resp.choices[0].message.content or ""
|
|
195
|
+
usage = getattr(resp, "usage", None)
|
|
196
|
+
tin = getattr(usage, "prompt_tokens", None) if usage else None
|
|
197
|
+
tout = getattr(usage, "completion_tokens", None) if usage else None
|
|
198
|
+
return text, tin, tout
|