promptpolygraph 0.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. promptpolygraph/__init__.py +37 -0
  2. promptpolygraph/__main__.py +10 -0
  3. promptpolygraph/adapters/__init__.py +60 -0
  4. promptpolygraph/adapters/base.py +40 -0
  5. promptpolygraph/adapters/callable.py +64 -0
  6. promptpolygraph/adapters/demo.py +121 -0
  7. promptpolygraph/adapters/http.py +143 -0
  8. promptpolygraph/adapters/llm.py +198 -0
  9. promptpolygraph/analyze/__init__.py +33 -0
  10. promptpolygraph/analyze/analyzer.py +396 -0
  11. promptpolygraph/analyze/assertions.py +488 -0
  12. promptpolygraph/analyze/baseline.py +108 -0
  13. promptpolygraph/analyze/embedders.py +121 -0
  14. promptpolygraph/analyze/gate.py +289 -0
  15. promptpolygraph/analyze/rubric.py +177 -0
  16. promptpolygraph/audit/__init__.py +24 -0
  17. promptpolygraph/audit/code_context.py +296 -0
  18. promptpolygraph/audit/engine.py +109 -0
  19. promptpolygraph/audit/forensic.py +404 -0
  20. promptpolygraph/audit/persona.py +406 -0
  21. promptpolygraph/cli.py +974 -0
  22. promptpolygraph/compare/__init__.py +15 -0
  23. promptpolygraph/compare/matrix.py +401 -0
  24. promptpolygraph/compare/pairwise.py +103 -0
  25. promptpolygraph/compare/trend.py +81 -0
  26. promptpolygraph/config.py +182 -0
  27. promptpolygraph/configdesign.py +120 -0
  28. promptpolygraph/corpus/__init__.py +15 -0
  29. promptpolygraph/corpus/generator.py +467 -0
  30. promptpolygraph/corpus/loader.py +90 -0
  31. promptpolygraph/data/__init__.py +0 -0
  32. promptpolygraph/data/personas/__init__.py +0 -0
  33. promptpolygraph/data/personas/accessibility_user.yaml +9 -0
  34. promptpolygraph/data/personas/budget_conscious.yaml +10 -0
  35. promptpolygraph/data/personas/enthusiast.yaml +10 -0
  36. promptpolygraph/data/personas/first_timer.yaml +9 -0
  37. promptpolygraph/data/personas/frustrated_returning_customer.yaml +10 -0
  38. promptpolygraph/data/personas/non_native_speaker.yaml +9 -0
  39. promptpolygraph/data/personas/plain_language_needer.yaml +10 -0
  40. promptpolygraph/data/personas/power_user.yaml +9 -0
  41. promptpolygraph/data/personas/privacy_skeptic.yaml +10 -0
  42. promptpolygraph/data/personas/the_journalist.yaml +10 -0
  43. promptpolygraph/data/personas/the_skeptic.yaml +9 -0
  44. promptpolygraph/data/personas/the_troll.yaml +10 -0
  45. promptpolygraph/data/personas/time_pressed_exec.yaml +8 -0
  46. promptpolygraph/discovery.py +83 -0
  47. promptpolygraph/elicit.py +338 -0
  48. promptpolygraph/llm.py +215 -0
  49. promptpolygraph/models.py +228 -0
  50. promptpolygraph/persona/__init__.py +20 -0
  51. promptpolygraph/persona/library.py +79 -0
  52. promptpolygraph/persona/new.py +160 -0
  53. promptpolygraph/pipeline.py +210 -0
  54. promptpolygraph/redteam/__init__.py +38 -0
  55. promptpolygraph/redteam/catalog.py +84 -0
  56. promptpolygraph/redteam/codetrace.py +180 -0
  57. promptpolygraph/redteam/converters.py +216 -0
  58. promptpolygraph/redteam/design.py +148 -0
  59. promptpolygraph/redteam/guard.py +239 -0
  60. promptpolygraph/redteam/judge.py +73 -0
  61. promptpolygraph/redteam/models.py +120 -0
  62. promptpolygraph/redteam/multiturn.py +435 -0
  63. promptpolygraph/redteam/orchestrator.py +235 -0
  64. promptpolygraph/redteam/profiles.py +143 -0
  65. promptpolygraph/redteam/report.py +394 -0
  66. promptpolygraph/redteam/rootcause.py +148 -0
  67. promptpolygraph/redteam/roster.py +231 -0
  68. promptpolygraph/redteam/scrub.py +56 -0
  69. promptpolygraph/redteam/sources/__init__.py +49 -0
  70. promptpolygraph/redteam/sources/base.py +64 -0
  71. promptpolygraph/redteam/sources/catalog_source.py +31 -0
  72. promptpolygraph/redteam/sources/dataset_source.py +343 -0
  73. promptpolygraph/redteam/sources/deepteam_source.py +428 -0
  74. promptpolygraph/redteam/sources/garak_source.py +331 -0
  75. promptpolygraph/redteam/sources/pyrit_source.py +382 -0
  76. promptpolygraph/redteam/strategies.py +143 -0
  77. promptpolygraph/report/__init__.py +104 -0
  78. promptpolygraph/report/_env.py +51 -0
  79. promptpolygraph/report/charts.py +415 -0
  80. promptpolygraph/report/context.py +580 -0
  81. promptpolygraph/report/docx.py +298 -0
  82. promptpolygraph/report/html.py +414 -0
  83. promptpolygraph/report/markdown.py +360 -0
  84. promptpolygraph/report/pdf.py +101 -0
  85. promptpolygraph/report/templates/default/report.html.j2 +349 -0
  86. promptpolygraph/report/templates/default/report.md.j2 +121 -0
  87. promptpolygraph/report/templates/minimal/report.html.j2 +34 -0
  88. promptpolygraph/report/templates/minimal/report.md.j2 +13 -0
  89. promptpolygraph/runner/__init__.py +16 -0
  90. promptpolygraph/runner/runner.py +144 -0
  91. promptpolygraph/runner/store.py +188 -0
  92. promptpolygraph/service/__init__.py +22 -0
  93. promptpolygraph/service/app.py +465 -0
  94. promptpolygraph/service/auth.py +27 -0
  95. promptpolygraph/service/dashboard.py +720 -0
  96. promptpolygraph/service/db.py +284 -0
  97. promptpolygraph/service/jobspec.py +62 -0
  98. promptpolygraph/service/scheduler.py +67 -0
  99. promptpolygraph/service/schemas.py +81 -0
  100. promptpolygraph/service/server.py +25 -0
  101. promptpolygraph/service/settings.py +60 -0
  102. promptpolygraph/service/webhooks.py +24 -0
  103. promptpolygraph/service/worker.py +108 -0
  104. promptpolygraph/tune.py +124 -0
  105. promptpolygraph/ui/__init__.py +12 -0
  106. promptpolygraph/ui/arena.py +1369 -0
  107. promptpolygraph/ui/chrome.py +634 -0
  108. promptpolygraph/ui/page.py +2635 -0
  109. promptpolygraph/ui/server.py +1738 -0
  110. promptpolygraph-0.6.6.dist-info/METADATA +380 -0
  111. promptpolygraph-0.6.6.dist-info/RECORD +114 -0
  112. promptpolygraph-0.6.6.dist-info/WHEEL +4 -0
  113. promptpolygraph-0.6.6.dist-info/entry_points.txt +4 -0
  114. promptpolygraph-0.6.6.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,37 @@
1
+ """PromptPolygraph — synthetic-prompt evaluation and persona-audit harness.
2
+
3
+ A local-first, cloud-agnostic harness for pushing thousands of synthetic prompts
4
+ through any web/API/LLM system, scoring the responses against a pluggable rubric
5
+ (plus deterministic assertions and an optional multi-judge ensemble), reacting to
6
+ them through a panel of personas, tracing low scores with a forensic audit, and
7
+ rendering the result as a markdown / docx / pdf / html report.
8
+ """
9
+
10
+ from .models import (
11
+ AssertionResult,
12
+ AssertionSpec,
13
+ Case,
14
+ Dimension,
15
+ Persona,
16
+ Response,
17
+ Rubric,
18
+ RunMeta,
19
+ Score,
20
+ fingerprint,
21
+ )
22
+
23
+ __version__ = "0.1.0"
24
+
25
+ __all__ = [
26
+ "AssertionResult",
27
+ "AssertionSpec",
28
+ "Case",
29
+ "Dimension",
30
+ "Persona",
31
+ "Response",
32
+ "Rubric",
33
+ "RunMeta",
34
+ "Score",
35
+ "fingerprint",
36
+ "__version__",
37
+ ]
@@ -0,0 +1,10 @@
1
+ """Enable `python -m promptpolygraph` as an alias for the `polygraph` CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+
7
+ from .cli import main
8
+
9
+ if __name__ == "__main__":
10
+ sys.exit(main())
@@ -0,0 +1,60 @@
1
+ """Adapters — the single integration point per system under test.
2
+
3
+ An adapter takes a `Case` and returns a `Response`. Ship three: `HTTPAdapter`
4
+ (any REST endpoint), `LLMAdapter` (OpenAI / Anthropic / OpenAI-compatible chat),
5
+ and `CallableAdapter` (an in-process Python callable, used for tests and
6
+ library embedding). Custom targets implement the same `query` coroutine.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any
12
+
13
+ from ..config import AdapterConfig
14
+ from .base import Adapter, BaseAdapter
15
+ from .callable import CallableAdapter
16
+ from .demo import DemoAdapter
17
+ from .http import HTTPAdapter
18
+ from .llm import LLMAdapter
19
+
20
+ __all__ = [
21
+ "Adapter",
22
+ "BaseAdapter",
23
+ "CallableAdapter",
24
+ "DemoAdapter",
25
+ "HTTPAdapter",
26
+ "LLMAdapter",
27
+ "build_adapter",
28
+ ]
29
+
30
+
31
+ def _resolve_callable(ref: str) -> Any:
32
+ """Import a callable from a 'module:function' (or 'module.function') string,
33
+ so a custom in-process adapter can be configured from a config file / the UI
34
+ without code changes. The function may take `case` or a plain prompt str."""
35
+ import importlib
36
+
37
+ mod_name, _, attr = ref.replace(":", ".").rpartition(".")
38
+ if not mod_name:
39
+ raise ValueError(f"callable adapter ref must be 'module:function', got {ref!r}")
40
+ return getattr(importlib.import_module(mod_name), attr)
41
+
42
+
43
+ def build_adapter(cfg: AdapterConfig, **extra: Any) -> Adapter:
44
+ """Construct an adapter from config. `extra` lets callers inject a callable."""
45
+ options = {**cfg.options, **extra}
46
+ kind = cfg.type.lower()
47
+ if kind == "http":
48
+ return HTTPAdapter(name=cfg.name or "http", **options)
49
+ if kind == "llm":
50
+ return LLMAdapter(name=cfg.name or "llm", **options)
51
+ if kind == "demo":
52
+ return DemoAdapter(name=cfg.name or "demo", **options)
53
+ if kind == "callable":
54
+ # Accept a live `fn`, or an import string under fn/import/target/ref.
55
+ fn = options.pop("fn", None)
56
+ ref = options.pop("import", None) or options.pop("target", None) or options.pop("ref", None)
57
+ if fn is None and isinstance(ref, str) and ref.strip():
58
+ fn = _resolve_callable(ref.strip())
59
+ return CallableAdapter(name=cfg.name or "callable", fn=fn, **options)
60
+ raise ValueError(f"unknown adapter type: {cfg.type!r}")
@@ -0,0 +1,40 @@
1
+ """Adapter protocol + a small base class with timing helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ from typing import Protocol, runtime_checkable
7
+
8
+ from ..models import Case, Response
9
+
10
+
11
+ @runtime_checkable
12
+ class Adapter(Protocol):
13
+ name: str
14
+
15
+ async def query(self, case: Case) -> Response:
16
+ """Send the case prompt to the target and return a Response."""
17
+ ...
18
+
19
+ async def aclose(self) -> None:
20
+ ...
21
+
22
+
23
+ class BaseAdapter:
24
+ """Common scaffolding: name, no-op close, and a timed-response helper."""
25
+
26
+ name: str = "adapter"
27
+
28
+ def __init__(self, name: str | None = None):
29
+ if name:
30
+ self.name = name
31
+
32
+ async def query(self, case: Case) -> Response: # pragma: no cover - abstract
33
+ raise NotImplementedError
34
+
35
+ async def aclose(self) -> None:
36
+ return None
37
+
38
+ @staticmethod
39
+ def _elapsed_ms(start: float) -> int:
40
+ return int((time.perf_counter() - start) * 1000)
@@ -0,0 +1,64 @@
1
+ """In-process callable adapter — for tests, library embedding, and offline smoke.
2
+
3
+ Wrap any function `fn(prompt: str) -> str` (sync or async), or a richer
4
+ `fn(case: Case) -> str | dict`. A dict may carry {"text", "tokens_in",
5
+ "tokens_out", "model", ...} to populate the Response. This is the adapter the
6
+ offline end-to-end smoke uses, so the whole pipeline runs with no network.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ import inspect
13
+ import time
14
+ from typing import Any, Awaitable, Callable
15
+
16
+ from ..models import Case, Response
17
+ from .base import BaseAdapter
18
+
19
+ Handler = Callable[..., Any] | Callable[..., Awaitable[Any]]
20
+
21
+
22
+ class CallableAdapter(BaseAdapter):
23
+ name = "callable"
24
+
25
+ def __init__(self, name: str | None = None, fn: Handler | None = None, **_: Any):
26
+ super().__init__(name)
27
+ if fn is None:
28
+ raise ValueError("CallableAdapter requires a `fn` callable")
29
+ self._fn = fn
30
+ self._wants_case = "case" in inspect.signature(fn).parameters
31
+
32
+ async def query(self, case: Case) -> Response:
33
+ start = time.perf_counter()
34
+ arg = case if self._wants_case else case.prompt
35
+ try:
36
+ result = self._fn(arg)
37
+ if inspect.isawaitable(result):
38
+ result = await result
39
+ except Exception as exc: # surface target errors as a Response, not a crash
40
+ return Response(
41
+ case_id=case.id,
42
+ error=f"{type(exc).__name__}: {exc}",
43
+ latency_ms=self._elapsed_ms(start),
44
+ source=self.name,
45
+ )
46
+ latency = self._elapsed_ms(start)
47
+ if isinstance(result, dict):
48
+ return Response(
49
+ case_id=case.id,
50
+ text=str(result.get("text", "")),
51
+ latency_ms=result.get("latency_ms", latency),
52
+ tokens_in=result.get("tokens_in"),
53
+ tokens_out=result.get("tokens_out"),
54
+ cost_usd=result.get("cost_usd"),
55
+ model=result.get("model"),
56
+ source=self.name,
57
+ raw=result,
58
+ )
59
+ return Response(
60
+ case_id=case.id,
61
+ text=str(result),
62
+ latency_ms=latency,
63
+ source=self.name,
64
+ )
@@ -0,0 +1,121 @@
1
+ """A built-in, deterministic demo target — so bundled examples run with a
2
+ single command and no API key. It simulates a passable assistant whose behavior
3
+ varies by the kind of prompt, giving the analyzer, personas, and forensic audit
4
+ a realistic spread to work with.
5
+
6
+ Two styles (set via adapter options `style: everyday|support`): an everyday
7
+ general-purpose assistant (the default example) and a customer-support
8
+ assistant. Both share the universal safety behaviors (escalate on distress,
9
+ refuse disallowed asks, clarify on empty input).
10
+
11
+ This is for demonstration only; real evaluations point an HTTP or LLM adapter
12
+ at an actual system.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import time
18
+ from typing import Any
19
+
20
+ from ..models import Case, Response
21
+ from .base import BaseAdapter
22
+
23
+ # Universal: a genuinely worrying message should be met with care + a handoff.
24
+ _DISTRESS = ("can't keep going", "can't go on", "give up", "overwhelmed", "hopeless", "end it all")
25
+ # Universal disallowed markers (harmful / others' credentials / clearly illegal).
26
+ _DISALLOWED = ("someone else's password", "another person's account", "build a bomb",
27
+ "make a weapon", "hack into", "without consent", "evade the law")
28
+ # Support-style disallowed extras.
29
+ _DISALLOWED_SUPPORT = ("password of another", "someone else", "admin override", "bypass", "fake invoice")
30
+
31
+
32
+ class DemoAdapter(BaseAdapter):
33
+ name = "demo"
34
+
35
+ def __init__(self, name: str | None = None, *, style: str = "support", flaky: bool = False, **_: Any):
36
+ super().__init__(name)
37
+ self.style = style if style in ("support", "everyday") else "support"
38
+ self._flaky = flaky
39
+
40
+ async def query(self, case: Case) -> Response:
41
+ text = self._answer(case.prompt or "", self.style)
42
+ latency = 60 + (len(case.prompt) % 40) * 3 # deterministic variation
43
+ tokens_in = max(1, len(case.prompt) // 4)
44
+ tokens_out = max(1, len(text) // 4)
45
+ # Tiny deterministic cost so the demo exercises the cost path/scorers.
46
+ cost = round((tokens_in + tokens_out) * 1e-6, 9)
47
+ return Response(
48
+ case_id=case.id,
49
+ text=text,
50
+ latency_ms=latency,
51
+ tokens_in=tokens_in,
52
+ tokens_out=tokens_out,
53
+ cost_usd=cost,
54
+ model=f"demo-{self.style}-assistant",
55
+ source=self.name,
56
+ )
57
+
58
+ @staticmethod
59
+ def _answer(prompt: str, style: str) -> str:
60
+ p = prompt.lower().strip()
61
+ if not p:
62
+ return "I didn't quite catch a question there — could you tell me a little more about what you need?"
63
+ if any(k in p for k in _DISTRESS):
64
+ return (
65
+ "I'm really sorry you're feeling this way, and I don't want to leave you with just a bot. "
66
+ "Please consider reaching out to someone you trust or a local support line right now — "
67
+ "you don't have to handle this alone, and talking to a person can help."
68
+ )
69
+ disallowed = _DISALLOWED + (_DISALLOWED_SUPPORT if style == "support" else ())
70
+ if any(k in p for k in disallowed):
71
+ return (
72
+ "I can't help with that one. If there's a legitimate version of what you're trying to do, "
73
+ "tell me a bit more and I'll point you in the right direction."
74
+ )
75
+ if style == "support":
76
+ return DemoAdapter._support(p)
77
+ return DemoAdapter._everyday(p)
78
+
79
+ @staticmethod
80
+ def _support(p: str) -> str:
81
+ if "reset" in p and "password" in p:
82
+ return "To reset your password: open Settings > Security > Reset password, then check your email for the link."
83
+ if "cancel" in p or "downgrade" in p:
84
+ return "You can manage or cancel your plan under Billing > Subscription. Changes take effect at the end of the cycle."
85
+ if "refund" in p:
86
+ return "I can start a refund request for an eligible charge — could you share the invoice number from Billing > History?"
87
+ if "?" in p or any(k in p for k in ("how", "what", "where", "why", "can i")):
88
+ return (
89
+ "Happy to help. Here's the short version, and tell me if you want more detail: most account "
90
+ "settings live under Settings, and billing lives under Billing. What are you trying to do?"
91
+ )
92
+ return "Thanks for reaching out — can you tell me a little more so I can point you to the right place?"
93
+
94
+ @staticmethod
95
+ def _everyday(p: str) -> str:
96
+ if p.startswith("how ") or "how do i" in p or "how to" in p or "steps" in p:
97
+ return (
98
+ "Good question — here's a simple way to approach it. Start by getting clear on the goal, "
99
+ "then break it into a few concrete steps and do the smallest one first. If you tell me your "
100
+ "specifics, I can tailor the steps to your situation."
101
+ )
102
+ if any(p.startswith(k) for k in ("what is", "what are", "define")) or "explain" in p or "what's" in p:
103
+ return (
104
+ "In short: it's the concept your question points at, and the key idea is that it does what its "
105
+ "name suggests in a straightforward way. Want the one-line version or a fuller explanation with an example?"
106
+ )
107
+ if "recommend" in p or "should i" in p or "best" in p:
108
+ return (
109
+ "It depends a bit on what matters most to you. A reasonable default works for most people, but if "
110
+ "you share your constraints (budget, time, preferences) I can give you a sharper recommendation."
111
+ )
112
+ if "?" in p or any(k in p for k in ("why", "when", "where", "who", "can you", "could you")):
113
+ return (
114
+ "Here's the short answer, and I'm happy to go deeper: it generally comes down to a couple of key "
115
+ "factors, and the right call depends on your context. Tell me more and I'll be specific."
116
+ )
117
+ return "Got it. Tell me a little more about what you're after and I'll help you work through it."
118
+
119
+
120
+ def make_demo_adapter(name: str | None = None, **kw: Any) -> DemoAdapter:
121
+ return DemoAdapter(name=name, **kw)
@@ -0,0 +1,143 @@
1
+ """Generic HTTP/REST adapter for any web endpoint.
2
+
3
+ Configurable so it fits most JSON APIs without code:
4
+ - method, url, headers (env-var interpolation via ${VAR})
5
+ - body_template: a JSON-able dict; the string "{{prompt}}" anywhere inside
6
+ is replaced with the case prompt (also "{{category}}", "{{id}}")
7
+ - response_path: a JMESPath expression extracting the answer text from the
8
+ JSON response (default "text"); falls back to the raw body if it misses
9
+ - tokens_in_path / tokens_out_path / model_path: optional JMESPath for usage
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import copy
15
+ import os
16
+ import re
17
+ import time
18
+ from typing import Any
19
+
20
+ import httpx
21
+ import jmespath
22
+
23
+ from ..models import Case, Response
24
+ from .base import BaseAdapter
25
+
26
+ _ENV = re.compile(r"\$\{([A-Z0-9_]+)\}")
27
+
28
+
29
+ def _interp_env(value: Any) -> Any:
30
+ if isinstance(value, str):
31
+ return _ENV.sub(lambda m: os.environ.get(m.group(1), ""), value)
32
+ if isinstance(value, dict):
33
+ return {k: _interp_env(v) for k, v in value.items()}
34
+ if isinstance(value, list):
35
+ return [_interp_env(v) for v in value]
36
+ return value
37
+
38
+
39
+ def _fill(value: Any, case: Case) -> Any:
40
+ if isinstance(value, str):
41
+ return (
42
+ value.replace("{{prompt}}", case.prompt)
43
+ .replace("{{category}}", case.category)
44
+ .replace("{{id}}", case.id)
45
+ )
46
+ if isinstance(value, dict):
47
+ return {k: _fill(v, case) for k, v in value.items()}
48
+ if isinstance(value, list):
49
+ return [_fill(v, case) for v in value]
50
+ return value
51
+
52
+
53
+ class HTTPAdapter(BaseAdapter):
54
+ name = "http"
55
+
56
+ def __init__(
57
+ self,
58
+ name: str | None = None,
59
+ *,
60
+ url: str,
61
+ method: str = "POST",
62
+ headers: dict[str, str] | None = None,
63
+ body_template: dict[str, Any] | None = None,
64
+ response_path: str = "text",
65
+ tokens_in_path: str | None = None,
66
+ tokens_out_path: str | None = None,
67
+ model_path: str | None = None,
68
+ cost_path: str | None = None,
69
+ timeout: float = 60.0,
70
+ **_: Any,
71
+ ):
72
+ super().__init__(name)
73
+ self._url = _interp_env(url)
74
+ self._method = method.upper()
75
+ self._headers = _interp_env(headers or {})
76
+ self._body_template = body_template or {"prompt": "{{prompt}}"}
77
+ self._response_path = response_path
78
+ self._tokens_in_path = tokens_in_path
79
+ self._tokens_out_path = tokens_out_path
80
+ self._model_path = model_path
81
+ self._cost_path = cost_path
82
+ self._client = httpx.AsyncClient(timeout=timeout)
83
+
84
+ async def query(self, case: Case) -> Response:
85
+ start = time.perf_counter()
86
+ body = _fill(copy.deepcopy(self._body_template), case)
87
+ try:
88
+ resp = await self._client.request(
89
+ self._method, self._url, headers=self._headers, json=body
90
+ )
91
+ resp.raise_for_status()
92
+ data = resp.json()
93
+ except Exception as exc:
94
+ return Response(
95
+ case_id=case.id,
96
+ error=f"{type(exc).__name__}: {exc}",
97
+ latency_ms=self._elapsed_ms(start),
98
+ source=self.name,
99
+ )
100
+ text = jmespath.search(self._response_path, data)
101
+ if text is None:
102
+ text = data if isinstance(data, str) else str(data)
103
+ return Response(
104
+ case_id=case.id,
105
+ text=str(text),
106
+ latency_ms=self._elapsed_ms(start),
107
+ tokens_in=_search_int(self._tokens_in_path, data),
108
+ tokens_out=_search_int(self._tokens_out_path, data),
109
+ cost_usd=_search_float(self._cost_path, data),
110
+ model=_search_str(self._model_path, data),
111
+ source=self.name,
112
+ raw=data if isinstance(data, dict) else {"body": data},
113
+ )
114
+
115
+ async def aclose(self) -> None:
116
+ await self._client.aclose()
117
+
118
+
119
+ def _search_int(path: str | None, data: Any) -> int | None:
120
+ if not path:
121
+ return None
122
+ v = jmespath.search(path, data)
123
+ try:
124
+ return int(v) if v is not None else None
125
+ except (TypeError, ValueError):
126
+ return None
127
+
128
+
129
+ def _search_str(path: str | None, data: Any) -> str | None:
130
+ if not path:
131
+ return None
132
+ v = jmespath.search(path, data)
133
+ return str(v) if v is not None else None
134
+
135
+
136
+ def _search_float(path: str | None, data: Any) -> float | None:
137
+ if not path:
138
+ return None
139
+ v = jmespath.search(path, data)
140
+ try:
141
+ return float(v) if v is not None else None
142
+ except (TypeError, ValueError):
143
+ return None
@@ -0,0 +1,198 @@
1
+ """LLM chat adapter — evaluate a model/assistant directly.
2
+
3
+ Supports Anthropic (via the bundled `anthropic` dep) and any OpenAI-compatible
4
+ chat endpoint (OpenAI, Azure, local servers, etc.) via the optional `openai`
5
+ extra. The target *is* the model: each case prompt becomes a single user turn
6
+ under an optional system prompt.
7
+
8
+ adapter:
9
+ type: llm
10
+ options:
11
+ provider: anthropic # or "openai"
12
+ model: claude-opus-4-8
13
+ system: "You are a helpful customer-support assistant."
14
+ max_tokens: 512
15
+ # base_url: https://... # openai-compatible servers
16
+ # api_key_env: OPENAI_API_KEY
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import os
22
+ import time
23
+ from typing import Any
24
+
25
+ from ..models import Case, Response
26
+ from .base import BaseAdapter
27
+
28
+ # Small built-in per-1k-token price table (USD), keyed by a substring of the
29
+ # model id. Best-effort defaults; override per-run via options.price_per_1k_in /
30
+ # price_per_1k_out. Prices are illustrative, not authoritative.
31
+ _PRICE_PER_1K: dict[str, tuple[float, float]] = {
32
+ "opus": (0.015, 0.075),
33
+ "sonnet": (0.003, 0.015),
34
+ "haiku": (0.0008, 0.004),
35
+ "gpt-4o-mini": (0.00015, 0.0006),
36
+ "gpt-4o": (0.0025, 0.01),
37
+ "gpt-4": (0.03, 0.06),
38
+ "gpt-3.5": (0.0005, 0.0015),
39
+ }
40
+
41
+
42
+ def _lookup_price(model: str) -> tuple[float, float] | None:
43
+ ml = (model or "").lower()
44
+ for key, price in _PRICE_PER_1K.items():
45
+ if key in ml:
46
+ return price
47
+ return None
48
+
49
+
50
+ def compute_cost(
51
+ model: str,
52
+ tokens_in: int | None,
53
+ tokens_out: int | None,
54
+ *,
55
+ price_in: float | None = None,
56
+ price_out: float | None = None,
57
+ ) -> float | None:
58
+ """Best-effort USD cost from token usage. Returns None if neither tokens nor
59
+ a price are known."""
60
+ if price_in is None or price_out is None:
61
+ looked = _lookup_price(model)
62
+ if looked is None and (price_in is None and price_out is None):
63
+ return None
64
+ if looked is not None:
65
+ price_in = price_in if price_in is not None else looked[0]
66
+ price_out = price_out if price_out is not None else looked[1]
67
+ if price_in is None and price_out is None:
68
+ return None
69
+ ti = tokens_in or 0
70
+ to = tokens_out or 0
71
+ return (ti / 1000.0) * (price_in or 0.0) + (to / 1000.0) * (price_out or 0.0)
72
+
73
+
74
+ class LLMAdapter(BaseAdapter):
75
+ name = "llm"
76
+
77
+ def __init__(
78
+ self,
79
+ name: str | None = None,
80
+ *,
81
+ provider: str = "anthropic",
82
+ model: str = "claude-opus-4-8",
83
+ system: str = "",
84
+ max_tokens: int = 512,
85
+ temperature: float = 0.0,
86
+ base_url: str | None = None,
87
+ api_key_env: str | None = None,
88
+ price_per_1k_in: float | None = None,
89
+ price_per_1k_out: float | None = None,
90
+ **_: Any,
91
+ ):
92
+ super().__init__(name)
93
+ prov = provider.lower()
94
+ # Local OpenAI-compatible servers (Ollama / vLLM / LM Studio) speak the
95
+ # OpenAI protocol; normalize them to the openai path with a sane default URL.
96
+ if prov in ("ollama", "vllm", "lmstudio", "local"):
97
+ base_url = base_url or "http://localhost:11434/v1"
98
+ prov = "openai"
99
+ self._provider = prov
100
+ self._model = model
101
+ self._system = system
102
+ self._max_tokens = max_tokens
103
+ self._temperature = temperature
104
+ self._base_url = base_url
105
+ self._api_key_env = api_key_env
106
+ self._price_in = price_per_1k_in
107
+ self._price_out = price_per_1k_out
108
+ self._client: Any = None
109
+
110
+ def _ensure_client(self) -> Any:
111
+ if self._client is not None:
112
+ return self._client
113
+ if self._provider == "anthropic":
114
+ import anthropic
115
+
116
+ key = os.environ.get(self._api_key_env or "ANTHROPIC_API_KEY")
117
+ self._client = anthropic.AsyncAnthropic(api_key=key)
118
+ elif self._provider in ("openai", "openai-compatible", "compatible"):
119
+ try:
120
+ import openai
121
+ except ImportError as exc: # pragma: no cover
122
+ raise ImportError(
123
+ "the LLM adapter's openai provider needs the optional dep: "
124
+ "pip install 'promptpolygraph[llm]'"
125
+ ) from exc
126
+ key = os.environ.get(self._api_key_env or "OPENAI_API_KEY")
127
+ self._client = openai.AsyncOpenAI(api_key=key, base_url=self._base_url)
128
+ else:
129
+ raise ValueError(f"unknown llm provider: {self._provider!r}")
130
+ return self._client
131
+
132
+ async def query(self, case: Case) -> Response:
133
+ start = time.perf_counter()
134
+ try:
135
+ if self._provider == "anthropic":
136
+ text, tin, tout = await self._anthropic(case.prompt)
137
+ else:
138
+ text, tin, tout = await self._openai(case.prompt)
139
+ except Exception as exc:
140
+ return Response(
141
+ case_id=case.id,
142
+ error=f"{type(exc).__name__}: {exc}",
143
+ latency_ms=self._elapsed_ms(start),
144
+ source=self.name,
145
+ model=self._model,
146
+ )
147
+ return Response(
148
+ case_id=case.id,
149
+ text=text,
150
+ latency_ms=self._elapsed_ms(start),
151
+ tokens_in=tin,
152
+ tokens_out=tout,
153
+ cost_usd=compute_cost(
154
+ self._model, tin, tout,
155
+ price_in=self._price_in, price_out=self._price_out,
156
+ ),
157
+ model=self._model,
158
+ source=self.name,
159
+ )
160
+
161
+ async def _anthropic(self, prompt: str) -> tuple[str, int | None, int | None]:
162
+ client = self._ensure_client()
163
+ msg = await client.messages.create(
164
+ model=self._model,
165
+ max_tokens=self._max_tokens,
166
+ temperature=self._temperature,
167
+ system=self._system or "",
168
+ messages=[{"role": "user", "content": prompt}],
169
+ )
170
+ text = ""
171
+ if msg.content:
172
+ for block in msg.content:
173
+ t = getattr(block, "text", None)
174
+ if t is not None:
175
+ text = t
176
+ break
177
+ usage = getattr(msg, "usage", None)
178
+ tin = getattr(usage, "input_tokens", None) if usage else None
179
+ tout = getattr(usage, "output_tokens", None) if usage else None
180
+ return text, tin, tout
181
+
182
+ async def _openai(self, prompt: str) -> tuple[str, int | None, int | None]:
183
+ client = self._ensure_client()
184
+ messages = []
185
+ if self._system:
186
+ messages.append({"role": "system", "content": self._system})
187
+ messages.append({"role": "user", "content": prompt})
188
+ resp = await client.chat.completions.create(
189
+ model=self._model,
190
+ max_tokens=self._max_tokens,
191
+ temperature=self._temperature,
192
+ messages=messages,
193
+ )
194
+ text = resp.choices[0].message.content or ""
195
+ usage = getattr(resp, "usage", None)
196
+ tin = getattr(usage, "prompt_tokens", None) if usage else None
197
+ tout = getattr(usage, "completion_tokens", None) if usage else None
198
+ return text, tin, tout