fluiq 0.0.1a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fluiq-0.0.1a1/PKG-INFO +6 -0
- fluiq-0.0.1a1/pyproject.toml +15 -0
- fluiq-0.0.1a1/setup.cfg +4 -0
- fluiq-0.0.1a1/src/fluiq/__init__.py +2 -0
- fluiq-0.0.1a1/src/fluiq/client.py +20 -0
- fluiq-0.0.1a1/src/fluiq/config.py +21 -0
- fluiq-0.0.1a1/src/fluiq/decorator/__init__.py +67 -0
- fluiq-0.0.1a1/src/fluiq/evaluations/__init__.py +21 -0
- fluiq-0.0.1a1/src/fluiq/evaluations/base.py +85 -0
- fluiq-0.0.1a1/src/fluiq/evaluations/hallucination.py +142 -0
- fluiq-0.0.1a1/src/fluiq/evaluations/judge.py +167 -0
- fluiq-0.0.1a1/src/fluiq/evaluations/ragas.py +188 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Anthropic/helper/mcp_trace.py +45 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Anthropic/helper/streaming.py +120 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Anthropic/helper/thinking_trace.py +35 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Anthropic/helper/tool_trace.py +83 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Anthropic/helper/utils.py +25 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Anthropic/trace.py +349 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Gemini/helper/mcp_trace.py +99 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Gemini/helper/thinking_trace.py +26 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Gemini/helper/tool_trace.py +168 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Gemini/helper/utils.py +49 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Gemini/trace.py +458 -0
- fluiq-0.0.1a1/src/fluiq/integrations/GoogleADK/__init__.py +1 -0
- fluiq-0.0.1a1/src/fluiq/integrations/GoogleADK/helper/__init__.py +1 -0
- fluiq-0.0.1a1/src/fluiq/integrations/GoogleADK/helper/utils.py +145 -0
- fluiq-0.0.1a1/src/fluiq/integrations/GoogleADK/plugin.py +221 -0
- fluiq-0.0.1a1/src/fluiq/integrations/GoogleADK/trace.py +28 -0
- fluiq-0.0.1a1/src/fluiq/integrations/LangGraph/__init__.py +1 -0
- fluiq-0.0.1a1/src/fluiq/integrations/LangGraph/trace.py +9 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Langchain/__init__.py +1 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Langchain/handler.py +215 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Langchain/helper/__init__.py +1 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Langchain/helper/utils.py +142 -0
- fluiq-0.0.1a1/src/fluiq/integrations/Langchain/trace.py +19 -0
- fluiq-0.0.1a1/src/fluiq/integrations/OpenAI/endpoints.py +129 -0
- fluiq-0.0.1a1/src/fluiq/integrations/OpenAI/helper/mcp_trace.py +32 -0
- fluiq-0.0.1a1/src/fluiq/integrations/OpenAI/helper/streaming.py +201 -0
- fluiq-0.0.1a1/src/fluiq/integrations/OpenAI/helper/thinking_trace.py +41 -0
- fluiq-0.0.1a1/src/fluiq/integrations/OpenAI/helper/tool_trace.py +76 -0
- fluiq-0.0.1a1/src/fluiq/integrations/OpenAI/helper/utils.py +36 -0
- fluiq-0.0.1a1/src/fluiq/integrations/OpenAI/trace.py +449 -0
- fluiq-0.0.1a1/src/fluiq/integrations/__init__.py +48 -0
- fluiq-0.0.1a1/src/fluiq/integrations/shared/chain.py +64 -0
- fluiq-0.0.1a1/src/fluiq/integrations/shared/context.py +49 -0
- fluiq-0.0.1a1/src/fluiq/integrations/shared/mcp_patch.py +40 -0
- fluiq-0.0.1a1/src/fluiq/integrations/shared/models.py +62 -0
- fluiq-0.0.1a1/src/fluiq/tracer.py +14 -0
- fluiq-0.0.1a1/src/fluiq.egg-info/PKG-INFO +6 -0
- fluiq-0.0.1a1/src/fluiq.egg-info/SOURCES.txt +51 -0
- fluiq-0.0.1a1/src/fluiq.egg-info/dependency_links.txt +1 -0
- fluiq-0.0.1a1/src/fluiq.egg-info/requires.txt +1 -0
- fluiq-0.0.1a1/src/fluiq.egg-info/top_level.txt +1 -0
fluiq-0.0.1a1/PKG-INFO
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "fluiq"
|
|
7
|
+
version = "0.0.1a1"
|
|
8
|
+
description = "Fluiq SDK"
|
|
9
|
+
requires-python = ">=3.9"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"requests>=2.31",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[tool.setuptools.packages.find]
|
|
15
|
+
where = ["src"]
|
fluiq-0.0.1a1/setup.cfg
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from fluiq.config import _config
|
|
3
|
+
|
|
4
|
+
def send_event(data):
|
|
5
|
+
|
|
6
|
+
if not _config["enabled"]:
|
|
7
|
+
return
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
r = requests.post(
|
|
11
|
+
f"{_config['endpoint']}/{_config['version']}/ingest",
|
|
12
|
+
json={
|
|
13
|
+
"api_key": _config["api_key"],
|
|
14
|
+
"event":data
|
|
15
|
+
},
|
|
16
|
+
timeout=1
|
|
17
|
+
)
|
|
18
|
+
r.raise_for_status()
|
|
19
|
+
except Exception as e:
|
|
20
|
+
print("[fluiq] send event failed: ",repr(e))
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from dotenv import load_dotenv
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
load_dotenv()
|
|
5
|
+
|
|
6
|
+
ENDPOINT = os.getenv("FLUIQ_API_ENDPOINT","http://localhost:8000/api")
|
|
7
|
+
|
|
8
|
+
_config={
|
|
9
|
+
"api_key": None,
|
|
10
|
+
"enabled": True,
|
|
11
|
+
"version": "v1",
|
|
12
|
+
"endpoint": ENDPOINT
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
def init(api_key: str, version="v1", endpoint=ENDPOINT):
|
|
16
|
+
_config["api_key"] = api_key
|
|
17
|
+
_config["version"] = version
|
|
18
|
+
_config["endpoint"] = endpoint
|
|
19
|
+
|
|
20
|
+
from fluiq.integrations import init as _integration_init_
|
|
21
|
+
_integration_init_()
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
import uuid
|
|
4
|
+
from fluiq.tracer import log_trace
|
|
5
|
+
from fluiq.integrations.shared.models import LogTrace, TraceType
|
|
6
|
+
from fluiq.integrations.shared.context import (
|
|
7
|
+
current_parent_id,
|
|
8
|
+
push_trace_id,
|
|
9
|
+
pop_trace_id,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _emit(trace_id, parent_id, func_name, args, kwargs, result, exc, start, end):
|
|
14
|
+
success = exc is None
|
|
15
|
+
payload = LogTrace(
|
|
16
|
+
trace_id=trace_id,
|
|
17
|
+
parent_id=parent_id,
|
|
18
|
+
integration=TraceType.General_Function,
|
|
19
|
+
function=func_name,
|
|
20
|
+
input=str(args) + str(kwargs),
|
|
21
|
+
output=str(result) if success else str(exc),
|
|
22
|
+
latency=end - start,
|
|
23
|
+
success=success,
|
|
24
|
+
)
|
|
25
|
+
log_trace(payload.model_dump(mode="json"))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def trace(func):
|
|
29
|
+
if asyncio.iscoroutinefunction(func):
|
|
30
|
+
async def async_wrapper(*args, **kwargs):
|
|
31
|
+
trace_id = str(uuid.uuid4())
|
|
32
|
+
parent_id = current_parent_id()
|
|
33
|
+
token = push_trace_id(trace_id)
|
|
34
|
+
start = time.time()
|
|
35
|
+
exc = None
|
|
36
|
+
result = None
|
|
37
|
+
try:
|
|
38
|
+
result = await func(*args, **kwargs)
|
|
39
|
+
except Exception as e:
|
|
40
|
+
exc = e
|
|
41
|
+
end = time.time()
|
|
42
|
+
pop_trace_id(token)
|
|
43
|
+
_emit(trace_id, parent_id, func.__name__, args, kwargs, result, exc, start, end)
|
|
44
|
+
if exc is not None:
|
|
45
|
+
raise exc
|
|
46
|
+
return result
|
|
47
|
+
return async_wrapper
|
|
48
|
+
|
|
49
|
+
def wrapper(*args, **kwargs):
|
|
50
|
+
trace_id = str(uuid.uuid4())
|
|
51
|
+
parent_id = current_parent_id()
|
|
52
|
+
token = push_trace_id(trace_id)
|
|
53
|
+
start = time.time()
|
|
54
|
+
exc = None
|
|
55
|
+
result = None
|
|
56
|
+
try:
|
|
57
|
+
result = func(*args, **kwargs)
|
|
58
|
+
except Exception as e:
|
|
59
|
+
exc = e
|
|
60
|
+
end = time.time()
|
|
61
|
+
pop_trace_id(token)
|
|
62
|
+
_emit(trace_id, parent_id, func.__name__, args, kwargs, result, exc, start, end)
|
|
63
|
+
if exc is not None:
|
|
64
|
+
raise exc
|
|
65
|
+
return result
|
|
66
|
+
|
|
67
|
+
return wrapper
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from fluiq.evaluations.base import BaseEvaluator, EvalResult, LLMJudge
|
|
2
|
+
from fluiq.evaluations.hallucination import HallucinationEvaluator
|
|
3
|
+
from fluiq.evaluations.ragas import (
|
|
4
|
+
AnswerRelevancy,
|
|
5
|
+
ContextPrecision,
|
|
6
|
+
ContextRecall,
|
|
7
|
+
Faithfulness,
|
|
8
|
+
Ragas,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"BaseEvaluator",
|
|
13
|
+
"EvalResult",
|
|
14
|
+
"LLMJudge",
|
|
15
|
+
"HallucinationEvaluator",
|
|
16
|
+
"Faithfulness",
|
|
17
|
+
"AnswerRelevancy",
|
|
18
|
+
"ContextPrecision",
|
|
19
|
+
"ContextRecall",
|
|
20
|
+
"Ragas",
|
|
21
|
+
]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class EvalResult(BaseModel):
|
|
10
|
+
model_config = ConfigDict(extra="allow")
|
|
11
|
+
|
|
12
|
+
name: str
|
|
13
|
+
score: float = Field(ge=0.0, le=1.0)
|
|
14
|
+
passed: Optional[bool] = None
|
|
15
|
+
reason: Optional[str] = None
|
|
16
|
+
details: Optional[Dict[str, Any]] = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BaseEvaluator(ABC):
|
|
20
|
+
name: str = "evaluator"
|
|
21
|
+
|
|
22
|
+
def __init__(self, threshold: float = 0.5):
|
|
23
|
+
self.threshold = threshold
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def evaluate(self, **kwargs: Any) -> EvalResult:
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
def _result(
|
|
30
|
+
self,
|
|
31
|
+
score: float,
|
|
32
|
+
reason: Optional[str] = None,
|
|
33
|
+
details: Optional[Dict[str, Any]] = None,
|
|
34
|
+
) -> EvalResult:
|
|
35
|
+
return EvalResult(
|
|
36
|
+
name=self.name,
|
|
37
|
+
score=float(score),
|
|
38
|
+
passed=float(score) >= self.threshold,
|
|
39
|
+
reason=reason,
|
|
40
|
+
details=details,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _parse_json_object(raw: str) -> Dict[str, Any]:
|
|
45
|
+
if not raw:
|
|
46
|
+
return {}
|
|
47
|
+
raw = raw.strip()
|
|
48
|
+
try:
|
|
49
|
+
data = json.loads(raw)
|
|
50
|
+
return data if isinstance(data, dict) else {"value": data}
|
|
51
|
+
except json.JSONDecodeError:
|
|
52
|
+
pass
|
|
53
|
+
match = re.search(r"\{.*\}", raw, re.DOTALL)
|
|
54
|
+
if match:
|
|
55
|
+
try:
|
|
56
|
+
data = json.loads(match.group(0))
|
|
57
|
+
return data if isinstance(data, dict) else {"value": data}
|
|
58
|
+
except json.JSONDecodeError:
|
|
59
|
+
return {}
|
|
60
|
+
return {}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _coerce_contexts(contexts: Any) -> List[str]:
|
|
64
|
+
if contexts is None:
|
|
65
|
+
return []
|
|
66
|
+
if isinstance(contexts, str):
|
|
67
|
+
return [contexts]
|
|
68
|
+
if isinstance(contexts, (list, tuple)):
|
|
69
|
+
return [str(c) for c in contexts if c is not None]
|
|
70
|
+
return [str(contexts)]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _clamp_unit(value: Any, default: float = 0.0) -> float:
|
|
74
|
+
try:
|
|
75
|
+
f = float(value)
|
|
76
|
+
except (TypeError, ValueError):
|
|
77
|
+
return default
|
|
78
|
+
if f < 0.0:
|
|
79
|
+
return 0.0
|
|
80
|
+
if f > 1.0:
|
|
81
|
+
return 1.0
|
|
82
|
+
return f
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
from fluiq.evaluations.judge import JudgeFn, LLMJudge # noqa: E402,F401
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
from typing import Any, List, Optional
|
|
2
|
+
|
|
3
|
+
from fluiq.evaluations.base import (
|
|
4
|
+
BaseEvaluator,
|
|
5
|
+
EvalResult,
|
|
6
|
+
LLMJudge,
|
|
7
|
+
_clamp_unit,
|
|
8
|
+
_coerce_contexts,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
_CLAIMS_PROMPT = (
|
|
13
|
+
"Extract every standalone factual claim from the ANSWER below. "
|
|
14
|
+
"Return JSON: {{\"claims\": [\"claim 1\", \"claim 2\", ...]}}.\n\n"
|
|
15
|
+
"ANSWER:\n{answer}"
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_VERIFY_PROMPT = (
|
|
20
|
+
"You are checking whether each CLAIM is supported by the REFERENCE. "
|
|
21
|
+
"A claim is SUPPORTED only if the reference entails it; if the reference "
|
|
22
|
+
"neither states nor implies it, mark it UNSUPPORTED. Speculation, added "
|
|
23
|
+
"details, and contradictions are UNSUPPORTED.\n\n"
|
|
24
|
+
"REFERENCE:\n{reference}\n\n"
|
|
25
|
+
"CLAIMS:\n{claims}\n\n"
|
|
26
|
+
"Return JSON: {{\"verdicts\": [{{\"claim\": str, \"supported\": bool, "
|
|
27
|
+
"\"reason\": str}}]}}"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class HallucinationEvaluator(BaseEvaluator):
|
|
32
|
+
"""Hallucination detector.
|
|
33
|
+
|
|
34
|
+
Score is 1.0 when no hallucinated claims are detected and 0.0 when every
|
|
35
|
+
claim in the answer is unsupported. Lower scores are worse. The evaluator
|
|
36
|
+
accepts either retrieved `contexts` (RAG style) or a single `reference`
|
|
37
|
+
string (closed-book / ground truth) as the source of truth.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
name = "hallucination"
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
judge: Optional[LLMJudge] = None,
|
|
45
|
+
threshold: float = 0.7,
|
|
46
|
+
):
|
|
47
|
+
super().__init__(threshold=threshold)
|
|
48
|
+
self.judge = judge or LLMJudge()
|
|
49
|
+
|
|
50
|
+
def evaluate(
|
|
51
|
+
self,
|
|
52
|
+
answer: str,
|
|
53
|
+
contexts: Any = None,
|
|
54
|
+
reference: Optional[str] = None,
|
|
55
|
+
**_: Any,
|
|
56
|
+
) -> EvalResult:
|
|
57
|
+
if not answer or not answer.strip():
|
|
58
|
+
return self._result(
|
|
59
|
+
score=1.0,
|
|
60
|
+
reason="empty answer; nothing to hallucinate",
|
|
61
|
+
details={"claims": [], "verdicts": []},
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
reference_text = _build_reference(contexts, reference)
|
|
65
|
+
if not reference_text:
|
|
66
|
+
return self._result(
|
|
67
|
+
score=0.0,
|
|
68
|
+
reason="no reference or contexts provided",
|
|
69
|
+
details={"claims": [], "verdicts": []},
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
claims = self._extract_claims(answer)
|
|
73
|
+
if not claims:
|
|
74
|
+
return self._result(
|
|
75
|
+
score=1.0,
|
|
76
|
+
reason="no factual claims extracted from answer",
|
|
77
|
+
details={"claims": [], "verdicts": []},
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
verdicts = self._verify_claims(reference_text, claims)
|
|
81
|
+
if not verdicts:
|
|
82
|
+
return self._result(
|
|
83
|
+
score=0.0,
|
|
84
|
+
reason="judge returned no verdicts",
|
|
85
|
+
details={"claims": claims, "verdicts": []},
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
supported = sum(1 for v in verdicts if v.get("supported") is True)
|
|
89
|
+
score = _clamp_unit(supported / len(verdicts))
|
|
90
|
+
unsupported = [v for v in verdicts if v.get("supported") is not True]
|
|
91
|
+
reason = (
|
|
92
|
+
f"{supported}/{len(verdicts)} claims supported"
|
|
93
|
+
if not unsupported
|
|
94
|
+
else f"{len(unsupported)} unsupported claim(s) of {len(verdicts)}"
|
|
95
|
+
)
|
|
96
|
+
return self._result(
|
|
97
|
+
score=score,
|
|
98
|
+
reason=reason,
|
|
99
|
+
details={
|
|
100
|
+
"claims": claims,
|
|
101
|
+
"verdicts": verdicts,
|
|
102
|
+
"supported_count": supported,
|
|
103
|
+
"total_claims": len(verdicts),
|
|
104
|
+
},
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def _extract_claims(self, answer: str) -> List[str]:
|
|
108
|
+
data = self.judge.judge_json(_CLAIMS_PROMPT.format(answer=answer))
|
|
109
|
+
raw = data.get("claims") or []
|
|
110
|
+
if not isinstance(raw, list):
|
|
111
|
+
return []
|
|
112
|
+
return [str(c).strip() for c in raw if str(c).strip()]
|
|
113
|
+
|
|
114
|
+
def _verify_claims(self, reference: str, claims: List[str]) -> List[dict]:
|
|
115
|
+
prompt = _VERIFY_PROMPT.format(
|
|
116
|
+
reference=reference,
|
|
117
|
+
claims="\n".join(f"- {c}" for c in claims),
|
|
118
|
+
)
|
|
119
|
+
data = self.judge.judge_json(prompt)
|
|
120
|
+
raw = data.get("verdicts") or []
|
|
121
|
+
if not isinstance(raw, list):
|
|
122
|
+
return []
|
|
123
|
+
verdicts: List[dict] = []
|
|
124
|
+
for v in raw:
|
|
125
|
+
if not isinstance(v, dict):
|
|
126
|
+
continue
|
|
127
|
+
verdicts.append({
|
|
128
|
+
"claim": str(v.get("claim", "")).strip(),
|
|
129
|
+
"supported": bool(v.get("supported")),
|
|
130
|
+
"reason": str(v.get("reason", "")).strip() or None,
|
|
131
|
+
})
|
|
132
|
+
return verdicts
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _build_reference(contexts: Any, reference: Optional[str]) -> str:
|
|
136
|
+
parts: List[str] = []
|
|
137
|
+
for c in _coerce_contexts(contexts):
|
|
138
|
+
if c.strip():
|
|
139
|
+
parts.append(c.strip())
|
|
140
|
+
if reference and reference.strip():
|
|
141
|
+
parts.append(reference.strip())
|
|
142
|
+
return "\n\n".join(parts)
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Callable, Dict, Optional
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
from fluiq.evaluations.base import _parse_json_object
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
JudgeFn = Callable[[str], str]
|
|
10
|
+
|
|
11
|
+
PROVIDERS = ("openai", "anthropic", "gemini", "fluiq")
|
|
12
|
+
|
|
13
|
+
DEFAULT_MODELS: Dict[str, str] = {
|
|
14
|
+
"openai": "gpt-4o-mini",
|
|
15
|
+
"anthropic": "claude-3-5-haiku-latest",
|
|
16
|
+
"gemini": "gemini-2.5-flash",
|
|
17
|
+
"fluiq": "fluiq-judge",
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
_SYSTEM_PROMPT = (
|
|
21
|
+
"You are a strict evaluator. Always respond with a single valid JSON "
|
|
22
|
+
"object and nothing else."
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LLMJudge:
|
|
27
|
+
"""LLM-as-judge with pluggable providers.
|
|
28
|
+
|
|
29
|
+
`provider` selects the backend: ``openai`` (default), ``anthropic``,
|
|
30
|
+
``gemini``, or ``fluiq``. The first three call the respective vendor
|
|
31
|
+
SDK directly using the user's own API key. ``fluiq`` proxies the call
|
|
32
|
+
through the Fluiq API so judging is billed and metered server-side.
|
|
33
|
+
|
|
34
|
+
Pass ``judge_fn(prompt: str) -> str`` to short-circuit the provider
|
|
35
|
+
path entirely (useful for offline testing and self-hosted models).
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
provider: str = "openai",
|
|
41
|
+
model: Optional[str] = None,
|
|
42
|
+
judge_fn: Optional[JudgeFn] = None,
|
|
43
|
+
api_key: Optional[str] = None,
|
|
44
|
+
temperature: float = 0.0,
|
|
45
|
+
):
|
|
46
|
+
if provider not in PROVIDERS:
|
|
47
|
+
raise ValueError(
|
|
48
|
+
f"Unsupported judge provider: {provider!r}. "
|
|
49
|
+
f"Use one of: {PROVIDERS}"
|
|
50
|
+
)
|
|
51
|
+
self.provider = provider
|
|
52
|
+
self.model = model or DEFAULT_MODELS[provider]
|
|
53
|
+
self.temperature = temperature
|
|
54
|
+
self._judge_fn = judge_fn
|
|
55
|
+
self._api_key = api_key
|
|
56
|
+
self._client = None
|
|
57
|
+
|
|
58
|
+
def __call__(self, prompt: str) -> str:
|
|
59
|
+
if self._judge_fn is not None:
|
|
60
|
+
return self._judge_fn(prompt)
|
|
61
|
+
if self.provider == "openai":
|
|
62
|
+
return self._call_openai(prompt)
|
|
63
|
+
if self.provider == "anthropic":
|
|
64
|
+
return self._call_anthropic(prompt)
|
|
65
|
+
if self.provider == "gemini":
|
|
66
|
+
return self._call_gemini(prompt)
|
|
67
|
+
if self.provider == "fluiq":
|
|
68
|
+
return self._call_fluiq(prompt)
|
|
69
|
+
raise RuntimeError(f"Unsupported provider: {self.provider}")
|
|
70
|
+
|
|
71
|
+
def judge_json(self, prompt: str) -> Dict[str, Any]:
|
|
72
|
+
return _parse_json_object(self(prompt))
|
|
73
|
+
|
|
74
|
+
def _call_openai(self, prompt: str) -> str:
|
|
75
|
+
try:
|
|
76
|
+
from openai import OpenAI
|
|
77
|
+
except ImportError as exc:
|
|
78
|
+
raise RuntimeError(
|
|
79
|
+
"judge provider 'openai' requires the `openai` package"
|
|
80
|
+
) from exc
|
|
81
|
+
if self._client is None:
|
|
82
|
+
key = self._api_key or os.getenv("OPENAI_API_KEY")
|
|
83
|
+
self._client = OpenAI(api_key=key) if key else OpenAI()
|
|
84
|
+
resp = self._client.chat.completions.create(
|
|
85
|
+
model=self.model,
|
|
86
|
+
temperature=self.temperature,
|
|
87
|
+
messages=[
|
|
88
|
+
{"role": "system", "content": _SYSTEM_PROMPT},
|
|
89
|
+
{"role": "user", "content": prompt},
|
|
90
|
+
],
|
|
91
|
+
response_format={"type": "json_object"},
|
|
92
|
+
)
|
|
93
|
+
return resp.choices[0].message.content or "{}"
|
|
94
|
+
|
|
95
|
+
def _call_anthropic(self, prompt: str) -> str:
|
|
96
|
+
try:
|
|
97
|
+
import anthropic
|
|
98
|
+
except ImportError as exc:
|
|
99
|
+
raise RuntimeError(
|
|
100
|
+
"judge provider 'anthropic' requires the `anthropic` package"
|
|
101
|
+
) from exc
|
|
102
|
+
if self._client is None:
|
|
103
|
+
key = self._api_key or os.getenv("ANTHROPIC_API_KEY")
|
|
104
|
+
self._client = anthropic.Anthropic(api_key=key) if key else anthropic.Anthropic()
|
|
105
|
+
resp = self._client.messages.create(
|
|
106
|
+
model=self.model,
|
|
107
|
+
max_tokens=1024,
|
|
108
|
+
temperature=self.temperature,
|
|
109
|
+
system=_SYSTEM_PROMPT,
|
|
110
|
+
messages=[{"role": "user", "content": prompt}],
|
|
111
|
+
)
|
|
112
|
+
for block in getattr(resp, "content", []) or []:
|
|
113
|
+
text = getattr(block, "text", None)
|
|
114
|
+
if text:
|
|
115
|
+
return text
|
|
116
|
+
return "{}"
|
|
117
|
+
|
|
118
|
+
def _call_gemini(self, prompt: str) -> str:
|
|
119
|
+
try:
|
|
120
|
+
from google import genai
|
|
121
|
+
from google.genai import types
|
|
122
|
+
except ImportError as exc:
|
|
123
|
+
raise RuntimeError(
|
|
124
|
+
"judge provider 'gemini' requires the `google-genai` package"
|
|
125
|
+
) from exc
|
|
126
|
+
if self._client is None:
|
|
127
|
+
key = self._api_key or os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
|
|
128
|
+
self._client = genai.Client(api_key=key) if key else genai.Client()
|
|
129
|
+
resp = self._client.models.generate_content(
|
|
130
|
+
model=self.model,
|
|
131
|
+
contents=prompt,
|
|
132
|
+
config=types.GenerateContentConfig(
|
|
133
|
+
system_instruction=_SYSTEM_PROMPT,
|
|
134
|
+
temperature=self.temperature,
|
|
135
|
+
response_mime_type="application/json",
|
|
136
|
+
),
|
|
137
|
+
)
|
|
138
|
+
return getattr(resp, "text", None) or "{}"
|
|
139
|
+
|
|
140
|
+
def _call_fluiq(self, prompt: str) -> str:
|
|
141
|
+
from fluiq.config import _config
|
|
142
|
+
|
|
143
|
+
api_key = self._api_key or _config.get("api_key")
|
|
144
|
+
if not api_key:
|
|
145
|
+
raise RuntimeError(
|
|
146
|
+
"judge provider 'fluiq' requires fluiq.instrument(api_key=...) "
|
|
147
|
+
"or LLMJudge(api_key=...)"
|
|
148
|
+
)
|
|
149
|
+
endpoint = _config.get("endpoint")
|
|
150
|
+
version = _config.get("version", "v1")
|
|
151
|
+
url = f"{endpoint}/{version}/judge"
|
|
152
|
+
resp = requests.post(
|
|
153
|
+
url,
|
|
154
|
+
json={
|
|
155
|
+
"api_key": api_key,
|
|
156
|
+
"model": self.model,
|
|
157
|
+
"prompt": prompt,
|
|
158
|
+
"temperature": self.temperature,
|
|
159
|
+
},
|
|
160
|
+
timeout=60,
|
|
161
|
+
)
|
|
162
|
+
resp.raise_for_status()
|
|
163
|
+
body = resp.json() or {}
|
|
164
|
+
return body.get("content") or "{}"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
|