whatap-python 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- whatap/LICENSE +0 -0
- whatap/README.rst +49 -0
- whatap/__init__.py +923 -0
- whatap/__main__.py +4 -0
- whatap/agent/darwin/amd64/whatap_python +0 -0
- whatap/agent/darwin/arm64/whatap_python +0 -0
- whatap/agent/linux/amd64/whatap_python +0 -0
- whatap/agent/linux/arm64/whatap_python +0 -0
- whatap/agent/windows/whatap_python.exe +0 -0
- whatap/bootstrap/__init__.py +0 -0
- whatap/bootstrap/sitecustomize.py +19 -0
- whatap/build.py +4 -0
- whatap/conf/__init__.py +0 -0
- whatap/conf/configuration.py +280 -0
- whatap/conf/configure.py +105 -0
- whatap/conf/license.py +49 -0
- whatap/control/__init__.py +0 -0
- whatap/counter/__init__.py +14 -0
- whatap/counter/counter_manager.py +45 -0
- whatap/counter/tasks/__init__.py +3 -0
- whatap/counter/tasks/base_task.py +26 -0
- whatap/counter/tasks/llm_evaluator_task.py +501 -0
- whatap/counter/tasks/llm_log_sink_task.py +309 -0
- whatap/counter/tasks/llm_stat_task.py +78 -0
- whatap/counter/tasks/openfiledescriptor.py +67 -0
- whatap/io/__init__.py +1 -0
- whatap/io/data_inputx.py +161 -0
- whatap/io/data_outputx.py +262 -0
- whatap/llm/__init__.py +17 -0
- whatap/llm/definitions.py +43 -0
- whatap/llm/evaluators/__init__.py +136 -0
- whatap/llm/evaluators/base.py +114 -0
- whatap/llm/evaluators/builtins/__init__.py +91 -0
- whatap/llm/evaluators/builtins/answer_relevance.py +46 -0
- whatap/llm/evaluators/builtins/combined_judge.py +271 -0
- whatap/llm/evaluators/builtins/factuality.py +71 -0
- whatap/llm/evaluators/builtins/hallucination.py +97 -0
- whatap/llm/evaluators/builtins/llm_judge.py +516 -0
- whatap/llm/evaluators/builtins/pii_leak.py +214 -0
- whatap/llm/evaluators/builtins/prompt_injection.py +71 -0
- whatap/llm/evaluators/builtins/toxicity.py +53 -0
- whatap/llm/evaluators/builtins/url_scan.py +194 -0
- whatap/llm/evaluators/registry.py +192 -0
- whatap/llm/evaluators/sampler.py +83 -0
- whatap/llm/evaluators/scope.py +334 -0
- whatap/llm/features.py +66 -0
- whatap/llm/log_sink_packs/__init__.py +9 -0
- whatap/llm/log_sink_packs/llm_input_message.py +16 -0
- whatap/llm/log_sink_packs/llm_log_sink_pack.py +72 -0
- whatap/llm/log_sink_packs/llm_output_message.py +19 -0
- whatap/llm/log_sink_packs/llm_step_eval_status.py +94 -0
- whatap/llm/log_sink_packs/llm_step_status.py +118 -0
- whatap/llm/log_sink_packs/llm_system_message.py +16 -0
- whatap/llm/log_sink_packs/llm_tool_calls.py +44 -0
- whatap/llm/log_sink_packs/llm_tool_results.py +16 -0
- whatap/llm/log_sink_packs/llm_tx_status.py +108 -0
- whatap/llm/pricing.py +236 -0
- whatap/llm/prompt_meta.py +288 -0
- whatap/llm/providers/__init__.py +0 -0
- whatap/llm/providers/anthropic/__init__.py +37 -0
- whatap/llm/providers/anthropic/messages/__init__.py +0 -0
- whatap/llm/providers/anthropic/messages/messages.py +70 -0
- whatap/llm/providers/anthropic/messages/messages_context.py +76 -0
- whatap/llm/providers/anthropic/messages/messages_extractor.py +126 -0
- whatap/llm/providers/interceptor.py +182 -0
- whatap/llm/providers/openai/__init__.py +133 -0
- whatap/llm/providers/openai/chat/__init__.py +0 -0
- whatap/llm/providers/openai/chat/chat.py +82 -0
- whatap/llm/providers/openai/chat/chat_context.py +78 -0
- whatap/llm/providers/openai/chat/chat_extractor.py +127 -0
- whatap/llm/providers/openai/completions/__init__.py +0 -0
- whatap/llm/providers/openai/completions/completions.py +70 -0
- whatap/llm/providers/openai/completions/completions_context.py +31 -0
- whatap/llm/providers/openai/completions/completions_extractor.py +61 -0
- whatap/llm/providers/openai/content_parser.py +41 -0
- whatap/llm/providers/openai/embeddings/__init__.py +0 -0
- whatap/llm/providers/openai/embeddings/embeddings.py +59 -0
- whatap/llm/providers/openai/embeddings/embeddings_context.py +25 -0
- whatap/llm/providers/openai/embeddings/embeddings_extractor.py +26 -0
- whatap/llm/providers/openai/responses/__init__.py +0 -0
- whatap/llm/providers/openai/responses/responses.py +70 -0
- whatap/llm/providers/openai/responses/responses_context.py +88 -0
- whatap/llm/providers/openai/responses/responses_extractor.py +126 -0
- whatap/llm/providers/stream_accumulator.py +73 -0
- whatap/llm/stats/__init__.py +35 -0
- whatap/llm/stats/active_stat.py +86 -0
- whatap/llm/stats/answer_relevance_eval_stat.py +10 -0
- whatap/llm/stats/api_status_stat.py +35 -0
- whatap/llm/stats/base_stat.py +107 -0
- whatap/llm/stats/combined_judge_eval_stat.py +11 -0
- whatap/llm/stats/error_stat.py +59 -0
- whatap/llm/stats/eval_stat.py +225 -0
- whatap/llm/stats/factuality_eval_stat.py +10 -0
- whatap/llm/stats/feature_stat.py +104 -0
- whatap/llm/stats/finish_stat.py +105 -0
- whatap/llm/stats/hallucination_eval_stat.py +10 -0
- whatap/llm/stats/meter.py +18 -0
- whatap/llm/stats/perf_stat.py +117 -0
- whatap/llm/stats/pii_leak_eval_stat.py +12 -0
- whatap/llm/stats/prompt_injection_eval_stat.py +10 -0
- whatap/llm/stats/token_usage_stat.py +133 -0
- whatap/llm/stats/toxicity_eval_stat.py +10 -0
- whatap/llm/stats/url_scan_eval_stat.py +12 -0
- whatap/net/__init__.py +0 -0
- whatap/net/async_sender.py +107 -0
- whatap/net/packet_enum.py +44 -0
- whatap/net/packet_type_enum.py +31 -0
- whatap/net/param_def.py +69 -0
- whatap/net/stackhelper.py +87 -0
- whatap/net/udp_session.py +394 -0
- whatap/net/udp_thread.py +54 -0
- whatap/pack/__init__.py +0 -0
- whatap/pack/logSinkPack.py +77 -0
- whatap/pack/pack.py +34 -0
- whatap/pack/pack_enum.py +41 -0
- whatap/pack/tagCountPack.py +61 -0
- whatap/scripts/__init__.py +208 -0
- whatap/trace/__init__.py +12 -0
- whatap/trace/mod/__init__.py +0 -0
- whatap/trace/mod/amqp/__init__.py +0 -0
- whatap/trace/mod/amqp/kombu.py +122 -0
- whatap/trace/mod/amqp/pika.py +62 -0
- whatap/trace/mod/application/__init__.py +0 -0
- whatap/trace/mod/application/bottle.py +34 -0
- whatap/trace/mod/application/celery.py +81 -0
- whatap/trace/mod/application/cherrypy.py +30 -0
- whatap/trace/mod/application/django.py +287 -0
- whatap/trace/mod/application/django_asgi.py +266 -0
- whatap/trace/mod/application/django_py3.py +251 -0
- whatap/trace/mod/application/fastapi/__init__.py +31 -0
- whatap/trace/mod/application/fastapi/endpoint.py +73 -0
- whatap/trace/mod/application/fastapi/exception_log.py +63 -0
- whatap/trace/mod/application/fastapi/instrumentation.py +204 -0
- whatap/trace/mod/application/fastapi/scope.py +115 -0
- whatap/trace/mod/application/fastapi/transaction.py +67 -0
- whatap/trace/mod/application/flask.py +52 -0
- whatap/trace/mod/application/frappe.py +224 -0
- whatap/trace/mod/application/graphql.py +170 -0
- whatap/trace/mod/application/nameko.py +39 -0
- whatap/trace/mod/application/odoo.py +63 -0
- whatap/trace/mod/application/starlette.py +126 -0
- whatap/trace/mod/application/tornado.py +163 -0
- whatap/trace/mod/application/wsgi.py +195 -0
- whatap/trace/mod/database/__init__.py +0 -0
- whatap/trace/mod/database/cxoracle.py +49 -0
- whatap/trace/mod/database/mongo.py +169 -0
- whatap/trace/mod/database/mysql.py +80 -0
- whatap/trace/mod/database/neo4j.py +90 -0
- whatap/trace/mod/database/psycopg2.py +45 -0
- whatap/trace/mod/database/psycopg3.py +359 -0
- whatap/trace/mod/database/redis.py +122 -0
- whatap/trace/mod/database/sqlalchemy.py +213 -0
- whatap/trace/mod/database/sqlite3.py +130 -0
- whatap/trace/mod/database/util.py +630 -0
- whatap/trace/mod/email/__init__.py +0 -0
- whatap/trace/mod/email/smtp.py +78 -0
- whatap/trace/mod/httpc/__init__.py +0 -0
- whatap/trace/mod/httpc/django.py +31 -0
- whatap/trace/mod/httpc/httplib.py +70 -0
- whatap/trace/mod/httpc/httpx.py +62 -0
- whatap/trace/mod/httpc/requests.py +20 -0
- whatap/trace/mod/httpc/urllib3.py +27 -0
- whatap/trace/mod/httpc/util.py +388 -0
- whatap/trace/mod/logging.py +161 -0
- whatap/trace/mod/plugin.py +84 -0
- whatap/trace/mod/standalone/__init__.py +0 -0
- whatap/trace/mod/standalone/multiple.py +293 -0
- whatap/trace/mod/standalone/single.py +135 -0
- whatap/trace/simple_trace_context.py +18 -0
- whatap/trace/trace_context.py +212 -0
- whatap/trace/trace_context_manager.py +244 -0
- whatap/trace/trace_error.py +84 -0
- whatap/trace/trace_handler.py +89 -0
- whatap/trace/trace_import.py +91 -0
- whatap/trace/trace_module_definition.py +156 -0
- whatap/util/__init__.py +0 -0
- whatap/util/bit_util.py +49 -0
- whatap/util/cardinality/__init__.py +0 -0
- whatap/util/cardinality/hyperloglog.py +84 -0
- whatap/util/cardinality/murmurhash.py +20 -0
- whatap/util/cardinality/registerset.py +60 -0
- whatap/util/compare_util.py +19 -0
- whatap/util/date_util.py +55 -0
- whatap/util/debug_util.py +73 -0
- whatap/util/escape_literal_sql.py +233 -0
- whatap/util/frame_util.py +20 -0
- whatap/util/hash_util.py +103 -0
- whatap/util/hexa32.py +66 -0
- whatap/util/int_set.py +199 -0
- whatap/util/ip_util.py +63 -0
- whatap/util/keygen.py +11 -0
- whatap/util/linked_list.py +113 -0
- whatap/util/linked_map.py +359 -0
- whatap/util/metering_util.py +103 -0
- whatap/util/request_double_queue.py +68 -0
- whatap/util/request_queue.py +60 -0
- whatap/util/string_util.py +20 -0
- whatap/util/throttle_util.py +99 -0
- whatap/util/userid_util.py +134 -0
- whatap/value/__init__.py +1 -0
- whatap/value/blob_value.py +38 -0
- whatap/value/boolean_value.py +33 -0
- whatap/value/decimal_value.py +36 -0
- whatap/value/double_summary.py +86 -0
- whatap/value/double_value.py +33 -0
- whatap/value/float_array.py +42 -0
- whatap/value/float_value.py +34 -0
- whatap/value/int_array.py +42 -0
- whatap/value/ip4_value.py +50 -0
- whatap/value/list_value.py +105 -0
- whatap/value/long_array.py +44 -0
- whatap/value/long_summary.py +83 -0
- whatap/value/map_value.py +154 -0
- whatap/value/null_value.py +21 -0
- whatap/value/number_value.py +33 -0
- whatap/value/summary_value.py +39 -0
- whatap/value/text_array.py +58 -0
- whatap/value/text_hash_value.py +37 -0
- whatap/value/text_value.py +43 -0
- whatap/value/value.py +26 -0
- whatap/value/value_enum.py +80 -0
- whatap/whatap.conf +14 -0
- whatap_python-2.1.0.dist-info/METADATA +87 -0
- whatap_python-2.1.0.dist-info/RECORD +227 -0
- whatap_python-2.1.0.dist-info/WHEEL +5 -0
- whatap_python-2.1.0.dist-info/entry_points.txt +6 -0
- whatap_python-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
"""LLM-as-a-judge 평가자의 베이스 + 어댑터.
|
|
2
|
+
|
|
3
|
+
LLM judge 평가자는 자체적으로 다른 LLM 을 호출해 평가를 수행한다.
|
|
4
|
+
- 비용이 비싸므로 보통 샘플링과 함께 사용한다 (whatap.conf 의 llm_eval_sample_rate).
|
|
5
|
+
- 워커 스레드의 재귀 가드 (LlmEvaluatorTask._safe_run_one 의 thread-local) 가
|
|
6
|
+
judge 의 LLM 호출이 다시 평가 큐에 들어가는 것을 자동 차단한다.
|
|
7
|
+
|
|
8
|
+
사용:
|
|
9
|
+
1) judge_fn 을 직접 작성하거나 make_openai_judge() 로 만든 뒤,
|
|
10
|
+
2) HallucinationEvaluator/AnswerRelevanceEvaluator/ToxicityEvaluator 등에 주입.
|
|
11
|
+
|
|
12
|
+
>>> judge = make_openai_judge(model='gpt-4o-mini')
|
|
13
|
+
>>> register_evaluator(HallucinationEvaluator(judge_fn=judge))
|
|
14
|
+
"""
|
|
15
|
+
import json
|
|
16
|
+
import re
|
|
17
|
+
|
|
18
|
+
from whatap import logging
|
|
19
|
+
from whatap.llm.evaluators.base import BaseEvaluator, EvaluatorResult
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# 사용자가 set_default_judge_fn() 으로 등록하는 전역 fallback judge.
|
|
23
|
+
# user 의 client 캡처가 안 되는 환경 (CLI 스크립트 등) 에서만 사용.
|
|
24
|
+
_default_judge_fn = [None]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def set_default_judge_fn(judge_fn):
|
|
28
|
+
"""전역 default judge_fn 등록 (앱 시작 시 한 번).
|
|
29
|
+
|
|
30
|
+
각 evaluator 의 judge 결정 우선순위:
|
|
31
|
+
1) evaluator 인스턴스에 명시된 judge_fn
|
|
32
|
+
2) 이 함수로 등록한 전역 default
|
|
33
|
+
3) ctx.provider 기반 자동 판별 (api.openai.com → OpenAI, api.anthropic.com → Anthropic)
|
|
34
|
+
|
|
35
|
+
로컬 vLLM 서버 / Azure OpenAI / 자체 호스팅 LLM 등 ctx.provider 가 알려진 클라우드와
|
|
36
|
+
매칭 안 되는 환경에서는 이 default 를 등록하지 않으면 judge_error 가 발생한다.
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
from whatap.llm.evaluators.builtins import set_default_judge_fn, make_openai_judge
|
|
40
|
+
|
|
41
|
+
# 앱 시작 시 한 번
|
|
42
|
+
set_default_judge_fn(make_openai_judge(
|
|
43
|
+
base_url='http://localhost:8002/v1',
|
|
44
|
+
api_key='EMPTY',
|
|
45
|
+
model='qwen-7b',
|
|
46
|
+
))
|
|
47
|
+
"""
|
|
48
|
+
if judge_fn is not None and not callable(judge_fn):
|
|
49
|
+
raise TypeError('judge_fn must be callable')
|
|
50
|
+
_default_judge_fn[0] = judge_fn
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_default_judge_fn():
|
|
54
|
+
"""현재 등록된 전역 default judge_fn 반환 (없으면 None)."""
|
|
55
|
+
return _default_judge_fn[0]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class LLMJudgeBase(BaseEvaluator):
|
|
59
|
+
"""LLM 을 사용해 평가를 수행하는 베이스 클래스.
|
|
60
|
+
|
|
61
|
+
서브클래스는 ``make_prompt(ctx)`` 와 ``parse_judgment(raw)`` 만 구현하면 된다.
|
|
62
|
+
|
|
63
|
+
judge LLM 호출 방식 — 우선순위:
|
|
64
|
+
1) ``judge_fn`` 명시 — ``(system, user) -> str`` callable. 완전 커스텀 케이스.
|
|
65
|
+
2) ``model`` 만 명시 — provider 는 평가 대상의 ctx.provider 에서 자동 판별.
|
|
66
|
+
(OpenAI 면 OpenAI, Anthropic 이면 Anthropic 으로 자동 dispatch)
|
|
67
|
+
3) 둘 다 미지정 — provider 자동 판별 + 모델은 작은 default
|
|
68
|
+
(OpenAI: gpt-4o-mini, Anthropic: claude-3-5-haiku-latest).
|
|
69
|
+
|
|
70
|
+
인증은 환경변수 (OPENAI_API_KEY / ANTHROPIC_API_KEY) 가 자동 사용된다 —
|
|
71
|
+
사용자 앱이 이미 쓰고 있는 키 그대로. 별도 발급 불필요.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
# LLM judge 호출이 발생하는 평가자임을 표시. ``LlmEvaluatorTask._run_one`` 이
|
|
75
|
+
# ``EvalStat`` (judge 호출 통계 — call_count/failures/latency_sum) 에 카운트
|
|
76
|
+
# 할지 여부를 결정. 규칙 기반 평가자 (PIILeak / URLScan / 사용자 custom 등)
|
|
77
|
+
# 는 BaseEvaluator 의 default False 이므로 EvalStat 에 안 잡힘 (점수 히스토그램
|
|
78
|
+
# 에는 정상 송출).
|
|
79
|
+
USES_LLM_JUDGE = True
|
|
80
|
+
|
|
81
|
+
LABEL = None
|
|
82
|
+
METRIC_TYPE = None
|
|
83
|
+
|
|
84
|
+
# 기본 모델 (provider 별).
|
|
85
|
+
_DEFAULT_OPENAI_MODEL = 'gpt-4o-mini'
|
|
86
|
+
_DEFAULT_ANTHROPIC_MODEL = 'claude-3-5-haiku-latest'
|
|
87
|
+
|
|
88
|
+
def __init__(self, judge_fn=None, model=None):
|
|
89
|
+
if judge_fn is not None and not callable(judge_fn):
|
|
90
|
+
raise TypeError('judge_fn must be callable')
|
|
91
|
+
self._judge_fn = judge_fn
|
|
92
|
+
self._model = model
|
|
93
|
+
# auto-resolved judge_fn 캐시 (provider 별).
|
|
94
|
+
self._auto_cache = {}
|
|
95
|
+
|
|
96
|
+
def make_prompt(self, ctx):
|
|
97
|
+
"""평가 프롬프트 생성. 반환값: (system_prompt, user_prompt) 튜플."""
|
|
98
|
+
raise NotImplementedError
|
|
99
|
+
|
|
100
|
+
def parse_judgment(self, raw):
|
|
101
|
+
"""judge 의 raw 응답을 EvaluatorResult 로 변환."""
|
|
102
|
+
raise NotImplementedError
|
|
103
|
+
|
|
104
|
+
def _resolve_judge_fn(self, ctx):
|
|
105
|
+
"""judge_fn 결정 우선순위:
|
|
106
|
+
|
|
107
|
+
1) evaluator 에 명시된 judge_fn
|
|
108
|
+
2) set_default_judge_fn() 으로 등록한 전역 default
|
|
109
|
+
3) **interceptor 가 캡처한 ctx.client 의 base_url/api_key 자동 재사용**
|
|
110
|
+
(사용자 앱이 쓰는 그 client 의 인증/엔드포인트 그대로 — 별도 등록 불필요)
|
|
111
|
+
4) ctx.provider 기반 자동 판별 (api.openai.com / api.anthropic.com 만 매칭)
|
|
112
|
+
|
|
113
|
+
대부분의 경우 (3) 에서 해결 — 사용자가 OpenAI/AsyncOpenAI/Anthropic 등 어떤
|
|
114
|
+
client 를 쓰고 있든 그 인증 정보를 그대로 받아 평가 LLM 호출도 같은 백엔드로 보냄.
|
|
115
|
+
"""
|
|
116
|
+
if self._judge_fn is not None:
|
|
117
|
+
return self._judge_fn
|
|
118
|
+
|
|
119
|
+
# 1) 전역 default
|
|
120
|
+
default = get_default_judge_fn()
|
|
121
|
+
if default is not None:
|
|
122
|
+
return default
|
|
123
|
+
|
|
124
|
+
# 2) interceptor 가 캡처한 사용자 client + event loop 그대로 재사용
|
|
125
|
+
captured = getattr(ctx, 'client', None)
|
|
126
|
+
captured_loop = getattr(ctx, 'event_loop', None)
|
|
127
|
+
if captured is not None:
|
|
128
|
+
judge_model = self._model or getattr(ctx, 'model', None) or self._DEFAULT_OPENAI_MODEL
|
|
129
|
+
judge_fn = _judge_fn_from_captured_client(captured, model=judge_model,
|
|
130
|
+
event_loop=captured_loop)
|
|
131
|
+
if judge_fn is not None:
|
|
132
|
+
return judge_fn
|
|
133
|
+
|
|
134
|
+
# 3) ctx.provider 문자열 기반 클라우드 매칭
|
|
135
|
+
provider = (getattr(ctx, 'provider', '') or '').lower()
|
|
136
|
+
if 'anthropic' in provider:
|
|
137
|
+
key = 'anthropic'
|
|
138
|
+
elif 'openai.com' in provider or 'azure.com' in provider:
|
|
139
|
+
key = 'openai'
|
|
140
|
+
else:
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
if key in self._auto_cache:
|
|
144
|
+
return self._auto_cache[key]
|
|
145
|
+
|
|
146
|
+
if key == 'anthropic':
|
|
147
|
+
judge_fn = make_anthropic_judge(
|
|
148
|
+
model=self._model or self._DEFAULT_ANTHROPIC_MODEL,
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
judge_fn = make_openai_judge(
|
|
152
|
+
model=self._model or self._DEFAULT_OPENAI_MODEL,
|
|
153
|
+
)
|
|
154
|
+
self._auto_cache[key] = judge_fn
|
|
155
|
+
return judge_fn
|
|
156
|
+
|
|
157
|
+
def evaluate(self, ctx):
|
|
158
|
+
judge_fn = self._resolve_judge_fn(ctx)
|
|
159
|
+
if judge_fn is None:
|
|
160
|
+
logging.warning(
|
|
161
|
+
'[LLM] judge evaluator %s skipped: no judge_fn resolvable for provider=%r. '
|
|
162
|
+
'Set one via evaluator(judge_fn=...) or set_default_judge_fn().'
|
|
163
|
+
% (self.LABEL, getattr(ctx, 'provider', '')),
|
|
164
|
+
extra={'id': 'LLM062'},
|
|
165
|
+
)
|
|
166
|
+
return EvaluatorResult(
|
|
167
|
+
value='no_judge_configured',
|
|
168
|
+
reasoning='No judge_fn for provider=%s. Use set_default_judge_fn().'
|
|
169
|
+
% getattr(ctx, 'provider', '?'),
|
|
170
|
+
metric_type='categorical',
|
|
171
|
+
metadata={'provider': getattr(ctx, 'provider', '')},
|
|
172
|
+
)
|
|
173
|
+
try:
|
|
174
|
+
system, user = self.make_prompt(ctx)
|
|
175
|
+
raw = judge_fn(system, user)
|
|
176
|
+
if not isinstance(raw, str):
|
|
177
|
+
raw = str(raw) if raw is not None else ''
|
|
178
|
+
return self.parse_judgment(raw)
|
|
179
|
+
except Exception as e:
|
|
180
|
+
logging.warning('[LLM] judge evaluator %s exception: %s' % (self.LABEL, e),
|
|
181
|
+
extra={'id': 'LLM060'})
|
|
182
|
+
return EvaluatorResult(
|
|
183
|
+
value='judge_error',
|
|
184
|
+
reasoning='%s: %s' % (type(e).__name__, e),
|
|
185
|
+
metric_type='categorical',
|
|
186
|
+
metadata={'error_type': type(e).__name__, 'error_message': str(e)},
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _judge_fn_from_captured_client(captured, model, temperature=0.0, max_tokens=2048,
|
|
191
|
+
event_loop=None):
|
|
192
|
+
"""interceptor 가 캡처한 user 의 client 를 그대로 재사용해 judge_fn 을 만든다.
|
|
193
|
+
|
|
194
|
+
핵심: **새 sync client / httpx.Client 생성 안 함.** user 가 만들어 쓰던 그
|
|
195
|
+
OpenAI / AsyncOpenAI / Anthropic / AsyncAnthropic 인스턴스를 그대로 호출.
|
|
196
|
+
|
|
197
|
+
sync client (OpenAI / Anthropic) → 워커 스레드에서 그대로 ``client.method(...)`` 호출.
|
|
198
|
+
async client (AsyncOpenAI / AsyncAnthropic) → ``run_coroutine_threadsafe`` 로 user 의
|
|
199
|
+
running event loop 에 dispatch → 그 loop 에서 user 의 AsyncClient 가 호출됨.
|
|
200
|
+
|
|
201
|
+
`event_loop` 가 None 이면 (sync 호출이었거나 capture 실패) async client 는 호출 불가
|
|
202
|
+
→ None 반환.
|
|
203
|
+
"""
|
|
204
|
+
import inspect
|
|
205
|
+
cls_name = type(captured).__name__
|
|
206
|
+
|
|
207
|
+
# async 여부 판별 — 클래스명 prefix + create 메서드가 coroutine 인지
|
|
208
|
+
is_async = cls_name.startswith('Async')
|
|
209
|
+
if not is_async:
|
|
210
|
+
try:
|
|
211
|
+
if 'Anthropic' in cls_name:
|
|
212
|
+
m = captured.messages.create
|
|
213
|
+
else:
|
|
214
|
+
m = captured.chat.completions.create
|
|
215
|
+
if inspect.iscoroutinefunction(m):
|
|
216
|
+
is_async = True
|
|
217
|
+
except Exception:
|
|
218
|
+
pass
|
|
219
|
+
|
|
220
|
+
# ── sync user client → 워커 스레드에서 직접 호출 ──
|
|
221
|
+
if not is_async:
|
|
222
|
+
if 'Anthropic' in cls_name:
|
|
223
|
+
return make_anthropic_judge(client=captured, model=model,
|
|
224
|
+
temperature=temperature, max_tokens=max_tokens)
|
|
225
|
+
return make_openai_judge(client=captured, model=model,
|
|
226
|
+
temperature=temperature, max_tokens=max_tokens)
|
|
227
|
+
|
|
228
|
+
# ── async user client → user loop 에 dispatch ──
|
|
229
|
+
if event_loop is None:
|
|
230
|
+
# loop capture 실패 — async client 그대로 호출 불가
|
|
231
|
+
return None
|
|
232
|
+
|
|
233
|
+
return _make_async_dispatched_judge(captured, model, temperature, max_tokens,
|
|
234
|
+
event_loop=event_loop,
|
|
235
|
+
kind='anthropic' if 'Anthropic' in cls_name else 'openai')
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _make_async_dispatched_judge(async_client, model, temperature, max_tokens,
|
|
239
|
+
event_loop, kind):
|
|
240
|
+
"""async user client 를 user 의 running loop 에 dispatch 해서 호출하는 sync judge_fn.
|
|
241
|
+
|
|
242
|
+
매 호출마다 ``run_coroutine_threadsafe(coro, event_loop).result()`` — coro 는 user 의
|
|
243
|
+
loop 에서 실행되어 user 의 AsyncClient (그 loop 에 binding) 그대로 사용.
|
|
244
|
+
별도 client 생성 / httpx pool 생성 없음 → 누적 leak 0.
|
|
245
|
+
|
|
246
|
+
Dispatched coro 는 task context 시작 시 trace context 를 None 으로 명시 set:
|
|
247
|
+
user loop 이 다른 request 처리 중일 때 그 ctx 를 inherit 하면 intercept 가
|
|
248
|
+
엉뚱한 txid 로 발화하므로 차단.
|
|
249
|
+
"""
|
|
250
|
+
import asyncio
|
|
251
|
+
from whatap.counter.tasks.llm_evaluator_task import _eval_worker_cv, _get_eval_worker_state
|
|
252
|
+
|
|
253
|
+
def _setup_dispatched_ctx(state):
|
|
254
|
+
"""dispatched coro 시작 시 컨텍스트 정리:
|
|
255
|
+
- trace ctx 를 None 으로 set — user loop 이 다른 request 처리 중일 때 그 ctx 를
|
|
256
|
+
inherit 해서 intercept 가 엉뚱한 txid 로 발화하는 것 차단
|
|
257
|
+
- _eval_worker_cv 를 worker 의 state dict 로 set — dispatched task 의 context 에
|
|
258
|
+
parent ids 까지 propagate. 이 task 안의 LLM 호출 intercept 가 evaluator worker
|
|
259
|
+
로 인지하고 (옵션 시) parent ids override 까지 정상 동작.
|
|
260
|
+
|
|
261
|
+
반환: (cv_token, trace_token_or_None) — coro 종료 시 reset 용. set 실패 시 None.
|
|
262
|
+
ContextVar 는 task 종료 시 자동 정리되지만, ``loop.run_until_complete`` 처럼
|
|
263
|
+
같은 task 컨텍스트가 user 코드로 이어지는 케이스에서 누출 방지를 위한 명시 reset.
|
|
264
|
+
"""
|
|
265
|
+
trace_token = None
|
|
266
|
+
try:
|
|
267
|
+
from whatap.trace.trace_context_manager import TraceContextManager
|
|
268
|
+
trace_token = TraceContextManager.whatap_coroutine_context.set(None)
|
|
269
|
+
except Exception:
|
|
270
|
+
pass
|
|
271
|
+
cv_token = None
|
|
272
|
+
try:
|
|
273
|
+
cv_token = _eval_worker_cv.set(state if state else {'in_eval': True})
|
|
274
|
+
except Exception:
|
|
275
|
+
pass
|
|
276
|
+
return cv_token, trace_token
|
|
277
|
+
|
|
278
|
+
def _reset_dispatched_ctx(cv_token, trace_token):
|
|
279
|
+
if cv_token is not None:
|
|
280
|
+
try:
|
|
281
|
+
_eval_worker_cv.reset(cv_token)
|
|
282
|
+
except Exception:
|
|
283
|
+
pass
|
|
284
|
+
if trace_token is not None:
|
|
285
|
+
try:
|
|
286
|
+
from whatap.trace.trace_context_manager import TraceContextManager
|
|
287
|
+
TraceContextManager.whatap_coroutine_context.reset(trace_token)
|
|
288
|
+
except Exception:
|
|
289
|
+
pass
|
|
290
|
+
|
|
291
|
+
if kind == 'anthropic':
|
|
292
|
+
async def _call(state, system, user):
|
|
293
|
+
cv_tok, tr_tok = _setup_dispatched_ctx(state)
|
|
294
|
+
try:
|
|
295
|
+
resp = await async_client.messages.create(
|
|
296
|
+
model=model, system=system,
|
|
297
|
+
messages=[{'role': 'user', 'content': user}],
|
|
298
|
+
max_tokens=max_tokens, temperature=temperature,
|
|
299
|
+
)
|
|
300
|
+
for block in resp.content:
|
|
301
|
+
if getattr(block, 'type', None) == 'text':
|
|
302
|
+
return block.text or ''
|
|
303
|
+
return ''
|
|
304
|
+
finally:
|
|
305
|
+
_reset_dispatched_ctx(cv_tok, tr_tok)
|
|
306
|
+
else:
|
|
307
|
+
async def _call(state, system, user):
|
|
308
|
+
cv_tok, tr_tok = _setup_dispatched_ctx(state)
|
|
309
|
+
try:
|
|
310
|
+
resp = await async_client.chat.completions.create(
|
|
311
|
+
model=model,
|
|
312
|
+
messages=[
|
|
313
|
+
{'role': 'system', 'content': system},
|
|
314
|
+
{'role': 'user', 'content': user},
|
|
315
|
+
],
|
|
316
|
+
temperature=temperature, max_tokens=max_tokens,
|
|
317
|
+
)
|
|
318
|
+
return resp.choices[0].message.content or ''
|
|
319
|
+
finally:
|
|
320
|
+
_reset_dispatched_ctx(cv_tok, tr_tok)
|
|
321
|
+
|
|
322
|
+
def judge_fn(system, user):
|
|
323
|
+
if not event_loop.is_running():
|
|
324
|
+
raise RuntimeError(
|
|
325
|
+
'captured event loop is no longer running — user app likely shut down'
|
|
326
|
+
)
|
|
327
|
+
# judge_fn 호출 시점 (worker thread) 에 state 캡처 → dispatched coro 에 closure
|
|
328
|
+
# 로 전달. parent_txid/step_id/index 가 user loop thread 까지 propagate 됨.
|
|
329
|
+
state = _get_eval_worker_state()
|
|
330
|
+
future = asyncio.run_coroutine_threadsafe(_call(state, system, user), event_loop)
|
|
331
|
+
# user app 의 loop hang / backpressure 로 평가 워커가 무기한 블록되는 것 차단.
|
|
332
|
+
# 초과 시 future 를 취소해 user loop 에 떠다니는 좀비 task 도 정리.
|
|
333
|
+
timeout = _judge_timeout_sec()
|
|
334
|
+
try:
|
|
335
|
+
return future.result(timeout=timeout) if timeout else future.result()
|
|
336
|
+
except Exception:
|
|
337
|
+
try:
|
|
338
|
+
future.cancel()
|
|
339
|
+
except Exception:
|
|
340
|
+
pass
|
|
341
|
+
raise
|
|
342
|
+
return judge_fn
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _judge_timeout_sec():
|
|
346
|
+
"""conf.llm_eval_judge_timeout_sec 을 안전 변환. 0/음수/잘못된 값 → None (무제한).
|
|
347
|
+
|
|
348
|
+
Configure.setProperty 가 string 으로 들고있을 수 있어 (reload 패턴) 변환 우회.
|
|
349
|
+
"""
|
|
350
|
+
from whatap.conf.configure import Configure as conf
|
|
351
|
+
raw = getattr(conf, 'llm_eval_judge_timeout_sec', 30)
|
|
352
|
+
try:
|
|
353
|
+
v = float(raw)
|
|
354
|
+
except (TypeError, ValueError):
|
|
355
|
+
return 30.0
|
|
356
|
+
return v if v > 0 else None
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def make_openai_judge(client=None, model='gpt-4o-mini', api_key=None, base_url=None,
|
|
360
|
+
temperature=0.0, max_tokens=2048):
|
|
361
|
+
"""OpenAI 호환 API 를 사용하는 judge_fn 을 생성한다.
|
|
362
|
+
|
|
363
|
+
:param client: 기존 ``openai.OpenAI`` 인스턴스. 주면 그대로 재사용.
|
|
364
|
+
:param model: 평가용 모델.
|
|
365
|
+
:param base_url: OpenAI 호환 엔드포인트 (client 미지정 시).
|
|
366
|
+
"""
|
|
367
|
+
_client_ref = [client]
|
|
368
|
+
|
|
369
|
+
def judge_fn(system, user):
|
|
370
|
+
c = _client_ref[0]
|
|
371
|
+
if c is None:
|
|
372
|
+
import openai
|
|
373
|
+
client_kwargs = {}
|
|
374
|
+
if api_key:
|
|
375
|
+
client_kwargs['api_key'] = api_key
|
|
376
|
+
if base_url:
|
|
377
|
+
client_kwargs['base_url'] = base_url
|
|
378
|
+
c = openai.OpenAI(**client_kwargs)
|
|
379
|
+
_client_ref[0] = c
|
|
380
|
+
resp = c.chat.completions.create(
|
|
381
|
+
model=model,
|
|
382
|
+
messages=[
|
|
383
|
+
{'role': 'system', 'content': system},
|
|
384
|
+
{'role': 'user', 'content': user},
|
|
385
|
+
],
|
|
386
|
+
temperature=temperature,
|
|
387
|
+
max_tokens=max_tokens,
|
|
388
|
+
)
|
|
389
|
+
return resp.choices[0].message.content or ''
|
|
390
|
+
return judge_fn
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def make_anthropic_judge(client=None, model='claude-3-5-haiku-latest', api_key=None,
|
|
394
|
+
max_tokens=2048, temperature=0.0):
|
|
395
|
+
"""Anthropic 을 사용하는 judge_fn 을 생성한다."""
|
|
396
|
+
_client_ref = [client]
|
|
397
|
+
|
|
398
|
+
def judge_fn(system, user):
|
|
399
|
+
c = _client_ref[0]
|
|
400
|
+
if c is None:
|
|
401
|
+
import anthropic
|
|
402
|
+
client_kwargs = {}
|
|
403
|
+
if api_key:
|
|
404
|
+
client_kwargs['api_key'] = api_key
|
|
405
|
+
c = anthropic.Anthropic(**client_kwargs)
|
|
406
|
+
_client_ref[0] = c
|
|
407
|
+
resp = c.messages.create(
|
|
408
|
+
model=model,
|
|
409
|
+
system=system,
|
|
410
|
+
messages=[{'role': 'user', 'content': user}],
|
|
411
|
+
max_tokens=max_tokens,
|
|
412
|
+
temperature=temperature,
|
|
413
|
+
)
|
|
414
|
+
for block in resp.content:
|
|
415
|
+
if getattr(block, 'type', None) == 'text':
|
|
416
|
+
return block.text or ''
|
|
417
|
+
return ''
|
|
418
|
+
return judge_fn
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
# ── parsing helpers ──
|
|
422
|
+
|
|
423
|
+
_JSON_BLOCK = re.compile(r'\{.*\}', re.DOTALL)
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def _salvage_truncated_object(text):
|
|
427
|
+
"""잘린 (truncated) JSON 객체에서 **완전하게 끝난 top-level 항목만** 살려낸다.
|
|
428
|
+
|
|
429
|
+
judge 응답이 토큰 한도 등으로 중간에서 끊기면 닫는 ``}`` 가 없어 정상 파싱이
|
|
430
|
+
전부 실패한다 → 5 aspect 점수가 통째로 날아감. 이 함수는 마지막 안전망:
|
|
431
|
+
outer 객체를 스캔하다 depth 1 (top-level) 에서 만난 마지막 콤마까지를 잘라
|
|
432
|
+
``}`` 로 닫아 — 그 전까지 **완전히 끝난 key:value 쌍** 들만 복구한다.
|
|
433
|
+
중간에 끊긴 마지막 항목은 버린다.
|
|
434
|
+
|
|
435
|
+
flat 객체 (개별 aspect judge: ``{"score":..,"reasoning":..}``) 와 중첩 객체
|
|
436
|
+
(combined judge: ``{"hallucination":{..},"toxicity":{..}}``) 모두에 동작 —
|
|
437
|
+
top-level 콤마만 경계로 본다.
|
|
438
|
+
|
|
439
|
+
:return: 복구된 dict, 또는 복구 불가 시 None.
|
|
440
|
+
"""
|
|
441
|
+
start = text.find('{')
|
|
442
|
+
if start < 0:
|
|
443
|
+
return None
|
|
444
|
+
s = text[start:]
|
|
445
|
+
in_str = False
|
|
446
|
+
esc = False
|
|
447
|
+
depth = 0
|
|
448
|
+
last_safe = None # depth 1 의 마지막 콤마 인덱스 — s[:last_safe] + '}' 가 valid
|
|
449
|
+
for i, ch in enumerate(s):
|
|
450
|
+
if in_str:
|
|
451
|
+
if esc:
|
|
452
|
+
esc = False
|
|
453
|
+
elif ch == '\\':
|
|
454
|
+
esc = True
|
|
455
|
+
elif ch == '"':
|
|
456
|
+
in_str = False
|
|
457
|
+
continue
|
|
458
|
+
if ch == '"':
|
|
459
|
+
in_str = True
|
|
460
|
+
elif ch == '{' or ch == '[':
|
|
461
|
+
depth += 1
|
|
462
|
+
elif ch == '}' or ch == ']':
|
|
463
|
+
depth -= 1
|
|
464
|
+
if depth == 0:
|
|
465
|
+
# outer 객체가 정상적으로 닫혔다 (truncation 아님) → 그대로 파싱
|
|
466
|
+
try:
|
|
467
|
+
return json.loads(s[:i + 1])
|
|
468
|
+
except ValueError:
|
|
469
|
+
return None
|
|
470
|
+
elif ch == ',' and depth == 1:
|
|
471
|
+
last_safe = i # 이 콤마 앞까지가 완전히 끝난 쌍들
|
|
472
|
+
if last_safe is None:
|
|
473
|
+
return None
|
|
474
|
+
try:
|
|
475
|
+
return json.loads(s[:last_safe] + '}')
|
|
476
|
+
except ValueError:
|
|
477
|
+
return None
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def parse_json_response(raw):
|
|
481
|
+
"""judge 의 raw 텍스트에서 JSON 객체를 추출 + 파싱.
|
|
482
|
+
|
|
483
|
+
LLM 응답은 ```json ... ``` 펜스나 prefix/suffix 가 붙는 경우가 많아서
|
|
484
|
+
가장 외곽의 {...} 를 정규식으로 잡아낸 뒤 파싱한다. 그래도 실패하면 (응답이
|
|
485
|
+
중간에서 잘린 경우) ``_salvage_truncated_object`` 로 완전히 끝난 항목만 복구한다.
|
|
486
|
+
"""
|
|
487
|
+
if not raw:
|
|
488
|
+
raise ValueError('empty judge response')
|
|
489
|
+
text = raw.strip()
|
|
490
|
+
if text.startswith('```'):
|
|
491
|
+
first_nl = text.find('\n')
|
|
492
|
+
if first_nl > 0:
|
|
493
|
+
text = text[first_nl + 1:]
|
|
494
|
+
if text.endswith('```'):
|
|
495
|
+
text = text[:-3]
|
|
496
|
+
text = text.strip()
|
|
497
|
+
try:
|
|
498
|
+
return json.loads(text)
|
|
499
|
+
except json.JSONDecodeError:
|
|
500
|
+
pass
|
|
501
|
+
m = _JSON_BLOCK.search(text)
|
|
502
|
+
if m:
|
|
503
|
+
try:
|
|
504
|
+
return json.loads(m.group(0))
|
|
505
|
+
except json.JSONDecodeError:
|
|
506
|
+
pass
|
|
507
|
+
# 마지막 안전망: 잘린 JSON 에서 완전히 끝난 top-level 항목만 복구.
|
|
508
|
+
salvaged = _salvage_truncated_object(text)
|
|
509
|
+
if salvaged:
|
|
510
|
+
logging.warning(
|
|
511
|
+
'[LLM] recovered truncated judge JSON — %d top-level key(s) salvaged, '
|
|
512
|
+
'rest dropped (response likely hit max_tokens)' % len(salvaged),
|
|
513
|
+
extra={'id': 'LLM065'},
|
|
514
|
+
)
|
|
515
|
+
return salvaged
|
|
516
|
+
raise ValueError('no JSON object found in judge response: %r' % raw[:200])
|