whatap-python 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- whatap/LICENSE +0 -0
- whatap/README.rst +49 -0
- whatap/__init__.py +923 -0
- whatap/__main__.py +4 -0
- whatap/agent/darwin/amd64/whatap_python +0 -0
- whatap/agent/darwin/arm64/whatap_python +0 -0
- whatap/agent/linux/amd64/whatap_python +0 -0
- whatap/agent/linux/arm64/whatap_python +0 -0
- whatap/agent/windows/whatap_python.exe +0 -0
- whatap/bootstrap/__init__.py +0 -0
- whatap/bootstrap/sitecustomize.py +19 -0
- whatap/build.py +4 -0
- whatap/conf/__init__.py +0 -0
- whatap/conf/configuration.py +280 -0
- whatap/conf/configure.py +105 -0
- whatap/conf/license.py +49 -0
- whatap/control/__init__.py +0 -0
- whatap/counter/__init__.py +14 -0
- whatap/counter/counter_manager.py +45 -0
- whatap/counter/tasks/__init__.py +3 -0
- whatap/counter/tasks/base_task.py +26 -0
- whatap/counter/tasks/llm_evaluator_task.py +501 -0
- whatap/counter/tasks/llm_log_sink_task.py +309 -0
- whatap/counter/tasks/llm_stat_task.py +78 -0
- whatap/counter/tasks/openfiledescriptor.py +67 -0
- whatap/io/__init__.py +1 -0
- whatap/io/data_inputx.py +161 -0
- whatap/io/data_outputx.py +262 -0
- whatap/llm/__init__.py +17 -0
- whatap/llm/definitions.py +43 -0
- whatap/llm/evaluators/__init__.py +136 -0
- whatap/llm/evaluators/base.py +114 -0
- whatap/llm/evaluators/builtins/__init__.py +91 -0
- whatap/llm/evaluators/builtins/answer_relevance.py +46 -0
- whatap/llm/evaluators/builtins/combined_judge.py +271 -0
- whatap/llm/evaluators/builtins/factuality.py +71 -0
- whatap/llm/evaluators/builtins/hallucination.py +97 -0
- whatap/llm/evaluators/builtins/llm_judge.py +516 -0
- whatap/llm/evaluators/builtins/pii_leak.py +214 -0
- whatap/llm/evaluators/builtins/prompt_injection.py +71 -0
- whatap/llm/evaluators/builtins/toxicity.py +53 -0
- whatap/llm/evaluators/builtins/url_scan.py +194 -0
- whatap/llm/evaluators/registry.py +192 -0
- whatap/llm/evaluators/sampler.py +83 -0
- whatap/llm/evaluators/scope.py +334 -0
- whatap/llm/features.py +66 -0
- whatap/llm/log_sink_packs/__init__.py +9 -0
- whatap/llm/log_sink_packs/llm_input_message.py +16 -0
- whatap/llm/log_sink_packs/llm_log_sink_pack.py +72 -0
- whatap/llm/log_sink_packs/llm_output_message.py +19 -0
- whatap/llm/log_sink_packs/llm_step_eval_status.py +94 -0
- whatap/llm/log_sink_packs/llm_step_status.py +118 -0
- whatap/llm/log_sink_packs/llm_system_message.py +16 -0
- whatap/llm/log_sink_packs/llm_tool_calls.py +44 -0
- whatap/llm/log_sink_packs/llm_tool_results.py +16 -0
- whatap/llm/log_sink_packs/llm_tx_status.py +108 -0
- whatap/llm/pricing.py +236 -0
- whatap/llm/prompt_meta.py +288 -0
- whatap/llm/providers/__init__.py +0 -0
- whatap/llm/providers/anthropic/__init__.py +37 -0
- whatap/llm/providers/anthropic/messages/__init__.py +0 -0
- whatap/llm/providers/anthropic/messages/messages.py +70 -0
- whatap/llm/providers/anthropic/messages/messages_context.py +76 -0
- whatap/llm/providers/anthropic/messages/messages_extractor.py +126 -0
- whatap/llm/providers/interceptor.py +182 -0
- whatap/llm/providers/openai/__init__.py +133 -0
- whatap/llm/providers/openai/chat/__init__.py +0 -0
- whatap/llm/providers/openai/chat/chat.py +82 -0
- whatap/llm/providers/openai/chat/chat_context.py +78 -0
- whatap/llm/providers/openai/chat/chat_extractor.py +127 -0
- whatap/llm/providers/openai/completions/__init__.py +0 -0
- whatap/llm/providers/openai/completions/completions.py +70 -0
- whatap/llm/providers/openai/completions/completions_context.py +31 -0
- whatap/llm/providers/openai/completions/completions_extractor.py +61 -0
- whatap/llm/providers/openai/content_parser.py +41 -0
- whatap/llm/providers/openai/embeddings/__init__.py +0 -0
- whatap/llm/providers/openai/embeddings/embeddings.py +59 -0
- whatap/llm/providers/openai/embeddings/embeddings_context.py +25 -0
- whatap/llm/providers/openai/embeddings/embeddings_extractor.py +26 -0
- whatap/llm/providers/openai/responses/__init__.py +0 -0
- whatap/llm/providers/openai/responses/responses.py +70 -0
- whatap/llm/providers/openai/responses/responses_context.py +88 -0
- whatap/llm/providers/openai/responses/responses_extractor.py +126 -0
- whatap/llm/providers/stream_accumulator.py +73 -0
- whatap/llm/stats/__init__.py +35 -0
- whatap/llm/stats/active_stat.py +86 -0
- whatap/llm/stats/answer_relevance_eval_stat.py +10 -0
- whatap/llm/stats/api_status_stat.py +35 -0
- whatap/llm/stats/base_stat.py +107 -0
- whatap/llm/stats/combined_judge_eval_stat.py +11 -0
- whatap/llm/stats/error_stat.py +59 -0
- whatap/llm/stats/eval_stat.py +225 -0
- whatap/llm/stats/factuality_eval_stat.py +10 -0
- whatap/llm/stats/feature_stat.py +104 -0
- whatap/llm/stats/finish_stat.py +105 -0
- whatap/llm/stats/hallucination_eval_stat.py +10 -0
- whatap/llm/stats/meter.py +18 -0
- whatap/llm/stats/perf_stat.py +117 -0
- whatap/llm/stats/pii_leak_eval_stat.py +12 -0
- whatap/llm/stats/prompt_injection_eval_stat.py +10 -0
- whatap/llm/stats/token_usage_stat.py +133 -0
- whatap/llm/stats/toxicity_eval_stat.py +10 -0
- whatap/llm/stats/url_scan_eval_stat.py +12 -0
- whatap/net/__init__.py +0 -0
- whatap/net/async_sender.py +107 -0
- whatap/net/packet_enum.py +44 -0
- whatap/net/packet_type_enum.py +31 -0
- whatap/net/param_def.py +69 -0
- whatap/net/stackhelper.py +87 -0
- whatap/net/udp_session.py +394 -0
- whatap/net/udp_thread.py +54 -0
- whatap/pack/__init__.py +0 -0
- whatap/pack/logSinkPack.py +77 -0
- whatap/pack/pack.py +34 -0
- whatap/pack/pack_enum.py +41 -0
- whatap/pack/tagCountPack.py +61 -0
- whatap/scripts/__init__.py +208 -0
- whatap/trace/__init__.py +12 -0
- whatap/trace/mod/__init__.py +0 -0
- whatap/trace/mod/amqp/__init__.py +0 -0
- whatap/trace/mod/amqp/kombu.py +122 -0
- whatap/trace/mod/amqp/pika.py +62 -0
- whatap/trace/mod/application/__init__.py +0 -0
- whatap/trace/mod/application/bottle.py +34 -0
- whatap/trace/mod/application/celery.py +81 -0
- whatap/trace/mod/application/cherrypy.py +30 -0
- whatap/trace/mod/application/django.py +287 -0
- whatap/trace/mod/application/django_asgi.py +266 -0
- whatap/trace/mod/application/django_py3.py +251 -0
- whatap/trace/mod/application/fastapi/__init__.py +31 -0
- whatap/trace/mod/application/fastapi/endpoint.py +73 -0
- whatap/trace/mod/application/fastapi/exception_log.py +63 -0
- whatap/trace/mod/application/fastapi/instrumentation.py +204 -0
- whatap/trace/mod/application/fastapi/scope.py +115 -0
- whatap/trace/mod/application/fastapi/transaction.py +67 -0
- whatap/trace/mod/application/flask.py +52 -0
- whatap/trace/mod/application/frappe.py +224 -0
- whatap/trace/mod/application/graphql.py +170 -0
- whatap/trace/mod/application/nameko.py +39 -0
- whatap/trace/mod/application/odoo.py +63 -0
- whatap/trace/mod/application/starlette.py +126 -0
- whatap/trace/mod/application/tornado.py +163 -0
- whatap/trace/mod/application/wsgi.py +195 -0
- whatap/trace/mod/database/__init__.py +0 -0
- whatap/trace/mod/database/cxoracle.py +49 -0
- whatap/trace/mod/database/mongo.py +169 -0
- whatap/trace/mod/database/mysql.py +80 -0
- whatap/trace/mod/database/neo4j.py +90 -0
- whatap/trace/mod/database/psycopg2.py +45 -0
- whatap/trace/mod/database/psycopg3.py +359 -0
- whatap/trace/mod/database/redis.py +122 -0
- whatap/trace/mod/database/sqlalchemy.py +213 -0
- whatap/trace/mod/database/sqlite3.py +130 -0
- whatap/trace/mod/database/util.py +630 -0
- whatap/trace/mod/email/__init__.py +0 -0
- whatap/trace/mod/email/smtp.py +78 -0
- whatap/trace/mod/httpc/__init__.py +0 -0
- whatap/trace/mod/httpc/django.py +31 -0
- whatap/trace/mod/httpc/httplib.py +70 -0
- whatap/trace/mod/httpc/httpx.py +62 -0
- whatap/trace/mod/httpc/requests.py +20 -0
- whatap/trace/mod/httpc/urllib3.py +27 -0
- whatap/trace/mod/httpc/util.py +388 -0
- whatap/trace/mod/logging.py +161 -0
- whatap/trace/mod/plugin.py +84 -0
- whatap/trace/mod/standalone/__init__.py +0 -0
- whatap/trace/mod/standalone/multiple.py +293 -0
- whatap/trace/mod/standalone/single.py +135 -0
- whatap/trace/simple_trace_context.py +18 -0
- whatap/trace/trace_context.py +212 -0
- whatap/trace/trace_context_manager.py +244 -0
- whatap/trace/trace_error.py +84 -0
- whatap/trace/trace_handler.py +89 -0
- whatap/trace/trace_import.py +91 -0
- whatap/trace/trace_module_definition.py +156 -0
- whatap/util/__init__.py +0 -0
- whatap/util/bit_util.py +49 -0
- whatap/util/cardinality/__init__.py +0 -0
- whatap/util/cardinality/hyperloglog.py +84 -0
- whatap/util/cardinality/murmurhash.py +20 -0
- whatap/util/cardinality/registerset.py +60 -0
- whatap/util/compare_util.py +19 -0
- whatap/util/date_util.py +55 -0
- whatap/util/debug_util.py +73 -0
- whatap/util/escape_literal_sql.py +233 -0
- whatap/util/frame_util.py +20 -0
- whatap/util/hash_util.py +103 -0
- whatap/util/hexa32.py +66 -0
- whatap/util/int_set.py +199 -0
- whatap/util/ip_util.py +63 -0
- whatap/util/keygen.py +11 -0
- whatap/util/linked_list.py +113 -0
- whatap/util/linked_map.py +359 -0
- whatap/util/metering_util.py +103 -0
- whatap/util/request_double_queue.py +68 -0
- whatap/util/request_queue.py +60 -0
- whatap/util/string_util.py +20 -0
- whatap/util/throttle_util.py +99 -0
- whatap/util/userid_util.py +134 -0
- whatap/value/__init__.py +1 -0
- whatap/value/blob_value.py +38 -0
- whatap/value/boolean_value.py +33 -0
- whatap/value/decimal_value.py +36 -0
- whatap/value/double_summary.py +86 -0
- whatap/value/double_value.py +33 -0
- whatap/value/float_array.py +42 -0
- whatap/value/float_value.py +34 -0
- whatap/value/int_array.py +42 -0
- whatap/value/ip4_value.py +50 -0
- whatap/value/list_value.py +105 -0
- whatap/value/long_array.py +44 -0
- whatap/value/long_summary.py +83 -0
- whatap/value/map_value.py +154 -0
- whatap/value/null_value.py +21 -0
- whatap/value/number_value.py +33 -0
- whatap/value/summary_value.py +39 -0
- whatap/value/text_array.py +58 -0
- whatap/value/text_hash_value.py +37 -0
- whatap/value/text_value.py +43 -0
- whatap/value/value.py +26 -0
- whatap/value/value_enum.py +80 -0
- whatap/whatap.conf +14 -0
- whatap_python-2.1.0.dist-info/METADATA +87 -0
- whatap_python-2.1.0.dist-info/RECORD +227 -0
- whatap_python-2.1.0.dist-info/WHEEL +5 -0
- whatap_python-2.1.0.dist-info/entry_points.txt +6 -0
- whatap_python-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
"""LLM 평가자 큐잉 + 비동기 실행 + 결과 송출 태스크.
|
|
2
|
+
|
|
3
|
+
데이터 흐름:
|
|
4
|
+
[Producer] LlmLogSinkTask.dispatch(LlmStepStatus)
|
|
5
|
+
└─ enqueue_evaluation(pack)
|
|
6
|
+
└─ self._q.put((pack, scope_evs)) ← 큐 적재
|
|
7
|
+
|
|
8
|
+
[Dispatcher Thread] daemon, q.get() blocking loop
|
|
9
|
+
└─ for evaluator in registry + scope:
|
|
10
|
+
if sampler.should_run(evaluator, pack.txid):
|
|
11
|
+
executor.submit(_run_one, evaluator, pack)
|
|
12
|
+
|
|
13
|
+
[Worker Thread (ThreadPoolExecutor)]
|
|
14
|
+
└─ evaluator.evaluate(ctx) → EvaluatorResult (+extras)
|
|
15
|
+
└─ LlmStepEvalStatus = LlmStepStatus.from_step_status(pack)
|
|
16
|
+
└─ score 5 필드 채움 (eval_hallucination.n 등)
|
|
17
|
+
└─ dispatch_llm_evaluation_pack(eval_pack)
|
|
18
|
+
└─ LlmLogSinkTask._send_log_sink(eval_pack)
|
|
19
|
+
|
|
20
|
+
평가 결과 pack 은 원본 LlmStepStatus 와 동일한 구조 (model/tokens/cost/latency/...) +
|
|
21
|
+
``llm_log_type=llm_step_eval_status`` 로 차별 + 평가 점수 5 필드 추가.
|
|
22
|
+
|
|
23
|
+
평가는 fire-and-forget. LlmStepStatus 송출과 완전 독립 — 사용자 트랜잭션 영향 0.
|
|
24
|
+
모든 예외는 swallow 후 [LLM] 로그로만 기록.
|
|
25
|
+
"""
|
|
26
|
+
import contextvars
|
|
27
|
+
import queue
|
|
28
|
+
import threading
|
|
29
|
+
import time
|
|
30
|
+
|
|
31
|
+
from whatap import logging
|
|
32
|
+
from whatap.conf.configure import Configure as conf
|
|
33
|
+
|
|
34
|
+
from whatap.llm.evaluators.base import EvaluatorContext, EvaluatorResult
|
|
35
|
+
from whatap.llm.evaluators.registry import EvaluatorRegistry
|
|
36
|
+
from whatap.llm.evaluators.sampler import EvaluatorSampler
|
|
37
|
+
from whatap.llm.log_sink_packs.llm_step_eval_status import LlmStepEvalStatus
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
_DEFAULT_BUFFER_LIMIT = 1000
|
|
41
|
+
_DEFAULT_WORKERS = 4
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _conf_truthy(name, default=False):
|
|
45
|
+
"""Configure attr 을 bool 로 안전 변환.
|
|
46
|
+
|
|
47
|
+
Configure.setProperty 가 config file 에서 읽은 문자열 'true'/'false' 를 그대로
|
|
48
|
+
setattr 해서 string 으로 저장 → 'false' 도 truthy 로 평가되는 reload 버그 우회.
|
|
49
|
+
"""
|
|
50
|
+
try:
|
|
51
|
+
val = getattr(conf, name, default)
|
|
52
|
+
except Exception:
|
|
53
|
+
return bool(default)
|
|
54
|
+
if isinstance(val, bool):
|
|
55
|
+
return val
|
|
56
|
+
if isinstance(val, str):
|
|
57
|
+
return val.strip().lower() in ('true', 'yes', '1', 'on')
|
|
58
|
+
return bool(val)
|
|
59
|
+
|
|
60
|
+
# 평가 점수가 매핑되는 attribute 이름.
|
|
61
|
+
# evaluator.LABEL (또는 EvaluatorResult.extras 의 label) 를 키로 룩업.
|
|
62
|
+
_LABEL_TO_FIELD = {
|
|
63
|
+
'hallucination': 'eval_hallucination',
|
|
64
|
+
'answer_relevance': 'eval_answer_relevance',
|
|
65
|
+
'toxicity': 'eval_toxicity',
|
|
66
|
+
'prompt_injection': 'eval_prompt_injection',
|
|
67
|
+
'factuality': 'eval_factuality',
|
|
68
|
+
'pii_leak': 'eval_pii_leak',
|
|
69
|
+
'url_scan': 'eval_url_scan',
|
|
70
|
+
'combined_judge': 'eval_combined_judge',
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# 평가 워커 안인지 판별하는 플래그 — 두 저장소 동시 사용:
|
|
74
|
+
# - thread-local : sync 워커 스레드 자체에서 발생한 LLM 호출 인지용
|
|
75
|
+
# - ContextVar : async judge 가 ``run_coroutine_threadsafe`` 로 user loop 에 dispatch
|
|
76
|
+
# 된 task 의 context 에서도 인지되도록 propagate
|
|
77
|
+
# 무한 재귀 방지 (judge 호출이 다시 평가 큐에 안 들어가도록) + intercept 가 judge 호출도
|
|
78
|
+
# 정상 추적 (단, 재귀 가드 덕분에 다시 평가 enqueue 되지 않음) 의 두 가지 목적.
|
|
79
|
+
_eval_worker_local = threading.local()
|
|
80
|
+
_eval_worker_cv = contextvars.ContextVar('whatap_eval_worker', default=False)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _is_in_evaluator_worker():
|
|
84
|
+
"""평가 워커 안인지 — thread-local 또는 ContextVar 기준."""
|
|
85
|
+
if getattr(_eval_worker_local, 'in_eval', False):
|
|
86
|
+
return True
|
|
87
|
+
cv_val = _eval_worker_cv.get()
|
|
88
|
+
if isinstance(cv_val, dict):
|
|
89
|
+
return bool(cv_val.get('in_eval'))
|
|
90
|
+
return bool(cv_val)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _get_eval_worker_state():
|
|
94
|
+
"""현재 eval worker state 반환. 없으면 None.
|
|
95
|
+
|
|
96
|
+
구조: {'in_eval', 'parent_txid', 'parent_step_id', 'parent_index'}.
|
|
97
|
+
CV 우선, fallback TL.
|
|
98
|
+
"""
|
|
99
|
+
cv_val = _eval_worker_cv.get()
|
|
100
|
+
if isinstance(cv_val, dict) and cv_val.get('in_eval'):
|
|
101
|
+
return cv_val
|
|
102
|
+
if getattr(_eval_worker_local, 'in_eval', False):
|
|
103
|
+
return {
|
|
104
|
+
'in_eval': True,
|
|
105
|
+
'parent_txid': getattr(_eval_worker_local, 'parent_txid', None),
|
|
106
|
+
'parent_step_id': getattr(_eval_worker_local, 'parent_step_id', None),
|
|
107
|
+
'parent_index': getattr(_eval_worker_local, 'parent_index', None),
|
|
108
|
+
}
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# EvalStat 인스턴스 캐시.
|
|
113
|
+
_eval_stat_cache = [None]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _eval_stat():
|
|
117
|
+
if _eval_stat_cache[0] is not None:
|
|
118
|
+
return _eval_stat_cache[0]
|
|
119
|
+
try:
|
|
120
|
+
from whatap.counter.tasks.llm_stat_task import LlmStatTask
|
|
121
|
+
stat = LlmStatTask.get_stat('EvalStat')
|
|
122
|
+
if stat is not None:
|
|
123
|
+
_eval_stat_cache[0] = stat
|
|
124
|
+
return stat
|
|
125
|
+
except Exception:
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class LlmEvaluatorTask(object):
|
|
130
|
+
"""LLM 평가 파이프라인 (큐 + dispatcher + worker pool)."""
|
|
131
|
+
|
|
132
|
+
_instance = None
|
|
133
|
+
_lock = threading.Lock()
|
|
134
|
+
|
|
135
|
+
def __init__(self):
|
|
136
|
+
buffer_limit = _coerce_int(getattr(conf, 'llm_eval_buffer_limit', _DEFAULT_BUFFER_LIMIT),
|
|
137
|
+
_DEFAULT_BUFFER_LIMIT)
|
|
138
|
+
self._q = queue.Queue(buffer_limit)
|
|
139
|
+
self._workers = _coerce_int(getattr(conf, 'llm_eval_workers', _DEFAULT_WORKERS),
|
|
140
|
+
_DEFAULT_WORKERS)
|
|
141
|
+
self._executor = None
|
|
142
|
+
self._sampler = EvaluatorSampler()
|
|
143
|
+
self._started = False
|
|
144
|
+
self._start_lock = threading.Lock()
|
|
145
|
+
|
|
146
|
+
@classmethod
|
|
147
|
+
def get_instance(cls):
|
|
148
|
+
if cls._instance is None:
|
|
149
|
+
with cls._lock:
|
|
150
|
+
if cls._instance is None:
|
|
151
|
+
cls._instance = cls()
|
|
152
|
+
return cls._instance
|
|
153
|
+
|
|
154
|
+
# ── public producers ──
|
|
155
|
+
|
|
156
|
+
def enqueue(self, pack):
|
|
157
|
+
"""LlmLogSinkTask.dispatch 가 매 LLM 호출 종료 시 호출.
|
|
158
|
+
|
|
159
|
+
:param pack: 원본 ``LlmStepStatus`` pack. worker 가 이걸 복제해 LlmStepEvalStatus 를
|
|
160
|
+
만들어 점수 채워 송출. user 스레드에서 호출되므로 scope 평가자도 여기서 캡처.
|
|
161
|
+
"""
|
|
162
|
+
if not _conf_truthy('llm_eval_enabled'):
|
|
163
|
+
return
|
|
164
|
+
if _is_in_evaluator_worker():
|
|
165
|
+
return # judge 평가자의 자기 호출이 다시 평가 큐에 들어가는 무한 재귀 차단
|
|
166
|
+
if pack is None:
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
# conf.llm_eval_evaluators 기반 빌트인 자동 register (1회 — 안에서 fast-path).
|
|
170
|
+
# is_empty() 검사 전에 호출해야 첫 enqueue 가 skip 안 됨.
|
|
171
|
+
try:
|
|
172
|
+
from whatap.llm.evaluators.registry import bootstrap_from_conf
|
|
173
|
+
bootstrap_from_conf()
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logging.warning('[LLM] evaluator bootstrap failed: %s' % e,
|
|
176
|
+
extra={'id': 'LLM048'})
|
|
177
|
+
|
|
178
|
+
# user 스레드에서 scope 평가자 캡처 (dispatcher 스레드는 못 봄)
|
|
179
|
+
try:
|
|
180
|
+
from whatap.llm.evaluators.scope import get_scope_evaluators
|
|
181
|
+
scope_evs = list(get_scope_evaluators())
|
|
182
|
+
except Exception:
|
|
183
|
+
scope_evs = []
|
|
184
|
+
|
|
185
|
+
if EvaluatorRegistry.get_instance().is_empty() and not scope_evs:
|
|
186
|
+
return
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
self._q.put_nowait((pack, scope_evs))
|
|
190
|
+
except queue.Full:
|
|
191
|
+
logging.warning('[LLM] eval queue full, pack dropped: txid=%s' % pack.txid,
|
|
192
|
+
extra={'id': 'LLM030'})
|
|
193
|
+
return
|
|
194
|
+
self._ensure_started()
|
|
195
|
+
|
|
196
|
+
# ── internal ──
|
|
197
|
+
|
|
198
|
+
def _ensure_started(self):
|
|
199
|
+
if self._started:
|
|
200
|
+
return
|
|
201
|
+
with self._start_lock:
|
|
202
|
+
if self._started:
|
|
203
|
+
return
|
|
204
|
+
try:
|
|
205
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
206
|
+
self._executor = ThreadPoolExecutor(
|
|
207
|
+
max_workers=self._workers,
|
|
208
|
+
thread_name_prefix='whatap-llm-eval',
|
|
209
|
+
)
|
|
210
|
+
t = threading.Thread(target=self._run, daemon=True,
|
|
211
|
+
name='whatap-llm-eval-dispatch')
|
|
212
|
+
t.start()
|
|
213
|
+
self._started = True
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logging.warning('[LLM] eval task start failed: %s' % e,
|
|
216
|
+
extra={'id': 'LLM035'})
|
|
217
|
+
|
|
218
|
+
def _run(self):
|
|
219
|
+
"""Dispatcher 스레드 본체. 큐에서 (pack, scope_evs) 받아 evaluator × pack fan-out."""
|
|
220
|
+
while True:
|
|
221
|
+
try:
|
|
222
|
+
item = self._q.get()
|
|
223
|
+
except Exception:
|
|
224
|
+
continue
|
|
225
|
+
if item is None:
|
|
226
|
+
continue
|
|
227
|
+
try:
|
|
228
|
+
pack, scope_evs = item
|
|
229
|
+
except (ValueError, TypeError):
|
|
230
|
+
continue
|
|
231
|
+
try:
|
|
232
|
+
merged = self._merge_evaluators(scope_evs)
|
|
233
|
+
ctx_txid = pack.txid
|
|
234
|
+
# 샘플링 통과한 평가자만 모아서 한 워커 task 로 묶음 — 한 LLM 호출당
|
|
235
|
+
# LlmStepEvalStatus pack 1개만 송출되도록.
|
|
236
|
+
active = []
|
|
237
|
+
for evaluator in merged:
|
|
238
|
+
if not evaluator.LABEL:
|
|
239
|
+
continue
|
|
240
|
+
if not self._sampler.should_run(evaluator, ctx_txid):
|
|
241
|
+
continue
|
|
242
|
+
active.append(evaluator)
|
|
243
|
+
if active:
|
|
244
|
+
self._executor.submit(self._safe_run_pack, active, pack)
|
|
245
|
+
except Exception as e:
|
|
246
|
+
logging.warning('[LLM] eval dispatch failed: %s' % e,
|
|
247
|
+
extra={'id': 'LLM031'})
|
|
248
|
+
|
|
249
|
+
@staticmethod
|
|
250
|
+
def _merge_evaluators(scope_evs=None):
|
|
251
|
+
"""전역 registry + 캡처된 scope 평가자 합치기. 같은 LABEL 이면 scope 가 우선."""
|
|
252
|
+
base_evs = EvaluatorRegistry.get_instance().all()
|
|
253
|
+
merged = {}
|
|
254
|
+
for e in base_evs:
|
|
255
|
+
merged[e.LABEL] = e
|
|
256
|
+
if scope_evs:
|
|
257
|
+
for e in scope_evs:
|
|
258
|
+
if getattr(e, 'LABEL', None):
|
|
259
|
+
merged[e.LABEL] = e
|
|
260
|
+
return list(merged.values())
|
|
261
|
+
|
|
262
|
+
def _safe_run_pack(self, evaluators, pack):
|
|
263
|
+
"""워커 스레드 진입점 — 한 pack 에 대해 활성 evaluator 들을 순차 실행 + 단일
|
|
264
|
+
LlmStepEvalStatus pack 송출.
|
|
265
|
+
|
|
266
|
+
모든 예외 swallow + 재귀 방지 + 부모 step 키 propagation.
|
|
267
|
+
|
|
268
|
+
TL/CV 에 다음 stash:
|
|
269
|
+
- in_eval=True (재귀 가드)
|
|
270
|
+
- parent_txid / parent_step_id / parent_index : 원본 user step 의 키
|
|
271
|
+
``llm_eval_track_judge_calls=true`` 일 때 judge LLM 호출 pack 의 결합 키를
|
|
272
|
+
user step 과 동일하게 override 해서 dashboard 에서 같은 transaction 으로
|
|
273
|
+
보이도록 함.
|
|
274
|
+
|
|
275
|
+
TL 은 워커 스레드 자체에서 발생한 LLM 호출 (sync judge), CV 는 dispatched coro
|
|
276
|
+
(async judge, user loop 으로 dispatch) 에서 propagate 되도록.
|
|
277
|
+
"""
|
|
278
|
+
state = {
|
|
279
|
+
'in_eval': True,
|
|
280
|
+
'parent_txid': pack.txid,
|
|
281
|
+
'parent_step_id': pack.step_id,
|
|
282
|
+
'parent_index': pack.index,
|
|
283
|
+
}
|
|
284
|
+
_eval_worker_local.in_eval = True
|
|
285
|
+
_eval_worker_local.parent_txid = pack.txid
|
|
286
|
+
_eval_worker_local.parent_step_id = pack.step_id
|
|
287
|
+
_eval_worker_local.parent_index = pack.index
|
|
288
|
+
cv_token = _eval_worker_cv.set(state)
|
|
289
|
+
try:
|
|
290
|
+
self._run_pack(evaluators, pack)
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logging.warning('[LLM] evaluator worker crashed: count=%d err=%s'
|
|
293
|
+
% (len(evaluators), e),
|
|
294
|
+
extra={'id': 'LLM036'})
|
|
295
|
+
finally:
|
|
296
|
+
_eval_worker_cv.reset(cv_token)
|
|
297
|
+
_eval_worker_local.in_eval = False
|
|
298
|
+
_eval_worker_local.parent_txid = None
|
|
299
|
+
_eval_worker_local.parent_step_id = None
|
|
300
|
+
_eval_worker_local.parent_index = None
|
|
301
|
+
|
|
302
|
+
def _run_pack(self, evaluators, pack):
|
|
303
|
+
"""한 pack 의 모든 활성 evaluator 를 순차 실행 + 점수 모아서 단일
|
|
304
|
+
LlmStepEvalStatus pack 으로 송출.
|
|
305
|
+
|
|
306
|
+
한 LLM 호출 → 한 LlmStepEvalStatus pack 이라는 1:1 invariant. 평가자가 8개여도
|
|
307
|
+
pack 은 1개. 점수가 없는 필드 (해당 evaluator 가 등록 안 됐거나 실패) 는 None
|
|
308
|
+
이라 fields() 송출 시 누락.
|
|
309
|
+
|
|
310
|
+
Metric 갱신은 evaluator 별 ``update_eval_metrics()`` 가 따로 처리:
|
|
311
|
+
- llm_eval_stat (호출 통계 — call_count/failures/latency_sum/latency_sketch).
|
|
312
|
+
``USES_LLM_JUDGE=True`` 인 평가자만 카운트 — 규칙 기반 (PII/URL) 은 skip.
|
|
313
|
+
- llm_eval_<label> (점수 히스토그램 value0~value10).
|
|
314
|
+
"""
|
|
315
|
+
from whatap.counter.tasks.llm_log_sink_task import dispatch_llm_evaluation_pack
|
|
316
|
+
|
|
317
|
+
ctx = build_eval_context_from_pack(pack)
|
|
318
|
+
|
|
319
|
+
all_scores = {} # {label: float} — pack 에 합쳐 채울 모든 점수
|
|
320
|
+
any_score_emitted = False # 한 evaluator 라도 점수를 냈으면 True
|
|
321
|
+
all_succeeded = True # 모두 성공했을 때만 eval_success=True
|
|
322
|
+
|
|
323
|
+
for evaluator in evaluators:
|
|
324
|
+
scores, success = self._evaluate_one(evaluator, ctx, pack)
|
|
325
|
+
if scores:
|
|
326
|
+
all_scores.update(scores)
|
|
327
|
+
any_score_emitted = True
|
|
328
|
+
if not success:
|
|
329
|
+
all_succeeded = False
|
|
330
|
+
|
|
331
|
+
# 원본 pack 의 모든 attr 복제 (model/tokens/cost/latency/txid/...) → eval pack
|
|
332
|
+
eval_pack = LlmStepEvalStatus.from_step_status(pack)
|
|
333
|
+
# 한 evaluator 라도 점수를 냈고 모두 성공했으면 True. 모두 실패면 False.
|
|
334
|
+
eval_pack.eval_success = bool(any_score_emitted and all_succeeded)
|
|
335
|
+
|
|
336
|
+
for label, score in all_scores.items():
|
|
337
|
+
self._assign_score(eval_pack, label, score)
|
|
338
|
+
|
|
339
|
+
try:
|
|
340
|
+
dispatch_llm_evaluation_pack(eval_pack)
|
|
341
|
+
except Exception as e:
|
|
342
|
+
logging.warning('[LLM] eval pack send failed: %s' % e,
|
|
343
|
+
extra={'id': 'LLM033'})
|
|
344
|
+
|
|
345
|
+
def _evaluate_one(self, evaluator, ctx, pack):
|
|
346
|
+
"""1 evaluator 실행 → ({label: score}, success bool) 반환 + metric 갱신.
|
|
347
|
+
|
|
348
|
+
pack 송출은 안 함 (호출자 ``_run_pack`` 이 모든 evaluator 끝나면 1 pack 으로 송출).
|
|
349
|
+
"""
|
|
350
|
+
start = time.monotonic()
|
|
351
|
+
success = True
|
|
352
|
+
result = None
|
|
353
|
+
try:
|
|
354
|
+
raw = evaluator.evaluate(ctx)
|
|
355
|
+
result = _coerce_result(raw, getattr(evaluator, 'METRIC_TYPE', None))
|
|
356
|
+
except Exception as e:
|
|
357
|
+
success = False
|
|
358
|
+
logging.warning('[LLM] evaluator %s failed: %s'
|
|
359
|
+
% (getattr(evaluator, 'LABEL', '?'), e),
|
|
360
|
+
extra={'id': 'LLM032'})
|
|
361
|
+
|
|
362
|
+
duration_ms = int((time.monotonic() - start) * 1000)
|
|
363
|
+
|
|
364
|
+
# 점수 추출 (primary + extras) — pack 채움 + EvalStat 업데이트 둘 다에 사용
|
|
365
|
+
scores = {} # {label: float}
|
|
366
|
+
if result is not None:
|
|
367
|
+
v = result.value
|
|
368
|
+
if isinstance(v, (int, float)) and not isinstance(v, bool):
|
|
369
|
+
scores[evaluator.LABEL] = float(v)
|
|
370
|
+
extras = getattr(result, 'extras', None) or {}
|
|
371
|
+
for ex_label, extra in extras.items():
|
|
372
|
+
if not ex_label:
|
|
373
|
+
continue
|
|
374
|
+
if isinstance(extra, EvaluatorResult):
|
|
375
|
+
val = extra.value
|
|
376
|
+
elif isinstance(extra, dict):
|
|
377
|
+
val = extra.get('value')
|
|
378
|
+
else:
|
|
379
|
+
val = extra
|
|
380
|
+
if isinstance(val, (int, float)) and not isinstance(val, bool):
|
|
381
|
+
scores[ex_label] = float(val)
|
|
382
|
+
|
|
383
|
+
# judge HTTP 가 실제로 호출됐는지 / 평가 결과 유효성:
|
|
384
|
+
# USES_LLM_JUDGE=False → 규칙 기반 평가자 (PIILeak / URLScan) → 호출 X
|
|
385
|
+
# no_judge_configured → judge_fn 자체 없음 → 호출 X
|
|
386
|
+
# judge_error → 호출 시도, HTTP/parse 실패 → 호출 O, 점수 X (failure)
|
|
387
|
+
# numeric score → 호출 성공 (success)
|
|
388
|
+
# called_judge=False 면 EvalStat (호출 통계) 갱신 skip — 점수 히스토그램은
|
|
389
|
+
# 영향 없음.
|
|
390
|
+
called_judge = bool(getattr(evaluator, 'USES_LLM_JUDGE', False))
|
|
391
|
+
if called_judge and result is not None and result.value == 'no_judge_configured':
|
|
392
|
+
called_judge = False
|
|
393
|
+
eval_success = success and bool(scores) # 점수 받았으면 성공
|
|
394
|
+
|
|
395
|
+
# 평가 메트릭 갱신 — llm_eval_stat + llm_eval_<label> N종 한번에.
|
|
396
|
+
try:
|
|
397
|
+
from whatap.llm.stats.eval_stat import update_eval_metrics
|
|
398
|
+
judge_model = evaluator._model if getattr(evaluator, '_model', None) else ctx.model
|
|
399
|
+
update_eval_metrics(
|
|
400
|
+
model=judge_model,
|
|
401
|
+
provider=ctx.provider,
|
|
402
|
+
operation_type=ctx.operation_type,
|
|
403
|
+
url=ctx.url,
|
|
404
|
+
prompt_version=getattr(pack, 'prompt_version', 'v1') or 'v1',
|
|
405
|
+
called_judge=called_judge,
|
|
406
|
+
success=eval_success,
|
|
407
|
+
latency_ms=duration_ms,
|
|
408
|
+
scores=scores,
|
|
409
|
+
)
|
|
410
|
+
except Exception as e:
|
|
411
|
+
logging.warning('[LLM] eval metrics update failed: %s' % e,
|
|
412
|
+
extra={'id': 'LLM038'})
|
|
413
|
+
|
|
414
|
+
return scores, eval_success
|
|
415
|
+
|
|
416
|
+
@staticmethod
|
|
417
|
+
def _assign_score(eval_pack, label, value):
|
|
418
|
+
"""label 에 매핑되는 eval_<x> 어트리뷰트에 score 값 세팅. 매핑 없으면 무시."""
|
|
419
|
+
attr = _LABEL_TO_FIELD.get(label)
|
|
420
|
+
if attr is None:
|
|
421
|
+
return
|
|
422
|
+
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
423
|
+
return # 숫자 점수만 (judge_error 같은 categorical 은 eval_success=False 로 표현)
|
|
424
|
+
setattr(eval_pack, attr, float(value))
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
# ── helpers ──
|
|
428
|
+
|
|
429
|
+
def _coerce_int(value, default):
|
|
430
|
+
try:
|
|
431
|
+
v = int(value)
|
|
432
|
+
return v if v > 0 else default
|
|
433
|
+
except (TypeError, ValueError):
|
|
434
|
+
return default
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def _coerce_result(raw, metric_type_hint=None):
|
|
438
|
+
"""evaluator.evaluate() 의 raw 반환값을 EvaluatorResult 로 표준화."""
|
|
439
|
+
if isinstance(raw, EvaluatorResult):
|
|
440
|
+
result = raw
|
|
441
|
+
else:
|
|
442
|
+
result = EvaluatorResult(value=raw)
|
|
443
|
+
|
|
444
|
+
if not result.metric_type:
|
|
445
|
+
if metric_type_hint:
|
|
446
|
+
result.metric_type = metric_type_hint
|
|
447
|
+
else:
|
|
448
|
+
v = result.value
|
|
449
|
+
if isinstance(v, bool):
|
|
450
|
+
result.metric_type = 'boolean'
|
|
451
|
+
elif isinstance(v, (int, float)):
|
|
452
|
+
result.metric_type = 'score'
|
|
453
|
+
elif isinstance(v, dict):
|
|
454
|
+
result.metric_type = 'json'
|
|
455
|
+
else:
|
|
456
|
+
result.metric_type = 'categorical'
|
|
457
|
+
if v is not None and not isinstance(v, str):
|
|
458
|
+
result.value = str(v)
|
|
459
|
+
return result
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
# ── module-level API (LlmLogSinkTask 가 호출) ──
|
|
463
|
+
|
|
464
|
+
def enqueue_evaluation(pack):
|
|
465
|
+
"""LlmLogSinkTask.dispatch() 끝에서 호출되는 entrypoint.
|
|
466
|
+
|
|
467
|
+
:param pack: 원본 ``LlmStepStatus``. worker 가 복제해 LlmStepEvalStatus 만들어 송출.
|
|
468
|
+
"""
|
|
469
|
+
LlmEvaluatorTask.get_instance().enqueue(pack)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def build_eval_context_from_pack(pack):
|
|
473
|
+
"""LlmStepStatus pack 으로부터 EvaluatorContext (evaluator.evaluate 입력) 생성."""
|
|
474
|
+
system_texts = getattr(pack, 'system_texts', None) or []
|
|
475
|
+
if isinstance(system_texts, (list, tuple)):
|
|
476
|
+
system_text = '\n'.join(str(t) for t in system_texts if t)
|
|
477
|
+
else:
|
|
478
|
+
system_text = str(system_texts) if system_texts else ''
|
|
479
|
+
|
|
480
|
+
return EvaluatorContext(
|
|
481
|
+
txid=pack.txid,
|
|
482
|
+
step_id=pack.step_id,
|
|
483
|
+
index=pack.index,
|
|
484
|
+
provider=pack.provider or '',
|
|
485
|
+
url=pack.url or '',
|
|
486
|
+
model=getattr(pack, 'model', None),
|
|
487
|
+
operation_type=getattr(pack, 'operation_type', '') or 'unknown',
|
|
488
|
+
input_text=getattr(pack, 'prompt_text', '') or '',
|
|
489
|
+
output_text=getattr(pack, 'completion_text', '') or '',
|
|
490
|
+
system_text=system_text,
|
|
491
|
+
reasoning_text=getattr(pack, 'reasoning_text', '') or '',
|
|
492
|
+
tool_calls_text=getattr(pack, 'tool_calls_text', '') or '',
|
|
493
|
+
tool_results_text=getattr(pack, 'tool_results_text', '') or '',
|
|
494
|
+
success=bool(getattr(pack, 'success', False)),
|
|
495
|
+
finish_reason=getattr(pack, 'finish_reason', None),
|
|
496
|
+
latency_ms=getattr(pack, 'latency', None),
|
|
497
|
+
input_tokens=getattr(pack, 'input_tokens', None),
|
|
498
|
+
output_tokens=getattr(pack, 'output_tokens', None),
|
|
499
|
+
client=getattr(pack, '_llm_client', None),
|
|
500
|
+
event_loop=getattr(pack, '_llm_event_loop', None),
|
|
501
|
+
)
|