whatap-python 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. whatap/LICENSE +0 -0
  2. whatap/README.rst +49 -0
  3. whatap/__init__.py +923 -0
  4. whatap/__main__.py +4 -0
  5. whatap/agent/darwin/amd64/whatap_python +0 -0
  6. whatap/agent/darwin/arm64/whatap_python +0 -0
  7. whatap/agent/linux/amd64/whatap_python +0 -0
  8. whatap/agent/linux/arm64/whatap_python +0 -0
  9. whatap/agent/windows/whatap_python.exe +0 -0
  10. whatap/bootstrap/__init__.py +0 -0
  11. whatap/bootstrap/sitecustomize.py +19 -0
  12. whatap/build.py +4 -0
  13. whatap/conf/__init__.py +0 -0
  14. whatap/conf/configuration.py +280 -0
  15. whatap/conf/configure.py +105 -0
  16. whatap/conf/license.py +49 -0
  17. whatap/control/__init__.py +0 -0
  18. whatap/counter/__init__.py +14 -0
  19. whatap/counter/counter_manager.py +45 -0
  20. whatap/counter/tasks/__init__.py +3 -0
  21. whatap/counter/tasks/base_task.py +26 -0
  22. whatap/counter/tasks/llm_evaluator_task.py +501 -0
  23. whatap/counter/tasks/llm_log_sink_task.py +309 -0
  24. whatap/counter/tasks/llm_stat_task.py +78 -0
  25. whatap/counter/tasks/openfiledescriptor.py +67 -0
  26. whatap/io/__init__.py +1 -0
  27. whatap/io/data_inputx.py +161 -0
  28. whatap/io/data_outputx.py +262 -0
  29. whatap/llm/__init__.py +17 -0
  30. whatap/llm/definitions.py +43 -0
  31. whatap/llm/evaluators/__init__.py +136 -0
  32. whatap/llm/evaluators/base.py +114 -0
  33. whatap/llm/evaluators/builtins/__init__.py +91 -0
  34. whatap/llm/evaluators/builtins/answer_relevance.py +46 -0
  35. whatap/llm/evaluators/builtins/combined_judge.py +271 -0
  36. whatap/llm/evaluators/builtins/factuality.py +71 -0
  37. whatap/llm/evaluators/builtins/hallucination.py +97 -0
  38. whatap/llm/evaluators/builtins/llm_judge.py +516 -0
  39. whatap/llm/evaluators/builtins/pii_leak.py +214 -0
  40. whatap/llm/evaluators/builtins/prompt_injection.py +71 -0
  41. whatap/llm/evaluators/builtins/toxicity.py +53 -0
  42. whatap/llm/evaluators/builtins/url_scan.py +194 -0
  43. whatap/llm/evaluators/registry.py +192 -0
  44. whatap/llm/evaluators/sampler.py +83 -0
  45. whatap/llm/evaluators/scope.py +334 -0
  46. whatap/llm/features.py +66 -0
  47. whatap/llm/log_sink_packs/__init__.py +9 -0
  48. whatap/llm/log_sink_packs/llm_input_message.py +16 -0
  49. whatap/llm/log_sink_packs/llm_log_sink_pack.py +72 -0
  50. whatap/llm/log_sink_packs/llm_output_message.py +19 -0
  51. whatap/llm/log_sink_packs/llm_step_eval_status.py +94 -0
  52. whatap/llm/log_sink_packs/llm_step_status.py +118 -0
  53. whatap/llm/log_sink_packs/llm_system_message.py +16 -0
  54. whatap/llm/log_sink_packs/llm_tool_calls.py +44 -0
  55. whatap/llm/log_sink_packs/llm_tool_results.py +16 -0
  56. whatap/llm/log_sink_packs/llm_tx_status.py +108 -0
  57. whatap/llm/pricing.py +236 -0
  58. whatap/llm/prompt_meta.py +288 -0
  59. whatap/llm/providers/__init__.py +0 -0
  60. whatap/llm/providers/anthropic/__init__.py +37 -0
  61. whatap/llm/providers/anthropic/messages/__init__.py +0 -0
  62. whatap/llm/providers/anthropic/messages/messages.py +70 -0
  63. whatap/llm/providers/anthropic/messages/messages_context.py +76 -0
  64. whatap/llm/providers/anthropic/messages/messages_extractor.py +126 -0
  65. whatap/llm/providers/interceptor.py +182 -0
  66. whatap/llm/providers/openai/__init__.py +133 -0
  67. whatap/llm/providers/openai/chat/__init__.py +0 -0
  68. whatap/llm/providers/openai/chat/chat.py +82 -0
  69. whatap/llm/providers/openai/chat/chat_context.py +78 -0
  70. whatap/llm/providers/openai/chat/chat_extractor.py +127 -0
  71. whatap/llm/providers/openai/completions/__init__.py +0 -0
  72. whatap/llm/providers/openai/completions/completions.py +70 -0
  73. whatap/llm/providers/openai/completions/completions_context.py +31 -0
  74. whatap/llm/providers/openai/completions/completions_extractor.py +61 -0
  75. whatap/llm/providers/openai/content_parser.py +41 -0
  76. whatap/llm/providers/openai/embeddings/__init__.py +0 -0
  77. whatap/llm/providers/openai/embeddings/embeddings.py +59 -0
  78. whatap/llm/providers/openai/embeddings/embeddings_context.py +25 -0
  79. whatap/llm/providers/openai/embeddings/embeddings_extractor.py +26 -0
  80. whatap/llm/providers/openai/responses/__init__.py +0 -0
  81. whatap/llm/providers/openai/responses/responses.py +70 -0
  82. whatap/llm/providers/openai/responses/responses_context.py +88 -0
  83. whatap/llm/providers/openai/responses/responses_extractor.py +126 -0
  84. whatap/llm/providers/stream_accumulator.py +73 -0
  85. whatap/llm/stats/__init__.py +35 -0
  86. whatap/llm/stats/active_stat.py +86 -0
  87. whatap/llm/stats/answer_relevance_eval_stat.py +10 -0
  88. whatap/llm/stats/api_status_stat.py +35 -0
  89. whatap/llm/stats/base_stat.py +107 -0
  90. whatap/llm/stats/combined_judge_eval_stat.py +11 -0
  91. whatap/llm/stats/error_stat.py +59 -0
  92. whatap/llm/stats/eval_stat.py +225 -0
  93. whatap/llm/stats/factuality_eval_stat.py +10 -0
  94. whatap/llm/stats/feature_stat.py +104 -0
  95. whatap/llm/stats/finish_stat.py +105 -0
  96. whatap/llm/stats/hallucination_eval_stat.py +10 -0
  97. whatap/llm/stats/meter.py +18 -0
  98. whatap/llm/stats/perf_stat.py +117 -0
  99. whatap/llm/stats/pii_leak_eval_stat.py +12 -0
  100. whatap/llm/stats/prompt_injection_eval_stat.py +10 -0
  101. whatap/llm/stats/token_usage_stat.py +133 -0
  102. whatap/llm/stats/toxicity_eval_stat.py +10 -0
  103. whatap/llm/stats/url_scan_eval_stat.py +12 -0
  104. whatap/net/__init__.py +0 -0
  105. whatap/net/async_sender.py +107 -0
  106. whatap/net/packet_enum.py +44 -0
  107. whatap/net/packet_type_enum.py +31 -0
  108. whatap/net/param_def.py +69 -0
  109. whatap/net/stackhelper.py +87 -0
  110. whatap/net/udp_session.py +394 -0
  111. whatap/net/udp_thread.py +54 -0
  112. whatap/pack/__init__.py +0 -0
  113. whatap/pack/logSinkPack.py +77 -0
  114. whatap/pack/pack.py +34 -0
  115. whatap/pack/pack_enum.py +41 -0
  116. whatap/pack/tagCountPack.py +61 -0
  117. whatap/scripts/__init__.py +208 -0
  118. whatap/trace/__init__.py +12 -0
  119. whatap/trace/mod/__init__.py +0 -0
  120. whatap/trace/mod/amqp/__init__.py +0 -0
  121. whatap/trace/mod/amqp/kombu.py +122 -0
  122. whatap/trace/mod/amqp/pika.py +62 -0
  123. whatap/trace/mod/application/__init__.py +0 -0
  124. whatap/trace/mod/application/bottle.py +34 -0
  125. whatap/trace/mod/application/celery.py +81 -0
  126. whatap/trace/mod/application/cherrypy.py +30 -0
  127. whatap/trace/mod/application/django.py +287 -0
  128. whatap/trace/mod/application/django_asgi.py +266 -0
  129. whatap/trace/mod/application/django_py3.py +251 -0
  130. whatap/trace/mod/application/fastapi/__init__.py +31 -0
  131. whatap/trace/mod/application/fastapi/endpoint.py +73 -0
  132. whatap/trace/mod/application/fastapi/exception_log.py +63 -0
  133. whatap/trace/mod/application/fastapi/instrumentation.py +204 -0
  134. whatap/trace/mod/application/fastapi/scope.py +115 -0
  135. whatap/trace/mod/application/fastapi/transaction.py +67 -0
  136. whatap/trace/mod/application/flask.py +52 -0
  137. whatap/trace/mod/application/frappe.py +224 -0
  138. whatap/trace/mod/application/graphql.py +170 -0
  139. whatap/trace/mod/application/nameko.py +39 -0
  140. whatap/trace/mod/application/odoo.py +63 -0
  141. whatap/trace/mod/application/starlette.py +126 -0
  142. whatap/trace/mod/application/tornado.py +163 -0
  143. whatap/trace/mod/application/wsgi.py +195 -0
  144. whatap/trace/mod/database/__init__.py +0 -0
  145. whatap/trace/mod/database/cxoracle.py +49 -0
  146. whatap/trace/mod/database/mongo.py +169 -0
  147. whatap/trace/mod/database/mysql.py +80 -0
  148. whatap/trace/mod/database/neo4j.py +90 -0
  149. whatap/trace/mod/database/psycopg2.py +45 -0
  150. whatap/trace/mod/database/psycopg3.py +359 -0
  151. whatap/trace/mod/database/redis.py +122 -0
  152. whatap/trace/mod/database/sqlalchemy.py +213 -0
  153. whatap/trace/mod/database/sqlite3.py +130 -0
  154. whatap/trace/mod/database/util.py +630 -0
  155. whatap/trace/mod/email/__init__.py +0 -0
  156. whatap/trace/mod/email/smtp.py +78 -0
  157. whatap/trace/mod/httpc/__init__.py +0 -0
  158. whatap/trace/mod/httpc/django.py +31 -0
  159. whatap/trace/mod/httpc/httplib.py +70 -0
  160. whatap/trace/mod/httpc/httpx.py +62 -0
  161. whatap/trace/mod/httpc/requests.py +20 -0
  162. whatap/trace/mod/httpc/urllib3.py +27 -0
  163. whatap/trace/mod/httpc/util.py +388 -0
  164. whatap/trace/mod/logging.py +161 -0
  165. whatap/trace/mod/plugin.py +84 -0
  166. whatap/trace/mod/standalone/__init__.py +0 -0
  167. whatap/trace/mod/standalone/multiple.py +293 -0
  168. whatap/trace/mod/standalone/single.py +135 -0
  169. whatap/trace/simple_trace_context.py +18 -0
  170. whatap/trace/trace_context.py +212 -0
  171. whatap/trace/trace_context_manager.py +244 -0
  172. whatap/trace/trace_error.py +84 -0
  173. whatap/trace/trace_handler.py +89 -0
  174. whatap/trace/trace_import.py +91 -0
  175. whatap/trace/trace_module_definition.py +156 -0
  176. whatap/util/__init__.py +0 -0
  177. whatap/util/bit_util.py +49 -0
  178. whatap/util/cardinality/__init__.py +0 -0
  179. whatap/util/cardinality/hyperloglog.py +84 -0
  180. whatap/util/cardinality/murmurhash.py +20 -0
  181. whatap/util/cardinality/registerset.py +60 -0
  182. whatap/util/compare_util.py +19 -0
  183. whatap/util/date_util.py +55 -0
  184. whatap/util/debug_util.py +73 -0
  185. whatap/util/escape_literal_sql.py +233 -0
  186. whatap/util/frame_util.py +20 -0
  187. whatap/util/hash_util.py +103 -0
  188. whatap/util/hexa32.py +66 -0
  189. whatap/util/int_set.py +199 -0
  190. whatap/util/ip_util.py +63 -0
  191. whatap/util/keygen.py +11 -0
  192. whatap/util/linked_list.py +113 -0
  193. whatap/util/linked_map.py +359 -0
  194. whatap/util/metering_util.py +103 -0
  195. whatap/util/request_double_queue.py +68 -0
  196. whatap/util/request_queue.py +60 -0
  197. whatap/util/string_util.py +20 -0
  198. whatap/util/throttle_util.py +99 -0
  199. whatap/util/userid_util.py +134 -0
  200. whatap/value/__init__.py +1 -0
  201. whatap/value/blob_value.py +38 -0
  202. whatap/value/boolean_value.py +33 -0
  203. whatap/value/decimal_value.py +36 -0
  204. whatap/value/double_summary.py +86 -0
  205. whatap/value/double_value.py +33 -0
  206. whatap/value/float_array.py +42 -0
  207. whatap/value/float_value.py +34 -0
  208. whatap/value/int_array.py +42 -0
  209. whatap/value/ip4_value.py +50 -0
  210. whatap/value/list_value.py +105 -0
  211. whatap/value/long_array.py +44 -0
  212. whatap/value/long_summary.py +83 -0
  213. whatap/value/map_value.py +154 -0
  214. whatap/value/null_value.py +21 -0
  215. whatap/value/number_value.py +33 -0
  216. whatap/value/summary_value.py +39 -0
  217. whatap/value/text_array.py +58 -0
  218. whatap/value/text_hash_value.py +37 -0
  219. whatap/value/text_value.py +43 -0
  220. whatap/value/value.py +26 -0
  221. whatap/value/value_enum.py +80 -0
  222. whatap/whatap.conf +14 -0
  223. whatap_python-2.1.0.dist-info/METADATA +87 -0
  224. whatap_python-2.1.0.dist-info/RECORD +227 -0
  225. whatap_python-2.1.0.dist-info/WHEEL +5 -0
  226. whatap_python-2.1.0.dist-info/entry_points.txt +6 -0
  227. whatap_python-2.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,516 @@
1
+ """LLM-as-a-judge 평가자의 베이스 + 어댑터.
2
+
3
+ LLM judge 평가자는 자체적으로 다른 LLM 을 호출해 평가를 수행한다.
4
+ - 비용이 비싸므로 보통 샘플링과 함께 사용한다 (whatap.conf 의 llm_eval_sample_rate).
5
+ - 워커 스레드의 재귀 가드 (LlmEvaluatorTask._safe_run_one 의 thread-local) 가
6
+ judge 의 LLM 호출이 다시 평가 큐에 들어가는 것을 자동 차단한다.
7
+
8
+ 사용:
9
+ 1) judge_fn 을 직접 작성하거나 make_openai_judge() 로 만든 뒤,
10
+ 2) HallucinationEvaluator/AnswerRelevanceEvaluator/ToxicityEvaluator 등에 주입.
11
+
12
+ >>> judge = make_openai_judge(model='gpt-4o-mini')
13
+ >>> register_evaluator(HallucinationEvaluator(judge_fn=judge))
14
+ """
15
+ import json
16
+ import re
17
+
18
+ from whatap import logging
19
+ from whatap.llm.evaluators.base import BaseEvaluator, EvaluatorResult
20
+
21
+
22
+ # 사용자가 set_default_judge_fn() 으로 등록하는 전역 fallback judge.
23
+ # user 의 client 캡처가 안 되는 환경 (CLI 스크립트 등) 에서만 사용.
24
+ _default_judge_fn = [None]
25
+
26
+
27
+ def set_default_judge_fn(judge_fn):
28
+ """전역 default judge_fn 등록 (앱 시작 시 한 번).
29
+
30
+ 각 evaluator 의 judge 결정 우선순위:
31
+ 1) evaluator 인스턴스에 명시된 judge_fn
32
+ 2) 이 함수로 등록한 전역 default
33
+ 3) ctx.provider 기반 자동 판별 (api.openai.com → OpenAI, api.anthropic.com → Anthropic)
34
+
35
+ 로컬 vLLM 서버 / Azure OpenAI / 자체 호스팅 LLM 등 ctx.provider 가 알려진 클라우드와
36
+ 매칭 안 되는 환경에서는 이 default 를 등록하지 않으면 judge_error 가 발생한다.
37
+
38
+ Example:
39
+ from whatap.llm.evaluators.builtins import set_default_judge_fn, make_openai_judge
40
+
41
+ # 앱 시작 시 한 번
42
+ set_default_judge_fn(make_openai_judge(
43
+ base_url='http://localhost:8002/v1',
44
+ api_key='EMPTY',
45
+ model='qwen-7b',
46
+ ))
47
+ """
48
+ if judge_fn is not None and not callable(judge_fn):
49
+ raise TypeError('judge_fn must be callable')
50
+ _default_judge_fn[0] = judge_fn
51
+
52
+
53
+ def get_default_judge_fn():
54
+ """현재 등록된 전역 default judge_fn 반환 (없으면 None)."""
55
+ return _default_judge_fn[0]
56
+
57
+
58
+ class LLMJudgeBase(BaseEvaluator):
59
+ """LLM 을 사용해 평가를 수행하는 베이스 클래스.
60
+
61
+ 서브클래스는 ``make_prompt(ctx)`` 와 ``parse_judgment(raw)`` 만 구현하면 된다.
62
+
63
+ judge LLM 호출 방식 — 우선순위:
64
+ 1) ``judge_fn`` 명시 — ``(system, user) -> str`` callable. 완전 커스텀 케이스.
65
+ 2) ``model`` 만 명시 — provider 는 평가 대상의 ctx.provider 에서 자동 판별.
66
+ (OpenAI 면 OpenAI, Anthropic 이면 Anthropic 으로 자동 dispatch)
67
+ 3) 둘 다 미지정 — provider 자동 판별 + 모델은 작은 default
68
+ (OpenAI: gpt-4o-mini, Anthropic: claude-3-5-haiku-latest).
69
+
70
+ 인증은 환경변수 (OPENAI_API_KEY / ANTHROPIC_API_KEY) 가 자동 사용된다 —
71
+ 사용자 앱이 이미 쓰고 있는 키 그대로. 별도 발급 불필요.
72
+ """
73
+
74
+ # LLM judge 호출이 발생하는 평가자임을 표시. ``LlmEvaluatorTask._run_one`` 이
75
+ # ``EvalStat`` (judge 호출 통계 — call_count/failures/latency_sum) 에 카운트
76
+ # 할지 여부를 결정. 규칙 기반 평가자 (PIILeak / URLScan / 사용자 custom 등)
77
+ # 는 BaseEvaluator 의 default False 이므로 EvalStat 에 안 잡힘 (점수 히스토그램
78
+ # 에는 정상 송출).
79
+ USES_LLM_JUDGE = True
80
+
81
+ LABEL = None
82
+ METRIC_TYPE = None
83
+
84
+ # 기본 모델 (provider 별).
85
+ _DEFAULT_OPENAI_MODEL = 'gpt-4o-mini'
86
+ _DEFAULT_ANTHROPIC_MODEL = 'claude-3-5-haiku-latest'
87
+
88
+ def __init__(self, judge_fn=None, model=None):
89
+ if judge_fn is not None and not callable(judge_fn):
90
+ raise TypeError('judge_fn must be callable')
91
+ self._judge_fn = judge_fn
92
+ self._model = model
93
+ # auto-resolved judge_fn 캐시 (provider 별).
94
+ self._auto_cache = {}
95
+
96
+ def make_prompt(self, ctx):
97
+ """평가 프롬프트 생성. 반환값: (system_prompt, user_prompt) 튜플."""
98
+ raise NotImplementedError
99
+
100
+ def parse_judgment(self, raw):
101
+ """judge 의 raw 응답을 EvaluatorResult 로 변환."""
102
+ raise NotImplementedError
103
+
104
+ def _resolve_judge_fn(self, ctx):
105
+ """judge_fn 결정 우선순위:
106
+
107
+ 1) evaluator 에 명시된 judge_fn
108
+ 2) set_default_judge_fn() 으로 등록한 전역 default
109
+ 3) **interceptor 가 캡처한 ctx.client 의 base_url/api_key 자동 재사용**
110
+ (사용자 앱이 쓰는 그 client 의 인증/엔드포인트 그대로 — 별도 등록 불필요)
111
+ 4) ctx.provider 기반 자동 판별 (api.openai.com / api.anthropic.com 만 매칭)
112
+
113
+ 대부분의 경우 (3) 에서 해결 — 사용자가 OpenAI/AsyncOpenAI/Anthropic 등 어떤
114
+ client 를 쓰고 있든 그 인증 정보를 그대로 받아 평가 LLM 호출도 같은 백엔드로 보냄.
115
+ """
116
+ if self._judge_fn is not None:
117
+ return self._judge_fn
118
+
119
+ # 1) 전역 default
120
+ default = get_default_judge_fn()
121
+ if default is not None:
122
+ return default
123
+
124
+ # 2) interceptor 가 캡처한 사용자 client + event loop 그대로 재사용
125
+ captured = getattr(ctx, 'client', None)
126
+ captured_loop = getattr(ctx, 'event_loop', None)
127
+ if captured is not None:
128
+ judge_model = self._model or getattr(ctx, 'model', None) or self._DEFAULT_OPENAI_MODEL
129
+ judge_fn = _judge_fn_from_captured_client(captured, model=judge_model,
130
+ event_loop=captured_loop)
131
+ if judge_fn is not None:
132
+ return judge_fn
133
+
134
+ # 3) ctx.provider 문자열 기반 클라우드 매칭
135
+ provider = (getattr(ctx, 'provider', '') or '').lower()
136
+ if 'anthropic' in provider:
137
+ key = 'anthropic'
138
+ elif 'openai.com' in provider or 'azure.com' in provider:
139
+ key = 'openai'
140
+ else:
141
+ return None
142
+
143
+ if key in self._auto_cache:
144
+ return self._auto_cache[key]
145
+
146
+ if key == 'anthropic':
147
+ judge_fn = make_anthropic_judge(
148
+ model=self._model or self._DEFAULT_ANTHROPIC_MODEL,
149
+ )
150
+ else:
151
+ judge_fn = make_openai_judge(
152
+ model=self._model or self._DEFAULT_OPENAI_MODEL,
153
+ )
154
+ self._auto_cache[key] = judge_fn
155
+ return judge_fn
156
+
157
+ def evaluate(self, ctx):
158
+ judge_fn = self._resolve_judge_fn(ctx)
159
+ if judge_fn is None:
160
+ logging.warning(
161
+ '[LLM] judge evaluator %s skipped: no judge_fn resolvable for provider=%r. '
162
+ 'Set one via evaluator(judge_fn=...) or set_default_judge_fn().'
163
+ % (self.LABEL, getattr(ctx, 'provider', '')),
164
+ extra={'id': 'LLM062'},
165
+ )
166
+ return EvaluatorResult(
167
+ value='no_judge_configured',
168
+ reasoning='No judge_fn for provider=%s. Use set_default_judge_fn().'
169
+ % getattr(ctx, 'provider', '?'),
170
+ metric_type='categorical',
171
+ metadata={'provider': getattr(ctx, 'provider', '')},
172
+ )
173
+ try:
174
+ system, user = self.make_prompt(ctx)
175
+ raw = judge_fn(system, user)
176
+ if not isinstance(raw, str):
177
+ raw = str(raw) if raw is not None else ''
178
+ return self.parse_judgment(raw)
179
+ except Exception as e:
180
+ logging.warning('[LLM] judge evaluator %s exception: %s' % (self.LABEL, e),
181
+ extra={'id': 'LLM060'})
182
+ return EvaluatorResult(
183
+ value='judge_error',
184
+ reasoning='%s: %s' % (type(e).__name__, e),
185
+ metric_type='categorical',
186
+ metadata={'error_type': type(e).__name__, 'error_message': str(e)},
187
+ )
188
+
189
+
190
+ def _judge_fn_from_captured_client(captured, model, temperature=0.0, max_tokens=2048,
191
+ event_loop=None):
192
+ """interceptor 가 캡처한 user 의 client 를 그대로 재사용해 judge_fn 을 만든다.
193
+
194
+ 핵심: **새 sync client / httpx.Client 생성 안 함.** user 가 만들어 쓰던 그
195
+ OpenAI / AsyncOpenAI / Anthropic / AsyncAnthropic 인스턴스를 그대로 호출.
196
+
197
+ sync client (OpenAI / Anthropic) → 워커 스레드에서 그대로 ``client.method(...)`` 호출.
198
+ async client (AsyncOpenAI / AsyncAnthropic) → ``run_coroutine_threadsafe`` 로 user 의
199
+ running event loop 에 dispatch → 그 loop 에서 user 의 AsyncClient 가 호출됨.
200
+
201
+ `event_loop` 가 None 이면 (sync 호출이었거나 capture 실패) async client 는 호출 불가
202
+ → None 반환.
203
+ """
204
+ import inspect
205
+ cls_name = type(captured).__name__
206
+
207
+ # async 여부 판별 — 클래스명 prefix + create 메서드가 coroutine 인지
208
+ is_async = cls_name.startswith('Async')
209
+ if not is_async:
210
+ try:
211
+ if 'Anthropic' in cls_name:
212
+ m = captured.messages.create
213
+ else:
214
+ m = captured.chat.completions.create
215
+ if inspect.iscoroutinefunction(m):
216
+ is_async = True
217
+ except Exception:
218
+ pass
219
+
220
+ # ── sync user client → 워커 스레드에서 직접 호출 ──
221
+ if not is_async:
222
+ if 'Anthropic' in cls_name:
223
+ return make_anthropic_judge(client=captured, model=model,
224
+ temperature=temperature, max_tokens=max_tokens)
225
+ return make_openai_judge(client=captured, model=model,
226
+ temperature=temperature, max_tokens=max_tokens)
227
+
228
+ # ── async user client → user loop 에 dispatch ──
229
+ if event_loop is None:
230
+ # loop capture 실패 — async client 그대로 호출 불가
231
+ return None
232
+
233
+ return _make_async_dispatched_judge(captured, model, temperature, max_tokens,
234
+ event_loop=event_loop,
235
+ kind='anthropic' if 'Anthropic' in cls_name else 'openai')
236
+
237
+
238
+ def _make_async_dispatched_judge(async_client, model, temperature, max_tokens,
239
+ event_loop, kind):
240
+ """async user client 를 user 의 running loop 에 dispatch 해서 호출하는 sync judge_fn.
241
+
242
+ 매 호출마다 ``run_coroutine_threadsafe(coro, event_loop).result()`` — coro 는 user 의
243
+ loop 에서 실행되어 user 의 AsyncClient (그 loop 에 binding) 그대로 사용.
244
+ 별도 client 생성 / httpx pool 생성 없음 → 누적 leak 0.
245
+
246
+ Dispatched coro 는 task context 시작 시 trace context 를 None 으로 명시 set:
247
+ user loop 이 다른 request 처리 중일 때 그 ctx 를 inherit 하면 intercept 가
248
+ 엉뚱한 txid 로 발화하므로 차단.
249
+ """
250
+ import asyncio
251
+ from whatap.counter.tasks.llm_evaluator_task import _eval_worker_cv, _get_eval_worker_state
252
+
253
+ def _setup_dispatched_ctx(state):
254
+ """dispatched coro 시작 시 컨텍스트 정리:
255
+ - trace ctx 를 None 으로 set — user loop 이 다른 request 처리 중일 때 그 ctx 를
256
+ inherit 해서 intercept 가 엉뚱한 txid 로 발화하는 것 차단
257
+ - _eval_worker_cv 를 worker 의 state dict 로 set — dispatched task 의 context 에
258
+ parent ids 까지 propagate. 이 task 안의 LLM 호출 intercept 가 evaluator worker
259
+ 로 인지하고 (옵션 시) parent ids override 까지 정상 동작.
260
+
261
+ 반환: (cv_token, trace_token_or_None) — coro 종료 시 reset 용. set 실패 시 None.
262
+ ContextVar 는 task 종료 시 자동 정리되지만, ``loop.run_until_complete`` 처럼
263
+ 같은 task 컨텍스트가 user 코드로 이어지는 케이스에서 누출 방지를 위한 명시 reset.
264
+ """
265
+ trace_token = None
266
+ try:
267
+ from whatap.trace.trace_context_manager import TraceContextManager
268
+ trace_token = TraceContextManager.whatap_coroutine_context.set(None)
269
+ except Exception:
270
+ pass
271
+ cv_token = None
272
+ try:
273
+ cv_token = _eval_worker_cv.set(state if state else {'in_eval': True})
274
+ except Exception:
275
+ pass
276
+ return cv_token, trace_token
277
+
278
+ def _reset_dispatched_ctx(cv_token, trace_token):
279
+ if cv_token is not None:
280
+ try:
281
+ _eval_worker_cv.reset(cv_token)
282
+ except Exception:
283
+ pass
284
+ if trace_token is not None:
285
+ try:
286
+ from whatap.trace.trace_context_manager import TraceContextManager
287
+ TraceContextManager.whatap_coroutine_context.reset(trace_token)
288
+ except Exception:
289
+ pass
290
+
291
+ if kind == 'anthropic':
292
+ async def _call(state, system, user):
293
+ cv_tok, tr_tok = _setup_dispatched_ctx(state)
294
+ try:
295
+ resp = await async_client.messages.create(
296
+ model=model, system=system,
297
+ messages=[{'role': 'user', 'content': user}],
298
+ max_tokens=max_tokens, temperature=temperature,
299
+ )
300
+ for block in resp.content:
301
+ if getattr(block, 'type', None) == 'text':
302
+ return block.text or ''
303
+ return ''
304
+ finally:
305
+ _reset_dispatched_ctx(cv_tok, tr_tok)
306
+ else:
307
+ async def _call(state, system, user):
308
+ cv_tok, tr_tok = _setup_dispatched_ctx(state)
309
+ try:
310
+ resp = await async_client.chat.completions.create(
311
+ model=model,
312
+ messages=[
313
+ {'role': 'system', 'content': system},
314
+ {'role': 'user', 'content': user},
315
+ ],
316
+ temperature=temperature, max_tokens=max_tokens,
317
+ )
318
+ return resp.choices[0].message.content or ''
319
+ finally:
320
+ _reset_dispatched_ctx(cv_tok, tr_tok)
321
+
322
+ def judge_fn(system, user):
323
+ if not event_loop.is_running():
324
+ raise RuntimeError(
325
+ 'captured event loop is no longer running — user app likely shut down'
326
+ )
327
+ # judge_fn 호출 시점 (worker thread) 에 state 캡처 → dispatched coro 에 closure
328
+ # 로 전달. parent_txid/step_id/index 가 user loop thread 까지 propagate 됨.
329
+ state = _get_eval_worker_state()
330
+ future = asyncio.run_coroutine_threadsafe(_call(state, system, user), event_loop)
331
+ # user app 의 loop hang / backpressure 로 평가 워커가 무기한 블록되는 것 차단.
332
+ # 초과 시 future 를 취소해 user loop 에 떠다니는 좀비 task 도 정리.
333
+ timeout = _judge_timeout_sec()
334
+ try:
335
+ return future.result(timeout=timeout) if timeout else future.result()
336
+ except Exception:
337
+ try:
338
+ future.cancel()
339
+ except Exception:
340
+ pass
341
+ raise
342
+ return judge_fn
343
+
344
+
345
+ def _judge_timeout_sec():
346
+ """conf.llm_eval_judge_timeout_sec 을 안전 변환. 0/음수/잘못된 값 → None (무제한).
347
+
348
+ Configure.setProperty 가 string 으로 들고있을 수 있어 (reload 패턴) 변환 우회.
349
+ """
350
+ from whatap.conf.configure import Configure as conf
351
+ raw = getattr(conf, 'llm_eval_judge_timeout_sec', 30)
352
+ try:
353
+ v = float(raw)
354
+ except (TypeError, ValueError):
355
+ return 30.0
356
+ return v if v > 0 else None
357
+
358
+
359
+ def make_openai_judge(client=None, model='gpt-4o-mini', api_key=None, base_url=None,
360
+ temperature=0.0, max_tokens=2048):
361
+ """OpenAI 호환 API 를 사용하는 judge_fn 을 생성한다.
362
+
363
+ :param client: 기존 ``openai.OpenAI`` 인스턴스. 주면 그대로 재사용.
364
+ :param model: 평가용 모델.
365
+ :param base_url: OpenAI 호환 엔드포인트 (client 미지정 시).
366
+ """
367
+ _client_ref = [client]
368
+
369
+ def judge_fn(system, user):
370
+ c = _client_ref[0]
371
+ if c is None:
372
+ import openai
373
+ client_kwargs = {}
374
+ if api_key:
375
+ client_kwargs['api_key'] = api_key
376
+ if base_url:
377
+ client_kwargs['base_url'] = base_url
378
+ c = openai.OpenAI(**client_kwargs)
379
+ _client_ref[0] = c
380
+ resp = c.chat.completions.create(
381
+ model=model,
382
+ messages=[
383
+ {'role': 'system', 'content': system},
384
+ {'role': 'user', 'content': user},
385
+ ],
386
+ temperature=temperature,
387
+ max_tokens=max_tokens,
388
+ )
389
+ return resp.choices[0].message.content or ''
390
+ return judge_fn
391
+
392
+
393
+ def make_anthropic_judge(client=None, model='claude-3-5-haiku-latest', api_key=None,
394
+ max_tokens=2048, temperature=0.0):
395
+ """Anthropic 을 사용하는 judge_fn 을 생성한다."""
396
+ _client_ref = [client]
397
+
398
+ def judge_fn(system, user):
399
+ c = _client_ref[0]
400
+ if c is None:
401
+ import anthropic
402
+ client_kwargs = {}
403
+ if api_key:
404
+ client_kwargs['api_key'] = api_key
405
+ c = anthropic.Anthropic(**client_kwargs)
406
+ _client_ref[0] = c
407
+ resp = c.messages.create(
408
+ model=model,
409
+ system=system,
410
+ messages=[{'role': 'user', 'content': user}],
411
+ max_tokens=max_tokens,
412
+ temperature=temperature,
413
+ )
414
+ for block in resp.content:
415
+ if getattr(block, 'type', None) == 'text':
416
+ return block.text or ''
417
+ return ''
418
+ return judge_fn
419
+
420
+
421
+ # ── parsing helpers ──
422
+
423
+ _JSON_BLOCK = re.compile(r'\{.*\}', re.DOTALL)
424
+
425
+
426
+ def _salvage_truncated_object(text):
427
+ """잘린 (truncated) JSON 객체에서 **완전하게 끝난 top-level 항목만** 살려낸다.
428
+
429
+ judge 응답이 토큰 한도 등으로 중간에서 끊기면 닫는 ``}`` 가 없어 정상 파싱이
430
+ 전부 실패한다 → 5 aspect 점수가 통째로 날아감. 이 함수는 마지막 안전망:
431
+ outer 객체를 스캔하다 depth 1 (top-level) 에서 만난 마지막 콤마까지를 잘라
432
+ ``}`` 로 닫아 — 그 전까지 **완전히 끝난 key:value 쌍** 들만 복구한다.
433
+ 중간에 끊긴 마지막 항목은 버린다.
434
+
435
+ flat 객체 (개별 aspect judge: ``{"score":..,"reasoning":..}``) 와 중첩 객체
436
+ (combined judge: ``{"hallucination":{..},"toxicity":{..}}``) 모두에 동작 —
437
+ top-level 콤마만 경계로 본다.
438
+
439
+ :return: 복구된 dict, 또는 복구 불가 시 None.
440
+ """
441
+ start = text.find('{')
442
+ if start < 0:
443
+ return None
444
+ s = text[start:]
445
+ in_str = False
446
+ esc = False
447
+ depth = 0
448
+ last_safe = None # depth 1 의 마지막 콤마 인덱스 — s[:last_safe] + '}' 가 valid
449
+ for i, ch in enumerate(s):
450
+ if in_str:
451
+ if esc:
452
+ esc = False
453
+ elif ch == '\\':
454
+ esc = True
455
+ elif ch == '"':
456
+ in_str = False
457
+ continue
458
+ if ch == '"':
459
+ in_str = True
460
+ elif ch == '{' or ch == '[':
461
+ depth += 1
462
+ elif ch == '}' or ch == ']':
463
+ depth -= 1
464
+ if depth == 0:
465
+ # outer 객체가 정상적으로 닫혔다 (truncation 아님) → 그대로 파싱
466
+ try:
467
+ return json.loads(s[:i + 1])
468
+ except ValueError:
469
+ return None
470
+ elif ch == ',' and depth == 1:
471
+ last_safe = i # 이 콤마 앞까지가 완전히 끝난 쌍들
472
+ if last_safe is None:
473
+ return None
474
+ try:
475
+ return json.loads(s[:last_safe] + '}')
476
+ except ValueError:
477
+ return None
478
+
479
+
480
+ def parse_json_response(raw):
481
+ """judge 의 raw 텍스트에서 JSON 객체를 추출 + 파싱.
482
+
483
+ LLM 응답은 ```json ... ``` 펜스나 prefix/suffix 가 붙는 경우가 많아서
484
+ 가장 외곽의 {...} 를 정규식으로 잡아낸 뒤 파싱한다. 그래도 실패하면 (응답이
485
+ 중간에서 잘린 경우) ``_salvage_truncated_object`` 로 완전히 끝난 항목만 복구한다.
486
+ """
487
+ if not raw:
488
+ raise ValueError('empty judge response')
489
+ text = raw.strip()
490
+ if text.startswith('```'):
491
+ first_nl = text.find('\n')
492
+ if first_nl > 0:
493
+ text = text[first_nl + 1:]
494
+ if text.endswith('```'):
495
+ text = text[:-3]
496
+ text = text.strip()
497
+ try:
498
+ return json.loads(text)
499
+ except json.JSONDecodeError:
500
+ pass
501
+ m = _JSON_BLOCK.search(text)
502
+ if m:
503
+ try:
504
+ return json.loads(m.group(0))
505
+ except json.JSONDecodeError:
506
+ pass
507
+ # 마지막 안전망: 잘린 JSON 에서 완전히 끝난 top-level 항목만 복구.
508
+ salvaged = _salvage_truncated_object(text)
509
+ if salvaged:
510
+ logging.warning(
511
+ '[LLM] recovered truncated judge JSON — %d top-level key(s) salvaged, '
512
+ 'rest dropped (response likely hit max_tokens)' % len(salvaged),
513
+ extra={'id': 'LLM065'},
514
+ )
515
+ return salvaged
516
+ raise ValueError('no JSON object found in judge response: %r' % raw[:200])