whatap-python 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. whatap/LICENSE +0 -0
  2. whatap/README.rst +49 -0
  3. whatap/__init__.py +923 -0
  4. whatap/__main__.py +4 -0
  5. whatap/agent/darwin/amd64/whatap_python +0 -0
  6. whatap/agent/darwin/arm64/whatap_python +0 -0
  7. whatap/agent/linux/amd64/whatap_python +0 -0
  8. whatap/agent/linux/arm64/whatap_python +0 -0
  9. whatap/agent/windows/whatap_python.exe +0 -0
  10. whatap/bootstrap/__init__.py +0 -0
  11. whatap/bootstrap/sitecustomize.py +19 -0
  12. whatap/build.py +4 -0
  13. whatap/conf/__init__.py +0 -0
  14. whatap/conf/configuration.py +280 -0
  15. whatap/conf/configure.py +105 -0
  16. whatap/conf/license.py +49 -0
  17. whatap/control/__init__.py +0 -0
  18. whatap/counter/__init__.py +14 -0
  19. whatap/counter/counter_manager.py +45 -0
  20. whatap/counter/tasks/__init__.py +3 -0
  21. whatap/counter/tasks/base_task.py +26 -0
  22. whatap/counter/tasks/llm_evaluator_task.py +501 -0
  23. whatap/counter/tasks/llm_log_sink_task.py +309 -0
  24. whatap/counter/tasks/llm_stat_task.py +78 -0
  25. whatap/counter/tasks/openfiledescriptor.py +67 -0
  26. whatap/io/__init__.py +1 -0
  27. whatap/io/data_inputx.py +161 -0
  28. whatap/io/data_outputx.py +262 -0
  29. whatap/llm/__init__.py +17 -0
  30. whatap/llm/definitions.py +43 -0
  31. whatap/llm/evaluators/__init__.py +136 -0
  32. whatap/llm/evaluators/base.py +114 -0
  33. whatap/llm/evaluators/builtins/__init__.py +91 -0
  34. whatap/llm/evaluators/builtins/answer_relevance.py +46 -0
  35. whatap/llm/evaluators/builtins/combined_judge.py +271 -0
  36. whatap/llm/evaluators/builtins/factuality.py +71 -0
  37. whatap/llm/evaluators/builtins/hallucination.py +97 -0
  38. whatap/llm/evaluators/builtins/llm_judge.py +516 -0
  39. whatap/llm/evaluators/builtins/pii_leak.py +214 -0
  40. whatap/llm/evaluators/builtins/prompt_injection.py +71 -0
  41. whatap/llm/evaluators/builtins/toxicity.py +53 -0
  42. whatap/llm/evaluators/builtins/url_scan.py +194 -0
  43. whatap/llm/evaluators/registry.py +192 -0
  44. whatap/llm/evaluators/sampler.py +83 -0
  45. whatap/llm/evaluators/scope.py +334 -0
  46. whatap/llm/features.py +66 -0
  47. whatap/llm/log_sink_packs/__init__.py +9 -0
  48. whatap/llm/log_sink_packs/llm_input_message.py +16 -0
  49. whatap/llm/log_sink_packs/llm_log_sink_pack.py +72 -0
  50. whatap/llm/log_sink_packs/llm_output_message.py +19 -0
  51. whatap/llm/log_sink_packs/llm_step_eval_status.py +94 -0
  52. whatap/llm/log_sink_packs/llm_step_status.py +118 -0
  53. whatap/llm/log_sink_packs/llm_system_message.py +16 -0
  54. whatap/llm/log_sink_packs/llm_tool_calls.py +44 -0
  55. whatap/llm/log_sink_packs/llm_tool_results.py +16 -0
  56. whatap/llm/log_sink_packs/llm_tx_status.py +108 -0
  57. whatap/llm/pricing.py +236 -0
  58. whatap/llm/prompt_meta.py +288 -0
  59. whatap/llm/providers/__init__.py +0 -0
  60. whatap/llm/providers/anthropic/__init__.py +37 -0
  61. whatap/llm/providers/anthropic/messages/__init__.py +0 -0
  62. whatap/llm/providers/anthropic/messages/messages.py +70 -0
  63. whatap/llm/providers/anthropic/messages/messages_context.py +76 -0
  64. whatap/llm/providers/anthropic/messages/messages_extractor.py +126 -0
  65. whatap/llm/providers/interceptor.py +182 -0
  66. whatap/llm/providers/openai/__init__.py +133 -0
  67. whatap/llm/providers/openai/chat/__init__.py +0 -0
  68. whatap/llm/providers/openai/chat/chat.py +82 -0
  69. whatap/llm/providers/openai/chat/chat_context.py +78 -0
  70. whatap/llm/providers/openai/chat/chat_extractor.py +127 -0
  71. whatap/llm/providers/openai/completions/__init__.py +0 -0
  72. whatap/llm/providers/openai/completions/completions.py +70 -0
  73. whatap/llm/providers/openai/completions/completions_context.py +31 -0
  74. whatap/llm/providers/openai/completions/completions_extractor.py +61 -0
  75. whatap/llm/providers/openai/content_parser.py +41 -0
  76. whatap/llm/providers/openai/embeddings/__init__.py +0 -0
  77. whatap/llm/providers/openai/embeddings/embeddings.py +59 -0
  78. whatap/llm/providers/openai/embeddings/embeddings_context.py +25 -0
  79. whatap/llm/providers/openai/embeddings/embeddings_extractor.py +26 -0
  80. whatap/llm/providers/openai/responses/__init__.py +0 -0
  81. whatap/llm/providers/openai/responses/responses.py +70 -0
  82. whatap/llm/providers/openai/responses/responses_context.py +88 -0
  83. whatap/llm/providers/openai/responses/responses_extractor.py +126 -0
  84. whatap/llm/providers/stream_accumulator.py +73 -0
  85. whatap/llm/stats/__init__.py +35 -0
  86. whatap/llm/stats/active_stat.py +86 -0
  87. whatap/llm/stats/answer_relevance_eval_stat.py +10 -0
  88. whatap/llm/stats/api_status_stat.py +35 -0
  89. whatap/llm/stats/base_stat.py +107 -0
  90. whatap/llm/stats/combined_judge_eval_stat.py +11 -0
  91. whatap/llm/stats/error_stat.py +59 -0
  92. whatap/llm/stats/eval_stat.py +225 -0
  93. whatap/llm/stats/factuality_eval_stat.py +10 -0
  94. whatap/llm/stats/feature_stat.py +104 -0
  95. whatap/llm/stats/finish_stat.py +105 -0
  96. whatap/llm/stats/hallucination_eval_stat.py +10 -0
  97. whatap/llm/stats/meter.py +18 -0
  98. whatap/llm/stats/perf_stat.py +117 -0
  99. whatap/llm/stats/pii_leak_eval_stat.py +12 -0
  100. whatap/llm/stats/prompt_injection_eval_stat.py +10 -0
  101. whatap/llm/stats/token_usage_stat.py +133 -0
  102. whatap/llm/stats/toxicity_eval_stat.py +10 -0
  103. whatap/llm/stats/url_scan_eval_stat.py +12 -0
  104. whatap/net/__init__.py +0 -0
  105. whatap/net/async_sender.py +107 -0
  106. whatap/net/packet_enum.py +44 -0
  107. whatap/net/packet_type_enum.py +31 -0
  108. whatap/net/param_def.py +69 -0
  109. whatap/net/stackhelper.py +87 -0
  110. whatap/net/udp_session.py +394 -0
  111. whatap/net/udp_thread.py +54 -0
  112. whatap/pack/__init__.py +0 -0
  113. whatap/pack/logSinkPack.py +77 -0
  114. whatap/pack/pack.py +34 -0
  115. whatap/pack/pack_enum.py +41 -0
  116. whatap/pack/tagCountPack.py +61 -0
  117. whatap/scripts/__init__.py +208 -0
  118. whatap/trace/__init__.py +12 -0
  119. whatap/trace/mod/__init__.py +0 -0
  120. whatap/trace/mod/amqp/__init__.py +0 -0
  121. whatap/trace/mod/amqp/kombu.py +122 -0
  122. whatap/trace/mod/amqp/pika.py +62 -0
  123. whatap/trace/mod/application/__init__.py +0 -0
  124. whatap/trace/mod/application/bottle.py +34 -0
  125. whatap/trace/mod/application/celery.py +81 -0
  126. whatap/trace/mod/application/cherrypy.py +30 -0
  127. whatap/trace/mod/application/django.py +287 -0
  128. whatap/trace/mod/application/django_asgi.py +266 -0
  129. whatap/trace/mod/application/django_py3.py +251 -0
  130. whatap/trace/mod/application/fastapi/__init__.py +31 -0
  131. whatap/trace/mod/application/fastapi/endpoint.py +73 -0
  132. whatap/trace/mod/application/fastapi/exception_log.py +63 -0
  133. whatap/trace/mod/application/fastapi/instrumentation.py +204 -0
  134. whatap/trace/mod/application/fastapi/scope.py +115 -0
  135. whatap/trace/mod/application/fastapi/transaction.py +67 -0
  136. whatap/trace/mod/application/flask.py +52 -0
  137. whatap/trace/mod/application/frappe.py +224 -0
  138. whatap/trace/mod/application/graphql.py +170 -0
  139. whatap/trace/mod/application/nameko.py +39 -0
  140. whatap/trace/mod/application/odoo.py +63 -0
  141. whatap/trace/mod/application/starlette.py +126 -0
  142. whatap/trace/mod/application/tornado.py +163 -0
  143. whatap/trace/mod/application/wsgi.py +195 -0
  144. whatap/trace/mod/database/__init__.py +0 -0
  145. whatap/trace/mod/database/cxoracle.py +49 -0
  146. whatap/trace/mod/database/mongo.py +169 -0
  147. whatap/trace/mod/database/mysql.py +80 -0
  148. whatap/trace/mod/database/neo4j.py +90 -0
  149. whatap/trace/mod/database/psycopg2.py +45 -0
  150. whatap/trace/mod/database/psycopg3.py +359 -0
  151. whatap/trace/mod/database/redis.py +122 -0
  152. whatap/trace/mod/database/sqlalchemy.py +213 -0
  153. whatap/trace/mod/database/sqlite3.py +130 -0
  154. whatap/trace/mod/database/util.py +630 -0
  155. whatap/trace/mod/email/__init__.py +0 -0
  156. whatap/trace/mod/email/smtp.py +78 -0
  157. whatap/trace/mod/httpc/__init__.py +0 -0
  158. whatap/trace/mod/httpc/django.py +31 -0
  159. whatap/trace/mod/httpc/httplib.py +70 -0
  160. whatap/trace/mod/httpc/httpx.py +62 -0
  161. whatap/trace/mod/httpc/requests.py +20 -0
  162. whatap/trace/mod/httpc/urllib3.py +27 -0
  163. whatap/trace/mod/httpc/util.py +388 -0
  164. whatap/trace/mod/logging.py +161 -0
  165. whatap/trace/mod/plugin.py +84 -0
  166. whatap/trace/mod/standalone/__init__.py +0 -0
  167. whatap/trace/mod/standalone/multiple.py +293 -0
  168. whatap/trace/mod/standalone/single.py +135 -0
  169. whatap/trace/simple_trace_context.py +18 -0
  170. whatap/trace/trace_context.py +212 -0
  171. whatap/trace/trace_context_manager.py +244 -0
  172. whatap/trace/trace_error.py +84 -0
  173. whatap/trace/trace_handler.py +89 -0
  174. whatap/trace/trace_import.py +91 -0
  175. whatap/trace/trace_module_definition.py +156 -0
  176. whatap/util/__init__.py +0 -0
  177. whatap/util/bit_util.py +49 -0
  178. whatap/util/cardinality/__init__.py +0 -0
  179. whatap/util/cardinality/hyperloglog.py +84 -0
  180. whatap/util/cardinality/murmurhash.py +20 -0
  181. whatap/util/cardinality/registerset.py +60 -0
  182. whatap/util/compare_util.py +19 -0
  183. whatap/util/date_util.py +55 -0
  184. whatap/util/debug_util.py +73 -0
  185. whatap/util/escape_literal_sql.py +233 -0
  186. whatap/util/frame_util.py +20 -0
  187. whatap/util/hash_util.py +103 -0
  188. whatap/util/hexa32.py +66 -0
  189. whatap/util/int_set.py +199 -0
  190. whatap/util/ip_util.py +63 -0
  191. whatap/util/keygen.py +11 -0
  192. whatap/util/linked_list.py +113 -0
  193. whatap/util/linked_map.py +359 -0
  194. whatap/util/metering_util.py +103 -0
  195. whatap/util/request_double_queue.py +68 -0
  196. whatap/util/request_queue.py +60 -0
  197. whatap/util/string_util.py +20 -0
  198. whatap/util/throttle_util.py +99 -0
  199. whatap/util/userid_util.py +134 -0
  200. whatap/value/__init__.py +1 -0
  201. whatap/value/blob_value.py +38 -0
  202. whatap/value/boolean_value.py +33 -0
  203. whatap/value/decimal_value.py +36 -0
  204. whatap/value/double_summary.py +86 -0
  205. whatap/value/double_value.py +33 -0
  206. whatap/value/float_array.py +42 -0
  207. whatap/value/float_value.py +34 -0
  208. whatap/value/int_array.py +42 -0
  209. whatap/value/ip4_value.py +50 -0
  210. whatap/value/list_value.py +105 -0
  211. whatap/value/long_array.py +44 -0
  212. whatap/value/long_summary.py +83 -0
  213. whatap/value/map_value.py +154 -0
  214. whatap/value/null_value.py +21 -0
  215. whatap/value/number_value.py +33 -0
  216. whatap/value/summary_value.py +39 -0
  217. whatap/value/text_array.py +58 -0
  218. whatap/value/text_hash_value.py +37 -0
  219. whatap/value/text_value.py +43 -0
  220. whatap/value/value.py +26 -0
  221. whatap/value/value_enum.py +80 -0
  222. whatap/whatap.conf +14 -0
  223. whatap_python-2.1.0.dist-info/METADATA +87 -0
  224. whatap_python-2.1.0.dist-info/RECORD +227 -0
  225. whatap_python-2.1.0.dist-info/WHEEL +5 -0
  226. whatap_python-2.1.0.dist-info/entry_points.txt +6 -0
  227. whatap_python-2.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,225 @@
1
+ """llm_eval_stat 카테고리 — LLM 평가 호출 통계 + 평가 점수 stat 공통 베이스.
2
+
3
+ 이 모듈은 LLM 평가 메트릭의 메인 진입점:
4
+
5
+ EvalStat — llm_eval_stat 카테고리 (호출 통계)
6
+ ScoreHistogramStat — 4종 점수 stat (hallucination / answer_relevance /
7
+ toxicity / combined_judge) 의 공통 베이스
8
+ update_eval_metrics() — evaluator 1회 실행 후 5 stat 일괄 갱신 헬퍼
9
+
10
+ Tags (모든 카테고리 공통):
11
+ pid / model / provider / operation_type / url / prompt_version
12
+
13
+ Fields:
14
+ llm_eval_stat:
15
+ call_count — judge LLM API 호출 횟수 (judge_fn 실제 호출만)
16
+ failures — 평가 실패 횟수 (judge HTTP 에러 / parse 에러)
17
+ latency_sum — judge 호출 소요 시간 합 (ms)
18
+ latency_sketch — KLL sketch (datasketches 설치 시) — p50/p95/p99 분위수용
19
+
20
+ llm_eval_hallucination / llm_eval_answer_relevance /
21
+ llm_eval_toxicity / llm_eval_combined_judge /
22
+ llm_eval_prompt_injection / llm_eval_factuality /
23
+ llm_eval_pii_leak / llm_eval_url_scan:
24
+ value0 ~ value10 — 11-bucket 점수 카운트.
25
+ value0 : 0.0 ≤ s < 0.1
26
+ value1 : 0.1 ≤ s < 0.2
27
+ ...
28
+ value9 : 0.9 ≤ s < 1.0
29
+ value10 : s == 1.0 (만점만 별도)
30
+ """
31
+ from collections import defaultdict
32
+
33
+ from whatap.llm.stats.base_stat import BaseStat
34
+
35
+ try:
36
+ from datasketches import kll_doubles_sketch
37
+ HAS_DATASKETCHES = True
38
+ except ImportError:
39
+ HAS_DATASKETCHES = False
40
+
41
+ _SKETCH_K = 200
42
+
43
+
44
+ # ─────────────────────────────────────────────────────────────────────────
45
+ # llm_eval_stat — 평가 호출 통계 (call_count / failures / latency)
46
+ # ─────────────────────────────────────────────────────────────────────────
47
+
48
+ class EvalStat(BaseStat):
49
+ """평가 호출 통계. 점수 분포는 별도 ScoreHistogramStat 서브클래스가 처리."""
50
+
51
+ _category = 'llm_eval_stat'
52
+ _conf_enabled_key = 'llm_eval_enabled'
53
+
54
+ def _use_sketch(self):
55
+ from whatap.conf.configure import Configure as conf
56
+ return (HAS_DATASKETCHES
57
+ and getattr(conf, 'llm_perf_sketch_enabled', True))
58
+
59
+ def _get_sketch_k(self):
60
+ from whatap.conf.configure import Configure as conf
61
+ return int(getattr(conf, 'llm_perf_sketch_k', _SKETCH_K))
62
+
63
+ def _empty_stats(self):
64
+ stats = {
65
+ 'call_count': defaultdict(int),
66
+ 'failures': defaultdict(int),
67
+ 'latency_sum': defaultdict(float),
68
+ }
69
+ if self._use_sketch():
70
+ k = self._get_sketch_k()
71
+ stats['latency_sketch'] = defaultdict(lambda: kll_doubles_sketch(k))
72
+ return stats
73
+
74
+ def _get_keys(self, stats):
75
+ return stats['call_count'].keys()
76
+
77
+ def update_call(self, model, provider, operation_type, url, prompt_version,
78
+ called_judge, success, latency_ms):
79
+ """평가 호출 완료 직후 갱신. ``called_judge=False`` 면 카운트 안 함
80
+ (judge_fn 자체가 없었던 케이스)."""
81
+ if not called_judge:
82
+ return
83
+ key = (model or 'unknown', provider or '', operation_type or 'default',
84
+ url or '', prompt_version or 'v1')
85
+ with self._lock:
86
+ self._stats['call_count'][key] += 1
87
+ if not success:
88
+ self._stats['failures'][key] += 1
89
+
90
+ try:
91
+ lat = float(latency_ms) if latency_ms is not None else 0.0
92
+ except (TypeError, ValueError):
93
+ lat = 0.0
94
+ if lat > 0:
95
+ self._stats['latency_sum'][key] += lat
96
+ if 'latency_sketch' in self._stats:
97
+ self._stats['latency_sketch'][key].update(lat)
98
+
99
+ def _build_fields(self, pack, stats, key):
100
+ pack.fields.putAuto('call_count', stats['call_count'][key])
101
+ pack.fields.putAuto('failures', stats['failures'][key])
102
+ pack.fields.putAuto('latency_sum', stats['latency_sum'][key])
103
+
104
+ if 'latency_sketch' in stats:
105
+ from whatap.value.blob_value import BlobValue
106
+ sketch = stats['latency_sketch'].get(key)
107
+ if sketch is not None and not sketch.is_empty():
108
+ pack.fields.putValue('latency_sketch', BlobValue(sketch.serialize()))
109
+
110
+
111
+ # ─────────────────────────────────────────────────────────────────────────
112
+ # 점수 히스토그램 공통 베이스 — 4 evaluator label stat 이 _category 만 지정해 상속
113
+ # ─────────────────────────────────────────────────────────────────────────
114
+
115
+ _BUCKET_COUNT = 11 # value0 ~ value10
116
+
117
+
118
+ def _bucket_idx(score):
119
+ """0.0 ~ 1.0 score 를 11-bucket index (0~10) 로 변환.
120
+
121
+ bucket 정의:
122
+ value0 : 0.0 ≤ s < 0.1
123
+ value1 : 0.1 ≤ s < 0.2
124
+ ...
125
+ value9 : 0.9 ≤ s < 1.0
126
+ value10 : s == 1.0 (만점만 별도 bucket)
127
+
128
+ 범위 밖이면 클램프 (음수 → 0, 1.0 초과 → 10), 변환 실패 시 0 (방어).
129
+ """
130
+ try:
131
+ s = float(score)
132
+ except (TypeError, ValueError):
133
+ return 0
134
+ if s <= 0.0:
135
+ return 0
136
+ if s >= 1.0:
137
+ return 10
138
+ idx = int(s * 10)
139
+ if idx > 10:
140
+ idx = 10
141
+ return idx
142
+
143
+
144
+ class ScoreHistogramStat(BaseStat):
145
+ """평가 라벨별 점수 히스토그램 베이스. 서브클래스가 ``_category`` 지정.
146
+
147
+ 실제 카테고리 stat 은 hallucination_eval_stat / answer_relevance_eval_stat /
148
+ toxicity_eval_stat / combined_judge_eval_stat / prompt_injection_eval_stat /
149
+ factuality_eval_stat / pii_leak_eval_stat / url_scan_eval_stat 모듈 참고.
150
+
151
+ Fields: value0 ~ value10 (11 bucket — 자세한 구분은 ``_bucket_idx`` 참고).
152
+ """
153
+
154
+ _conf_enabled_key = 'llm_eval_enabled'
155
+
156
+ def _empty_stats(self):
157
+ # key → [int] * 11
158
+ return defaultdict(lambda: [0] * _BUCKET_COUNT)
159
+
160
+ def _get_keys(self, stats):
161
+ return stats.keys()
162
+
163
+ def update_score(self, model, provider, operation_type, url, prompt_version, score):
164
+ if score is None:
165
+ return
166
+ key = (model or 'unknown', provider or '', operation_type or 'default',
167
+ url or '', prompt_version or 'v1')
168
+ idx = _bucket_idx(score)
169
+ with self._lock:
170
+ self._stats[key][idx] += 1
171
+
172
+ def _build_fields(self, pack, stats, key):
173
+ buckets = stats.get(key) or [0] * _BUCKET_COUNT
174
+ for i, v in enumerate(buckets):
175
+ pack.fields.putAuto('value%d' % i, v)
176
+
177
+
178
+ # ─────────────────────────────────────────────────────────────────────────
179
+ # 라우팅 헬퍼 — evaluator 1회 실행 직후 5 stat 일괄 갱신
180
+ # ─────────────────────────────────────────────────────────────────────────
181
+
182
+ # evaluator label → score-stat 클래스명 매핑.
183
+ # LlmEvaluatorTask._run_one 이 score 별로 적절한 stat 으로 라우팅.
184
+ _LABEL_TO_STAT_NAME = {
185
+ 'hallucination': 'HallucinationEvalStat',
186
+ 'answer_relevance': 'AnswerRelevanceEvalStat',
187
+ 'toxicity': 'ToxicityEvalStat',
188
+ 'prompt_injection': 'PromptInjectionEvalStat',
189
+ 'factuality': 'FactualityEvalStat',
190
+ 'pii_leak': 'PIILeakEvalStat',
191
+ 'url_scan': 'URLScanEvalStat',
192
+ 'combined_judge': 'CombinedJudgeEvalStat',
193
+ }
194
+
195
+
196
+ def update_eval_metrics(model, provider, operation_type, url, prompt_version,
197
+ called_judge, success, latency_ms, scores=None):
198
+ """평가 1회 실행 후 호출 통계 + 점수 히스토그램 4종 일괄 갱신.
199
+
200
+ ``LlmEvaluatorTask._run_one`` 이 evaluator.evaluate() 끝난 직후 호출.
201
+ """
202
+ from whatap.counter.tasks.llm_stat_task import LlmStatTask
203
+
204
+ # 1. EvalStat — 호출 통계
205
+ eval_stat = LlmStatTask.get_stat('EvalStat')
206
+ if eval_stat is not None:
207
+ eval_stat.update_call(
208
+ model=model, provider=provider, operation_type=operation_type,
209
+ url=url, prompt_version=prompt_version,
210
+ called_judge=called_judge, success=success, latency_ms=latency_ms,
211
+ )
212
+
213
+ # 2. 점수별 ScoreHistogramStat — label 매핑되는 것만 라우팅
214
+ if scores:
215
+ for label, score in scores.items():
216
+ stat_name = _LABEL_TO_STAT_NAME.get(label)
217
+ if stat_name is None:
218
+ continue
219
+ stat = LlmStatTask.get_stat(stat_name)
220
+ if stat is None:
221
+ continue
222
+ stat.update_score(
223
+ model=model, provider=provider, operation_type=operation_type,
224
+ url=url, prompt_version=prompt_version, score=score,
225
+ )
@@ -0,0 +1,10 @@
1
+ """llm_eval_factuality 카테고리 — Factuality 평가 점수 히스토그램.
2
+
3
+ 11-bucket 분포 (value0~value10). 자세한 동작은 ``eval_stat.ScoreHistogramStat``
4
+ docstring 참고.
5
+ """
6
+ from whatap.llm.stats.eval_stat import ScoreHistogramStat
7
+
8
+
9
+ class FactualityEvalStat(ScoreHistogramStat):
10
+ _category = 'llm_eval_factuality'
@@ -0,0 +1,104 @@
1
+ """llm_feature_stat 카테고리 — LLM 호출 feature 분포.
2
+
3
+ pack.features (csv) 의 raw 값을 그대로 누적. closed-set 검증 안 함 —
4
+ logsink 와 metric 이 동일한 데이터를 다룬다는 원칙. feature 이름은 list field
5
+ (@id / features / features_count) 안의 entry 이지 dimension (tag) 이 아니므로
6
+ 새 값이 들어와도 row cardinality 폭주는 없음.
7
+
8
+ feature 는 호출당 multi-valued (vision,tool_use,... / 0개 가능) 라 finish_stat 과
9
+ 달리 sum(features_count) != 호출 수다. features_count 는 "그 feature 를 쓴 호출
10
+ 수" 이고, 전체 호출 수(adoption rate 분모) 는 동일 5-tuple 키의 llm_perf_stat.
11
+ call_count 를 쓴다. feature 가 없는 호출은 어떤 feature 에도 안 잡히므로 (none
12
+ 버킷 없음), feature 를 하나도 안 쓴 key 는 이 메트릭에 등장하지 않는다.
13
+
14
+ Tags : pid / model / provider / operation_type / url / prompt_version (+ !rectype=2)
15
+ Fields:
16
+ @id — (model, provider, op_type, prompt_version, feature) 해시 list
17
+ features — feature 이름 list (raw value 그대로)
18
+ features_count — feature 별 카운트 list
19
+ """
20
+ from collections import defaultdict
21
+
22
+ from whatap.llm.stats.base_stat import BaseStat
23
+ from whatap.util.hash_util import HashUtil
24
+
25
+
26
+ class FeatureStat(BaseStat):
27
+
28
+ _category = "llm_feature_stat"
29
+ _is_listed = True
30
+
31
+ def _empty_stats(self):
32
+ return {
33
+ # key → {feature_name: count}
34
+ 'features': defaultdict(lambda: defaultdict(int)),
35
+ }
36
+
37
+ def _get_keys(self, stats):
38
+ return stats['features'].keys()
39
+
40
+ def update_stats(self, model_name, host, operation_type, url='',
41
+ features='', prompt_version='v1'):
42
+ if not features:
43
+ return
44
+ key = (model_name or 'unknown', host or '', operation_type or 'default',
45
+ url or '', prompt_version or 'v1')
46
+ with self._lock:
47
+ for feat in features.split(','):
48
+ feat = feat.strip()
49
+ if feat:
50
+ self._stats['features'][key][feat] += 1
51
+
52
+ def update_from_pack(self, pack):
53
+ self.update_stats(
54
+ pack.model or 'unknown',
55
+ pack.provider or '',
56
+ pack.operation_type or 'default',
57
+ url=pack.url or '',
58
+ features=pack.features or '',
59
+ prompt_version=getattr(pack, 'prompt_version', 'v1') or 'v1',
60
+ )
61
+
62
+ def _build_fields_listed(self, pack, stats, keys, pid):
63
+ """리스트형 메트릭. 모든 key 의 (feature) row 를 평행 배열로 평탄화.
64
+
65
+ Layout:
66
+ - 키 차원 (row 단위): pid / model / provider / operation_type / url / prompt_version
67
+ - feature 차원 (row 단위): features / features_count / @id
68
+
69
+ feature 를 하나도 안 쓴 key 는 update_stats 에서 누적되지 않으므로 여기
70
+ keys 에 등장하지 않는다. call_count 는 llm_perf_stat 으로 일원화.
71
+ """
72
+ id_list = pack.fields.newList("@id")
73
+ pid_list = pack.fields.newList("pid")
74
+ model_list = pack.fields.newList("model")
75
+ provider_list = pack.fields.newList("provider")
76
+ op_type_list = pack.fields.newList("operation_type")
77
+ url_list = pack.fields.newList("url")
78
+ prompt_version_list = pack.fields.newList("prompt_version")
79
+
80
+ features_list = pack.fields.newList("features")
81
+ features_count_list = pack.fields.newList("features_count")
82
+
83
+ for key in keys:
84
+ try:
85
+ model, provider, op_type, url, prompt_version = key
86
+ except ValueError:
87
+ model, provider, op_type, url = key
88
+ prompt_version = 'v1'
89
+
90
+ feat_counts = stats['features'].get(key) or {}
91
+
92
+ # 정렬 — dashboard 디버깅/비교 안정성. set 은 순서 비결정적이므로 강제.
93
+ for feat in sorted(feat_counts.keys()):
94
+ id_list.addLong(HashUtil.hashFromString(
95
+ "{}:{}:{}:{}:{}".format(model, provider, op_type, prompt_version, feat)))
96
+ features_list.addString(feat)
97
+ features_count_list.addLong(feat_counts[feat])
98
+
99
+ pid_list.addLong(pid)
100
+ model_list.addString(model)
101
+ provider_list.addString(provider)
102
+ op_type_list.addString(op_type)
103
+ url_list.addString(url)
104
+ prompt_version_list.addString(prompt_version)
@@ -0,0 +1,105 @@
1
+ """llm_finish_stat 카테고리 — LLM 호출 finish_reason 분포.
2
+
3
+ pack.finish_reason 의 raw 값을 그대로 누적. closed-set 검증 안 함 — logsink 와
4
+ metric 이 동일한 데이터를 다룬다는 원칙. reason 이름은 list field 안의 entry
5
+ 이지 dimension (tag) 이 아니므로 cardinality 폭주 없음.
6
+
7
+ Provider 별로 들어오는 raw 값 예:
8
+ OpenAI Chat/Completions : stop / length / tool_calls / function_call / content_filter
9
+ OpenAI Responses (status) : completed / failed / incomplete / cancelled / in_progress
10
+ Anthropic Messages : end_turn / max_tokens / stop_sequence / tool_use
11
+
12
+ finish_reason 이 비는 호출(에러/중단/미완료 등 raw 값이 None/빈문자열) 은 "none"
13
+ 버킷으로 누적한다. 따라서 finish_reasons_count 의 합 == 해당 key 의 전체 호출 수
14
+ 이고, 배열 한 칸 = 그 finish_reason 으로 끝난 실제 호출 수로 유의미하다.
15
+ 별도 call_count 필드는 두지 않는다(llm_perf_stat 으로 일원화).
16
+
17
+ Tags : pid / model / provider / operation_type / url / prompt_version (+ !rectype=2)
18
+ Fields:
19
+ @id — (model, provider, op_type, prompt_version, reason) 해시 list
20
+ finish_reasons — reason 이름 list (raw value 그대로, 빈 값은 "none")
21
+ finish_reasons_count — reason 별 카운트 list
22
+ """
23
+ from collections import defaultdict
24
+
25
+ from whatap.llm.stats.base_stat import BaseStat
26
+ from whatap.util.hash_util import HashUtil
27
+
28
+
29
+ class FinishStat(BaseStat):
30
+
31
+ _category = "llm_finish_stat"
32
+ _is_listed = True
33
+
34
+ def _empty_stats(self):
35
+ return {
36
+ # key → {finish_reason: count}
37
+ 'reasons': defaultdict(lambda: defaultdict(int)),
38
+ }
39
+
40
+ def _get_keys(self, stats):
41
+ return stats['reasons'].keys()
42
+
43
+ def update_stats(self, model_name, host, operation_type, url='',
44
+ finish_reason='', prompt_version='v1'):
45
+ key = (model_name or 'unknown', host or '', operation_type or 'default',
46
+ url or '', prompt_version or 'v1')
47
+ reason = str(finish_reason).strip() if finish_reason else ''
48
+ if not reason:
49
+ reason = 'none'
50
+ with self._lock:
51
+ self._stats['reasons'][key][reason] += 1
52
+
53
+ def update_from_pack(self, pack):
54
+ self.update_stats(
55
+ pack.model or 'unknown',
56
+ pack.provider or '',
57
+ pack.operation_type or 'default',
58
+ url=pack.url or '',
59
+ finish_reason=getattr(pack, 'finish_reason', '') or '',
60
+ prompt_version=getattr(pack, 'prompt_version', 'v1') or 'v1',
61
+ )
62
+
63
+ def _build_fields_listed(self, pack, stats, keys, pid):
64
+ """리스트형 메트릭. 모든 key 의 (finish_reason) row 를 평행 배열로 평탄화.
65
+
66
+ Layout:
67
+ - 키 차원 (row 단위): pid / model / provider / operation_type / url / prompt_version
68
+ - reason 차원 (row 단위): finish_reasons / finish_reasons_count / @id
69
+
70
+ 빈 finish_reason 은 update_stats 에서 "none" 버킷으로 누적되므로 모든 key 는
71
+ 최소 1개 reason row 를 가지며, finish_reasons_count 의 합 == 호출 수.
72
+ """
73
+ id_list = pack.fields.newList("@id")
74
+ pid_list = pack.fields.newList("pid")
75
+ model_list = pack.fields.newList("model")
76
+ provider_list = pack.fields.newList("provider")
77
+ op_type_list = pack.fields.newList("operation_type")
78
+ url_list = pack.fields.newList("url")
79
+ prompt_version_list = pack.fields.newList("prompt_version")
80
+
81
+ reasons_list = pack.fields.newList("finish_reasons")
82
+ reasons_count_list = pack.fields.newList("finish_reasons_count")
83
+
84
+ for key in keys:
85
+ try:
86
+ model, provider, op_type, url, prompt_version = key
87
+ except ValueError:
88
+ model, provider, op_type, url = key
89
+ prompt_version = 'v1'
90
+
91
+ reason_counts = stats['reasons'].get(key) or {}
92
+
93
+ # 정렬 — dashboard 디버깅/비교 안정성
94
+ for reason in sorted(reason_counts.keys()):
95
+ id_list.addLong(HashUtil.hashFromString(
96
+ "{}:{}:{}:{}:{}".format(model, provider, op_type, prompt_version, reason)))
97
+ reasons_list.addString(reason)
98
+ reasons_count_list.addLong(reason_counts[reason])
99
+
100
+ pid_list.addLong(pid)
101
+ model_list.addString(model)
102
+ provider_list.addString(provider)
103
+ op_type_list.addString(op_type)
104
+ url_list.addString(url)
105
+ prompt_version_list.addString(prompt_version)
@@ -0,0 +1,10 @@
1
+ """llm_eval_hallucination 카테고리 — Hallucination 평가 점수 히스토그램.
2
+
3
+ 11-bucket 분포 (value0~value10). 자세한 동작은 ``eval_stat.ScoreHistogramStat``
4
+ docstring 참고.
5
+ """
6
+ from whatap.llm.stats.eval_stat import ScoreHistogramStat
7
+
8
+
9
+ class HallucinationEvalStat(ScoreHistogramStat):
10
+ _category = 'llm_eval_hallucination'
@@ -0,0 +1,18 @@
1
+ import threading
2
+
3
+
4
+ class Meter:
5
+ _lock = threading.Lock()
6
+ _count = 0
7
+
8
+ @classmethod
9
+ def increment(cls):
10
+ with cls._lock:
11
+ cls._count += 1
12
+
13
+ @classmethod
14
+ def get_and_reset(cls):
15
+ with cls._lock:
16
+ count = cls._count
17
+ cls._count = 0
18
+ return count
@@ -0,0 +1,117 @@
1
+ from collections import defaultdict
2
+
3
+ from whatap.llm.stats.base_stat import BaseStat
4
+
5
+ try:
6
+ from datasketches import kll_doubles_sketch
7
+ HAS_DATASKETCHES = True
8
+ except ImportError:
9
+ HAS_DATASKETCHES = False
10
+
11
+ _SKETCH_K = 200
12
+
13
+
14
+ class PerfStat(BaseStat):
15
+ _category = "llm_perf_stat"
16
+
17
+ def _use_sketch(self):
18
+ from whatap.conf.configure import Configure as conf
19
+ return (HAS_DATASKETCHES
20
+ and getattr(conf, 'llm_perf_sketch_enabled', True))
21
+
22
+ def _get_sketch_k(self):
23
+ from whatap.conf.configure import Configure as conf
24
+ return int(getattr(conf, 'llm_perf_sketch_k', _SKETCH_K))
25
+
26
+ def _empty_stats(self):
27
+ stats = {
28
+ 'call_count': defaultdict(int),
29
+ 'error_count': defaultdict(int),
30
+ 'stream_count': defaultdict(int),
31
+ 'latency_sum': defaultdict(float),
32
+ 'ttft_sum': defaultdict(float),
33
+ 'ttft_count': defaultdict(int),
34
+ 'tpot_sum': defaultdict(float),
35
+ 'tpot_count': defaultdict(int),
36
+ }
37
+ if self._use_sketch():
38
+ k = self._get_sketch_k()
39
+ stats['latency_sketch'] = defaultdict(lambda: kll_doubles_sketch(k))
40
+ stats['ttft_sketch'] = defaultdict(lambda: kll_doubles_sketch(k))
41
+ stats['tpot_sketch'] = defaultdict(lambda: kll_doubles_sketch(k))
42
+ return stats
43
+
44
+ def _get_keys(self, stats):
45
+ return stats['call_count'].keys()
46
+
47
+ def update_stats(self, model_name, host, operation_type, url='',
48
+ latency=None, ttft=None, tpot=None, prompt_version='v1',
49
+ error_count=0, stream=False):
50
+ key = (model_name or 'unknown', host or '', operation_type or 'default',
51
+ url or '', prompt_version or 'v1')
52
+ with self._lock:
53
+ self._stats['call_count'][key] += 1
54
+ self._stats['error_count'][key] += error_count
55
+ if stream:
56
+ self._stats['stream_count'][key] += 1
57
+
58
+ if latency is not None and latency >= 0:
59
+ self._stats['latency_sum'][key] += latency
60
+ if 'latency_sketch' in self._stats:
61
+ self._stats['latency_sketch'][key].update(latency)
62
+
63
+ if ttft is not None and ttft >= 0:
64
+ self._stats['ttft_sum'][key] += ttft
65
+ self._stats['ttft_count'][key] += 1
66
+ if 'ttft_sketch' in self._stats:
67
+ self._stats['ttft_sketch'][key].update(ttft)
68
+
69
+ if tpot is not None and tpot >= 0:
70
+ self._stats['tpot_sum'][key] += tpot
71
+ self._stats['tpot_count'][key] += 1
72
+ if 'tpot_sketch' in self._stats:
73
+ self._stats['tpot_sketch'][key].update(tpot)
74
+
75
+ def update_from_pack(self, pack):
76
+ tpot = None
77
+ output_tokens = getattr(pack, 'output_tokens', None) or 0
78
+ if pack.ttft is not None and pack.latency is not None and output_tokens > 1:
79
+ tpot = (pack.latency - pack.ttft) / (output_tokens - 1)
80
+ self.update_stats(
81
+ pack.model or 'unknown',
82
+ pack.provider or '',
83
+ pack.operation_type or 'default',
84
+ url=pack.url or '',
85
+ latency=pack.latency,
86
+ ttft=pack.ttft,
87
+ tpot=tpot,
88
+ prompt_version=getattr(pack, 'prompt_version', 'v1') or 'v1',
89
+ error_count=0 if pack.success else 1,
90
+ stream=pack.stream,
91
+ )
92
+
93
+ def _build_fields(self, pack, stats, key):
94
+ pack.fields.putAuto("call_count", stats['call_count'][key])
95
+ pack.fields.putAuto("error_count", stats['error_count'][key])
96
+ pack.fields.putAuto("stream_count", stats['stream_count'][key])
97
+ pack.fields.putAuto("latency_sum", stats['latency_sum'][key])
98
+ pack.fields.putAuto("ttft_sum", stats['ttft_sum'][key])
99
+ pack.fields.putAuto("ttft_count", stats['ttft_count'][key])
100
+ pack.fields.putAuto("tpot_sum", stats['tpot_sum'][key])
101
+ pack.fields.putAuto("tpot_count", stats['tpot_count'][key])
102
+
103
+ use_sketch = 'latency_sketch' in stats
104
+ if use_sketch:
105
+ from whatap.value.blob_value import BlobValue
106
+
107
+ sketch = stats['latency_sketch'].get(key)
108
+ if sketch and not sketch.is_empty():
109
+ pack.fields.putValue("latency_sketch", BlobValue(sketch.serialize()))
110
+
111
+ sketch = stats['ttft_sketch'].get(key)
112
+ if sketch and not sketch.is_empty():
113
+ pack.fields.putValue("ttft_sketch", BlobValue(sketch.serialize()))
114
+
115
+ sketch = stats['tpot_sketch'].get(key)
116
+ if sketch and not sketch.is_empty():
117
+ pack.fields.putValue("tpot_sketch", BlobValue(sketch.serialize()))
@@ -0,0 +1,12 @@
1
+ """llm_eval_pii_leak 카테고리 — PII 노출 탐지 점수 히스토그램.
2
+
3
+ 11-bucket 분포 (value0~value10). 자세한 동작은 ``eval_stat.ScoreHistogramStat``
4
+ docstring 참고.
5
+
6
+ 값 의미: 0.0 = PII 미탐지, 1.0 = 다수 탐지 (output_text 길이 대비 정규화).
7
+ """
8
+ from whatap.llm.stats.eval_stat import ScoreHistogramStat
9
+
10
+
11
+ class PIILeakEvalStat(ScoreHistogramStat):
12
+ _category = 'llm_eval_pii_leak'
@@ -0,0 +1,10 @@
1
+ """llm_eval_prompt_injection 카테고리 — Prompt Injection 평가 점수 히스토그램.
2
+
3
+ 11-bucket 분포 (value0~value10). 자세한 동작은 ``eval_stat.ScoreHistogramStat``
4
+ docstring 참고.
5
+ """
6
+ from whatap.llm.stats.eval_stat import ScoreHistogramStat
7
+
8
+
9
+ class PromptInjectionEvalStat(ScoreHistogramStat):
10
+ _category = 'llm_eval_prompt_injection'