PyPI - whatap-python - Versions diffs - 2.0.3rc1__tar.gz → 2.1.0__tar.gz - Mend

whatap-python 2.0.3rc1tar.gz → 2.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (241) hide show

{whatap_python-2.0.3rc1 → whatap_python-2.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: whatap-python
-Version: 2.0.3rc1
+Version: 2.1.0
 Summary: Monitoring and Profiling Service
 Home-page: https://www.whatap.io
 Author: whatap

whatap_python-2.1.0/whatap/build.py ADDED Viewed

@@ -0,0 +1,4 @@
+app = 'Python'
+name = 'whatap-python'
+version = '2.1.0'
+release_date = '20260610'

{whatap_python-2.0.3rc1 → whatap_python-2.1.0}/whatap/conf/configuration.py RENAMED Viewed

@@ -232,5 +232,49 @@ Configuration = {
     "force_llm_net_udp_port": False,
     "llm_model_pricing": "",
     "llm_perf_sketch_enabled": True,
-    "llm_perf_sketch_k": 200
+    "llm_perf_sketch_k": 200,
+    # ── LLM Evaluation ──
+    # 평가 파이프라인 마스터 토글. false면 enqueue/worker가 동작하지 않음.
+    "llm_eval_enabled": False,
+    # judge LLM 호출 평가자의 샘플링 비율 (0.0 ~ 1.0).
+    #   1.0 (기본) — 모든 judge 평가자 항상 실행
+    #   0.1       — judge 평가자 10% 만 실행 (비용 1/10)
+    #   0.0       — judge 평가자 전부 skip
+    # 규칙 기반 평가자 (PIILeak / URLScan 등 USES_LLM_JUDGE=False) 는 영향 X.
+    # 결정론적 샘플링 — 같은 txid 는 항상 같은 결정.
+    "llm_eval_sample_rate": 1.0,
+    # 평가 큐 최대 크기. 초과 시 drop + LLM030 경고.
+    "llm_eval_buffer_limit": 1000,
+    # 평가자 실행에 쓸 ThreadPoolExecutor worker 수.
+    "llm_eval_workers": 4,
+    # judge LLM 1회 호출의 최대 대기 시간 (초). 초과 시 TimeoutError → judge_error
+    # 로 graceful degrade. user app 의 event loop 가 hang/backpressure 일 때
+    # 평가 워커가 무기한 블록되는 것을 차단. 0 또는 음수면 무제한 (legacy 동작).
+    "llm_eval_judge_timeout_sec": 30,
+    # judge LLM 호출도 기존 인스트루멘테이션 (intercept) 으로 추적할지 여부.
+    # False (기본): judge 호출은 intercept 우회 → 메트릭/logsink 에 안 잡힘.
+    #               (eval 결과 점수만 LlmStepEvalStatus + llm_eval_stat 로 송출)
+    # True       : judge 호출이 llm_step_status pack + 메트릭에 잡힘 (사용자 호출과 동일).
+    #               cost/token/latency 가시화. 메트릭 카운트 2배 증가 가능 (user + judge).
+    "llm_eval_track_judge_calls": False,
+    # 활성 평가자 csv. 미지정 (빈 값) 이면 default 3 종 자동 활성:
+    #   combined_judge, pii_leak, url_scan
+    # 개별 aspect evaluator (hallucination / answer_relevance / toxicity /
+    # prompt_injection / factuality) 는 combined_judge 가 1회 judge 호출로 이미
+    # 모두 산출하므로 default 에서 제외 — 명시 활성 시에만 별도 evaluator 가 추가
+    # judge 호출을 발생시킴.
+    #
+    # 가용 라벨 (각 라벨이 evaluator 1개 + 동명 stat 카테고리에 매핑):
+    #   combined_judge   — 1번의 LLM judge 호출로 5 의미 aspect 동시 평가
+    #                      (hallucination / answer_relevance / toxicity /
+    #                       prompt_injection / factuality)
+    #   pii_leak         — 정규식 + Luhn 으로 PII 노출 탐지 (LLM 호출 X)
+    #   url_scan         — URL 추출 + suspicious 패턴 매칭 (LLM 호출 X)
+    #   hallucination / answer_relevance / toxicity / prompt_injection / factuality
+    #                    — combined_judge 대신 개별 evaluator 만 쓰고 싶을 때
+    # 예: "combined_judge,pii_leak,url_scan"  (default 와 동일)
+    # 예: "pii_leak,url_scan"                 (정형 평가만, LLM judge 0회)
+    # 예: "hallucination,toxicity,pii_leak"   (개별 aspect — combined 대신)
+    "llm_eval_evaluators": "",
 }

{whatap_python-2.0.3rc1 → whatap_python-2.1.0}/whatap/conf/configure.py RENAMED Viewed

@@ -60,7 +60,7 @@ class Configure(object):
                     except Exception as e:
                         print('WHATAP: ', e)
                         continue
-            if not getattr(cls, "license") and getattr(cls, "accesskey"):
+            if not getattr(cls, "license", None) and getattr(cls, "accesskey", None):
                 setattr(cls, "license", getattr(cls, "accesskey"))
             for callback in cls.observers:
                 callback()

whatap_python-2.1.0/whatap/counter/tasks/llm_evaluator_task.py ADDED Viewed

@@ -0,0 +1,501 @@
+"""LLM 평가자 큐잉 + 비동기 실행 + 결과 송출 태스크.
+데이터 흐름:
+    [Producer]  LlmLogSinkTask.dispatch(LlmStepStatus)
+                  └─ enqueue_evaluation(pack)
+                          └─ self._q.put((pack, scope_evs))    ← 큐 적재
+    [Dispatcher Thread]  daemon, q.get() blocking loop
+                  └─ for evaluator in registry + scope:
+                          if sampler.should_run(evaluator, pack.txid):
+                              executor.submit(_run_one, evaluator, pack)
+    [Worker Thread (ThreadPoolExecutor)]
+                  └─ evaluator.evaluate(ctx) → EvaluatorResult (+extras)
+                          └─ LlmStepEvalStatus = LlmStepStatus.from_step_status(pack)
+                                  └─ score 5 필드 채움 (eval_hallucination.n 등)
+                                  └─ dispatch_llm_evaluation_pack(eval_pack)
+                                          └─ LlmLogSinkTask._send_log_sink(eval_pack)
+평가 결과 pack 은 원본 LlmStepStatus 와 동일한 구조 (model/tokens/cost/latency/...) +
+``llm_log_type=llm_step_eval_status`` 로 차별 + 평가 점수 5 필드 추가.
+평가는 fire-and-forget. LlmStepStatus 송출과 완전 독립 — 사용자 트랜잭션 영향 0.
+모든 예외는 swallow 후 [LLM] 로그로만 기록.
+"""
+import contextvars
+import queue
+import threading
+import time
+from whatap import logging
+from whatap.conf.configure import Configure as conf
+from whatap.llm.evaluators.base import EvaluatorContext, EvaluatorResult
+from whatap.llm.evaluators.registry import EvaluatorRegistry
+from whatap.llm.evaluators.sampler import EvaluatorSampler
+from whatap.llm.log_sink_packs.llm_step_eval_status import LlmStepEvalStatus
+_DEFAULT_BUFFER_LIMIT = 1000
+_DEFAULT_WORKERS = 4
+def _conf_truthy(name, default=False):
+    """Configure attr 을 bool 로 안전 변환.
+    Configure.setProperty 가 config file 에서 읽은 문자열 'true'/'false' 를 그대로
+    setattr 해서 string 으로 저장 → 'false' 도 truthy 로 평가되는 reload 버그 우회.
+    """
+    try:
+        val = getattr(conf, name, default)
+    except Exception:
+        return bool(default)
+    if isinstance(val, bool):
+        return val
+    if isinstance(val, str):
+        return val.strip().lower() in ('true', 'yes', '1', 'on')
+    return bool(val)
+# 평가 점수가 매핑되는 attribute 이름.
+# evaluator.LABEL (또는 EvaluatorResult.extras 의 label) 를 키로 룩업.
+_LABEL_TO_FIELD = {
+    'hallucination': 'eval_hallucination',
+    'answer_relevance': 'eval_answer_relevance',
+    'toxicity': 'eval_toxicity',
+    'prompt_injection': 'eval_prompt_injection',
+    'factuality': 'eval_factuality',
+    'pii_leak': 'eval_pii_leak',
+    'url_scan': 'eval_url_scan',
+    'combined_judge': 'eval_combined_judge',
+}
+# 평가 워커 안인지 판별하는 플래그 — 두 저장소 동시 사용:
+#   - thread-local : sync 워커 스레드 자체에서 발생한 LLM 호출 인지용
+#   - ContextVar   : async judge 가 ``run_coroutine_threadsafe`` 로 user loop 에 dispatch
+#                    된 task 의 context 에서도 인지되도록 propagate
+# 무한 재귀 방지 (judge 호출이 다시 평가 큐에 안 들어가도록) + intercept 가 judge 호출도
+# 정상 추적 (단, 재귀 가드 덕분에 다시 평가 enqueue 되지 않음) 의 두 가지 목적.
+_eval_worker_local = threading.local()
+_eval_worker_cv = contextvars.ContextVar('whatap_eval_worker', default=False)
+def _is_in_evaluator_worker():
+    """평가 워커 안인지 — thread-local 또는 ContextVar 기준."""
+    if getattr(_eval_worker_local, 'in_eval', False):
+        return True
+    cv_val = _eval_worker_cv.get()
+    if isinstance(cv_val, dict):
+        return bool(cv_val.get('in_eval'))
+    return bool(cv_val)
+def _get_eval_worker_state():
+    """현재 eval worker state 반환. 없으면 None.
+    구조: {'in_eval', 'parent_txid', 'parent_step_id', 'parent_index'}.
+    CV 우선, fallback TL.
+    """
+    cv_val = _eval_worker_cv.get()
+    if isinstance(cv_val, dict) and cv_val.get('in_eval'):
+        return cv_val
+    if getattr(_eval_worker_local, 'in_eval', False):
+        return {
+            'in_eval': True,
+            'parent_txid': getattr(_eval_worker_local, 'parent_txid', None),
+            'parent_step_id': getattr(_eval_worker_local, 'parent_step_id', None),
+            'parent_index': getattr(_eval_worker_local, 'parent_index', None),
+        }
+    return None
+# EvalStat 인스턴스 캐시.
+_eval_stat_cache = [None]
+def _eval_stat():
+    if _eval_stat_cache[0] is not None:
+        return _eval_stat_cache[0]
+    try:
+        from whatap.counter.tasks.llm_stat_task import LlmStatTask
+        stat = LlmStatTask.get_stat('EvalStat')
+        if stat is not None:
+            _eval_stat_cache[0] = stat
+        return stat
+    except Exception:
+        return None
+class LlmEvaluatorTask(object):
+    """LLM 평가 파이프라인 (큐 + dispatcher + worker pool)."""
+    _instance = None
+    _lock = threading.Lock()
+    def __init__(self):
+        buffer_limit = _coerce_int(getattr(conf, 'llm_eval_buffer_limit', _DEFAULT_BUFFER_LIMIT),
+                                   _DEFAULT_BUFFER_LIMIT)
+        self._q = queue.Queue(buffer_limit)
+        self._workers = _coerce_int(getattr(conf, 'llm_eval_workers', _DEFAULT_WORKERS),
+                                    _DEFAULT_WORKERS)
+        self._executor = None
+        self._sampler = EvaluatorSampler()
+        self._started = False
+        self._start_lock = threading.Lock()
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = cls()
+        return cls._instance
+    # ── public producers ──
+    def enqueue(self, pack):
+        """LlmLogSinkTask.dispatch 가 매 LLM 호출 종료 시 호출.
+        :param pack: 원본 ``LlmStepStatus`` pack. worker 가 이걸 복제해 LlmStepEvalStatus 를
+            만들어 점수 채워 송출. user 스레드에서 호출되므로 scope 평가자도 여기서 캡처.
+        """
+        if not _conf_truthy('llm_eval_enabled'):
+            return
+        if _is_in_evaluator_worker():
+            return  # judge 평가자의 자기 호출이 다시 평가 큐에 들어가는 무한 재귀 차단
+        if pack is None:
+            return
+        # conf.llm_eval_evaluators 기반 빌트인 자동 register (1회 — 안에서 fast-path).
+        # is_empty() 검사 전에 호출해야 첫 enqueue 가 skip 안 됨.
+        try:
+            from whatap.llm.evaluators.registry import bootstrap_from_conf
+            bootstrap_from_conf()
+        except Exception as e:
+            logging.warning('[LLM] evaluator bootstrap failed: %s' % e,
+                            extra={'id': 'LLM048'})
+        # user 스레드에서 scope 평가자 캡처 (dispatcher 스레드는 못 봄)
+        try:
+            from whatap.llm.evaluators.scope import get_scope_evaluators
+            scope_evs = list(get_scope_evaluators())
+        except Exception:
+            scope_evs = []
+        if EvaluatorRegistry.get_instance().is_empty() and not scope_evs:
+            return
+        try:
+            self._q.put_nowait((pack, scope_evs))
+        except queue.Full:
+            logging.warning('[LLM] eval queue full, pack dropped: txid=%s' % pack.txid,
+                            extra={'id': 'LLM030'})
+            return
+        self._ensure_started()
+    # ── internal ──
+    def _ensure_started(self):
+        if self._started:
+            return
+        with self._start_lock:
+            if self._started:
+                return
+            try:
+                from concurrent.futures import ThreadPoolExecutor
+                self._executor = ThreadPoolExecutor(
+                    max_workers=self._workers,
+                    thread_name_prefix='whatap-llm-eval',
+                )
+                t = threading.Thread(target=self._run, daemon=True,
+                                     name='whatap-llm-eval-dispatch')
+                t.start()
+                self._started = True
+            except Exception as e:
+                logging.warning('[LLM] eval task start failed: %s' % e,
+                                extra={'id': 'LLM035'})
+    def _run(self):
+        """Dispatcher 스레드 본체. 큐에서 (pack, scope_evs) 받아 evaluator × pack fan-out."""
+        while True:
+            try:
+                item = self._q.get()
+            except Exception:
+                continue
+            if item is None:
+                continue
+            try:
+                pack, scope_evs = item
+            except (ValueError, TypeError):
+                continue
+            try:
+                merged = self._merge_evaluators(scope_evs)
+                ctx_txid = pack.txid
+                # 샘플링 통과한 평가자만 모아서 한 워커 task 로 묶음 — 한 LLM 호출당
+                # LlmStepEvalStatus pack 1개만 송출되도록.
+                active = []
+                for evaluator in merged:
+                    if not evaluator.LABEL:
+                        continue
+                    if not self._sampler.should_run(evaluator, ctx_txid):
+                        continue
+                    active.append(evaluator)
+                if active:
+                    self._executor.submit(self._safe_run_pack, active, pack)
+            except Exception as e:
+                logging.warning('[LLM] eval dispatch failed: %s' % e,
+                                extra={'id': 'LLM031'})
+    @staticmethod
+    def _merge_evaluators(scope_evs=None):
+        """전역 registry + 캡처된 scope 평가자 합치기. 같은 LABEL 이면 scope 가 우선."""
+        base_evs = EvaluatorRegistry.get_instance().all()
+        merged = {}
+        for e in base_evs:
+            merged[e.LABEL] = e
+        if scope_evs:
+            for e in scope_evs:
+                if getattr(e, 'LABEL', None):
+                    merged[e.LABEL] = e
+        return list(merged.values())
+    def _safe_run_pack(self, evaluators, pack):
+        """워커 스레드 진입점 — 한 pack 에 대해 활성 evaluator 들을 순차 실행 + 단일
+        LlmStepEvalStatus pack 송출.
+        모든 예외 swallow + 재귀 방지 + 부모 step 키 propagation.
+        TL/CV 에 다음 stash:
+          - in_eval=True (재귀 가드)
+          - parent_txid / parent_step_id / parent_index : 원본 user step 의 키
+            ``llm_eval_track_judge_calls=true`` 일 때 judge LLM 호출 pack 의 결합 키를
+            user step 과 동일하게 override 해서 dashboard 에서 같은 transaction 으로
+            보이도록 함.
+        TL 은 워커 스레드 자체에서 발생한 LLM 호출 (sync judge), CV 는 dispatched coro
+        (async judge, user loop 으로 dispatch) 에서 propagate 되도록.
+        """
+        state = {
+            'in_eval': True,
+            'parent_txid': pack.txid,
+            'parent_step_id': pack.step_id,
+            'parent_index': pack.index,
+        }
+        _eval_worker_local.in_eval = True
+        _eval_worker_local.parent_txid = pack.txid
+        _eval_worker_local.parent_step_id = pack.step_id
+        _eval_worker_local.parent_index = pack.index
+        cv_token = _eval_worker_cv.set(state)
+        try:
+            self._run_pack(evaluators, pack)
+        except Exception as e:
+            logging.warning('[LLM] evaluator worker crashed: count=%d err=%s'
+                            % (len(evaluators), e),
+                            extra={'id': 'LLM036'})
+        finally:
+            _eval_worker_cv.reset(cv_token)
+            _eval_worker_local.in_eval = False
+            _eval_worker_local.parent_txid = None
+            _eval_worker_local.parent_step_id = None
+            _eval_worker_local.parent_index = None
+    def _run_pack(self, evaluators, pack):
+        """한 pack 의 모든 활성 evaluator 를 순차 실행 + 점수 모아서 단일
+        LlmStepEvalStatus pack 으로 송출.
+        한 LLM 호출 → 한 LlmStepEvalStatus pack 이라는 1:1 invariant. 평가자가 8개여도
+        pack 은 1개. 점수가 없는 필드 (해당 evaluator 가 등록 안 됐거나 실패) 는 None
+        이라 fields() 송출 시 누락.
+        Metric 갱신은 evaluator 별 ``update_eval_metrics()`` 가 따로 처리:
+          - llm_eval_stat (호출 통계 — call_count/failures/latency_sum/latency_sketch).
+            ``USES_LLM_JUDGE=True`` 인 평가자만 카운트 — 규칙 기반 (PII/URL) 은 skip.
+          - llm_eval_<label> (점수 히스토그램 value0~value10).
+        """
+        from whatap.counter.tasks.llm_log_sink_task import dispatch_llm_evaluation_pack
+        ctx = build_eval_context_from_pack(pack)
+        all_scores = {}      # {label: float} — pack 에 합쳐 채울 모든 점수
+        any_score_emitted = False  # 한 evaluator 라도 점수를 냈으면 True
+        all_succeeded = True       # 모두 성공했을 때만 eval_success=True
+        for evaluator in evaluators:
+            scores, success = self._evaluate_one(evaluator, ctx, pack)
+            if scores:
+                all_scores.update(scores)
+                any_score_emitted = True
+            if not success:
+                all_succeeded = False
+        # 원본 pack 의 모든 attr 복제 (model/tokens/cost/latency/txid/...) → eval pack
+        eval_pack = LlmStepEvalStatus.from_step_status(pack)
+        # 한 evaluator 라도 점수를 냈고 모두 성공했으면 True. 모두 실패면 False.
+        eval_pack.eval_success = bool(any_score_emitted and all_succeeded)
+        for label, score in all_scores.items():
+            self._assign_score(eval_pack, label, score)
+        try:
+            dispatch_llm_evaluation_pack(eval_pack)
+        except Exception as e:
+            logging.warning('[LLM] eval pack send failed: %s' % e,
+                            extra={'id': 'LLM033'})
+    def _evaluate_one(self, evaluator, ctx, pack):
+        """1 evaluator 실행 → ({label: score}, success bool) 반환 + metric 갱신.
+        pack 송출은 안 함 (호출자 ``_run_pack`` 이 모든 evaluator 끝나면 1 pack 으로 송출).
+        """
+        start = time.monotonic()
+        success = True
+        result = None
+        try:
+            raw = evaluator.evaluate(ctx)
+            result = _coerce_result(raw, getattr(evaluator, 'METRIC_TYPE', None))
+        except Exception as e:
+            success = False
+            logging.warning('[LLM] evaluator %s failed: %s'
+                            % (getattr(evaluator, 'LABEL', '?'), e),
+                            extra={'id': 'LLM032'})
+        duration_ms = int((time.monotonic() - start) * 1000)
+        # 점수 추출 (primary + extras) — pack 채움 + EvalStat 업데이트 둘 다에 사용
+        scores = {}  # {label: float}
+        if result is not None:
+            v = result.value
+            if isinstance(v, (int, float)) and not isinstance(v, bool):
+                scores[evaluator.LABEL] = float(v)
+            extras = getattr(result, 'extras', None) or {}
+            for ex_label, extra in extras.items():
+                if not ex_label:
+                    continue
+                if isinstance(extra, EvaluatorResult):
+                    val = extra.value
+                elif isinstance(extra, dict):
+                    val = extra.get('value')
+                else:
+                    val = extra
+                if isinstance(val, (int, float)) and not isinstance(val, bool):
+                    scores[ex_label] = float(val)
+        # judge HTTP 가 실제로 호출됐는지 / 평가 결과 유효성:
+        #   USES_LLM_JUDGE=False → 규칙 기반 평가자 (PIILeak / URLScan) → 호출 X
+        #   no_judge_configured  → judge_fn 자체 없음 → 호출 X
+        #   judge_error          → 호출 시도, HTTP/parse 실패 → 호출 O, 점수 X (failure)
+        #   numeric score        → 호출 성공 (success)
+        # called_judge=False 면 EvalStat (호출 통계) 갱신 skip — 점수 히스토그램은
+        # 영향 없음.
+        called_judge = bool(getattr(evaluator, 'USES_LLM_JUDGE', False))
+        if called_judge and result is not None and result.value == 'no_judge_configured':
+            called_judge = False
+        eval_success = success and bool(scores)  # 점수 받았으면 성공
+        # 평가 메트릭 갱신 — llm_eval_stat + llm_eval_<label> N종 한번에.
+        try:
+            from whatap.llm.stats.eval_stat import update_eval_metrics
+            judge_model = evaluator._model if getattr(evaluator, '_model', None) else ctx.model
+            update_eval_metrics(
+                model=judge_model,
+                provider=ctx.provider,
+                operation_type=ctx.operation_type,
+                url=ctx.url,
+                prompt_version=getattr(pack, 'prompt_version', 'v1') or 'v1',
+                called_judge=called_judge,
+                success=eval_success,
+                latency_ms=duration_ms,
+                scores=scores,
+            )
+        except Exception as e:
+            logging.warning('[LLM] eval metrics update failed: %s' % e,
+                            extra={'id': 'LLM038'})
+        return scores, eval_success
+    @staticmethod
+    def _assign_score(eval_pack, label, value):
+        """label 에 매핑되는 eval_<x> 어트리뷰트에 score 값 세팅. 매핑 없으면 무시."""
+        attr = _LABEL_TO_FIELD.get(label)
+        if attr is None:
+            return
+        if isinstance(value, bool) or not isinstance(value, (int, float)):
+            return  # 숫자 점수만 (judge_error 같은 categorical 은 eval_success=False 로 표현)
+        setattr(eval_pack, attr, float(value))
+# ── helpers ──
+def _coerce_int(value, default):
+    try:
+        v = int(value)
+        return v if v > 0 else default
+    except (TypeError, ValueError):
+        return default
+def _coerce_result(raw, metric_type_hint=None):
+    """evaluator.evaluate() 의 raw 반환값을 EvaluatorResult 로 표준화."""
+    if isinstance(raw, EvaluatorResult):
+        result = raw
+    else:
+        result = EvaluatorResult(value=raw)
+    if not result.metric_type:
+        if metric_type_hint:
+            result.metric_type = metric_type_hint
+        else:
+            v = result.value
+            if isinstance(v, bool):
+                result.metric_type = 'boolean'
+            elif isinstance(v, (int, float)):
+                result.metric_type = 'score'
+            elif isinstance(v, dict):
+                result.metric_type = 'json'
+            else:
+                result.metric_type = 'categorical'
+                if v is not None and not isinstance(v, str):
+                    result.value = str(v)
+    return result
+# ── module-level API (LlmLogSinkTask 가 호출) ──
+def enqueue_evaluation(pack):
+    """LlmLogSinkTask.dispatch() 끝에서 호출되는 entrypoint.
+    :param pack: 원본 ``LlmStepStatus``. worker 가 복제해 LlmStepEvalStatus 만들어 송출.
+    """
+    LlmEvaluatorTask.get_instance().enqueue(pack)
+def build_eval_context_from_pack(pack):
+    """LlmStepStatus pack 으로부터 EvaluatorContext (evaluator.evaluate 입력) 생성."""
+    system_texts = getattr(pack, 'system_texts', None) or []
+    if isinstance(system_texts, (list, tuple)):
+        system_text = '\n'.join(str(t) for t in system_texts if t)
+    else:
+        system_text = str(system_texts) if system_texts else ''
+    return EvaluatorContext(
+        txid=pack.txid,
+        step_id=pack.step_id,
+        index=pack.index,
+        provider=pack.provider or '',
+        url=pack.url or '',
+        model=getattr(pack, 'model', None),
+        operation_type=getattr(pack, 'operation_type', '') or 'unknown',
+        input_text=getattr(pack, 'prompt_text', '') or '',
+        output_text=getattr(pack, 'completion_text', '') or '',
+        system_text=system_text,
+        reasoning_text=getattr(pack, 'reasoning_text', '') or '',
+        tool_calls_text=getattr(pack, 'tool_calls_text', '') or '',
+        tool_results_text=getattr(pack, 'tool_results_text', '') or '',
+        success=bool(getattr(pack, 'success', False)),
+        finish_reason=getattr(pack, 'finish_reason', None),
+        latency_ms=getattr(pack, 'latency', None),
+        input_tokens=getattr(pack, 'input_tokens', None),
+        output_tokens=getattr(pack, 'output_tokens', None),
+        client=getattr(pack, '_llm_client', None),
+        event_loop=getattr(pack, '_llm_event_loop', None),
+    )

{whatap_python-2.0.3rc1 → whatap_python-2.1.0}/whatap/counter/tasks/llm_log_sink_task.py RENAMED Viewed

@@ -23,6 +23,16 @@ _MAX_CONTENT_BYTES = 20000
 class LlmLogSinkTask(object):
+    """LLM logsink pack 송출 파이프라인.
+    LLM 호출 결과 pack 을 받아 즉시 송출. 평가 (LLM-judge 등) 는 별도 파이프라인
+    (``LlmEvaluatorTask``) 에서 비동기로 처리되며, 평가 결과는 ``LlmStepEvalStatus``
+    pack (``llm_log_type=llm_step_eval_status``) 으로 송출된다 — 원본 LlmStepStatus 와
+    동일한 fields/tags + 평가 점수 5 필드. (txid, step_id) 결합 키로 백엔드 사후 결합.
+    이 단순한 모델 덕분에 hold/timeout/callback 등의 복잡도가 모두 제거됨.
+    """
     _instance = None
     _lock = threading.Lock()
@@ -39,6 +49,14 @@ class LlmLogSinkTask(object):
         return cls._instance
     def dispatch(self, pack):
+        """LLM 호출 결과 pack 을 즉시 송출 + 평가 큐 트리거.
+        평가 워커 안에서 호출된 judge LLM call 인 경우 (intercept 가 fire 한 케이스):
+          - operation_type='whatap_evaluation' / prompt_version='v1' 고정
+          - parent (user) step 의 (txid, step_id, index) 그대로 propagate
+            → dashboard 에서 같은 transaction 으로 묶여 보임
+          - eval enqueue 는 재귀 가드가 자동 차단
+        """
         if not isinstance(pack, LlmStepStatus):
             return
@@ -55,6 +73,25 @@ class LlmLogSinkTask(object):
                             % (pack.model, pack.input_tokens, pack.output_tokens, pack.cached_tokens, e),
                             extra={'id': 'LLM024'})
+        # ── judge LLM call (eval 워커 안에서 발생) — 고정 라벨 + parent step ids ──
+        try:
+            from whatap.counter.tasks.llm_evaluator_task import (
+                _is_in_evaluator_worker, _get_eval_worker_state,
+            )
+            if _is_in_evaluator_worker():
+                state = _get_eval_worker_state() or {}
+                pack.operation_type = 'whatap_evaluation'
+                pack.prompt_version = 'v1'
+                if state.get('parent_txid'):
+                    pack.txid = state['parent_txid']
+                if state.get('parent_step_id'):
+                    pack.step_id = state['parent_step_id']
+                if state.get('parent_index') is not None:
+                    pack.index = state['parent_index']
+        except Exception:
+            pass
+        # tx_summary 누적
         self._accumulate_tx_summary(ctx, pack)
         if ctx and not pack.success and pack.error_type:
@@ -63,7 +100,22 @@ class LlmLogSinkTask(object):
             ctx._llm_last_error_provider = pack.provider or ''
             ctx._llm_last_error_op_type = pack.operation_type or 'unknown'
             ctx._llm_last_error_url = pack.url or ''
+            ctx._llm_last_error_prompt_version = (
+                getattr(pack, 'prompt_version', 'v1') or 'v1')
+        # pack 즉시 송출 — eval 결과 기다리지 않음
+        self._enqueue_for_send(pack)
+        # 평가 큐 트리거 — worker 가 pack 을 복제해 LlmStepEvalStatus 로 만들어 점수 채워 송출
+        # judge 호출 인 경우 enqueue_evaluation 의 재귀 가드가 차단
+        try:
+            from whatap.counter.tasks.llm_evaluator_task import enqueue_evaluation
+            enqueue_evaluation(pack)
+        except Exception as e:
+            logging.warning('[LLM] eval enqueue failed: %s' % e, extra={'id': 'LLM034'})
+    def _enqueue_for_send(self, pack):
+        """완성된 pack 을 송출 큐에 적재. queue full 이면 drop + LLM025."""
         self._ensure_started()
         if self._q.full():
             logging.warning('[LLM] send queue full, pack dropped: model=%s' % pack.model,
@@ -242,3 +294,16 @@ def send_llm_pack(metadata):
         if hasattr(pack, key):
             setattr(pack, key, val)
     dispatch_llm_pack(pack)
+def dispatch_llm_evaluation_pack(pack):
+    """LlmStepEvalStatus pack 을 LogSink 인프라로 송출.
+    LlmEvaluatorTask._run_one 이 원본 LlmStepStatus 를 복제해 만든 LlmStepEvalStatus 를
+    이 함수로 송출. (txid, step_id) 결합 키로 백엔드에서 LlmStepStatus 와 사후 결합.
+    """
+    try:
+        LlmLogSinkTask.get_instance()._send_log_sink(pack)
+    except Exception as e:
+        logging.warning('[LLM] dispatch_llm_evaluation_pack failed: %s' % e,
+                        extra={'id': 'LLM037'})

{whatap_python-2.0.3rc1 → whatap_python-2.1.0}/whatap/counter/tasks/llm_stat_task.py RENAMED Viewed

@@ -68,6 +68,7 @@ class LlmStatTask(Thread):
                 getattr(ctx, '_llm_last_error_op_type', 'unknown'),
                 url=getattr(ctx, '_llm_last_error_url', ''),
                 error_type=last_error_type,
+                prompt_version=getattr(ctx, '_llm_last_error_prompt_version', 'v1'),
             )
     @classmethod

whatap-python 2.0.3rc1__tar.gz → 2.1.0__tar.gz

whatap-python 2.0.3rc1tar.gz → 2.1.0tar.gz