trace-score 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ from .trace import compute_TRACE, TRACEEvaluator, WEIGHT_PRESETS
2
+ from .components.temporal import compute_T
3
+ from .components.reliability import compute_R
4
+ from .components.adaptive import compute_A
5
+ from .components.coherence import compute_C
6
+ from .components.epistemic import compute_E
7
+
8
+ __version__ = "0.1.0"
9
+ __author__ = "Girinath V"
@@ -0,0 +1,5 @@
1
+ from .temporal import compute_T
2
+ from .reliability import compute_R
3
+ from .adaptive import compute_A
4
+ from .coherence import compute_C
5
+ from .epistemic import compute_E
@@ -0,0 +1,111 @@
1
+ import re
2
+ import numpy as np
3
+ from typing import List, Tuple, Dict, Optional
4
+ CORRECTION_MARKERS = [
5
+ r"\bactually\b", r"\bno[,\s]",
6
+ r"\bnot exactly\b", r"\bi said\b",
7
+ r"\bi mentioned\b", r"\bi told you\b",
8
+ r"\bplease avoid\b", r"\bplease don'?t\b",
9
+ r"\bdon'?t forget\b", r"\bi already said\b",
10
+ r"\bcorrection\b", r"\bwait[,\s]",
11
+ r"\bi meant\b", r"\bthat'?s wrong\b",
12
+ r"\bnot quite\b", r"\byou forgot\b",
13
+ r"\byou missed\b", r"\bremember[,\s]",
14
+ ]
15
+ def has_correction_marker(text: str) -> bool:
16
+ text_lower = text.lower()
17
+ return any(re.search(p, text_lower) for p in CORRECTION_MARKERS)
18
+ def is_correction_turn(
19
+ user_text : str,
20
+ prev_asst_text : Optional[str],
21
+ nli_model = None
22
+ ) -> bool:
23
+ if not has_correction_marker(user_text):
24
+ return False
25
+ if prev_asst_text is None or nli_model is None:
26
+ return True
27
+ scores = nli_model.predict(
28
+ [[prev_asst_text, user_text]],
29
+ apply_softmax=True
30
+ )
31
+ return float(scores[0][0]) >= 0.35 or has_correction_marker(user_text)
32
+ def compute_A(
33
+ conversation:List[Tuple[str, str]],
34
+ sbert_model=None,
35
+ nli_model=None,
36
+ retention_threshold: float = 0.55,
37
+ gamma:float = 0.80
38
+ ) -> Dict:
39
+ if sbert_model is None:
40
+ from sentence_transformers import SentenceTransformer
41
+ sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
42
+ if nli_model is None:
43
+ from sentence_transformers import CrossEncoder
44
+ nli_model = CrossEncoder(
45
+ "cross-encoder/nli-deberta-v3-small",
46
+ max_length=512
47
+ )
48
+ corrections = []
49
+ prev_assistant = None
50
+ for i, (role, text) in enumerate(conversation):
51
+ if role == "assistant":
52
+ prev_assistant = text
53
+ elif role == "user":
54
+ if is_correction_turn(text, prev_assistant, nli_model):
55
+ corrections.append({"turn_index": i, "text": text})
56
+ K = len(corrections)
57
+ if K == 0:
58
+ return {
59
+ "score": 1.0,
60
+ "per_turn_scores": [],
61
+ "decay_weights": [],
62
+ "corrections_found": 0,
63
+ "retained_count": 0,
64
+ "details": [],
65
+ "explanation": "No correction turns detected. A = 1.0 (vacuously true)."
66
+ }
67
+ per_turn_scores = []
68
+ details = []
69
+ for k, correction in enumerate(corrections):
70
+ turn_idx = correction["turn_index"]
71
+ c_text = correction["text"]
72
+ subsequent = [
73
+ text for i, (role, text) in enumerate(conversation)
74
+ if role == "assistant" and i > turn_idx
75
+ ]
76
+ if not subsequent:
77
+ A_k = 1.0
78
+ max_sim = 1.0
79
+ else:
80
+ c_emb = sbert_model.encode(
81
+ [c_text], convert_to_numpy=True, normalize_embeddings=True
82
+ )
83
+ s_embs = sbert_model.encode(
84
+ subsequent, convert_to_numpy=True, normalize_embeddings=True
85
+ )
86
+ sims = np.dot(c_emb, s_embs.T)[0]
87
+ max_sim = float(sims.max())
88
+ A_k = 1.0 if max_sim >= retention_threshold else 0.0
89
+ per_turn_scores.append(A_k)
90
+ details.append({
91
+ "turn_index": turn_idx,
92
+ "text": c_text[:100],
93
+ "retained": bool(A_k),
94
+ "max_sim": round(max_sim, 4),
95
+ })
96
+ decay_weights = [gamma ** (K - 1 - k) for k in range(K)]
97
+ Z = sum(decay_weights)
98
+ A_score = sum(w * s for w, s in zip(decay_weights, per_turn_scores)) / Z
99
+ return {
100
+ "score": round(A_score, 4),
101
+ "per_turn_scores": per_turn_scores,
102
+ "decay_weights": [round(w, 4) for w in decay_weights],
103
+ "corrections_found": K,
104
+ "retained_count": int(sum(per_turn_scores)),
105
+ "details": details,
106
+ "explanation": (
107
+ f"Found {K} correction(s). "
108
+ f"Per-correction scores: {per_turn_scores}. "
109
+ f"A = {A_score:.4f}."
110
+ )
111
+ }
@@ -0,0 +1,46 @@
1
+ import numpy as np
2
+ from typing import List, Tuple, Dict
3
+ def compute_C(
4
+ conversation : List[Tuple[str, str]],
5
+ sbert_model = None,
6
+ gamma : float = 0.80
7
+ ) -> Dict:
8
+ if sbert_model is None:
9
+ from sentence_transformers import SentenceTransformer
10
+ sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
11
+ all_texts = [text for _, text in conversation]
12
+ N = len(all_texts)
13
+ if N <= 1:
14
+ return {
15
+ "score": 1.0,
16
+ "per_turn_sims": [],
17
+ "decay_weights": [],
18
+ "mean_drift":0.0,
19
+ "explanation":f"Only {N} turn(s). C = 1.0."
20
+ }
21
+ embeddings = sbert_model.encode(
22
+ all_texts,
23
+ convert_to_numpy=True,
24
+ normalize_embeddings=True,
25
+ batch_size=64,
26
+ show_progress_bar=False
27
+ )
28
+ sim_vector = np.sum(embeddings[:-1] * embeddings[1:], axis=1)
29
+ sim_vector = np.clip(sim_vector, 0.0, 1.0)
30
+ per_turn_sims = [round(float(s), 4) for s in sim_vector]
31
+ M = len(per_turn_sims)
32
+ decay_weights = [gamma ** (M - 1 - t) for t in range(M)]
33
+ Z = sum(decay_weights)
34
+ C_score = sum(w * s for w, s in zip(decay_weights, per_turn_sims)) / Z
35
+ mean_drift = 1.0 - float(np.mean(sim_vector))
36
+ return {
37
+ "score": round(C_score, 4),
38
+ "per_turn_sims": per_turn_sims,
39
+ "decay_weights": [round(w, 4) for w in decay_weights],
40
+ "mean_drift": round(mean_drift, 4),
41
+ "explanation": (
42
+ f"Encoded {N} turns via SBERT. "
43
+ f"Adjacent-pair sims: {per_turn_sims}. "
44
+ f"C = {C_score:.4f}, mean drift = {mean_drift:.4f}."
45
+ )
46
+ }
@@ -0,0 +1,110 @@
1
+ import numpy as np
2
+ from typing import List, Tuple, Dict
3
+ CONFIDENCE_LEXICON = {
4
+ "definitely": 1.00,"certainly": 1.00,
5
+ "absolutely": 1.00,"undoubtedly": 0.98,
6
+ "without a doubt": 0.97,"for sure": 0.95,
7
+ "clearly": 0.94,"obviously": 0.93,
8
+ "i am certain": 0.95,"i am confident": 0.92,
9
+ "i know": 0.90,"i believe": 0.80,
10
+ "i think": 0.75,"likely": 0.77,
11
+ "probably": 0.74,"it seems": 0.72,
12
+ "generally": 0.76,"typically": 0.75,
13
+ "usually": 0.74,"might": 0.52,
14
+ "could": 0.51,"may": 0.56,
15
+ "possibly": 0.46,"i suppose": 0.51,
16
+ "i guess": 0.46,"sometimes": 0.56,
17
+ "perhaps": 0.31,"maybe": 0.31,
18
+ "i am not sure": 0.20,"i'm not sure": 0.20,
19
+ "uncertain": 0.26,"unclear": 0.26,
20
+ "it depends": 0.31,"hard to say": 0.20,
21
+ "i doubt": 0.22,"i cannot say": 0.15,
22
+ }
23
+ DEFAULT_CONFIDENCE = 0.68
24
+ HIGH_CONF_ANCHOR = "I am absolutely certain and confident about this."
25
+ LOW_CONF_ANCHOR = "I am not sure and quite uncertain about this perhaps."
26
+ def lexicon_confidence(text: str) -> float:
27
+ text_lower = text.lower()
28
+ found_scores = [
29
+ score for marker, score in sorted(
30
+ CONFIDENCE_LEXICON.items(), key=lambda x: len(x[0]), reverse=True
31
+ )
32
+ if marker in text_lower
33
+ ]
34
+ return float(np.mean(found_scores)) if found_scores else DEFAULT_CONFIDENCE
35
+ def sbert_anchor_confidence(text, sbert_model, high_emb, low_emb) -> float:
36
+ text_emb = sbert_model.encode(
37
+ [text], convert_to_numpy=True, normalize_embeddings=True
38
+ )[0]
39
+ sim_high = float(np.dot(text_emb, high_emb))
40
+ sim_low = float(np.dot(text_emb, low_emb))
41
+ total = sim_high + sim_low
42
+ return float(sim_high / total) if total > 0 else DEFAULT_CONFIDENCE
43
+ def compute_E(
44
+ conversation : List[Tuple[str, str]],
45
+ sbert_model = None,
46
+ gamma : float = 0.80,
47
+ lexicon_weight : float = 0.60,
48
+ sbert_weight : float = 0.40
49
+ ) -> Dict:
50
+ if sbert_model is None:
51
+ from sentence_transformers import SentenceTransformer
52
+ sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
53
+ anchors = sbert_model.encode(
54
+ [HIGH_CONF_ANCHOR, LOW_CONF_ANCHOR],
55
+ convert_to_numpy=True,
56
+ normalize_embeddings=True
57
+ )
58
+ high_emb = anchors[0]
59
+ low_emb = anchors[1]
60
+ assistant_texts = [text for role, text in conversation if role == "assistant"]
61
+ N = len(assistant_texts)
62
+ if N == 0:
63
+ return {
64
+ "score": 1.0,
65
+ "variance_penalty": 0.0,
66
+ "turn_confidences": [],
67
+ "decay_weights": [],
68
+ "mean_confidence": DEFAULT_CONFIDENCE,
69
+ "raw_variance": 0.0,
70
+ "explanation": "No assistant turns. E = 1.0, V = 0.0."
71
+ }
72
+ if N == 1:
73
+ conf = (
74
+ lexicon_weight * lexicon_confidence(assistant_texts[0]) +
75
+ sbert_weight * sbert_anchor_confidence(assistant_texts[0], sbert_model, high_emb, low_emb)
76
+ )
77
+ return {
78
+ "score": round(conf, 4),
79
+ "variance_penalty": 0.0,
80
+ "turn_confidences": [round(conf, 4)],
81
+ "decay_weights": [1.0],
82
+ "mean_confidence": round(conf, 4),
83
+ "raw_variance": 0.0,
84
+ "explanation": f"Single assistant turn. E = {conf:.4f}, V = 0.0."
85
+ }
86
+ turn_confidences = [
87
+ lexicon_weight * lexicon_confidence(t) +
88
+ sbert_weight * sbert_anchor_confidence(t, sbert_model, high_emb, low_emb)
89
+ for t in assistant_texts
90
+ ]
91
+ conf_array = np.array(turn_confidences)
92
+ decay_weights = [gamma ** (N - 1 - t) for t in range(N)]
93
+ Z = sum(decay_weights)
94
+ E_score = sum(w * c for w, c in zip(decay_weights, turn_confidences)) / Z
95
+ mu = float(conf_array.mean())
96
+ raw_variance = float(np.mean((conf_array - mu) ** 2))
97
+ V_norm = min(raw_variance / 0.25, 1.0)
98
+ return {
99
+ "score": round(E_score, 4),
100
+ "variance_penalty": round(V_norm, 4),
101
+ "turn_confidences": [round(c, 4) for c in turn_confidences],
102
+ "decay_weights": [round(w, 4) for w in decay_weights],
103
+ "mean_confidence": round(mu, 4),
104
+ "raw_variance": round(raw_variance, 6),
105
+ "explanation": (
106
+ f"Analyzed {N} assistant turns. "
107
+ f"Confidences: {[round(c,3) for c in turn_confidences]}. "
108
+ f"E = {E_score:.4f}, V_norm = {V_norm:.4f}."
109
+ )
110
+ }
@@ -0,0 +1,87 @@
1
+ from itertools import combinations
2
+ from typing import List, Tuple, Dict
3
+ import numpy as np
4
+ def compute_R(
5
+ conversation : List[Tuple[str, str]],
6
+ nli_model = None,
7
+ threshold : float = 0.75,
8
+ gamma : float = 0.80,
9
+ bidirectional : bool = True
10
+ ) -> Dict:
11
+ if nli_model is None:
12
+ from sentence_transformers import CrossEncoder
13
+ nli_model = CrossEncoder(
14
+ "cross-encoder/nli-deberta-v3-small",
15
+ max_length=512
16
+ )
17
+ assistant_turns = [
18
+ (i, text)
19
+ for i, (role, text) in enumerate(conversation)
20
+ if role == "assistant"
21
+ ]
22
+ N = len(assistant_turns)
23
+ if N < 2:
24
+ return {
25
+ "score": 1.0,
26
+ "penalty": 0.0,
27
+ "per_turn_scores": [1.0] * N,
28
+ "decay_weights": [1.0] * N,
29
+ "total_pairs": 0,
30
+ "contradiction_count": 0,
31
+ "contradictions": [],
32
+ "explanation": f"Only {N} assistant turn(s). R = 1.0, P = 0.0."
33
+ }
34
+ all_pairs = list(combinations(range(N), 2))
35
+ forward_inputs = [
36
+ [assistant_turns[i][1], assistant_turns[j][1]]
37
+ for i, j in all_pairs
38
+ ]
39
+ forward_scores = nli_model.predict(forward_inputs, apply_softmax=True)
40
+ forward_probs = forward_scores[:, 0]
41
+ if bidirectional:
42
+ backward_inputs = [
43
+ [assistant_turns[j][1], assistant_turns[i][1]]
44
+ for i, j in all_pairs
45
+ ]
46
+ backward_scores = nli_model.predict(backward_inputs, apply_softmax=True)
47
+ backward_probs = backward_scores[:, 0]
48
+ contradiction_probs = np.maximum(forward_probs, backward_probs)
49
+ else:
50
+ contradiction_probs = forward_probs
51
+ contradictions = []
52
+ contradicted_set = set()
53
+ for k, (i, j) in enumerate(all_pairs):
54
+ prob = float(contradiction_probs[k])
55
+ if prob >= threshold:
56
+ contradictions.append({
57
+ "turn_i": assistant_turns[i][0],
58
+ "turn_j": assistant_turns[j][0],
59
+ "contradiction_probability": round(prob, 4),
60
+ "text_i": assistant_turns[i][1][:100],
61
+ "text_j": assistant_turns[j][1][:100],
62
+ })
63
+ contradicted_set.add(i)
64
+ contradicted_set.add(j)
65
+ per_turn_scores = [
66
+ 0.0 if t in contradicted_set else 1.0
67
+ for t in range(N)
68
+ ]
69
+ decay_weights = [gamma ** (N - 1 - t) for t in range(N)]
70
+ Z = sum(decay_weights)
71
+ R_score = sum(w * s for w, s in zip(decay_weights, per_turn_scores)) / Z
72
+ total_pairs = len(all_pairs)
73
+ P = len(contradictions) / total_pairs if total_pairs > 0 else 0.0
74
+ return {
75
+ "score": round(R_score, 4),
76
+ "penalty": round(P, 4),
77
+ "per_turn_scores": per_turn_scores,
78
+ "decay_weights": [round(w, 4) for w in decay_weights],
79
+ "total_pairs": total_pairs,
80
+ "contradiction_count": len(contradictions),
81
+ "contradictions": contradictions,
82
+ "explanation": (
83
+ f"Checked {total_pairs} pairs across {N} assistant turns. "
84
+ f"Contradictions: {len(contradictions)}. "
85
+ f"R = {R_score:.4f}, P (penalty) = {P:.4f}."
86
+ )
87
+ }
@@ -0,0 +1,94 @@
1
+ import re
2
+ import numpy as np
3
+ from typing import List, Tuple, Dict
4
+ FACT_PATTERNS = [
5
+ (r"i(?:'m| am) ([\w\s]+?)(?:\s+and\s+|\s*,\s*|\.|\?|$)",
6
+ "user is {}"),
7
+ (r"i have ([\w\s]+?)(?:\s+and\s+|\s*,\s*|\.|\?|$)",
8
+ "user has {}"),
9
+ (r"i (hate|love|prefer|avoid|dislike|like|eat|drink) ([\w\s]+?)(?:\s+and\s+|\s*,\s*|\.|\?|$)",
10
+ "user {}s {}"),
11
+ (r"allergic to ([\w\s]+?)(?:\s+and\s+|\s*,\s*|\.|\?|$)",
12
+ "user is allergic to {}"),
13
+ (r"my ([\w\s]+?) is ([\w\s\d]+?)(?:\s+and\s+|\s*,\s*|\.|\?|$)",
14
+ "user's {} is {}"),
15
+ ]
16
+ def extract_atomic_facts(text: str) -> List[str]:
17
+ facts = []
18
+ text_lower = text.lower().strip()
19
+ for pattern, template in FACT_PATTERNS:
20
+ for match in re.findall(pattern, text_lower):
21
+ if isinstance(match, tuple):
22
+ fact = template.format(*[m.strip() for m in match])
23
+ else:
24
+ fact = template.format(match.strip())
25
+ if len(fact.split()) >= 3:
26
+ facts.append(fact)
27
+ return list(set(facts))
28
+ def compute_T(
29
+ conversation : List[Tuple[str, str]],
30
+ sbert_model = None,
31
+ threshold : float = 0.60,
32
+ gamma : float = 0.80
33
+ ) -> Dict:
34
+ if sbert_model is None:
35
+ from sentence_transformers import SentenceTransformer
36
+ sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
37
+ cumulative_facts = []
38
+ assistant_turns = []
39
+ for role, text in conversation:
40
+ if role == "user":
41
+ new_facts = extract_atomic_facts(text)
42
+ for f in new_facts:
43
+ if f not in cumulative_facts:
44
+ cumulative_facts.append(f)
45
+ elif role == "assistant":
46
+ assistant_turns.append({
47
+ "text": text,
48
+ "facts": list(cumulative_facts)
49
+ })
50
+ N = len(assistant_turns)
51
+ if N == 0:
52
+ return {
53
+ "score": 1.0,
54
+ "per_turn_scores": [],
55
+ "decay_weights": [],
56
+ "user_facts": cumulative_facts,
57
+ "explanation": "No assistant turns found. T = 1.0."
58
+ }
59
+ per_turn_scores = []
60
+ for turn_data in assistant_turns:
61
+ facts = turn_data["facts"]
62
+ text = turn_data["text"]
63
+ if not facts:
64
+ per_turn_scores.append(1.0)
65
+ continue
66
+ fact_embs = sbert_model.encode(
67
+ facts,
68
+ convert_to_numpy=True,
69
+ normalize_embeddings=True
70
+ )
71
+ turn_emb = sbert_model.encode(
72
+ [text],
73
+ convert_to_numpy=True,
74
+ normalize_embeddings=True
75
+ )
76
+ sims = np.dot(fact_embs, turn_emb.T).flatten()
77
+ recalled = int(np.sum(sims >= threshold))
78
+ T_t = recalled / len(facts)
79
+ per_turn_scores.append(round(T_t, 4))
80
+ decay_weights = [gamma ** (N - 1 - t) for t in range(N)]
81
+ Z = sum(decay_weights)
82
+ T_score = sum(w * s for w, s in zip(decay_weights, per_turn_scores)) / Z
83
+ return {
84
+ "score": round(T_score, 4),
85
+ "per_turn_scores": per_turn_scores,
86
+ "decay_weights": [round(w, 4) for w in decay_weights],
87
+ "user_facts": cumulative_facts,
88
+ "explanation": (
89
+ f"Computed T across {N} assistant turns with gamma={gamma}. "
90
+ f"Per-turn scores: {per_turn_scores}. "
91
+ f"Decay weights: {[round(w,3) for w in decay_weights]}. "
92
+ f"T = {T_score:.4f}."
93
+ )
94
+ }
trace_score/trace.py ADDED
@@ -0,0 +1,166 @@
1
+ from typing import List, Tuple, Dict, Optional
2
+ from .components.temporal import compute_T
3
+ from .components.reliability import compute_R
4
+ from .components.adaptive import compute_A
5
+ from .components.coherence import compute_C
6
+ from .components.epistemic import compute_E
7
+ WEIGHT_PRESETS = {
8
+ "equal": {"w_T": 0.20, "w_R": 0.20, "w_A": 0.20, "w_C": 0.20, "w_E": 0.20},
9
+ "customer_service": {"w_T": 0.30, "w_R": 0.20, "w_A": 0.30, "w_C": 0.10, "w_E": 0.10},
10
+ "technical_qa": {"w_T": 0.20, "w_R": 0.30, "w_A": 0.10, "w_C": 0.10, "w_E": 0.30},
11
+ "medical_chatbot": {"w_T": 0.30, "w_R": 0.30, "w_A": 0.20, "w_C": 0.10, "w_E": 0.10},
12
+ "education_tutor": {"w_T": 0.20, "w_R": 0.10, "w_A": 0.30, "w_C": 0.30, "w_E": 0.10},
13
+ }
14
+ INTERPRETATION_THRESHOLDS = [
15
+ (0.85, "Excellent — conversation is highly consistent"),
16
+ (0.70, "Good — minor consistency issues"),
17
+ (0.55, "Moderate — notable consistency failures"),
18
+ (0.40, "Poor — significant consistency problems"),
19
+ (0.00, "Very poor — conversation is highly inconsistent"),
20
+ ]
21
+ class TRACEEvaluator:
22
+ def __init__(self):
23
+ self._sbert = None
24
+ self._nli = None
25
+ def _load_models(self):
26
+ if self._sbert is None:
27
+ from sentence_transformers import SentenceTransformer
28
+ self._sbert = SentenceTransformer("all-MiniLM-L6-v2")
29
+ if self._nli is None:
30
+ from sentence_transformers import CrossEncoder
31
+ self._nli = CrossEncoder(
32
+ "cross-encoder/nli-deberta-v3-small",
33
+ max_length=512
34
+ )
35
+ def evaluate(
36
+ self,
37
+ conversation : List[Tuple[str, str]],
38
+ weights : Optional[Dict[str, float]] = None,
39
+ preset : str = "equal",
40
+ gamma : float = 0.80,
41
+ lambda_ : float = 0.15,
42
+ delta : float = 0.10,
43
+ alpha : float = 0.05,
44
+ beta : float = 0.05,
45
+ verbose : bool = False
46
+ ) -> Dict:
47
+ self._load_models()
48
+ if not conversation:
49
+ return self._empty_result(weights or WEIGHT_PRESETS[preset])
50
+ for role, text in conversation:
51
+ if role not in ("user", "assistant"):
52
+ raise ValueError(f"Invalid role '{role}'. Must be 'user' or 'assistant'.")
53
+ if not text or not text.strip():
54
+ raise ValueError("Empty turn text found in conversation.")
55
+ w = self._resolve_weights(weights, preset)
56
+ t_result = compute_T(conversation, sbert_model=self._sbert, gamma=gamma)
57
+ r_result = compute_R(conversation, nli_model=self._nli, gamma=gamma)
58
+ a_result = compute_A(conversation, sbert_model=self._sbert,
59
+ nli_model=self._nli, gamma=gamma)
60
+ c_result = compute_C(conversation, sbert_model=self._sbert, gamma=gamma)
61
+ e_result = compute_E(conversation, sbert_model=self._sbert, gamma=gamma)
62
+ T = t_result["score"]
63
+ R = r_result["score"]
64
+ A = a_result["score"]
65
+ C = c_result["score"]
66
+ E = e_result["score"]
67
+ P = r_result["penalty"]
68
+ V = e_result["variance_penalty"]
69
+ base_score = (
70
+ w["w_T"] * T +
71
+ w["w_R"] * R +
72
+ w["w_A"] * A +
73
+ w["w_C"] * C +
74
+ w["w_E"] * E
75
+ )
76
+ penalty_term = lambda_ * P + delta * V
77
+ interaction_term = alpha * (T * C) + beta * (A * R)
78
+ raw_trace = base_score - penalty_term + interaction_term
79
+ trace_score = round(max(0.0, min(1.0, raw_trace)), 4)
80
+ formula_breakdown = (
81
+ f"TRACE = [{w['w_T']}×{T:.3f} + {w['w_R']}×{R:.3f} + "
82
+ f"{w['w_A']}×{A:.3f} + {w['w_C']}×{C:.3f} + {w['w_E']}×{E:.3f}]"
83
+ f" - [{lambda_}×{P:.3f} + {delta}×{V:.3f}]"
84
+ f" + [{alpha}×({T:.3f}×{C:.3f}) + {beta}×({A:.3f}×{R:.3f})]"
85
+ f" = {base_score:.4f} - {penalty_term:.4f} + {interaction_term:.4f}"
86
+ f" = {raw_trace:.4f} → clamped → {trace_score:.4f}"
87
+ )
88
+ result = {
89
+ "trace_score": trace_score,
90
+ "base_score": round(base_score, 4),
91
+ "penalty_term": round(penalty_term, 4),
92
+ "interaction_term": round(interaction_term, 4),
93
+ "T": T, "R": R, "A": A, "C": C, "E": E,
94
+ "P": P, "V": V,
95
+ "weights": w,
96
+ "preset": preset if weights is None else "custom",
97
+ "gamma": gamma,
98
+ "lambda": lambda_,
99
+ "delta": delta,
100
+ "alpha": alpha,
101
+ "beta": beta,
102
+ "interpretation": self._interpret(trace_score),
103
+ "formula_breakdown": formula_breakdown,
104
+ "num_turns": len(conversation),
105
+ }
106
+ if verbose:
107
+ result["details"] = {
108
+ "T": t_result,
109
+ "R": r_result,
110
+ "A": a_result,
111
+ "C": c_result,
112
+ "E": e_result,
113
+ }
114
+ return result
115
+ def _resolve_weights(self, weights, preset):
116
+ if weights is not None:
117
+ total = sum(weights.values())
118
+ if abs(total - 1.0) > 0.01:
119
+ raise ValueError(f"Weights must sum to 1.0, got {total:.4f}.")
120
+ return weights
121
+ if preset not in WEIGHT_PRESETS:
122
+ raise ValueError(
123
+ f"Unknown preset '{preset}'. "
124
+ f"Available: {list(WEIGHT_PRESETS.keys())}"
125
+ )
126
+ return WEIGHT_PRESETS[preset]
127
+ def _interpret(self, score: float) -> str:
128
+ for threshold, label in INTERPRETATION_THRESHOLDS:
129
+ if score >= threshold:
130
+ return label
131
+ return INTERPRETATION_THRESHOLDS[-1][1]
132
+ def _empty_result(self, w):
133
+ return {
134
+ "trace_score": 1.0,
135
+ "base_score": 1.0,
136
+ "penalty_term": 0.0,
137
+ "interaction_term": 0.0,
138
+ "T": 1.0, "R": 1.0, "A": 1.0, "C": 1.0, "E": 1.0,
139
+ "P": 0.0, "V": 0.0,
140
+ "weights": w,
141
+ "interpretation": "Empty conversation. TRACE = 1.0.",
142
+ "num_turns": 0,
143
+ }
144
+ def compute_TRACE(
145
+ conversation : List[Tuple[str, str]],
146
+ weights : Optional[Dict[str, float]] = None,
147
+ preset : str = "equal",
148
+ gamma : float = 0.80,
149
+ lambda_ : float = 0.15,
150
+ delta : float = 0.10,
151
+ alpha : float = 0.05,
152
+ beta : float = 0.05,
153
+ verbose : bool = False
154
+ ) -> Dict:
155
+ evaluator = TRACEEvaluator()
156
+ return evaluator.evaluate(
157
+ conversation,
158
+ weights=weights,
159
+ preset=preset,
160
+ gamma=gamma,
161
+ lambda_=lambda_,
162
+ delta=delta,
163
+ alpha=alpha,
164
+ beta=beta,
165
+ verbose=verbose
166
+ )
@@ -0,0 +1,270 @@
1
+ Metadata-Version: 2.4
2
+ Name: trace-score
3
+ Version: 0.1.0
4
+ Summary: Multi-turn LLM Conversation Consistency Metric
5
+ Home-page: https://github.com/Giri530/trace-score
6
+ Author: Girinath V
7
+ Author-email: Girinath V <your-email@gmail.com>
8
+ License: MIT License
9
+
10
+ Copyright (c) 2026 Girinath V
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+
30
+ Project-URL: Homepage, https://github.com/Giri530/trace-score
31
+ Project-URL: Repository, https://github.com/Giri530/trace-score
32
+ Keywords: nlp,llm,evaluation,multi-turn,consistency,trace-score
33
+ Classifier: Development Status :: 3 - Alpha
34
+ Classifier: Intended Audience :: Science/Research
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Programming Language :: Python :: 3
37
+ Requires-Python: >=3.8
38
+ Description-Content-Type: text/markdown
39
+ License-File: LICENSE
40
+ Requires-Dist: sentence-transformers>=2.2.0
41
+ Requires-Dist: numpy>=1.21.0
42
+ Requires-Dist: torch>=1.11.0
43
+ Dynamic: author
44
+ Dynamic: home-page
45
+ Dynamic: license-file
46
+ Dynamic: requires-python
47
+
48
+ # TRACE Score
49
+
50
+ **Multi-turn LLM Conversation Consistency Metric**
51
+
52
+ > The first unified, deterministic, reference-free evaluation metric for
53
+ > multi-turn conversational consistency in Large Language Models.
54
+
55
+ [![PyPI version](https://badge.fury.io/py/trace-score.svg)](https://pypi.org/project/trace-score/)
56
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
57
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
58
+
59
+ ---
60
+
61
+ ## The Problem
62
+
63
+ Existing metrics (BLEU, ROUGE, BERTScore, RAGAS) evaluate each conversation
64
+ turn **in isolation**. They cannot detect failures that only become visible
65
+ **across multiple turns**:
66
+
67
+ | Failure Type | Example | BLEU | ROUGE | BERTScore | TRACE |
68
+ |---|---|---|---|---|---|
69
+ | Fact forgotten | User says "I am diabetic" → model recommends sugar-rich food 5 turns later | Miss | Miss | Miss | **Catch** |
70
+ | Correction ignored | User corrects model → model reverts to old behavior | Miss | Miss | Miss | **Catch** |
71
+ | Self-contradiction | Model says X at turn 2, contradicts X at turn 7 | Miss | Miss | Miss | **Catch** |
72
+ | Topic drift | Conversation gradually drifts off-topic | Miss | Miss | Miss | **Catch** |
73
+ | Confidence drift | Model says "definitely" then "perhaps" about same claim | Miss | Miss | Miss | **Catch** |
74
+
75
+ ---
76
+
77
+ ## Formula
78
+
79
+ ```
80
+ TRACE(C) = Σ(wᵢ · Sᵢ) − λ·P − δ·V + α·(T·C) + β·(A·R)
81
+ ```
82
+
83
+ Each component uses **time-decay aggregation** — recent turns weighted more:
84
+
85
+ ```
86
+ Sᵢ = (1/Z) · Σ γ^(N-t) · Sᵢ,ₜ
87
+ Z = Σ γ^(N-t)
88
+ ```
89
+
90
+ | Symbol | Component | Measures |
91
+ |--------|-----------|---------|
92
+ | **T** | Temporal Retention | Did assistant remember user-stated facts? |
93
+ | **R** | Reliability Consistency | Did assistant contradict itself? |
94
+ | **A** | Adaptive Correction | Did assistant retain user corrections? |
95
+ | **C** | Context Coherence | Did conversation stay on topic? |
96
+ | **E** | Epistemic Stability | Did confidence stay calibrated? |
97
+ | P | Contradiction penalty | Global contradiction rate |
98
+ | V | Variance penalty | Confidence variance |
99
+ | γ | Time decay factor | Default: 0.80 |
100
+ | λ | Contradiction weight | Default: 0.15 |
101
+ | δ | Variance weight | Default: 0.10 |
102
+ | α | T·C interaction | Default: 0.05 |
103
+ | β | A·R interaction | Default: 0.05 |
104
+
105
+ ---
106
+
107
+ ## Install
108
+
109
+ ```bash
110
+ pip install trace-score
111
+ ```
112
+
113
+ ---
114
+
115
+ ## Quick Start
116
+
117
+ ```python
118
+ from trace_score import compute_TRACE
119
+
120
+ conversation = [
121
+ ("user", "I am diabetic and hate spicy food"),
122
+ ("assistant", "I will suggest low sugar mild options."),
123
+ ("user", "Actually I eat fish too. I am pescatarian."),
124
+ ("assistant", "Spicy chicken with cashews!"), # failure turn
125
+ ]
126
+
127
+ result = compute_TRACE(conversation, verbose=True)
128
+
129
+ print(result["trace_score"]) # 0.41 — catches failures
130
+ print(result["T"]) # 0.50 — forgot user facts
131
+ print(result["A"]) # 0.00 — ignored correction
132
+ print(result["formula_breakdown"]) # full formula with values
133
+ print(result["interpretation"]) # "Poor consistency"
134
+ ```
135
+
136
+ ---
137
+
138
+ ## Batch Evaluation
139
+
140
+ ```python
141
+ from trace_score import TRACEEvaluator
142
+
143
+ # Models loaded once, reused across all calls — much faster
144
+ evaluator = TRACEEvaluator()
145
+ results = [evaluator.evaluate(conv) for conv in conversations]
146
+ ```
147
+
148
+ ---
149
+
150
+ ## Adaptive Weights
151
+
152
+ ```python
153
+ # Equal weights (default)
154
+ result = compute_TRACE(conv, preset="equal")
155
+
156
+ # Medical chatbot — memory and reliability weighted more
157
+ result = compute_TRACE(conv, preset="medical_chatbot")
158
+
159
+ # Custom weights — must sum to 1.0
160
+ result = compute_TRACE(conv, weights={
161
+ "w_T": 0.35, "w_R": 0.25,
162
+ "w_A": 0.20, "w_C": 0.10, "w_E": 0.10
163
+ })
164
+ ```
165
+
166
+ Available presets: `equal`, `customer_service`, `technical_qa`,
167
+ `medical_chatbot`, `education_tutor`
168
+
169
+ ---
170
+
171
+ ## Benchmark Results
172
+
173
+ Evaluated on **30 multi-turn conversations** across 3 categories
174
+ (Fact Memory, Correction Retention, Contradiction Detection).
175
+ Conversations generated by **Llama-3.1-8B via Groq API**.
176
+
177
+ ### Overall Metric Comparison
178
+
179
+ | Metric | Overall | Fact Memory | Correction | Contradiction |
180
+ |--------|---------|-------------|------------|---------------|
181
+ | **TRACE** | **0.699** | **0.703** | **0.550** | **0.843** |
182
+ | BLEU | 0.102 | 0.046 | 0.149 | 0.110 |
183
+ | ROUGE-L | 0.239 | 0.177 | 0.301 | 0.239 |
184
+ | BERTScore | 0.822 | 0.800 | 0.842 | 0.823 |
185
+
186
+ **Key finding:** BLEU and ROUGE-L show similar low scores across all categories
187
+ — they cannot distinguish between different types of consistency failures.
188
+ BERTScore appears high but provides no diagnostic breakdown.
189
+ **TRACE clearly separates Correction (0.550) from Contradiction (0.843)**,
190
+ revealing that Llama-3.1-8B struggles most with retaining user corrections.
191
+
192
+ ---
193
+
194
+ ### TRACE Component Breakdown by Category
195
+
196
+ | Category | T | R | A | C | E |
197
+ |----------|---|---|---|---|---|
198
+ | Fact Memory | 0.137 | 0.955 | **1.000** | 0.503 | 0.697 |
199
+ | Correction | 0.491 | 0.927 | **0.144** | 0.465 | 0.712 |
200
+ | Contradiction | **0.973** | 0.875 | 0.900 | 0.510 | 0.696 |
201
+
202
+ **Diagnostic insight:**
203
+
204
+ - Fact Memory: T=0.137 — model **forgets user-stated facts** (A=1.0 means
205
+ no corrections were needed, so A is vacuously true here)
206
+ - Correction: A=0.144 — model **ignores user corrections** (critical failure)
207
+ - Contradiction: T=0.973, A=0.900 — model handles these well
208
+
209
+ No existing metric (BLEU, ROUGE, BERTScore) can produce this breakdown.
210
+
211
+ ---
212
+
213
+ ### The Gap TRACE Reveals — BERTScore vs TRACE
214
+
215
+ Conversations where **BERTScore is high but TRACE is low**
216
+ (failures invisible to BERTScore, caught by TRACE):
217
+
218
+ | Conversation | Category | TRACE | BERTScore | Gap |
219
+ |---|---|---|---|---|
220
+ | CR_006 | Correction | 0.314 | 0.876 | +0.562 |
221
+ | CR_009 | Correction | 0.381 | 0.861 | +0.480 |
222
+ | CR_004 | Correction | 0.535 | 0.884 | +0.349 |
223
+ | CR_003 | Correction | 0.494 | 0.864 | +0.370 |
224
+ | CR_002 | Correction | 0.442 | 0.822 | +0.380 |
225
+
226
+ **In all 5 cases:** BERTScore ≥ 0.82 (looks good), TRACE < 0.55 (failures detected).
227
+ The A component reveals why — user corrections completely ignored (A=0.00).
228
+ This is invisible to any per-turn metric.
229
+
230
+ ---
231
+
232
+ ## Why TRACE?
233
+
234
+ | Metric | Multi-turn | Reference-free | Deterministic | Time-decay | Diagnostic |
235
+ |--------|-----------|----------------|---------------|-----------|-----------|
236
+ | BLEU | No | No | Yes | No | No |
237
+ | ROUGE | No | No | Yes | No | No |
238
+ | BERTScore | No | No | Yes | No | No |
239
+ | RAGAS | No | Yes | No | No | Partial |
240
+ | **TRACE** | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** |
241
+
242
+ ---
243
+
244
+ ## Models Used
245
+
246
+ | Model | Purpose | Size |
247
+ |-------|---------|------|
248
+ | `all-MiniLM-L6-v2` | Semantic similarity (T, A, C, E) | 80MB |
249
+ | `cross-encoder/nli-deberta-v3-small` | Contradiction detection (R, A) | 184MB |
250
+
251
+ Models downloaded automatically on first use (~264MB total).
252
+ CPU-friendly — no GPU required.
253
+
254
+ ---
255
+
256
+ ## Citation
257
+
258
+ ```bibtex
259
+ @article{girinathv2026trace,
260
+ title = {TRACE: A Unified Deterministic Metric for Multi-turn
261
+ Conversational Consistency in Large Language Models},
262
+ author = {Girinath, V},
263
+ year = {2026}
264
+ }
265
+ ```
266
+
267
+ ---
268
+
269
+ *Author: Girinath V*
270
+ *GitHub: https://github.com/Giri530/trace-score*
@@ -0,0 +1,13 @@
1
+ trace_score/__init__.py,sha256=--Nr7FRCszGBpz1D-CMH8cxsAwcbsS9UYo67X0U0iog,341
2
+ trace_score/trace.py,sha256=60MxMgmPkV04CLnuOZqWJnRqVGtJJX5gpjn02pL4ZXw,6858
3
+ trace_score/components/__init__.py,sha256=HPkrl-R3_AmDdcVjYnZC6VlhKarNAeDrYsYQ03TB9fc,171
4
+ trace_score/components/adaptive.py,sha256=yfA9I02o9gRfVkymTFfDkUSv8q23VbhBTGJukrxhSIY,3984
5
+ trace_score/components/coherence.py,sha256=JNsVH0dqkST_p2gTv9riKjFZ0DALkAk1YsQ5LZjac1o,1618
6
+ trace_score/components/epistemic.py,sha256=4qOKEQdHXj-EOl9jtNBO1jSFjjysj8isX3xNNanbPw0,4352
7
+ trace_score/components/reliability.py,sha256=nJjXwjWIEl5ZItXW_U8l9Gy_BtyimV83N7U9FAldAm0,3365
8
+ trace_score/components/temporal.py,sha256=OCQw4lZEUh4UepelbDzV1pAIpZIz9oFh-MJgdjaZh7Y,3442
9
+ trace_score-0.1.0.dist-info/licenses/LICENSE,sha256=FWXZwUInhOZvTxKqwnCPYbVby_65dHGXRCLrdAFhtJ8,1067
10
+ trace_score-0.1.0.dist-info/METADATA,sha256=PxZfpV6eSf-yS9nAWwKLFcUziNLmeZ76UlYf3hqfDlI,9294
11
+ trace_score-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
12
+ trace_score-0.1.0.dist-info/top_level.txt,sha256=QNr7AmluwDQlIJJgmyz35K5ACpW9gEiOPEDECwrft_4,12
13
+ trace_score-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Girinath V
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ trace_score