trace-score 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trace_score/__init__.py +9 -0
- trace_score/components/__init__.py +5 -0
- trace_score/components/adaptive.py +111 -0
- trace_score/components/coherence.py +46 -0
- trace_score/components/epistemic.py +110 -0
- trace_score/components/reliability.py +87 -0
- trace_score/components/temporal.py +94 -0
- trace_score/trace.py +166 -0
- trace_score-0.1.0.dist-info/METADATA +270 -0
- trace_score-0.1.0.dist-info/RECORD +13 -0
- trace_score-0.1.0.dist-info/WHEEL +5 -0
- trace_score-0.1.0.dist-info/licenses/LICENSE +21 -0
- trace_score-0.1.0.dist-info/top_level.txt +1 -0
trace_score/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from .trace import compute_TRACE, TRACEEvaluator, WEIGHT_PRESETS
|
|
2
|
+
from .components.temporal import compute_T
|
|
3
|
+
from .components.reliability import compute_R
|
|
4
|
+
from .components.adaptive import compute_A
|
|
5
|
+
from .components.coherence import compute_C
|
|
6
|
+
from .components.epistemic import compute_E
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
__author__ = "Girinath V"
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import numpy as np
|
|
3
|
+
from typing import List, Tuple, Dict, Optional
|
|
4
|
+
CORRECTION_MARKERS = [
|
|
5
|
+
r"\bactually\b", r"\bno[,\s]",
|
|
6
|
+
r"\bnot exactly\b", r"\bi said\b",
|
|
7
|
+
r"\bi mentioned\b", r"\bi told you\b",
|
|
8
|
+
r"\bplease avoid\b", r"\bplease don'?t\b",
|
|
9
|
+
r"\bdon'?t forget\b", r"\bi already said\b",
|
|
10
|
+
r"\bcorrection\b", r"\bwait[,\s]",
|
|
11
|
+
r"\bi meant\b", r"\bthat'?s wrong\b",
|
|
12
|
+
r"\bnot quite\b", r"\byou forgot\b",
|
|
13
|
+
r"\byou missed\b", r"\bremember[,\s]",
|
|
14
|
+
]
|
|
15
|
+
def has_correction_marker(text: str) -> bool:
|
|
16
|
+
text_lower = text.lower()
|
|
17
|
+
return any(re.search(p, text_lower) for p in CORRECTION_MARKERS)
|
|
18
|
+
def is_correction_turn(
|
|
19
|
+
user_text : str,
|
|
20
|
+
prev_asst_text : Optional[str],
|
|
21
|
+
nli_model = None
|
|
22
|
+
) -> bool:
|
|
23
|
+
if not has_correction_marker(user_text):
|
|
24
|
+
return False
|
|
25
|
+
if prev_asst_text is None or nli_model is None:
|
|
26
|
+
return True
|
|
27
|
+
scores = nli_model.predict(
|
|
28
|
+
[[prev_asst_text, user_text]],
|
|
29
|
+
apply_softmax=True
|
|
30
|
+
)
|
|
31
|
+
return float(scores[0][0]) >= 0.35 or has_correction_marker(user_text)
|
|
32
|
+
def compute_A(
|
|
33
|
+
conversation:List[Tuple[str, str]],
|
|
34
|
+
sbert_model=None,
|
|
35
|
+
nli_model=None,
|
|
36
|
+
retention_threshold: float = 0.55,
|
|
37
|
+
gamma:float = 0.80
|
|
38
|
+
) -> Dict:
|
|
39
|
+
if sbert_model is None:
|
|
40
|
+
from sentence_transformers import SentenceTransformer
|
|
41
|
+
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
42
|
+
if nli_model is None:
|
|
43
|
+
from sentence_transformers import CrossEncoder
|
|
44
|
+
nli_model = CrossEncoder(
|
|
45
|
+
"cross-encoder/nli-deberta-v3-small",
|
|
46
|
+
max_length=512
|
|
47
|
+
)
|
|
48
|
+
corrections = []
|
|
49
|
+
prev_assistant = None
|
|
50
|
+
for i, (role, text) in enumerate(conversation):
|
|
51
|
+
if role == "assistant":
|
|
52
|
+
prev_assistant = text
|
|
53
|
+
elif role == "user":
|
|
54
|
+
if is_correction_turn(text, prev_assistant, nli_model):
|
|
55
|
+
corrections.append({"turn_index": i, "text": text})
|
|
56
|
+
K = len(corrections)
|
|
57
|
+
if K == 0:
|
|
58
|
+
return {
|
|
59
|
+
"score": 1.0,
|
|
60
|
+
"per_turn_scores": [],
|
|
61
|
+
"decay_weights": [],
|
|
62
|
+
"corrections_found": 0,
|
|
63
|
+
"retained_count": 0,
|
|
64
|
+
"details": [],
|
|
65
|
+
"explanation": "No correction turns detected. A = 1.0 (vacuously true)."
|
|
66
|
+
}
|
|
67
|
+
per_turn_scores = []
|
|
68
|
+
details = []
|
|
69
|
+
for k, correction in enumerate(corrections):
|
|
70
|
+
turn_idx = correction["turn_index"]
|
|
71
|
+
c_text = correction["text"]
|
|
72
|
+
subsequent = [
|
|
73
|
+
text for i, (role, text) in enumerate(conversation)
|
|
74
|
+
if role == "assistant" and i > turn_idx
|
|
75
|
+
]
|
|
76
|
+
if not subsequent:
|
|
77
|
+
A_k = 1.0
|
|
78
|
+
max_sim = 1.0
|
|
79
|
+
else:
|
|
80
|
+
c_emb = sbert_model.encode(
|
|
81
|
+
[c_text], convert_to_numpy=True, normalize_embeddings=True
|
|
82
|
+
)
|
|
83
|
+
s_embs = sbert_model.encode(
|
|
84
|
+
subsequent, convert_to_numpy=True, normalize_embeddings=True
|
|
85
|
+
)
|
|
86
|
+
sims = np.dot(c_emb, s_embs.T)[0]
|
|
87
|
+
max_sim = float(sims.max())
|
|
88
|
+
A_k = 1.0 if max_sim >= retention_threshold else 0.0
|
|
89
|
+
per_turn_scores.append(A_k)
|
|
90
|
+
details.append({
|
|
91
|
+
"turn_index": turn_idx,
|
|
92
|
+
"text": c_text[:100],
|
|
93
|
+
"retained": bool(A_k),
|
|
94
|
+
"max_sim": round(max_sim, 4),
|
|
95
|
+
})
|
|
96
|
+
decay_weights = [gamma ** (K - 1 - k) for k in range(K)]
|
|
97
|
+
Z = sum(decay_weights)
|
|
98
|
+
A_score = sum(w * s for w, s in zip(decay_weights, per_turn_scores)) / Z
|
|
99
|
+
return {
|
|
100
|
+
"score": round(A_score, 4),
|
|
101
|
+
"per_turn_scores": per_turn_scores,
|
|
102
|
+
"decay_weights": [round(w, 4) for w in decay_weights],
|
|
103
|
+
"corrections_found": K,
|
|
104
|
+
"retained_count": int(sum(per_turn_scores)),
|
|
105
|
+
"details": details,
|
|
106
|
+
"explanation": (
|
|
107
|
+
f"Found {K} correction(s). "
|
|
108
|
+
f"Per-correction scores: {per_turn_scores}. "
|
|
109
|
+
f"A = {A_score:.4f}."
|
|
110
|
+
)
|
|
111
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import List, Tuple, Dict
|
|
3
|
+
def compute_C(
|
|
4
|
+
conversation : List[Tuple[str, str]],
|
|
5
|
+
sbert_model = None,
|
|
6
|
+
gamma : float = 0.80
|
|
7
|
+
) -> Dict:
|
|
8
|
+
if sbert_model is None:
|
|
9
|
+
from sentence_transformers import SentenceTransformer
|
|
10
|
+
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
11
|
+
all_texts = [text for _, text in conversation]
|
|
12
|
+
N = len(all_texts)
|
|
13
|
+
if N <= 1:
|
|
14
|
+
return {
|
|
15
|
+
"score": 1.0,
|
|
16
|
+
"per_turn_sims": [],
|
|
17
|
+
"decay_weights": [],
|
|
18
|
+
"mean_drift":0.0,
|
|
19
|
+
"explanation":f"Only {N} turn(s). C = 1.0."
|
|
20
|
+
}
|
|
21
|
+
embeddings = sbert_model.encode(
|
|
22
|
+
all_texts,
|
|
23
|
+
convert_to_numpy=True,
|
|
24
|
+
normalize_embeddings=True,
|
|
25
|
+
batch_size=64,
|
|
26
|
+
show_progress_bar=False
|
|
27
|
+
)
|
|
28
|
+
sim_vector = np.sum(embeddings[:-1] * embeddings[1:], axis=1)
|
|
29
|
+
sim_vector = np.clip(sim_vector, 0.0, 1.0)
|
|
30
|
+
per_turn_sims = [round(float(s), 4) for s in sim_vector]
|
|
31
|
+
M = len(per_turn_sims)
|
|
32
|
+
decay_weights = [gamma ** (M - 1 - t) for t in range(M)]
|
|
33
|
+
Z = sum(decay_weights)
|
|
34
|
+
C_score = sum(w * s for w, s in zip(decay_weights, per_turn_sims)) / Z
|
|
35
|
+
mean_drift = 1.0 - float(np.mean(sim_vector))
|
|
36
|
+
return {
|
|
37
|
+
"score": round(C_score, 4),
|
|
38
|
+
"per_turn_sims": per_turn_sims,
|
|
39
|
+
"decay_weights": [round(w, 4) for w in decay_weights],
|
|
40
|
+
"mean_drift": round(mean_drift, 4),
|
|
41
|
+
"explanation": (
|
|
42
|
+
f"Encoded {N} turns via SBERT. "
|
|
43
|
+
f"Adjacent-pair sims: {per_turn_sims}. "
|
|
44
|
+
f"C = {C_score:.4f}, mean drift = {mean_drift:.4f}."
|
|
45
|
+
)
|
|
46
|
+
}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import List, Tuple, Dict
|
|
3
|
+
CONFIDENCE_LEXICON = {
|
|
4
|
+
"definitely": 1.00,"certainly": 1.00,
|
|
5
|
+
"absolutely": 1.00,"undoubtedly": 0.98,
|
|
6
|
+
"without a doubt": 0.97,"for sure": 0.95,
|
|
7
|
+
"clearly": 0.94,"obviously": 0.93,
|
|
8
|
+
"i am certain": 0.95,"i am confident": 0.92,
|
|
9
|
+
"i know": 0.90,"i believe": 0.80,
|
|
10
|
+
"i think": 0.75,"likely": 0.77,
|
|
11
|
+
"probably": 0.74,"it seems": 0.72,
|
|
12
|
+
"generally": 0.76,"typically": 0.75,
|
|
13
|
+
"usually": 0.74,"might": 0.52,
|
|
14
|
+
"could": 0.51,"may": 0.56,
|
|
15
|
+
"possibly": 0.46,"i suppose": 0.51,
|
|
16
|
+
"i guess": 0.46,"sometimes": 0.56,
|
|
17
|
+
"perhaps": 0.31,"maybe": 0.31,
|
|
18
|
+
"i am not sure": 0.20,"i'm not sure": 0.20,
|
|
19
|
+
"uncertain": 0.26,"unclear": 0.26,
|
|
20
|
+
"it depends": 0.31,"hard to say": 0.20,
|
|
21
|
+
"i doubt": 0.22,"i cannot say": 0.15,
|
|
22
|
+
}
|
|
23
|
+
DEFAULT_CONFIDENCE = 0.68
|
|
24
|
+
HIGH_CONF_ANCHOR = "I am absolutely certain and confident about this."
|
|
25
|
+
LOW_CONF_ANCHOR = "I am not sure and quite uncertain about this perhaps."
|
|
26
|
+
def lexicon_confidence(text: str) -> float:
|
|
27
|
+
text_lower = text.lower()
|
|
28
|
+
found_scores = [
|
|
29
|
+
score for marker, score in sorted(
|
|
30
|
+
CONFIDENCE_LEXICON.items(), key=lambda x: len(x[0]), reverse=True
|
|
31
|
+
)
|
|
32
|
+
if marker in text_lower
|
|
33
|
+
]
|
|
34
|
+
return float(np.mean(found_scores)) if found_scores else DEFAULT_CONFIDENCE
|
|
35
|
+
def sbert_anchor_confidence(text, sbert_model, high_emb, low_emb) -> float:
|
|
36
|
+
text_emb = sbert_model.encode(
|
|
37
|
+
[text], convert_to_numpy=True, normalize_embeddings=True
|
|
38
|
+
)[0]
|
|
39
|
+
sim_high = float(np.dot(text_emb, high_emb))
|
|
40
|
+
sim_low = float(np.dot(text_emb, low_emb))
|
|
41
|
+
total = sim_high + sim_low
|
|
42
|
+
return float(sim_high / total) if total > 0 else DEFAULT_CONFIDENCE
|
|
43
|
+
def compute_E(
|
|
44
|
+
conversation : List[Tuple[str, str]],
|
|
45
|
+
sbert_model = None,
|
|
46
|
+
gamma : float = 0.80,
|
|
47
|
+
lexicon_weight : float = 0.60,
|
|
48
|
+
sbert_weight : float = 0.40
|
|
49
|
+
) -> Dict:
|
|
50
|
+
if sbert_model is None:
|
|
51
|
+
from sentence_transformers import SentenceTransformer
|
|
52
|
+
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
53
|
+
anchors = sbert_model.encode(
|
|
54
|
+
[HIGH_CONF_ANCHOR, LOW_CONF_ANCHOR],
|
|
55
|
+
convert_to_numpy=True,
|
|
56
|
+
normalize_embeddings=True
|
|
57
|
+
)
|
|
58
|
+
high_emb = anchors[0]
|
|
59
|
+
low_emb = anchors[1]
|
|
60
|
+
assistant_texts = [text for role, text in conversation if role == "assistant"]
|
|
61
|
+
N = len(assistant_texts)
|
|
62
|
+
if N == 0:
|
|
63
|
+
return {
|
|
64
|
+
"score": 1.0,
|
|
65
|
+
"variance_penalty": 0.0,
|
|
66
|
+
"turn_confidences": [],
|
|
67
|
+
"decay_weights": [],
|
|
68
|
+
"mean_confidence": DEFAULT_CONFIDENCE,
|
|
69
|
+
"raw_variance": 0.0,
|
|
70
|
+
"explanation": "No assistant turns. E = 1.0, V = 0.0."
|
|
71
|
+
}
|
|
72
|
+
if N == 1:
|
|
73
|
+
conf = (
|
|
74
|
+
lexicon_weight * lexicon_confidence(assistant_texts[0]) +
|
|
75
|
+
sbert_weight * sbert_anchor_confidence(assistant_texts[0], sbert_model, high_emb, low_emb)
|
|
76
|
+
)
|
|
77
|
+
return {
|
|
78
|
+
"score": round(conf, 4),
|
|
79
|
+
"variance_penalty": 0.0,
|
|
80
|
+
"turn_confidences": [round(conf, 4)],
|
|
81
|
+
"decay_weights": [1.0],
|
|
82
|
+
"mean_confidence": round(conf, 4),
|
|
83
|
+
"raw_variance": 0.0,
|
|
84
|
+
"explanation": f"Single assistant turn. E = {conf:.4f}, V = 0.0."
|
|
85
|
+
}
|
|
86
|
+
turn_confidences = [
|
|
87
|
+
lexicon_weight * lexicon_confidence(t) +
|
|
88
|
+
sbert_weight * sbert_anchor_confidence(t, sbert_model, high_emb, low_emb)
|
|
89
|
+
for t in assistant_texts
|
|
90
|
+
]
|
|
91
|
+
conf_array = np.array(turn_confidences)
|
|
92
|
+
decay_weights = [gamma ** (N - 1 - t) for t in range(N)]
|
|
93
|
+
Z = sum(decay_weights)
|
|
94
|
+
E_score = sum(w * c for w, c in zip(decay_weights, turn_confidences)) / Z
|
|
95
|
+
mu = float(conf_array.mean())
|
|
96
|
+
raw_variance = float(np.mean((conf_array - mu) ** 2))
|
|
97
|
+
V_norm = min(raw_variance / 0.25, 1.0)
|
|
98
|
+
return {
|
|
99
|
+
"score": round(E_score, 4),
|
|
100
|
+
"variance_penalty": round(V_norm, 4),
|
|
101
|
+
"turn_confidences": [round(c, 4) for c in turn_confidences],
|
|
102
|
+
"decay_weights": [round(w, 4) for w in decay_weights],
|
|
103
|
+
"mean_confidence": round(mu, 4),
|
|
104
|
+
"raw_variance": round(raw_variance, 6),
|
|
105
|
+
"explanation": (
|
|
106
|
+
f"Analyzed {N} assistant turns. "
|
|
107
|
+
f"Confidences: {[round(c,3) for c in turn_confidences]}. "
|
|
108
|
+
f"E = {E_score:.4f}, V_norm = {V_norm:.4f}."
|
|
109
|
+
)
|
|
110
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from itertools import combinations
|
|
2
|
+
from typing import List, Tuple, Dict
|
|
3
|
+
import numpy as np
|
|
4
|
+
def compute_R(
|
|
5
|
+
conversation : List[Tuple[str, str]],
|
|
6
|
+
nli_model = None,
|
|
7
|
+
threshold : float = 0.75,
|
|
8
|
+
gamma : float = 0.80,
|
|
9
|
+
bidirectional : bool = True
|
|
10
|
+
) -> Dict:
|
|
11
|
+
if nli_model is None:
|
|
12
|
+
from sentence_transformers import CrossEncoder
|
|
13
|
+
nli_model = CrossEncoder(
|
|
14
|
+
"cross-encoder/nli-deberta-v3-small",
|
|
15
|
+
max_length=512
|
|
16
|
+
)
|
|
17
|
+
assistant_turns = [
|
|
18
|
+
(i, text)
|
|
19
|
+
for i, (role, text) in enumerate(conversation)
|
|
20
|
+
if role == "assistant"
|
|
21
|
+
]
|
|
22
|
+
N = len(assistant_turns)
|
|
23
|
+
if N < 2:
|
|
24
|
+
return {
|
|
25
|
+
"score": 1.0,
|
|
26
|
+
"penalty": 0.0,
|
|
27
|
+
"per_turn_scores": [1.0] * N,
|
|
28
|
+
"decay_weights": [1.0] * N,
|
|
29
|
+
"total_pairs": 0,
|
|
30
|
+
"contradiction_count": 0,
|
|
31
|
+
"contradictions": [],
|
|
32
|
+
"explanation": f"Only {N} assistant turn(s). R = 1.0, P = 0.0."
|
|
33
|
+
}
|
|
34
|
+
all_pairs = list(combinations(range(N), 2))
|
|
35
|
+
forward_inputs = [
|
|
36
|
+
[assistant_turns[i][1], assistant_turns[j][1]]
|
|
37
|
+
for i, j in all_pairs
|
|
38
|
+
]
|
|
39
|
+
forward_scores = nli_model.predict(forward_inputs, apply_softmax=True)
|
|
40
|
+
forward_probs = forward_scores[:, 0]
|
|
41
|
+
if bidirectional:
|
|
42
|
+
backward_inputs = [
|
|
43
|
+
[assistant_turns[j][1], assistant_turns[i][1]]
|
|
44
|
+
for i, j in all_pairs
|
|
45
|
+
]
|
|
46
|
+
backward_scores = nli_model.predict(backward_inputs, apply_softmax=True)
|
|
47
|
+
backward_probs = backward_scores[:, 0]
|
|
48
|
+
contradiction_probs = np.maximum(forward_probs, backward_probs)
|
|
49
|
+
else:
|
|
50
|
+
contradiction_probs = forward_probs
|
|
51
|
+
contradictions = []
|
|
52
|
+
contradicted_set = set()
|
|
53
|
+
for k, (i, j) in enumerate(all_pairs):
|
|
54
|
+
prob = float(contradiction_probs[k])
|
|
55
|
+
if prob >= threshold:
|
|
56
|
+
contradictions.append({
|
|
57
|
+
"turn_i": assistant_turns[i][0],
|
|
58
|
+
"turn_j": assistant_turns[j][0],
|
|
59
|
+
"contradiction_probability": round(prob, 4),
|
|
60
|
+
"text_i": assistant_turns[i][1][:100],
|
|
61
|
+
"text_j": assistant_turns[j][1][:100],
|
|
62
|
+
})
|
|
63
|
+
contradicted_set.add(i)
|
|
64
|
+
contradicted_set.add(j)
|
|
65
|
+
per_turn_scores = [
|
|
66
|
+
0.0 if t in contradicted_set else 1.0
|
|
67
|
+
for t in range(N)
|
|
68
|
+
]
|
|
69
|
+
decay_weights = [gamma ** (N - 1 - t) for t in range(N)]
|
|
70
|
+
Z = sum(decay_weights)
|
|
71
|
+
R_score = sum(w * s for w, s in zip(decay_weights, per_turn_scores)) / Z
|
|
72
|
+
total_pairs = len(all_pairs)
|
|
73
|
+
P = len(contradictions) / total_pairs if total_pairs > 0 else 0.0
|
|
74
|
+
return {
|
|
75
|
+
"score": round(R_score, 4),
|
|
76
|
+
"penalty": round(P, 4),
|
|
77
|
+
"per_turn_scores": per_turn_scores,
|
|
78
|
+
"decay_weights": [round(w, 4) for w in decay_weights],
|
|
79
|
+
"total_pairs": total_pairs,
|
|
80
|
+
"contradiction_count": len(contradictions),
|
|
81
|
+
"contradictions": contradictions,
|
|
82
|
+
"explanation": (
|
|
83
|
+
f"Checked {total_pairs} pairs across {N} assistant turns. "
|
|
84
|
+
f"Contradictions: {len(contradictions)}. "
|
|
85
|
+
f"R = {R_score:.4f}, P (penalty) = {P:.4f}."
|
|
86
|
+
)
|
|
87
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import numpy as np
|
|
3
|
+
from typing import List, Tuple, Dict
|
|
4
|
+
FACT_PATTERNS = [
|
|
5
|
+
(r"i(?:'m| am) ([\w\s]+?)(?:\s+and\s+|\s*,\s*|\.|\?|$)",
|
|
6
|
+
"user is {}"),
|
|
7
|
+
(r"i have ([\w\s]+?)(?:\s+and\s+|\s*,\s*|\.|\?|$)",
|
|
8
|
+
"user has {}"),
|
|
9
|
+
(r"i (hate|love|prefer|avoid|dislike|like|eat|drink) ([\w\s]+?)(?:\s+and\s+|\s*,\s*|\.|\?|$)",
|
|
10
|
+
"user {}s {}"),
|
|
11
|
+
(r"allergic to ([\w\s]+?)(?:\s+and\s+|\s*,\s*|\.|\?|$)",
|
|
12
|
+
"user is allergic to {}"),
|
|
13
|
+
(r"my ([\w\s]+?) is ([\w\s\d]+?)(?:\s+and\s+|\s*,\s*|\.|\?|$)",
|
|
14
|
+
"user's {} is {}"),
|
|
15
|
+
]
|
|
16
|
+
def extract_atomic_facts(text: str) -> List[str]:
|
|
17
|
+
facts = []
|
|
18
|
+
text_lower = text.lower().strip()
|
|
19
|
+
for pattern, template in FACT_PATTERNS:
|
|
20
|
+
for match in re.findall(pattern, text_lower):
|
|
21
|
+
if isinstance(match, tuple):
|
|
22
|
+
fact = template.format(*[m.strip() for m in match])
|
|
23
|
+
else:
|
|
24
|
+
fact = template.format(match.strip())
|
|
25
|
+
if len(fact.split()) >= 3:
|
|
26
|
+
facts.append(fact)
|
|
27
|
+
return list(set(facts))
|
|
28
|
+
def compute_T(
|
|
29
|
+
conversation : List[Tuple[str, str]],
|
|
30
|
+
sbert_model = None,
|
|
31
|
+
threshold : float = 0.60,
|
|
32
|
+
gamma : float = 0.80
|
|
33
|
+
) -> Dict:
|
|
34
|
+
if sbert_model is None:
|
|
35
|
+
from sentence_transformers import SentenceTransformer
|
|
36
|
+
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
37
|
+
cumulative_facts = []
|
|
38
|
+
assistant_turns = []
|
|
39
|
+
for role, text in conversation:
|
|
40
|
+
if role == "user":
|
|
41
|
+
new_facts = extract_atomic_facts(text)
|
|
42
|
+
for f in new_facts:
|
|
43
|
+
if f not in cumulative_facts:
|
|
44
|
+
cumulative_facts.append(f)
|
|
45
|
+
elif role == "assistant":
|
|
46
|
+
assistant_turns.append({
|
|
47
|
+
"text": text,
|
|
48
|
+
"facts": list(cumulative_facts)
|
|
49
|
+
})
|
|
50
|
+
N = len(assistant_turns)
|
|
51
|
+
if N == 0:
|
|
52
|
+
return {
|
|
53
|
+
"score": 1.0,
|
|
54
|
+
"per_turn_scores": [],
|
|
55
|
+
"decay_weights": [],
|
|
56
|
+
"user_facts": cumulative_facts,
|
|
57
|
+
"explanation": "No assistant turns found. T = 1.0."
|
|
58
|
+
}
|
|
59
|
+
per_turn_scores = []
|
|
60
|
+
for turn_data in assistant_turns:
|
|
61
|
+
facts = turn_data["facts"]
|
|
62
|
+
text = turn_data["text"]
|
|
63
|
+
if not facts:
|
|
64
|
+
per_turn_scores.append(1.0)
|
|
65
|
+
continue
|
|
66
|
+
fact_embs = sbert_model.encode(
|
|
67
|
+
facts,
|
|
68
|
+
convert_to_numpy=True,
|
|
69
|
+
normalize_embeddings=True
|
|
70
|
+
)
|
|
71
|
+
turn_emb = sbert_model.encode(
|
|
72
|
+
[text],
|
|
73
|
+
convert_to_numpy=True,
|
|
74
|
+
normalize_embeddings=True
|
|
75
|
+
)
|
|
76
|
+
sims = np.dot(fact_embs, turn_emb.T).flatten()
|
|
77
|
+
recalled = int(np.sum(sims >= threshold))
|
|
78
|
+
T_t = recalled / len(facts)
|
|
79
|
+
per_turn_scores.append(round(T_t, 4))
|
|
80
|
+
decay_weights = [gamma ** (N - 1 - t) for t in range(N)]
|
|
81
|
+
Z = sum(decay_weights)
|
|
82
|
+
T_score = sum(w * s for w, s in zip(decay_weights, per_turn_scores)) / Z
|
|
83
|
+
return {
|
|
84
|
+
"score": round(T_score, 4),
|
|
85
|
+
"per_turn_scores": per_turn_scores,
|
|
86
|
+
"decay_weights": [round(w, 4) for w in decay_weights],
|
|
87
|
+
"user_facts": cumulative_facts,
|
|
88
|
+
"explanation": (
|
|
89
|
+
f"Computed T across {N} assistant turns with gamma={gamma}. "
|
|
90
|
+
f"Per-turn scores: {per_turn_scores}. "
|
|
91
|
+
f"Decay weights: {[round(w,3) for w in decay_weights]}. "
|
|
92
|
+
f"T = {T_score:.4f}."
|
|
93
|
+
)
|
|
94
|
+
}
|
trace_score/trace.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
from typing import List, Tuple, Dict, Optional
|
|
2
|
+
from .components.temporal import compute_T
|
|
3
|
+
from .components.reliability import compute_R
|
|
4
|
+
from .components.adaptive import compute_A
|
|
5
|
+
from .components.coherence import compute_C
|
|
6
|
+
from .components.epistemic import compute_E
|
|
7
|
+
WEIGHT_PRESETS = {
|
|
8
|
+
"equal": {"w_T": 0.20, "w_R": 0.20, "w_A": 0.20, "w_C": 0.20, "w_E": 0.20},
|
|
9
|
+
"customer_service": {"w_T": 0.30, "w_R": 0.20, "w_A": 0.30, "w_C": 0.10, "w_E": 0.10},
|
|
10
|
+
"technical_qa": {"w_T": 0.20, "w_R": 0.30, "w_A": 0.10, "w_C": 0.10, "w_E": 0.30},
|
|
11
|
+
"medical_chatbot": {"w_T": 0.30, "w_R": 0.30, "w_A": 0.20, "w_C": 0.10, "w_E": 0.10},
|
|
12
|
+
"education_tutor": {"w_T": 0.20, "w_R": 0.10, "w_A": 0.30, "w_C": 0.30, "w_E": 0.10},
|
|
13
|
+
}
|
|
14
|
+
INTERPRETATION_THRESHOLDS = [
|
|
15
|
+
(0.85, "Excellent — conversation is highly consistent"),
|
|
16
|
+
(0.70, "Good — minor consistency issues"),
|
|
17
|
+
(0.55, "Moderate — notable consistency failures"),
|
|
18
|
+
(0.40, "Poor — significant consistency problems"),
|
|
19
|
+
(0.00, "Very poor — conversation is highly inconsistent"),
|
|
20
|
+
]
|
|
21
|
+
class TRACEEvaluator:
|
|
22
|
+
def __init__(self):
|
|
23
|
+
self._sbert = None
|
|
24
|
+
self._nli = None
|
|
25
|
+
def _load_models(self):
|
|
26
|
+
if self._sbert is None:
|
|
27
|
+
from sentence_transformers import SentenceTransformer
|
|
28
|
+
self._sbert = SentenceTransformer("all-MiniLM-L6-v2")
|
|
29
|
+
if self._nli is None:
|
|
30
|
+
from sentence_transformers import CrossEncoder
|
|
31
|
+
self._nli = CrossEncoder(
|
|
32
|
+
"cross-encoder/nli-deberta-v3-small",
|
|
33
|
+
max_length=512
|
|
34
|
+
)
|
|
35
|
+
def evaluate(
|
|
36
|
+
self,
|
|
37
|
+
conversation : List[Tuple[str, str]],
|
|
38
|
+
weights : Optional[Dict[str, float]] = None,
|
|
39
|
+
preset : str = "equal",
|
|
40
|
+
gamma : float = 0.80,
|
|
41
|
+
lambda_ : float = 0.15,
|
|
42
|
+
delta : float = 0.10,
|
|
43
|
+
alpha : float = 0.05,
|
|
44
|
+
beta : float = 0.05,
|
|
45
|
+
verbose : bool = False
|
|
46
|
+
) -> Dict:
|
|
47
|
+
self._load_models()
|
|
48
|
+
if not conversation:
|
|
49
|
+
return self._empty_result(weights or WEIGHT_PRESETS[preset])
|
|
50
|
+
for role, text in conversation:
|
|
51
|
+
if role not in ("user", "assistant"):
|
|
52
|
+
raise ValueError(f"Invalid role '{role}'. Must be 'user' or 'assistant'.")
|
|
53
|
+
if not text or not text.strip():
|
|
54
|
+
raise ValueError("Empty turn text found in conversation.")
|
|
55
|
+
w = self._resolve_weights(weights, preset)
|
|
56
|
+
t_result = compute_T(conversation, sbert_model=self._sbert, gamma=gamma)
|
|
57
|
+
r_result = compute_R(conversation, nli_model=self._nli, gamma=gamma)
|
|
58
|
+
a_result = compute_A(conversation, sbert_model=self._sbert,
|
|
59
|
+
nli_model=self._nli, gamma=gamma)
|
|
60
|
+
c_result = compute_C(conversation, sbert_model=self._sbert, gamma=gamma)
|
|
61
|
+
e_result = compute_E(conversation, sbert_model=self._sbert, gamma=gamma)
|
|
62
|
+
T = t_result["score"]
|
|
63
|
+
R = r_result["score"]
|
|
64
|
+
A = a_result["score"]
|
|
65
|
+
C = c_result["score"]
|
|
66
|
+
E = e_result["score"]
|
|
67
|
+
P = r_result["penalty"]
|
|
68
|
+
V = e_result["variance_penalty"]
|
|
69
|
+
base_score = (
|
|
70
|
+
w["w_T"] * T +
|
|
71
|
+
w["w_R"] * R +
|
|
72
|
+
w["w_A"] * A +
|
|
73
|
+
w["w_C"] * C +
|
|
74
|
+
w["w_E"] * E
|
|
75
|
+
)
|
|
76
|
+
penalty_term = lambda_ * P + delta * V
|
|
77
|
+
interaction_term = alpha * (T * C) + beta * (A * R)
|
|
78
|
+
raw_trace = base_score - penalty_term + interaction_term
|
|
79
|
+
trace_score = round(max(0.0, min(1.0, raw_trace)), 4)
|
|
80
|
+
formula_breakdown = (
|
|
81
|
+
f"TRACE = [{w['w_T']}×{T:.3f} + {w['w_R']}×{R:.3f} + "
|
|
82
|
+
f"{w['w_A']}×{A:.3f} + {w['w_C']}×{C:.3f} + {w['w_E']}×{E:.3f}]"
|
|
83
|
+
f" - [{lambda_}×{P:.3f} + {delta}×{V:.3f}]"
|
|
84
|
+
f" + [{alpha}×({T:.3f}×{C:.3f}) + {beta}×({A:.3f}×{R:.3f})]"
|
|
85
|
+
f" = {base_score:.4f} - {penalty_term:.4f} + {interaction_term:.4f}"
|
|
86
|
+
f" = {raw_trace:.4f} → clamped → {trace_score:.4f}"
|
|
87
|
+
)
|
|
88
|
+
result = {
|
|
89
|
+
"trace_score": trace_score,
|
|
90
|
+
"base_score": round(base_score, 4),
|
|
91
|
+
"penalty_term": round(penalty_term, 4),
|
|
92
|
+
"interaction_term": round(interaction_term, 4),
|
|
93
|
+
"T": T, "R": R, "A": A, "C": C, "E": E,
|
|
94
|
+
"P": P, "V": V,
|
|
95
|
+
"weights": w,
|
|
96
|
+
"preset": preset if weights is None else "custom",
|
|
97
|
+
"gamma": gamma,
|
|
98
|
+
"lambda": lambda_,
|
|
99
|
+
"delta": delta,
|
|
100
|
+
"alpha": alpha,
|
|
101
|
+
"beta": beta,
|
|
102
|
+
"interpretation": self._interpret(trace_score),
|
|
103
|
+
"formula_breakdown": formula_breakdown,
|
|
104
|
+
"num_turns": len(conversation),
|
|
105
|
+
}
|
|
106
|
+
if verbose:
|
|
107
|
+
result["details"] = {
|
|
108
|
+
"T": t_result,
|
|
109
|
+
"R": r_result,
|
|
110
|
+
"A": a_result,
|
|
111
|
+
"C": c_result,
|
|
112
|
+
"E": e_result,
|
|
113
|
+
}
|
|
114
|
+
return result
|
|
115
|
+
def _resolve_weights(self, weights, preset):
|
|
116
|
+
if weights is not None:
|
|
117
|
+
total = sum(weights.values())
|
|
118
|
+
if abs(total - 1.0) > 0.01:
|
|
119
|
+
raise ValueError(f"Weights must sum to 1.0, got {total:.4f}.")
|
|
120
|
+
return weights
|
|
121
|
+
if preset not in WEIGHT_PRESETS:
|
|
122
|
+
raise ValueError(
|
|
123
|
+
f"Unknown preset '{preset}'. "
|
|
124
|
+
f"Available: {list(WEIGHT_PRESETS.keys())}"
|
|
125
|
+
)
|
|
126
|
+
return WEIGHT_PRESETS[preset]
|
|
127
|
+
def _interpret(self, score: float) -> str:
|
|
128
|
+
for threshold, label in INTERPRETATION_THRESHOLDS:
|
|
129
|
+
if score >= threshold:
|
|
130
|
+
return label
|
|
131
|
+
return INTERPRETATION_THRESHOLDS[-1][1]
|
|
132
|
+
def _empty_result(self, w):
|
|
133
|
+
return {
|
|
134
|
+
"trace_score": 1.0,
|
|
135
|
+
"base_score": 1.0,
|
|
136
|
+
"penalty_term": 0.0,
|
|
137
|
+
"interaction_term": 0.0,
|
|
138
|
+
"T": 1.0, "R": 1.0, "A": 1.0, "C": 1.0, "E": 1.0,
|
|
139
|
+
"P": 0.0, "V": 0.0,
|
|
140
|
+
"weights": w,
|
|
141
|
+
"interpretation": "Empty conversation. TRACE = 1.0.",
|
|
142
|
+
"num_turns": 0,
|
|
143
|
+
}
|
|
144
|
+
def compute_TRACE(
|
|
145
|
+
conversation : List[Tuple[str, str]],
|
|
146
|
+
weights : Optional[Dict[str, float]] = None,
|
|
147
|
+
preset : str = "equal",
|
|
148
|
+
gamma : float = 0.80,
|
|
149
|
+
lambda_ : float = 0.15,
|
|
150
|
+
delta : float = 0.10,
|
|
151
|
+
alpha : float = 0.05,
|
|
152
|
+
beta : float = 0.05,
|
|
153
|
+
verbose : bool = False
|
|
154
|
+
) -> Dict:
|
|
155
|
+
evaluator = TRACEEvaluator()
|
|
156
|
+
return evaluator.evaluate(
|
|
157
|
+
conversation,
|
|
158
|
+
weights=weights,
|
|
159
|
+
preset=preset,
|
|
160
|
+
gamma=gamma,
|
|
161
|
+
lambda_=lambda_,
|
|
162
|
+
delta=delta,
|
|
163
|
+
alpha=alpha,
|
|
164
|
+
beta=beta,
|
|
165
|
+
verbose=verbose
|
|
166
|
+
)
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: trace-score
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multi-turn LLM Conversation Consistency Metric
|
|
5
|
+
Home-page: https://github.com/Giri530/trace-score
|
|
6
|
+
Author: Girinath V
|
|
7
|
+
Author-email: Girinath V <your-email@gmail.com>
|
|
8
|
+
License: MIT License
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2026 Girinath V
|
|
11
|
+
|
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
14
|
+
in the Software without restriction, including without limitation the rights
|
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
17
|
+
furnished to do so, subject to the following conditions:
|
|
18
|
+
|
|
19
|
+
The above copyright notice and this permission notice shall be included in all
|
|
20
|
+
copies or substantial portions of the Software.
|
|
21
|
+
|
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
28
|
+
SOFTWARE.
|
|
29
|
+
|
|
30
|
+
Project-URL: Homepage, https://github.com/Giri530/trace-score
|
|
31
|
+
Project-URL: Repository, https://github.com/Giri530/trace-score
|
|
32
|
+
Keywords: nlp,llm,evaluation,multi-turn,consistency,trace-score
|
|
33
|
+
Classifier: Development Status :: 3 - Alpha
|
|
34
|
+
Classifier: Intended Audience :: Science/Research
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Requires-Python: >=3.8
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
License-File: LICENSE
|
|
40
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
|
41
|
+
Requires-Dist: numpy>=1.21.0
|
|
42
|
+
Requires-Dist: torch>=1.11.0
|
|
43
|
+
Dynamic: author
|
|
44
|
+
Dynamic: home-page
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
Dynamic: requires-python
|
|
47
|
+
|
|
48
|
+
# TRACE Score
|
|
49
|
+
|
|
50
|
+
**Multi-turn LLM Conversation Consistency Metric**
|
|
51
|
+
|
|
52
|
+
> The first unified, deterministic, reference-free evaluation metric for
|
|
53
|
+
> multi-turn conversational consistency in Large Language Models.
|
|
54
|
+
|
|
55
|
+
[](https://pypi.org/project/trace-score/)
|
|
56
|
+
[](https://opensource.org/licenses/MIT)
|
|
57
|
+
[](https://www.python.org/downloads/)
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## The Problem
|
|
62
|
+
|
|
63
|
+
Existing metrics (BLEU, ROUGE, BERTScore, RAGAS) evaluate each conversation
|
|
64
|
+
turn **in isolation**. They cannot detect failures that only become visible
|
|
65
|
+
**across multiple turns**:
|
|
66
|
+
|
|
67
|
+
| Failure Type | Example | BLEU | ROUGE | BERTScore | TRACE |
|
|
68
|
+
|---|---|---|---|---|---|
|
|
69
|
+
| Fact forgotten | User says "I am diabetic" → model recommends sugar-rich food 5 turns later | Miss | Miss | Miss | **Catch** |
|
|
70
|
+
| Correction ignored | User corrects model → model reverts to old behavior | Miss | Miss | Miss | **Catch** |
|
|
71
|
+
| Self-contradiction | Model says X at turn 2, contradicts X at turn 7 | Miss | Miss | Miss | **Catch** |
|
|
72
|
+
| Topic drift | Conversation gradually drifts off-topic | Miss | Miss | Miss | **Catch** |
|
|
73
|
+
| Confidence drift | Model says "definitely" then "perhaps" about same claim | Miss | Miss | Miss | **Catch** |
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Formula
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
TRACE(C) = Σ(wᵢ · Sᵢ) − λ·P − δ·V + α·(T·C) + β·(A·R)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Each component uses **time-decay aggregation** — recent turns weighted more:
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
Sᵢ = (1/Z) · Σ γ^(N-t) · Sᵢ,ₜ
|
|
87
|
+
Z = Σ γ^(N-t)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
| Symbol | Component | Measures |
|
|
91
|
+
|--------|-----------|---------|
|
|
92
|
+
| **T** | Temporal Retention | Did assistant remember user-stated facts? |
|
|
93
|
+
| **R** | Reliability Consistency | Did assistant contradict itself? |
|
|
94
|
+
| **A** | Adaptive Correction | Did assistant retain user corrections? |
|
|
95
|
+
| **C** | Context Coherence | Did conversation stay on topic? |
|
|
96
|
+
| **E** | Epistemic Stability | Did confidence stay calibrated? |
|
|
97
|
+
| P | Contradiction penalty | Global contradiction rate |
|
|
98
|
+
| V | Variance penalty | Confidence variance |
|
|
99
|
+
| γ | Time decay factor | Default: 0.80 |
|
|
100
|
+
| λ | Contradiction weight | Default: 0.15 |
|
|
101
|
+
| δ | Variance weight | Default: 0.10 |
|
|
102
|
+
| α | T·C interaction | Default: 0.05 |
|
|
103
|
+
| β | A·R interaction | Default: 0.05 |
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## Install
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
pip install trace-score
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## Quick Start
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from trace_score import compute_TRACE
|
|
119
|
+
|
|
120
|
+
conversation = [
|
|
121
|
+
("user", "I am diabetic and hate spicy food"),
|
|
122
|
+
("assistant", "I will suggest low sugar mild options."),
|
|
123
|
+
("user", "Actually I eat fish too. I am pescatarian."),
|
|
124
|
+
("assistant", "Spicy chicken with cashews!"), # failure turn
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
result = compute_TRACE(conversation, verbose=True)
|
|
128
|
+
|
|
129
|
+
print(result["trace_score"]) # 0.41 — catches failures
|
|
130
|
+
print(result["T"]) # 0.50 — forgot user facts
|
|
131
|
+
print(result["A"]) # 0.00 — ignored correction
|
|
132
|
+
print(result["formula_breakdown"]) # full formula with values
|
|
133
|
+
print(result["interpretation"]) # "Poor consistency"
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## Batch Evaluation
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from trace_score import TRACEEvaluator
|
|
142
|
+
|
|
143
|
+
# Models loaded once, reused across all calls — much faster
|
|
144
|
+
evaluator = TRACEEvaluator()
|
|
145
|
+
results = [evaluator.evaluate(conv) for conv in conversations]
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Adaptive Weights
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
# Equal weights (default)
|
|
154
|
+
result = compute_TRACE(conv, preset="equal")
|
|
155
|
+
|
|
156
|
+
# Medical chatbot — memory and reliability weighted more
|
|
157
|
+
result = compute_TRACE(conv, preset="medical_chatbot")
|
|
158
|
+
|
|
159
|
+
# Custom weights — must sum to 1.0
|
|
160
|
+
result = compute_TRACE(conv, weights={
|
|
161
|
+
"w_T": 0.35, "w_R": 0.25,
|
|
162
|
+
"w_A": 0.20, "w_C": 0.10, "w_E": 0.10
|
|
163
|
+
})
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Available presets: `equal`, `customer_service`, `technical_qa`,
|
|
167
|
+
`medical_chatbot`, `education_tutor`
|
|
168
|
+
|
|
169
|
+
---
|
|
170
|
+
|
|
171
|
+
## Benchmark Results
|
|
172
|
+
|
|
173
|
+
Evaluated on **30 multi-turn conversations** across 3 categories
|
|
174
|
+
(Fact Memory, Correction Retention, Contradiction Detection).
|
|
175
|
+
Conversations generated by **Llama-3.1-8B via Groq API**.
|
|
176
|
+
|
|
177
|
+
### Overall Metric Comparison
|
|
178
|
+
|
|
179
|
+
| Metric | Overall | Fact Memory | Correction | Contradiction |
|
|
180
|
+
|--------|---------|-------------|------------|---------------|
|
|
181
|
+
| **TRACE** | **0.699** | **0.703** | **0.550** | **0.843** |
|
|
182
|
+
| BLEU | 0.102 | 0.046 | 0.149 | 0.110 |
|
|
183
|
+
| ROUGE-L | 0.239 | 0.177 | 0.301 | 0.239 |
|
|
184
|
+
| BERTScore | 0.822 | 0.800 | 0.842 | 0.823 |
|
|
185
|
+
|
|
186
|
+
**Key finding:** BLEU and ROUGE-L show similar low scores across all categories
|
|
187
|
+
— they cannot distinguish between different types of consistency failures.
|
|
188
|
+
BERTScore appears high but provides no diagnostic breakdown.
|
|
189
|
+
**TRACE clearly separates Correction (0.550) from Contradiction (0.843)**,
|
|
190
|
+
revealing that Llama-3.1-8B struggles most with retaining user corrections.
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
### TRACE Component Breakdown by Category
|
|
195
|
+
|
|
196
|
+
| Category | T | R | A | C | E |
|
|
197
|
+
|----------|---|---|---|---|---|
|
|
198
|
+
| Fact Memory | 0.137 | 0.955 | **1.000** | 0.503 | 0.697 |
|
|
199
|
+
| Correction | 0.491 | 0.927 | **0.144** | 0.465 | 0.712 |
|
|
200
|
+
| Contradiction | **0.973** | 0.875 | 0.900 | 0.510 | 0.696 |
|
|
201
|
+
|
|
202
|
+
**Diagnostic insight:**
|
|
203
|
+
|
|
204
|
+
- Fact Memory: T=0.137 — model **forgets user-stated facts** (A=1.0 means
|
|
205
|
+
no corrections were needed, so A is vacuously true here)
|
|
206
|
+
- Correction: A=0.144 — model **ignores user corrections** (critical failure)
|
|
207
|
+
- Contradiction: T=0.973, A=0.900 — model handles these well
|
|
208
|
+
|
|
209
|
+
No existing metric (BLEU, ROUGE, BERTScore) can produce this breakdown.
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
### The Gap TRACE Reveals — BERTScore vs TRACE
|
|
214
|
+
|
|
215
|
+
Conversations where **BERTScore is high but TRACE is low**
|
|
216
|
+
(failures invisible to BERTScore, caught by TRACE):
|
|
217
|
+
|
|
218
|
+
| Conversation | Category | TRACE | BERTScore | Gap |
|
|
219
|
+
|---|---|---|---|---|
|
|
220
|
+
| CR_006 | Correction | 0.314 | 0.876 | +0.562 |
|
|
221
|
+
| CR_009 | Correction | 0.381 | 0.861 | +0.480 |
|
|
222
|
+
| CR_004 | Correction | 0.535 | 0.884 | +0.349 |
|
|
223
|
+
| CR_003 | Correction | 0.494 | 0.864 | +0.370 |
|
|
224
|
+
| CR_002 | Correction | 0.442 | 0.822 | +0.380 |
|
|
225
|
+
|
|
226
|
+
**In all 5 cases:** BERTScore ≥ 0.82 (looks good), TRACE < 0.55 (failures detected).
|
|
227
|
+
The A component reveals why — user corrections completely ignored (A=0.00).
|
|
228
|
+
This is invisible to any per-turn metric.
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## Why TRACE?
|
|
233
|
+
|
|
234
|
+
| Metric | Multi-turn | Reference-free | Deterministic | Time-decay | Diagnostic |
|
|
235
|
+
|--------|-----------|----------------|---------------|-----------|-----------|
|
|
236
|
+
| BLEU | No | No | Yes | No | No |
|
|
237
|
+
| ROUGE | No | No | Yes | No | No |
|
|
238
|
+
| BERTScore | No | No | Yes | No | No |
|
|
239
|
+
| RAGAS | No | Yes | No | No | Partial |
|
|
240
|
+
| **TRACE** | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** |
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## Models Used
|
|
245
|
+
|
|
246
|
+
| Model | Purpose | Size |
|
|
247
|
+
|-------|---------|------|
|
|
248
|
+
| `all-MiniLM-L6-v2` | Semantic similarity (T, A, C, E) | 80MB |
|
|
249
|
+
| `cross-encoder/nli-deberta-v3-small` | Contradiction detection (R, A) | 184MB |
|
|
250
|
+
|
|
251
|
+
Models downloaded automatically on first use (~264MB total).
|
|
252
|
+
CPU-friendly — no GPU required.
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
## Citation
|
|
257
|
+
|
|
258
|
+
```bibtex
|
|
259
|
+
@article{girinathv2026trace,
|
|
260
|
+
title = {TRACE: A Unified Deterministic Metric for Multi-turn
|
|
261
|
+
Conversational Consistency in Large Language Models},
|
|
262
|
+
author = {Girinath, V},
|
|
263
|
+
year = {2026}
|
|
264
|
+
}
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
*Author: Girinath V*
|
|
270
|
+
*GitHub: https://github.com/Giri530/trace-score*
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
trace_score/__init__.py,sha256=--Nr7FRCszGBpz1D-CMH8cxsAwcbsS9UYo67X0U0iog,341
|
|
2
|
+
trace_score/trace.py,sha256=60MxMgmPkV04CLnuOZqWJnRqVGtJJX5gpjn02pL4ZXw,6858
|
|
3
|
+
trace_score/components/__init__.py,sha256=HPkrl-R3_AmDdcVjYnZC6VlhKarNAeDrYsYQ03TB9fc,171
|
|
4
|
+
trace_score/components/adaptive.py,sha256=yfA9I02o9gRfVkymTFfDkUSv8q23VbhBTGJukrxhSIY,3984
|
|
5
|
+
trace_score/components/coherence.py,sha256=JNsVH0dqkST_p2gTv9riKjFZ0DALkAk1YsQ5LZjac1o,1618
|
|
6
|
+
trace_score/components/epistemic.py,sha256=4qOKEQdHXj-EOl9jtNBO1jSFjjysj8isX3xNNanbPw0,4352
|
|
7
|
+
trace_score/components/reliability.py,sha256=nJjXwjWIEl5ZItXW_U8l9Gy_BtyimV83N7U9FAldAm0,3365
|
|
8
|
+
trace_score/components/temporal.py,sha256=OCQw4lZEUh4UepelbDzV1pAIpZIz9oFh-MJgdjaZh7Y,3442
|
|
9
|
+
trace_score-0.1.0.dist-info/licenses/LICENSE,sha256=FWXZwUInhOZvTxKqwnCPYbVby_65dHGXRCLrdAFhtJ8,1067
|
|
10
|
+
trace_score-0.1.0.dist-info/METADATA,sha256=PxZfpV6eSf-yS9nAWwKLFcUziNLmeZ76UlYf3hqfDlI,9294
|
|
11
|
+
trace_score-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
12
|
+
trace_score-0.1.0.dist-info/top_level.txt,sha256=QNr7AmluwDQlIJJgmyz35K5ACpW9gEiOPEDECwrft_4,12
|
|
13
|
+
trace_score-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Girinath V
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
trace_score
|