trace-score 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trace_score-0.1.0/LICENSE +21 -0
- trace_score-0.1.0/PKG-INFO +270 -0
- trace_score-0.1.0/README.md +223 -0
- trace_score-0.1.0/pyproject.toml +33 -0
- trace_score-0.1.0/setup.cfg +4 -0
- trace_score-0.1.0/setup.py +36 -0
- trace_score-0.1.0/trace_score/__init__.py +9 -0
- trace_score-0.1.0/trace_score/components/__init__.py +5 -0
- trace_score-0.1.0/trace_score/components/adaptive.py +111 -0
- trace_score-0.1.0/trace_score/components/coherence.py +46 -0
- trace_score-0.1.0/trace_score/components/epistemic.py +110 -0
- trace_score-0.1.0/trace_score/components/reliability.py +87 -0
- trace_score-0.1.0/trace_score/components/temporal.py +94 -0
- trace_score-0.1.0/trace_score/trace.py +166 -0
- trace_score-0.1.0/trace_score.egg-info/PKG-INFO +270 -0
- trace_score-0.1.0/trace_score.egg-info/SOURCES.txt +17 -0
- trace_score-0.1.0/trace_score.egg-info/dependency_links.txt +1 -0
- trace_score-0.1.0/trace_score.egg-info/requires.txt +3 -0
- trace_score-0.1.0/trace_score.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Girinath V
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: trace-score
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multi-turn LLM Conversation Consistency Metric
|
|
5
|
+
Home-page: https://github.com/Giri530/trace-score
|
|
6
|
+
Author: Girinath V
|
|
7
|
+
Author-email: Girinath V <your-email@gmail.com>
|
|
8
|
+
License: MIT License
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2026 Girinath V
|
|
11
|
+
|
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
14
|
+
in the Software without restriction, including without limitation the rights
|
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
17
|
+
furnished to do so, subject to the following conditions:
|
|
18
|
+
|
|
19
|
+
The above copyright notice and this permission notice shall be included in all
|
|
20
|
+
copies or substantial portions of the Software.
|
|
21
|
+
|
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
28
|
+
SOFTWARE.
|
|
29
|
+
|
|
30
|
+
Project-URL: Homepage, https://github.com/Giri530/trace-score
|
|
31
|
+
Project-URL: Repository, https://github.com/Giri530/trace-score
|
|
32
|
+
Keywords: nlp,llm,evaluation,multi-turn,consistency,trace-score
|
|
33
|
+
Classifier: Development Status :: 3 - Alpha
|
|
34
|
+
Classifier: Intended Audience :: Science/Research
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Requires-Python: >=3.8
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
License-File: LICENSE
|
|
40
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
|
41
|
+
Requires-Dist: numpy>=1.21.0
|
|
42
|
+
Requires-Dist: torch>=1.11.0
|
|
43
|
+
Dynamic: author
|
|
44
|
+
Dynamic: home-page
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
Dynamic: requires-python
|
|
47
|
+
|
|
48
|
+
# TRACE Score
|
|
49
|
+
|
|
50
|
+
**Multi-turn LLM Conversation Consistency Metric**
|
|
51
|
+
|
|
52
|
+
> The first unified, deterministic, reference-free evaluation metric for
|
|
53
|
+
> multi-turn conversational consistency in Large Language Models.
|
|
54
|
+
|
|
55
|
+
[](https://pypi.org/project/trace-score/)
|
|
56
|
+
[](https://opensource.org/licenses/MIT)
|
|
57
|
+
[](https://www.python.org/downloads/)
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## The Problem
|
|
62
|
+
|
|
63
|
+
Existing metrics (BLEU, ROUGE, BERTScore, RAGAS) evaluate each conversation
|
|
64
|
+
turn **in isolation**. They cannot detect failures that only become visible
|
|
65
|
+
**across multiple turns**:
|
|
66
|
+
|
|
67
|
+
| Failure Type | Example | BLEU | ROUGE | BERTScore | TRACE |
|
|
68
|
+
|---|---|---|---|---|---|
|
|
69
|
+
| Fact forgotten | User says "I am diabetic" → model recommends sugar-rich food 5 turns later | Miss | Miss | Miss | **Catch** |
|
|
70
|
+
| Correction ignored | User corrects model → model reverts to old behavior | Miss | Miss | Miss | **Catch** |
|
|
71
|
+
| Self-contradiction | Model says X at turn 2, contradicts X at turn 7 | Miss | Miss | Miss | **Catch** |
|
|
72
|
+
| Topic drift | Conversation gradually drifts off-topic | Miss | Miss | Miss | **Catch** |
|
|
73
|
+
| Confidence drift | Model says "definitely" then "perhaps" about same claim | Miss | Miss | Miss | **Catch** |
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Formula
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
TRACE(C) = Σ(wᵢ · Sᵢ) − λ·P − δ·V + α·(T·C) + β·(A·R)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Each component uses **time-decay aggregation** — recent turns weighted more:
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
Sᵢ = (1/Z) · Σ γ^(N-t) · Sᵢ,ₜ
|
|
87
|
+
Z = Σ γ^(N-t)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
| Symbol | Component | Measures |
|
|
91
|
+
|--------|-----------|---------|
|
|
92
|
+
| **T** | Temporal Retention | Did assistant remember user-stated facts? |
|
|
93
|
+
| **R** | Reliability Consistency | Did assistant contradict itself? |
|
|
94
|
+
| **A** | Adaptive Correction | Did assistant retain user corrections? |
|
|
95
|
+
| **C** | Context Coherence | Did conversation stay on topic? |
|
|
96
|
+
| **E** | Epistemic Stability | Did confidence stay calibrated? |
|
|
97
|
+
| P | Contradiction penalty | Global contradiction rate |
|
|
98
|
+
| V | Variance penalty | Confidence variance |
|
|
99
|
+
| γ | Time decay factor | Default: 0.80 |
|
|
100
|
+
| λ | Contradiction weight | Default: 0.15 |
|
|
101
|
+
| δ | Variance weight | Default: 0.10 |
|
|
102
|
+
| α | T·C interaction | Default: 0.05 |
|
|
103
|
+
| β | A·R interaction | Default: 0.05 |
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## Install
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
pip install trace-score
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## Quick Start
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from trace_score import compute_TRACE
|
|
119
|
+
|
|
120
|
+
conversation = [
|
|
121
|
+
("user", "I am diabetic and hate spicy food"),
|
|
122
|
+
("assistant", "I will suggest low sugar mild options."),
|
|
123
|
+
("user", "Actually I eat fish too. I am pescatarian."),
|
|
124
|
+
("assistant", "Spicy chicken with cashews!"), # failure turn
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
result = compute_TRACE(conversation, verbose=True)
|
|
128
|
+
|
|
129
|
+
print(result["trace_score"]) # 0.41 — catches failures
|
|
130
|
+
print(result["T"]) # 0.50 — forgot user facts
|
|
131
|
+
print(result["A"]) # 0.00 — ignored correction
|
|
132
|
+
print(result["formula_breakdown"]) # full formula with values
|
|
133
|
+
print(result["interpretation"]) # "Poor consistency"
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## Batch Evaluation
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from trace_score import TRACEEvaluator
|
|
142
|
+
|
|
143
|
+
# Models loaded once, reused across all calls — much faster
|
|
144
|
+
evaluator = TRACEEvaluator()
|
|
145
|
+
results = [evaluator.evaluate(conv) for conv in conversations]
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Adaptive Weights
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
# Equal weights (default)
|
|
154
|
+
result = compute_TRACE(conv, preset="equal")
|
|
155
|
+
|
|
156
|
+
# Medical chatbot — memory and reliability weighted more
|
|
157
|
+
result = compute_TRACE(conv, preset="medical_chatbot")
|
|
158
|
+
|
|
159
|
+
# Custom weights — must sum to 1.0
|
|
160
|
+
result = compute_TRACE(conv, weights={
|
|
161
|
+
"w_T": 0.35, "w_R": 0.25,
|
|
162
|
+
"w_A": 0.20, "w_C": 0.10, "w_E": 0.10
|
|
163
|
+
})
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Available presets: `equal`, `customer_service`, `technical_qa`,
|
|
167
|
+
`medical_chatbot`, `education_tutor`
|
|
168
|
+
|
|
169
|
+
---
|
|
170
|
+
|
|
171
|
+
## Benchmark Results
|
|
172
|
+
|
|
173
|
+
Evaluated on **30 multi-turn conversations** across 3 categories
|
|
174
|
+
(Fact Memory, Correction Retention, Contradiction Detection).
|
|
175
|
+
Conversations generated by **Llama-3.1-8B via Groq API**.
|
|
176
|
+
|
|
177
|
+
### Overall Metric Comparison
|
|
178
|
+
|
|
179
|
+
| Metric | Overall | Fact Memory | Correction | Contradiction |
|
|
180
|
+
|--------|---------|-------------|------------|---------------|
|
|
181
|
+
| **TRACE** | **0.699** | **0.703** | **0.550** | **0.843** |
|
|
182
|
+
| BLEU | 0.102 | 0.046 | 0.149 | 0.110 |
|
|
183
|
+
| ROUGE-L | 0.239 | 0.177 | 0.301 | 0.239 |
|
|
184
|
+
| BERTScore | 0.822 | 0.800 | 0.842 | 0.823 |
|
|
185
|
+
|
|
186
|
+
**Key finding:** BLEU and ROUGE-L show similar low scores across all categories
|
|
187
|
+
— they cannot distinguish between different types of consistency failures.
|
|
188
|
+
BERTScore appears high but provides no diagnostic breakdown.
|
|
189
|
+
**TRACE clearly separates Correction (0.550) from Contradiction (0.843)**,
|
|
190
|
+
revealing that Llama-3.1-8B struggles most with retaining user corrections.
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
### TRACE Component Breakdown by Category
|
|
195
|
+
|
|
196
|
+
| Category | T | R | A | C | E |
|
|
197
|
+
|----------|---|---|---|---|---|
|
|
198
|
+
| Fact Memory | 0.137 | 0.955 | **1.000** | 0.503 | 0.697 |
|
|
199
|
+
| Correction | 0.491 | 0.927 | **0.144** | 0.465 | 0.712 |
|
|
200
|
+
| Contradiction | **0.973** | 0.875 | 0.900 | 0.510 | 0.696 |
|
|
201
|
+
|
|
202
|
+
**Diagnostic insight:**
|
|
203
|
+
|
|
204
|
+
- Fact Memory: T=0.137 — model **forgets user-stated facts** (A=1.0 means
|
|
205
|
+
no corrections were needed, so A is vacuously true here)
|
|
206
|
+
- Correction: A=0.144 — model **ignores user corrections** (critical failure)
|
|
207
|
+
- Contradiction: T=0.973, A=0.900 — model handles these well
|
|
208
|
+
|
|
209
|
+
No existing metric (BLEU, ROUGE, BERTScore) can produce this breakdown.
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
### The Gap TRACE Reveals — BERTScore vs TRACE
|
|
214
|
+
|
|
215
|
+
Conversations where **BERTScore is high but TRACE is low**
|
|
216
|
+
(failures invisible to BERTScore, caught by TRACE):
|
|
217
|
+
|
|
218
|
+
| Conversation | Category | TRACE | BERTScore | Gap |
|
|
219
|
+
|---|---|---|---|---|
|
|
220
|
+
| CR_006 | Correction | 0.314 | 0.876 | +0.562 |
|
|
221
|
+
| CR_009 | Correction | 0.381 | 0.861 | +0.480 |
|
|
222
|
+
| CR_004 | Correction | 0.535 | 0.884 | +0.349 |
|
|
223
|
+
| CR_003 | Correction | 0.494 | 0.864 | +0.370 |
|
|
224
|
+
| CR_002 | Correction | 0.442 | 0.822 | +0.380 |
|
|
225
|
+
|
|
226
|
+
**In all 5 cases:** BERTScore ≥ 0.82 (looks good), TRACE < 0.55 (failures detected).
|
|
227
|
+
The A component reveals why — user corrections completely ignored (A=0.00).
|
|
228
|
+
This is invisible to any per-turn metric.
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## Why TRACE?
|
|
233
|
+
|
|
234
|
+
| Metric | Multi-turn | Reference-free | Deterministic | Time-decay | Diagnostic |
|
|
235
|
+
|--------|-----------|----------------|---------------|-----------|-----------|
|
|
236
|
+
| BLEU | No | No | Yes | No | No |
|
|
237
|
+
| ROUGE | No | No | Yes | No | No |
|
|
238
|
+
| BERTScore | No | No | Yes | No | No |
|
|
239
|
+
| RAGAS | No | Yes | No | No | Partial |
|
|
240
|
+
| **TRACE** | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** |
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## Models Used
|
|
245
|
+
|
|
246
|
+
| Model | Purpose | Size |
|
|
247
|
+
|-------|---------|------|
|
|
248
|
+
| `all-MiniLM-L6-v2` | Semantic similarity (T, A, C, E) | 80MB |
|
|
249
|
+
| `cross-encoder/nli-deberta-v3-small` | Contradiction detection (R, A) | 184MB |
|
|
250
|
+
|
|
251
|
+
Models downloaded automatically on first use (~264MB total).
|
|
252
|
+
CPU-friendly — no GPU required.
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
## Citation
|
|
257
|
+
|
|
258
|
+
```bibtex
|
|
259
|
+
@article{girinathv2026trace,
|
|
260
|
+
title = {TRACE: A Unified Deterministic Metric for Multi-turn
|
|
261
|
+
Conversational Consistency in Large Language Models},
|
|
262
|
+
author = {Girinath, V},
|
|
263
|
+
year = {2026}
|
|
264
|
+
}
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
*Author: Girinath V*
|
|
270
|
+
*GitHub: https://github.com/Giri530/trace-score*
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# TRACE Score
|
|
2
|
+
|
|
3
|
+
**Multi-turn LLM Conversation Consistency Metric**
|
|
4
|
+
|
|
5
|
+
> The first unified, deterministic, reference-free evaluation metric for
|
|
6
|
+
> multi-turn conversational consistency in Large Language Models.
|
|
7
|
+
|
|
8
|
+
[](https://pypi.org/project/trace-score/)
|
|
9
|
+
[](https://opensource.org/licenses/MIT)
|
|
10
|
+
[](https://www.python.org/downloads/)
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## The Problem
|
|
15
|
+
|
|
16
|
+
Existing metrics (BLEU, ROUGE, BERTScore, RAGAS) evaluate each conversation
|
|
17
|
+
turn **in isolation**. They cannot detect failures that only become visible
|
|
18
|
+
**across multiple turns**:
|
|
19
|
+
|
|
20
|
+
| Failure Type | Example | BLEU | ROUGE | BERTScore | TRACE |
|
|
21
|
+
|---|---|---|---|---|---|
|
|
22
|
+
| Fact forgotten | User says "I am diabetic" → model recommends sugar-rich food 5 turns later | Miss | Miss | Miss | **Catch** |
|
|
23
|
+
| Correction ignored | User corrects model → model reverts to old behavior | Miss | Miss | Miss | **Catch** |
|
|
24
|
+
| Self-contradiction | Model says X at turn 2, contradicts X at turn 7 | Miss | Miss | Miss | **Catch** |
|
|
25
|
+
| Topic drift | Conversation gradually drifts off-topic | Miss | Miss | Miss | **Catch** |
|
|
26
|
+
| Confidence drift | Model says "definitely" then "perhaps" about same claim | Miss | Miss | Miss | **Catch** |
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Formula
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
TRACE(C) = Σ(wᵢ · Sᵢ) − λ·P − δ·V + α·(T·C) + β·(A·R)
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Each component uses **time-decay aggregation** — recent turns weighted more:
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
Sᵢ = (1/Z) · Σ γ^(N-t) · Sᵢ,ₜ
|
|
40
|
+
Z = Σ γ^(N-t)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
| Symbol | Component | Measures |
|
|
44
|
+
|--------|-----------|---------|
|
|
45
|
+
| **T** | Temporal Retention | Did assistant remember user-stated facts? |
|
|
46
|
+
| **R** | Reliability Consistency | Did assistant contradict itself? |
|
|
47
|
+
| **A** | Adaptive Correction | Did assistant retain user corrections? |
|
|
48
|
+
| **C** | Context Coherence | Did conversation stay on topic? |
|
|
49
|
+
| **E** | Epistemic Stability | Did confidence stay calibrated? |
|
|
50
|
+
| P | Contradiction penalty | Global contradiction rate |
|
|
51
|
+
| V | Variance penalty | Confidence variance |
|
|
52
|
+
| γ | Time decay factor | Default: 0.80 |
|
|
53
|
+
| λ | Contradiction weight | Default: 0.15 |
|
|
54
|
+
| δ | Variance weight | Default: 0.10 |
|
|
55
|
+
| α | T·C interaction | Default: 0.05 |
|
|
56
|
+
| β | A·R interaction | Default: 0.05 |
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Install
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install trace-score
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## Quick Start
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from trace_score import compute_TRACE
|
|
72
|
+
|
|
73
|
+
conversation = [
|
|
74
|
+
("user", "I am diabetic and hate spicy food"),
|
|
75
|
+
("assistant", "I will suggest low sugar mild options."),
|
|
76
|
+
("user", "Actually I eat fish too. I am pescatarian."),
|
|
77
|
+
("assistant", "Spicy chicken with cashews!"), # failure turn
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
result = compute_TRACE(conversation, verbose=True)
|
|
81
|
+
|
|
82
|
+
print(result["trace_score"]) # 0.41 — catches failures
|
|
83
|
+
print(result["T"]) # 0.50 — forgot user facts
|
|
84
|
+
print(result["A"]) # 0.00 — ignored correction
|
|
85
|
+
print(result["formula_breakdown"]) # full formula with values
|
|
86
|
+
print(result["interpretation"]) # "Poor consistency"
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Batch Evaluation
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from trace_score import TRACEEvaluator
|
|
95
|
+
|
|
96
|
+
# Models loaded once, reused across all calls — much faster
|
|
97
|
+
evaluator = TRACEEvaluator()
|
|
98
|
+
results = [evaluator.evaluate(conv) for conv in conversations]
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Adaptive Weights
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
# Equal weights (default)
|
|
107
|
+
result = compute_TRACE(conv, preset="equal")
|
|
108
|
+
|
|
109
|
+
# Medical chatbot — memory and reliability weighted more
|
|
110
|
+
result = compute_TRACE(conv, preset="medical_chatbot")
|
|
111
|
+
|
|
112
|
+
# Custom weights — must sum to 1.0
|
|
113
|
+
result = compute_TRACE(conv, weights={
|
|
114
|
+
"w_T": 0.35, "w_R": 0.25,
|
|
115
|
+
"w_A": 0.20, "w_C": 0.10, "w_E": 0.10
|
|
116
|
+
})
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Available presets: `equal`, `customer_service`, `technical_qa`,
|
|
120
|
+
`medical_chatbot`, `education_tutor`
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Benchmark Results
|
|
125
|
+
|
|
126
|
+
Evaluated on **30 multi-turn conversations** across 3 categories
|
|
127
|
+
(Fact Memory, Correction Retention, Contradiction Detection).
|
|
128
|
+
Conversations generated by **Llama-3.1-8B via Groq API**.
|
|
129
|
+
|
|
130
|
+
### Overall Metric Comparison
|
|
131
|
+
|
|
132
|
+
| Metric | Overall | Fact Memory | Correction | Contradiction |
|
|
133
|
+
|--------|---------|-------------|------------|---------------|
|
|
134
|
+
| **TRACE** | **0.699** | **0.703** | **0.550** | **0.843** |
|
|
135
|
+
| BLEU | 0.102 | 0.046 | 0.149 | 0.110 |
|
|
136
|
+
| ROUGE-L | 0.239 | 0.177 | 0.301 | 0.239 |
|
|
137
|
+
| BERTScore | 0.822 | 0.800 | 0.842 | 0.823 |
|
|
138
|
+
|
|
139
|
+
**Key finding:** BLEU and ROUGE-L show similar low scores across all categories
|
|
140
|
+
— they cannot distinguish between different types of consistency failures.
|
|
141
|
+
BERTScore appears high but provides no diagnostic breakdown.
|
|
142
|
+
**TRACE clearly separates Correction (0.550) from Contradiction (0.843)**,
|
|
143
|
+
revealing that Llama-3.1-8B struggles most with retaining user corrections.
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
### TRACE Component Breakdown by Category
|
|
148
|
+
|
|
149
|
+
| Category | T | R | A | C | E |
|
|
150
|
+
|----------|---|---|---|---|---|
|
|
151
|
+
| Fact Memory | 0.137 | 0.955 | **1.000** | 0.503 | 0.697 |
|
|
152
|
+
| Correction | 0.491 | 0.927 | **0.144** | 0.465 | 0.712 |
|
|
153
|
+
| Contradiction | **0.973** | 0.875 | 0.900 | 0.510 | 0.696 |
|
|
154
|
+
|
|
155
|
+
**Diagnostic insight:**
|
|
156
|
+
|
|
157
|
+
- Fact Memory: T=0.137 — model **forgets user-stated facts** (A=1.0 means
|
|
158
|
+
no corrections were needed, so A is vacuously true here)
|
|
159
|
+
- Correction: A=0.144 — model **ignores user corrections** (critical failure)
|
|
160
|
+
- Contradiction: T=0.973, A=0.900 — model handles these well
|
|
161
|
+
|
|
162
|
+
No existing metric (BLEU, ROUGE, BERTScore) can produce this breakdown.
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
### The Gap TRACE Reveals — BERTScore vs TRACE
|
|
167
|
+
|
|
168
|
+
Conversations where **BERTScore is high but TRACE is low**
|
|
169
|
+
(failures invisible to BERTScore, caught by TRACE):
|
|
170
|
+
|
|
171
|
+
| Conversation | Category | TRACE | BERTScore | Gap |
|
|
172
|
+
|---|---|---|---|---|
|
|
173
|
+
| CR_006 | Correction | 0.314 | 0.876 | +0.562 |
|
|
174
|
+
| CR_009 | Correction | 0.381 | 0.861 | +0.480 |
|
|
175
|
+
| CR_004 | Correction | 0.535 | 0.884 | +0.349 |
|
|
176
|
+
| CR_003 | Correction | 0.494 | 0.864 | +0.370 |
|
|
177
|
+
| CR_002 | Correction | 0.442 | 0.822 | +0.380 |
|
|
178
|
+
|
|
179
|
+
**In all 5 cases:** BERTScore ≥ 0.82 (looks good), TRACE < 0.55 (failures detected).
|
|
180
|
+
The A component reveals why — user corrections completely ignored (A=0.00).
|
|
181
|
+
This is invisible to any per-turn metric.
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Why TRACE?
|
|
186
|
+
|
|
187
|
+
| Metric | Multi-turn | Reference-free | Deterministic | Time-decay | Diagnostic |
|
|
188
|
+
|--------|-----------|----------------|---------------|-----------|-----------|
|
|
189
|
+
| BLEU | No | No | Yes | No | No |
|
|
190
|
+
| ROUGE | No | No | Yes | No | No |
|
|
191
|
+
| BERTScore | No | No | Yes | No | No |
|
|
192
|
+
| RAGAS | No | Yes | No | No | Partial |
|
|
193
|
+
| **TRACE** | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** |
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## Models Used
|
|
198
|
+
|
|
199
|
+
| Model | Purpose | Size |
|
|
200
|
+
|-------|---------|------|
|
|
201
|
+
| `all-MiniLM-L6-v2` | Semantic similarity (T, A, C, E) | 80MB |
|
|
202
|
+
| `cross-encoder/nli-deberta-v3-small` | Contradiction detection (R, A) | 184MB |
|
|
203
|
+
|
|
204
|
+
Models downloaded automatically on first use (~264MB total).
|
|
205
|
+
CPU-friendly — no GPU required.
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## Citation
|
|
210
|
+
|
|
211
|
+
```bibtex
|
|
212
|
+
@article{girinathv2026trace,
|
|
213
|
+
title = {TRACE: A Unified Deterministic Metric for Multi-turn
|
|
214
|
+
Conversational Consistency in Large Language Models},
|
|
215
|
+
author = {Girinath, V},
|
|
216
|
+
year = {2026}
|
|
217
|
+
}
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
*Author: Girinath V*
|
|
223
|
+
*GitHub: https://github.com/Giri530/trace-score*
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=42", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "trace-score"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Multi-turn LLM Conversation Consistency Metric"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {file = "LICENSE"}
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Girinath V", email = "your-email@gmail.com"}
|
|
14
|
+
]
|
|
15
|
+
dependencies = [
|
|
16
|
+
"sentence-transformers>=2.2.0",
|
|
17
|
+
"numpy>=1.21.0",
|
|
18
|
+
"torch>=1.11.0",
|
|
19
|
+
]
|
|
20
|
+
keywords = [
|
|
21
|
+
"nlp", "llm", "evaluation",
|
|
22
|
+
"multi-turn", "consistency", "trace-score"
|
|
23
|
+
]
|
|
24
|
+
classifiers = [
|
|
25
|
+
"Development Status :: 3 - Alpha",
|
|
26
|
+
"Intended Audience :: Science/Research",
|
|
27
|
+
"License :: OSI Approved :: MIT License",
|
|
28
|
+
"Programming Language :: Python :: 3",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Homepage = "https://github.com/Giri530/trace-score"
|
|
33
|
+
Repository = "https://github.com/Giri530/trace-score"
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
with open("README.md", "r", encoding="utf-8") as f:
|
|
3
|
+
long_description = f.read()
|
|
4
|
+
setup(
|
|
5
|
+
name = "trace-score",
|
|
6
|
+
version="0.1.0",
|
|
7
|
+
author = "Girinath V",
|
|
8
|
+
author_email = "girinathv48@gmail.com",
|
|
9
|
+
description = "Multi-turn LLM Conversation Consistency Metric",
|
|
10
|
+
long_description = long_description,
|
|
11
|
+
long_description_content_type = "text/markdown",
|
|
12
|
+
url = "https://github.com/Giri530/trace-score",
|
|
13
|
+
packages = find_packages(),
|
|
14
|
+
python_requires = ">=3.8",
|
|
15
|
+
install_requires = [
|
|
16
|
+
"sentence-transformers>=2.2.0",
|
|
17
|
+
"numpy>=1.21.0",
|
|
18
|
+
"torch>=1.11.0",
|
|
19
|
+
],
|
|
20
|
+
classifiers = [
|
|
21
|
+
"Development Status :: 3 - Alpha",
|
|
22
|
+
"Intended Audience :: Science/Research",
|
|
23
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
24
|
+
"License :: OSI Approved :: MIT License",
|
|
25
|
+
"Programming Language :: Python :: 3",
|
|
26
|
+
"Programming Language :: Python :: 3.8",
|
|
27
|
+
"Programming Language :: Python :: 3.9",
|
|
28
|
+
"Programming Language :: Python :: 3.10",
|
|
29
|
+
"Programming Language :: Python :: 3.11",
|
|
30
|
+
],
|
|
31
|
+
keywords = [
|
|
32
|
+
"nlp", "llm", "evaluation", "metrics",
|
|
33
|
+
"multi-turn", "consistency", "dialogue",
|
|
34
|
+
"trace-score", "conversational-ai",
|
|
35
|
+
],
|
|
36
|
+
)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from .trace import compute_TRACE, TRACEEvaluator, WEIGHT_PRESETS
|
|
2
|
+
from .components.temporal import compute_T
|
|
3
|
+
from .components.reliability import compute_R
|
|
4
|
+
from .components.adaptive import compute_A
|
|
5
|
+
from .components.coherence import compute_C
|
|
6
|
+
from .components.epistemic import compute_E
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
__author__ = "Girinath V"
|