trace-score 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Girinath V
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,270 @@
1
+ Metadata-Version: 2.4
2
+ Name: trace-score
3
+ Version: 0.1.0
4
+ Summary: Multi-turn LLM Conversation Consistency Metric
5
+ Home-page: https://github.com/Giri530/trace-score
6
+ Author: Girinath V
7
+ Author-email: Girinath V <your-email@gmail.com>
8
+ License: MIT License
9
+
10
+ Copyright (c) 2026 Girinath V
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+
30
+ Project-URL: Homepage, https://github.com/Giri530/trace-score
31
+ Project-URL: Repository, https://github.com/Giri530/trace-score
32
+ Keywords: nlp,llm,evaluation,multi-turn,consistency,trace-score
33
+ Classifier: Development Status :: 3 - Alpha
34
+ Classifier: Intended Audience :: Science/Research
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Programming Language :: Python :: 3
37
+ Requires-Python: >=3.8
38
+ Description-Content-Type: text/markdown
39
+ License-File: LICENSE
40
+ Requires-Dist: sentence-transformers>=2.2.0
41
+ Requires-Dist: numpy>=1.21.0
42
+ Requires-Dist: torch>=1.11.0
43
+ Dynamic: author
44
+ Dynamic: home-page
45
+ Dynamic: license-file
46
+ Dynamic: requires-python
47
+
48
+ # TRACE Score
49
+
50
+ **Multi-turn LLM Conversation Consistency Metric**
51
+
52
+ > The first unified, deterministic, reference-free evaluation metric for
53
+ > multi-turn conversational consistency in Large Language Models.
54
+
55
+ [![PyPI version](https://badge.fury.io/py/trace-score.svg)](https://pypi.org/project/trace-score/)
56
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
57
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
58
+
59
+ ---
60
+
61
+ ## The Problem
62
+
63
+ Existing metrics (BLEU, ROUGE, BERTScore, RAGAS) evaluate each conversation
64
+ turn **in isolation**. They cannot detect failures that only become visible
65
+ **across multiple turns**:
66
+
67
+ | Failure Type | Example | BLEU | ROUGE | BERTScore | TRACE |
68
+ |---|---|---|---|---|---|
69
+ | Fact forgotten | User says "I am diabetic" → model recommends sugar-rich food 5 turns later | Miss | Miss | Miss | **Catch** |
70
+ | Correction ignored | User corrects model → model reverts to old behavior | Miss | Miss | Miss | **Catch** |
71
+ | Self-contradiction | Model says X at turn 2, contradicts X at turn 7 | Miss | Miss | Miss | **Catch** |
72
+ | Topic drift | Conversation gradually drifts off-topic | Miss | Miss | Miss | **Catch** |
73
+ | Confidence drift | Model says "definitely" then "perhaps" about same claim | Miss | Miss | Miss | **Catch** |
74
+
75
+ ---
76
+
77
+ ## Formula
78
+
79
+ ```
80
+ TRACE(C) = Σ(wᵢ · Sᵢ) − λ·P − δ·V + α·(T·C) + β·(A·R)
81
+ ```
82
+
83
+ Each component uses **time-decay aggregation** — recent turns weighted more:
84
+
85
+ ```
86
+ Sᵢ = (1/Z) · Σ γ^(N-t) · Sᵢ,ₜ
87
+ Z = Σ γ^(N-t)
88
+ ```
89
+
90
+ | Symbol | Component | Measures |
91
+ |--------|-----------|---------|
92
+ | **T** | Temporal Retention | Did assistant remember user-stated facts? |
93
+ | **R** | Reliability Consistency | Did assistant contradict itself? |
94
+ | **A** | Adaptive Correction | Did assistant retain user corrections? |
95
+ | **C** | Context Coherence | Did conversation stay on topic? |
96
+ | **E** | Epistemic Stability | Did confidence stay calibrated? |
97
+ | P | Contradiction penalty | Global contradiction rate |
98
+ | V | Variance penalty | Confidence variance |
99
+ | γ | Time decay factor | Default: 0.80 |
100
+ | λ | Contradiction weight | Default: 0.15 |
101
+ | δ | Variance weight | Default: 0.10 |
102
+ | α | T·C interaction | Default: 0.05 |
103
+ | β | A·R interaction | Default: 0.05 |
104
+
105
+ ---
106
+
107
+ ## Install
108
+
109
+ ```bash
110
+ pip install trace-score
111
+ ```
112
+
113
+ ---
114
+
115
+ ## Quick Start
116
+
117
+ ```python
118
+ from trace_score import compute_TRACE
119
+
120
+ conversation = [
121
+ ("user", "I am diabetic and hate spicy food"),
122
+ ("assistant", "I will suggest low sugar mild options."),
123
+ ("user", "Actually I eat fish too. I am pescatarian."),
124
+ ("assistant", "Spicy chicken with cashews!"), # failure turn
125
+ ]
126
+
127
+ result = compute_TRACE(conversation, verbose=True)
128
+
129
+ print(result["trace_score"]) # 0.41 — catches failures
130
+ print(result["T"]) # 0.50 — forgot user facts
131
+ print(result["A"]) # 0.00 — ignored correction
132
+ print(result["formula_breakdown"]) # full formula with values
133
+ print(result["interpretation"]) # "Poor consistency"
134
+ ```
135
+
136
+ ---
137
+
138
+ ## Batch Evaluation
139
+
140
+ ```python
141
+ from trace_score import TRACEEvaluator
142
+
143
+ # Models loaded once, reused across all calls — much faster
144
+ evaluator = TRACEEvaluator()
145
+ results = [evaluator.evaluate(conv) for conv in conversations]
146
+ ```
147
+
148
+ ---
149
+
150
+ ## Adaptive Weights
151
+
152
+ ```python
153
+ # Equal weights (default)
154
+ result = compute_TRACE(conv, preset="equal")
155
+
156
+ # Medical chatbot — memory and reliability weighted more
157
+ result = compute_TRACE(conv, preset="medical_chatbot")
158
+
159
+ # Custom weights — must sum to 1.0
160
+ result = compute_TRACE(conv, weights={
161
+ "w_T": 0.35, "w_R": 0.25,
162
+ "w_A": 0.20, "w_C": 0.10, "w_E": 0.10
163
+ })
164
+ ```
165
+
166
+ Available presets: `equal`, `customer_service`, `technical_qa`,
167
+ `medical_chatbot`, `education_tutor`
168
+
169
+ ---
170
+
171
+ ## Benchmark Results
172
+
173
+ Evaluated on **30 multi-turn conversations** across 3 categories
174
+ (Fact Memory, Correction Retention, Contradiction Detection).
175
+ Conversations generated by **Llama-3.1-8B via Groq API**.
176
+
177
+ ### Overall Metric Comparison
178
+
179
+ | Metric | Overall | Fact Memory | Correction | Contradiction |
180
+ |--------|---------|-------------|------------|---------------|
181
+ | **TRACE** | **0.699** | **0.703** | **0.550** | **0.843** |
182
+ | BLEU | 0.102 | 0.046 | 0.149 | 0.110 |
183
+ | ROUGE-L | 0.239 | 0.177 | 0.301 | 0.239 |
184
+ | BERTScore | 0.822 | 0.800 | 0.842 | 0.823 |
185
+
186
+ **Key finding:** BLEU and ROUGE-L show similar low scores across all categories
187
+ — they cannot distinguish between different types of consistency failures.
188
+ BERTScore appears high but provides no diagnostic breakdown.
189
+ **TRACE clearly separates Correction (0.550) from Contradiction (0.843)**,
190
+ revealing that Llama-3.1-8B struggles most with retaining user corrections.
191
+
192
+ ---
193
+
194
+ ### TRACE Component Breakdown by Category
195
+
196
+ | Category | T | R | A | C | E |
197
+ |----------|---|---|---|---|---|
198
+ | Fact Memory | 0.137 | 0.955 | **1.000** | 0.503 | 0.697 |
199
+ | Correction | 0.491 | 0.927 | **0.144** | 0.465 | 0.712 |
200
+ | Contradiction | **0.973** | 0.875 | 0.900 | 0.510 | 0.696 |
201
+
202
+ **Diagnostic insight:**
203
+
204
+ - Fact Memory: T=0.137 — model **forgets user-stated facts** (A=1.0 means
205
+ no corrections were needed, so A is vacuously true here)
206
+ - Correction: A=0.144 — model **ignores user corrections** (critical failure)
207
+ - Contradiction: T=0.973, A=0.900 — model handles these well
208
+
209
+ No existing metric (BLEU, ROUGE, BERTScore) can produce this breakdown.
210
+
211
+ ---
212
+
213
+ ### The Gap TRACE Reveals — BERTScore vs TRACE
214
+
215
+ Conversations where **BERTScore is high but TRACE is low**
216
+ (failures invisible to BERTScore, caught by TRACE):
217
+
218
+ | Conversation | Category | TRACE | BERTScore | Gap |
219
+ |---|---|---|---|---|
220
+ | CR_006 | Correction | 0.314 | 0.876 | +0.562 |
221
+ | CR_009 | Correction | 0.381 | 0.861 | +0.480 |
222
+ | CR_004 | Correction | 0.535 | 0.884 | +0.349 |
223
+ | CR_003 | Correction | 0.494 | 0.864 | +0.370 |
224
+ | CR_002 | Correction | 0.442 | 0.822 | +0.380 |
225
+
226
+ **In all 5 cases:** BERTScore ≥ 0.82 (looks good), TRACE < 0.55 (failures detected).
227
+ The A component reveals why — user corrections completely ignored (A=0.00).
228
+ This is invisible to any per-turn metric.
229
+
230
+ ---
231
+
232
+ ## Why TRACE?
233
+
234
+ | Metric | Multi-turn | Reference-free | Deterministic | Time-decay | Diagnostic |
235
+ |--------|-----------|----------------|---------------|-----------|-----------|
236
+ | BLEU | No | No | Yes | No | No |
237
+ | ROUGE | No | No | Yes | No | No |
238
+ | BERTScore | No | No | Yes | No | No |
239
+ | RAGAS | No | Yes | No | No | Partial |
240
+ | **TRACE** | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** |
241
+
242
+ ---
243
+
244
+ ## Models Used
245
+
246
+ | Model | Purpose | Size |
247
+ |-------|---------|------|
248
+ | `all-MiniLM-L6-v2` | Semantic similarity (T, A, C, E) | 80MB |
249
+ | `cross-encoder/nli-deberta-v3-small` | Contradiction detection (R, A) | 184MB |
250
+
251
+ Models downloaded automatically on first use (~264MB total).
252
+ CPU-friendly — no GPU required.
253
+
254
+ ---
255
+
256
+ ## Citation
257
+
258
+ ```bibtex
259
+ @article{girinathv2026trace,
260
+ title = {TRACE: A Unified Deterministic Metric for Multi-turn
261
+ Conversational Consistency in Large Language Models},
262
+ author = {Girinath, V},
263
+ year = {2026}
264
+ }
265
+ ```
266
+
267
+ ---
268
+
269
+ *Author: Girinath V*
270
+ *GitHub: https://github.com/Giri530/trace-score*
@@ -0,0 +1,223 @@
1
+ # TRACE Score
2
+
3
+ **Multi-turn LLM Conversation Consistency Metric**
4
+
5
+ > The first unified, deterministic, reference-free evaluation metric for
6
+ > multi-turn conversational consistency in Large Language Models.
7
+
8
+ [![PyPI version](https://badge.fury.io/py/trace-score.svg)](https://pypi.org/project/trace-score/)
9
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
10
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
11
+
12
+ ---
13
+
14
+ ## The Problem
15
+
16
+ Existing metrics (BLEU, ROUGE, BERTScore, RAGAS) evaluate each conversation
17
+ turn **in isolation**. They cannot detect failures that only become visible
18
+ **across multiple turns**:
19
+
20
+ | Failure Type | Example | BLEU | ROUGE | BERTScore | TRACE |
21
+ |---|---|---|---|---|---|
22
+ | Fact forgotten | User says "I am diabetic" → model recommends sugar-rich food 5 turns later | Miss | Miss | Miss | **Catch** |
23
+ | Correction ignored | User corrects model → model reverts to old behavior | Miss | Miss | Miss | **Catch** |
24
+ | Self-contradiction | Model says X at turn 2, contradicts X at turn 7 | Miss | Miss | Miss | **Catch** |
25
+ | Topic drift | Conversation gradually drifts off-topic | Miss | Miss | Miss | **Catch** |
26
+ | Confidence drift | Model says "definitely" then "perhaps" about same claim | Miss | Miss | Miss | **Catch** |
27
+
28
+ ---
29
+
30
+ ## Formula
31
+
32
+ ```
33
+ TRACE(C) = Σ(wᵢ · Sᵢ) − λ·P − δ·V + α·(T·C) + β·(A·R)
34
+ ```
35
+
36
+ Each component uses **time-decay aggregation** — recent turns weighted more:
37
+
38
+ ```
39
+ Sᵢ = (1/Z) · Σ γ^(N-t) · Sᵢ,ₜ
40
+ Z = Σ γ^(N-t)
41
+ ```
42
+
43
+ | Symbol | Component | Measures |
44
+ |--------|-----------|---------|
45
+ | **T** | Temporal Retention | Did assistant remember user-stated facts? |
46
+ | **R** | Reliability Consistency | Did assistant contradict itself? |
47
+ | **A** | Adaptive Correction | Did assistant retain user corrections? |
48
+ | **C** | Context Coherence | Did conversation stay on topic? |
49
+ | **E** | Epistemic Stability | Did confidence stay calibrated? |
50
+ | P | Contradiction penalty | Global contradiction rate |
51
+ | V | Variance penalty | Confidence variance |
52
+ | γ | Time decay factor | Default: 0.80 |
53
+ | λ | Contradiction weight | Default: 0.15 |
54
+ | δ | Variance weight | Default: 0.10 |
55
+ | α | T·C interaction | Default: 0.05 |
56
+ | β | A·R interaction | Default: 0.05 |
57
+
58
+ ---
59
+
60
+ ## Install
61
+
62
+ ```bash
63
+ pip install trace-score
64
+ ```
65
+
66
+ ---
67
+
68
+ ## Quick Start
69
+
70
+ ```python
71
+ from trace_score import compute_TRACE
72
+
73
+ conversation = [
74
+ ("user", "I am diabetic and hate spicy food"),
75
+ ("assistant", "I will suggest low sugar mild options."),
76
+ ("user", "Actually I eat fish too. I am pescatarian."),
77
+ ("assistant", "Spicy chicken with cashews!"), # failure turn
78
+ ]
79
+
80
+ result = compute_TRACE(conversation, verbose=True)
81
+
82
+ print(result["trace_score"]) # 0.41 — catches failures
83
+ print(result["T"]) # 0.50 — forgot user facts
84
+ print(result["A"]) # 0.00 — ignored correction
85
+ print(result["formula_breakdown"]) # full formula with values
86
+ print(result["interpretation"]) # "Poor consistency"
87
+ ```
88
+
89
+ ---
90
+
91
+ ## Batch Evaluation
92
+
93
+ ```python
94
+ from trace_score import TRACEEvaluator
95
+
96
+ # Models loaded once, reused across all calls — much faster
97
+ evaluator = TRACEEvaluator()
98
+ results = [evaluator.evaluate(conv) for conv in conversations]
99
+ ```
100
+
101
+ ---
102
+
103
+ ## Adaptive Weights
104
+
105
+ ```python
106
+ # Equal weights (default)
107
+ result = compute_TRACE(conv, preset="equal")
108
+
109
+ # Medical chatbot — memory and reliability weighted more
110
+ result = compute_TRACE(conv, preset="medical_chatbot")
111
+
112
+ # Custom weights — must sum to 1.0
113
+ result = compute_TRACE(conv, weights={
114
+ "w_T": 0.35, "w_R": 0.25,
115
+ "w_A": 0.20, "w_C": 0.10, "w_E": 0.10
116
+ })
117
+ ```
118
+
119
+ Available presets: `equal`, `customer_service`, `technical_qa`,
120
+ `medical_chatbot`, `education_tutor`
121
+
122
+ ---
123
+
124
+ ## Benchmark Results
125
+
126
+ Evaluated on **30 multi-turn conversations** across 3 categories
127
+ (Fact Memory, Correction Retention, Contradiction Detection).
128
+ Conversations generated by **Llama-3.1-8B via Groq API**.
129
+
130
+ ### Overall Metric Comparison
131
+
132
+ | Metric | Overall | Fact Memory | Correction | Contradiction |
133
+ |--------|---------|-------------|------------|---------------|
134
+ | **TRACE** | **0.699** | **0.703** | **0.550** | **0.843** |
135
+ | BLEU | 0.102 | 0.046 | 0.149 | 0.110 |
136
+ | ROUGE-L | 0.239 | 0.177 | 0.301 | 0.239 |
137
+ | BERTScore | 0.822 | 0.800 | 0.842 | 0.823 |
138
+
139
+ **Key finding:** BLEU and ROUGE-L show similar low scores across all categories
140
+ — they cannot distinguish between different types of consistency failures.
141
+ BERTScore appears high but provides no diagnostic breakdown.
142
+ **TRACE clearly separates Correction (0.550) from Contradiction (0.843)**,
143
+ revealing that Llama-3.1-8B struggles most with retaining user corrections.
144
+
145
+ ---
146
+
147
+ ### TRACE Component Breakdown by Category
148
+
149
+ | Category | T | R | A | C | E |
150
+ |----------|---|---|---|---|---|
151
+ | Fact Memory | 0.137 | 0.955 | **1.000** | 0.503 | 0.697 |
152
+ | Correction | 0.491 | 0.927 | **0.144** | 0.465 | 0.712 |
153
+ | Contradiction | **0.973** | 0.875 | 0.900 | 0.510 | 0.696 |
154
+
155
+ **Diagnostic insight:**
156
+
157
+ - Fact Memory: T=0.137 — model **forgets user-stated facts** (A=1.0 means
158
+ no corrections were needed, so A is vacuously true here)
159
+ - Correction: A=0.144 — model **ignores user corrections** (critical failure)
160
+ - Contradiction: T=0.973, A=0.900 — model handles these well
161
+
162
+ No existing metric (BLEU, ROUGE, BERTScore) can produce this breakdown.
163
+
164
+ ---
165
+
166
+ ### The Gap TRACE Reveals — BERTScore vs TRACE
167
+
168
+ Conversations where **BERTScore is high but TRACE is low**
169
+ (failures invisible to BERTScore, caught by TRACE):
170
+
171
+ | Conversation | Category | TRACE | BERTScore | Gap |
172
+ |---|---|---|---|---|
173
+ | CR_006 | Correction | 0.314 | 0.876 | +0.562 |
174
+ | CR_009 | Correction | 0.381 | 0.861 | +0.480 |
175
+ | CR_004 | Correction | 0.535 | 0.884 | +0.349 |
176
+ | CR_003 | Correction | 0.494 | 0.864 | +0.370 |
177
+ | CR_002 | Correction | 0.442 | 0.822 | +0.380 |
178
+
179
+ **In all 5 cases:** BERTScore ≥ 0.82 (looks good), TRACE < 0.55 (failures detected).
180
+ The A component reveals why — user corrections completely ignored (A=0.00).
181
+ This is invisible to any per-turn metric.
182
+
183
+ ---
184
+
185
+ ## Why TRACE?
186
+
187
+ | Metric | Multi-turn | Reference-free | Deterministic | Time-decay | Diagnostic |
188
+ |--------|-----------|----------------|---------------|-----------|-----------|
189
+ | BLEU | No | No | Yes | No | No |
190
+ | ROUGE | No | No | Yes | No | No |
191
+ | BERTScore | No | No | Yes | No | No |
192
+ | RAGAS | No | Yes | No | No | Partial |
193
+ | **TRACE** | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** |
194
+
195
+ ---
196
+
197
+ ## Models Used
198
+
199
+ | Model | Purpose | Size |
200
+ |-------|---------|------|
201
+ | `all-MiniLM-L6-v2` | Semantic similarity (T, A, C, E) | 80MB |
202
+ | `cross-encoder/nli-deberta-v3-small` | Contradiction detection (R, A) | 184MB |
203
+
204
+ Models downloaded automatically on first use (~264MB total).
205
+ CPU-friendly — no GPU required.
206
+
207
+ ---
208
+
209
+ ## Citation
210
+
211
+ ```bibtex
212
+ @article{girinathv2026trace,
213
+ title = {TRACE: A Unified Deterministic Metric for Multi-turn
214
+ Conversational Consistency in Large Language Models},
215
+ author = {Girinath, V},
216
+ year = {2026}
217
+ }
218
+ ```
219
+
220
+ ---
221
+
222
+ *Author: Girinath V*
223
+ *GitHub: https://github.com/Giri530/trace-score*
@@ -0,0 +1,33 @@
1
+ [build-system]
2
+ requires = ["setuptools>=42", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "trace-score"
7
+ version = "0.1.0"
8
+ description = "Multi-turn LLM Conversation Consistency Metric"
9
+ readme = "README.md"
10
+ license = {file = "LICENSE"}
11
+ requires-python = ">=3.8"
12
+ authors = [
13
+ {name = "Girinath V", email = "your-email@gmail.com"}
14
+ ]
15
+ dependencies = [
16
+ "sentence-transformers>=2.2.0",
17
+ "numpy>=1.21.0",
18
+ "torch>=1.11.0",
19
+ ]
20
+ keywords = [
21
+ "nlp", "llm", "evaluation",
22
+ "multi-turn", "consistency", "trace-score"
23
+ ]
24
+ classifiers = [
25
+ "Development Status :: 3 - Alpha",
26
+ "Intended Audience :: Science/Research",
27
+ "License :: OSI Approved :: MIT License",
28
+ "Programming Language :: Python :: 3",
29
+ ]
30
+
31
+ [project.urls]
32
+ Homepage = "https://github.com/Giri530/trace-score"
33
+ Repository = "https://github.com/Giri530/trace-score"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,36 @@
1
+ from setuptools import setup, find_packages
2
+ with open("README.md", "r", encoding="utf-8") as f:
3
+ long_description = f.read()
4
+ setup(
5
+ name = "trace-score",
6
+ version="0.1.0",
7
+ author = "Girinath V",
8
+ author_email = "girinathv48@gmail.com",
9
+ description = "Multi-turn LLM Conversation Consistency Metric",
10
+ long_description = long_description,
11
+ long_description_content_type = "text/markdown",
12
+ url = "https://github.com/Giri530/trace-score",
13
+ packages = find_packages(),
14
+ python_requires = ">=3.8",
15
+ install_requires = [
16
+ "sentence-transformers>=2.2.0",
17
+ "numpy>=1.21.0",
18
+ "torch>=1.11.0",
19
+ ],
20
+ classifiers = [
21
+ "Development Status :: 3 - Alpha",
22
+ "Intended Audience :: Science/Research",
23
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
24
+ "License :: OSI Approved :: MIT License",
25
+ "Programming Language :: Python :: 3",
26
+ "Programming Language :: Python :: 3.8",
27
+ "Programming Language :: Python :: 3.9",
28
+ "Programming Language :: Python :: 3.10",
29
+ "Programming Language :: Python :: 3.11",
30
+ ],
31
+ keywords = [
32
+ "nlp", "llm", "evaluation", "metrics",
33
+ "multi-turn", "consistency", "dialogue",
34
+ "trace-score", "conversational-ai",
35
+ ],
36
+ )
@@ -0,0 +1,9 @@
1
+ from .trace import compute_TRACE, TRACEEvaluator, WEIGHT_PRESETS
2
+ from .components.temporal import compute_T
3
+ from .components.reliability import compute_R
4
+ from .components.adaptive import compute_A
5
+ from .components.coherence import compute_C
6
+ from .components.epistemic import compute_E
7
+
8
+ __version__ = "0.1.0"
9
+ __author__ = "Girinath V"
@@ -0,0 +1,5 @@
1
+ from .temporal import compute_T
2
+ from .reliability import compute_R
3
+ from .adaptive import compute_A
4
+ from .coherence import compute_C
5
+ from .epistemic import compute_E