llmflowstack 1.2.3__tar.gz → 1.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/PKG-INFO +16 -15
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/README.md +15 -14
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/rag/VectorDatabase.py +9 -4
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/utils/__init__.py +2 -2
- llmflowstack-1.2.4/llmflowstack/utils/evaluation_methods.py +165 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/pyproject.toml +1 -1
- llmflowstack-1.2.3/llmflowstack/utils/evaluation_methods.py +0 -92
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/.github/workflows/python-publish.yml +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/.gitignore +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/LICENSE +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/__init__.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/callbacks/__init__.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/callbacks/log_collector.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/callbacks/stop_on_token.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/BaseDecoder.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/GPT_OSS.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/Gemma.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/LLaMA3.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/LLaMA4.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/MedGemma.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/__init__.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/rag/__init__.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/schemas/__init__.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/schemas/params.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/utils/exceptions.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/utils/generation_utils.py +0 -0
- {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/utils/logging.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmflowstack
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.4
|
|
4
4
|
Summary: LLMFlowStack is a framework for training and using LLMs (LLaMA, GPT-OSS, Gemma, ...). Supports DAPT, fine-tuning, and distributed inference. Public fork without institution-specific components.
|
|
5
5
|
Author-email: Gustavo Henrique Ferreira Cruz <gustavohferreiracruz@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -277,34 +277,35 @@ print(query_result)
|
|
|
277
277
|
|
|
278
278
|
### NLP Evaluation
|
|
279
279
|
|
|
280
|
+
> **Disclaimer**
|
|
281
|
+
> These evaluation functions are designed for batch processing. Models and encoders are loaded internally on each call, which may be inefficient for per-sample or streaming evaluation.
|
|
282
|
+
|
|
280
283
|
```python
|
|
281
284
|
> from llmflowstack import text_evaluation
|
|
282
|
-
> from llmflowstack.utils import (bert_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
|
|
285
|
+
> from llmflowstack.utils import (bert_score_evaluation, bleu_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
|
|
283
286
|
|
|
284
287
|
# Predictions from some model
|
|
285
288
|
> predictions = ["Chico is a dog, and he is orange!", "Fred is a cat, and he is white!"]
|
|
286
289
|
# References text (ground truth)
|
|
287
290
|
> references = ["Chico is a cat, and he is black!", "Fred is a dog, and he is white!"]
|
|
288
291
|
|
|
289
|
-
#
|
|
292
|
+
# BERT Score Evaluation
|
|
290
293
|
> bert_score_evaluation(predictions, references)
|
|
291
|
-
{'bertscore_precision': 0.
|
|
294
|
+
{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773}
|
|
295
|
+
|
|
296
|
+
# Bleu Score Evaluation
|
|
297
|
+
> bleu_score_evaluation(predictions, references)
|
|
298
|
+
{'bleu_score': 0.3656}
|
|
292
299
|
|
|
293
300
|
# Cosine Similarity Evaluation
|
|
294
301
|
> cosine_similarity_evaluation(predictions, references)
|
|
295
|
-
{'cosine_similarity': 0.
|
|
302
|
+
{'cosine_similarity': 0.7443}
|
|
296
303
|
|
|
297
|
-
#
|
|
304
|
+
# Rouge Score Evaluation
|
|
298
305
|
> rouge_evaluation(predictions, references)
|
|
299
|
-
{'rouge1': 0.8125, 'rouge2': 0.
|
|
306
|
+
{'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
|
|
300
307
|
|
|
301
308
|
# All-in-one function
|
|
302
|
-
> text_evaluation(predictions, references
|
|
303
|
-
{'
|
|
309
|
+
> text_evaluation(predictions, references)
|
|
310
|
+
{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773, 'bleu_score': 0.3656, 'cosine_similarity': 0.7443, 'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
|
|
304
311
|
```
|
|
305
|
-
|
|
306
|
-
---
|
|
307
|
-
|
|
308
|
-
> **Disclaimer**
|
|
309
|
-
> This is a public fork of a framework originally developed in a research setting.
|
|
310
|
-
> Institution-specific components have been removed for confidentiality reasons.
|
|
@@ -241,34 +241,35 @@ print(query_result)
|
|
|
241
241
|
|
|
242
242
|
### NLP Evaluation
|
|
243
243
|
|
|
244
|
+
> **Disclaimer**
|
|
245
|
+
> These evaluation functions are designed for batch processing. Models and encoders are loaded internally on each call, which may be inefficient for per-sample or streaming evaluation.
|
|
246
|
+
|
|
244
247
|
```python
|
|
245
248
|
> from llmflowstack import text_evaluation
|
|
246
|
-
> from llmflowstack.utils import (bert_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
|
|
249
|
+
> from llmflowstack.utils import (bert_score_evaluation, bleu_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
|
|
247
250
|
|
|
248
251
|
# Predictions from some model
|
|
249
252
|
> predictions = ["Chico is a dog, and he is orange!", "Fred is a cat, and he is white!"]
|
|
250
253
|
# References text (ground truth)
|
|
251
254
|
> references = ["Chico is a cat, and he is black!", "Fred is a dog, and he is white!"]
|
|
252
255
|
|
|
253
|
-
#
|
|
256
|
+
# BERT Score Evaluation
|
|
254
257
|
> bert_score_evaluation(predictions, references)
|
|
255
|
-
{'bertscore_precision': 0.
|
|
258
|
+
{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773}
|
|
259
|
+
|
|
260
|
+
# Bleu Score Evaluation
|
|
261
|
+
> bleu_score_evaluation(predictions, references)
|
|
262
|
+
{'bleu_score': 0.3656}
|
|
256
263
|
|
|
257
264
|
# Cosine Similarity Evaluation
|
|
258
265
|
> cosine_similarity_evaluation(predictions, references)
|
|
259
|
-
{'cosine_similarity': 0.
|
|
266
|
+
{'cosine_similarity': 0.7443}
|
|
260
267
|
|
|
261
|
-
#
|
|
268
|
+
# Rouge Score Evaluation
|
|
262
269
|
> rouge_evaluation(predictions, references)
|
|
263
|
-
{'rouge1': 0.8125, 'rouge2': 0.
|
|
270
|
+
{'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
|
|
264
271
|
|
|
265
272
|
# All-in-one function
|
|
266
|
-
> text_evaluation(predictions, references
|
|
267
|
-
{'
|
|
273
|
+
> text_evaluation(predictions, references)
|
|
274
|
+
{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773, 'bleu_score': 0.3656, 'cosine_similarity': 0.7443, 'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
|
|
268
275
|
```
|
|
269
|
-
|
|
270
|
-
---
|
|
271
|
-
|
|
272
|
-
> **Disclaimer**
|
|
273
|
-
> This is a public fork of a framework originally developed in a research setting.
|
|
274
|
-
> Institution-specific components have been removed for confidentiality reasons.
|
|
@@ -9,10 +9,9 @@ from langchain_chroma import Chroma
|
|
|
9
9
|
from langchain_core.documents import Document
|
|
10
10
|
from langchain_core.embeddings import Embeddings
|
|
11
11
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
12
|
-
from sentence_transformers import SentenceTransformer
|
|
13
|
-
|
|
14
12
|
from llmflowstack.utils.exceptions import MissingEssentialProp
|
|
15
13
|
from llmflowstack.utils.logging import LogLevel
|
|
14
|
+
from sentence_transformers import SentenceTransformer
|
|
16
15
|
|
|
17
16
|
|
|
18
17
|
class EncoderWrapper(Embeddings):
|
|
@@ -26,14 +25,20 @@ class EncoderWrapper(Embeddings):
|
|
|
26
25
|
self,
|
|
27
26
|
texts: list[str]
|
|
28
27
|
) -> list[list[float]]:
|
|
29
|
-
|
|
28
|
+
try:
|
|
29
|
+
vectors = self.model.encode(texts, task="retrieval", show_progress_bar=False)
|
|
30
|
+
except TypeError:
|
|
31
|
+
vectors = self.model.encode(texts, show_progress_bar=False)
|
|
30
32
|
return vectors.tolist()
|
|
31
33
|
|
|
32
34
|
def embed_query(
|
|
33
35
|
self,
|
|
34
36
|
text: str
|
|
35
37
|
) -> list[float]:
|
|
36
|
-
|
|
38
|
+
try:
|
|
39
|
+
vectors = self.model.encode(text, task="retrieval", show_progress_bar=False)
|
|
40
|
+
except TypeError:
|
|
41
|
+
vectors = self.model.encode(text, show_progress_bar=False)
|
|
37
42
|
return vectors.tolist()
|
|
38
43
|
|
|
39
44
|
class VectorDatabase:
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
from .evaluation_methods import (bert_score_evaluation,
|
|
1
|
+
from .evaluation_methods import (bert_score_evaluation, bleu_score_evaluation,
|
|
2
2
|
cosine_similarity_evaluation,
|
|
3
3
|
rouge_evaluation, text_evaluation)
|
|
4
4
|
|
|
5
5
|
__all__ = [
|
|
6
6
|
"bert_score_evaluation",
|
|
7
|
+
"bleu_score_evaluation",
|
|
7
8
|
"cosine_similarity_evaluation",
|
|
8
9
|
"rouge_evaluation",
|
|
9
|
-
"evaluation_methods",
|
|
10
10
|
"text_evaluation"
|
|
11
11
|
]
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
|
|
3
|
+
from evaluate import load
|
|
4
|
+
from nltk.stem.snowball import SnowballStemmer
|
|
5
|
+
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
|
|
6
|
+
from rouge_score import rouge_scorer
|
|
7
|
+
from sentence_transformers import SentenceTransformer, util
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def avg(
|
|
11
|
+
values: list[float] | None
|
|
12
|
+
) -> float:
|
|
13
|
+
return sum(values) / len(values) if values else 0.0
|
|
14
|
+
|
|
15
|
+
def stem_texts(texts: list[str]) -> list[str]:
|
|
16
|
+
stemmer = SnowballStemmer("portuguese")
|
|
17
|
+
|
|
18
|
+
stemmed_texts: list[str] = []
|
|
19
|
+
for text in texts:
|
|
20
|
+
stemmed_text = " ".join([stemmer.stem(word) for word in text.split()])
|
|
21
|
+
stemmed_texts.append(stemmed_text)
|
|
22
|
+
|
|
23
|
+
return stemmed_texts
|
|
24
|
+
|
|
25
|
+
def rouge_evaluation(
|
|
26
|
+
preds: list[str],
|
|
27
|
+
refs: list[str]
|
|
28
|
+
) -> dict[Literal["rouge1", "rouge2", "rougeL"], float]:
|
|
29
|
+
preds_stemmed = stem_texts(preds)
|
|
30
|
+
refs_stemmed = stem_texts(refs)
|
|
31
|
+
|
|
32
|
+
rouge_metrics = {"rouge1": [], "rouge2": [], "rougeL": []}
|
|
33
|
+
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
|
|
34
|
+
|
|
35
|
+
for ref, pred in zip(refs_stemmed, preds_stemmed):
|
|
36
|
+
scores = scorer.score(
|
|
37
|
+
target=ref,
|
|
38
|
+
prediction=pred
|
|
39
|
+
)
|
|
40
|
+
for key in rouge_metrics:
|
|
41
|
+
rouge_metrics[key].append(scores[key].fmeasure)
|
|
42
|
+
|
|
43
|
+
rouge1 = round(avg(rouge_metrics["rouge1"]), 4)
|
|
44
|
+
rouge2 = round(avg(rouge_metrics["rouge2"]), 4)
|
|
45
|
+
rougeL = round(avg(rouge_metrics["rougeL"]), 4)
|
|
46
|
+
|
|
47
|
+
return {
|
|
48
|
+
"rouge1": rouge1,
|
|
49
|
+
"rouge2": rouge2,
|
|
50
|
+
"rougeL": rougeL
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
def bert_score_evaluation(
|
|
54
|
+
preds: list[str],
|
|
55
|
+
refs: list[str],
|
|
56
|
+
encoder: str | None = None,
|
|
57
|
+
lang: str = "pt"
|
|
58
|
+
) -> dict[Literal["bertscore_precision", "bertscore_recall", "bertscore_f1"], float]:
|
|
59
|
+
bertscore = load("bertscore")
|
|
60
|
+
|
|
61
|
+
bert_score = bertscore.compute(
|
|
62
|
+
predictions=preds,
|
|
63
|
+
references=refs,
|
|
64
|
+
model_type=encoder,
|
|
65
|
+
lang=lang
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
assert bert_score is not None
|
|
69
|
+
|
|
70
|
+
precision = round(avg(bert_score["precision"]), 4)
|
|
71
|
+
recall = round(avg(bert_score["recall"]), 4)
|
|
72
|
+
f1 = round(avg(bert_score["f1"]), 4)
|
|
73
|
+
|
|
74
|
+
return {
|
|
75
|
+
"bertscore_precision": precision,
|
|
76
|
+
"bertscore_recall": recall,
|
|
77
|
+
"bertscore_f1": f1
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
def cosine_similarity_evaluation(
|
|
81
|
+
preds: list[str],
|
|
82
|
+
refs: list[str],
|
|
83
|
+
encoder: str | None = None
|
|
84
|
+
) -> dict[Literal["cosine_similarity"], float]:
|
|
85
|
+
if not encoder:
|
|
86
|
+
encoder = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
|
87
|
+
|
|
88
|
+
model = SentenceTransformer(
|
|
89
|
+
encoder,
|
|
90
|
+
trust_remote_code=True
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
emb_preds = model.encode(preds, task="retrieval", convert_to_tensor=True)
|
|
95
|
+
emb_refs = model.encode(refs, task="retrieval", convert_to_tensor=True)
|
|
96
|
+
except TypeError:
|
|
97
|
+
emb_preds = model.encode(preds, convert_to_tensor=True)
|
|
98
|
+
emb_refs = model.encode(refs, convert_to_tensor=True)
|
|
99
|
+
|
|
100
|
+
cos_sim_matrix = util.cos_sim(emb_preds, emb_refs)
|
|
101
|
+
|
|
102
|
+
cos_sim_scores = cos_sim_matrix.diag()
|
|
103
|
+
avg_cos_sim = round(float(cos_sim_scores.mean().item()), 4)
|
|
104
|
+
|
|
105
|
+
return {"cosine_similarity": float(avg_cos_sim)}
|
|
106
|
+
|
|
107
|
+
def bleu_score_evaluation(
|
|
108
|
+
preds: list[str],
|
|
109
|
+
refs: list[str]
|
|
110
|
+
) -> dict[Literal["bleu_score"], float]:
|
|
111
|
+
smooth = SmoothingFunction().method1
|
|
112
|
+
|
|
113
|
+
scores = []
|
|
114
|
+
for pred, ref in zip(preds, refs):
|
|
115
|
+
if not pred.strip() or not ref.strip():
|
|
116
|
+
scores.append(0.0)
|
|
117
|
+
continue
|
|
118
|
+
scores.append(sentence_bleu(
|
|
119
|
+
references=[ref.split()],
|
|
120
|
+
hypothesis=pred.split(),
|
|
121
|
+
smoothing_function=smooth
|
|
122
|
+
))
|
|
123
|
+
|
|
124
|
+
bleu_score = round(avg(scores), 4)
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
"bleu_score": bleu_score
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
def text_evaluation(
|
|
131
|
+
preds: list[str],
|
|
132
|
+
refs: list[str],
|
|
133
|
+
rouge: bool = True,
|
|
134
|
+
bert: bool = True,
|
|
135
|
+
cosine: bool = True,
|
|
136
|
+
bleu: bool = True,
|
|
137
|
+
encoder: str | None = None,
|
|
138
|
+
lang: str = "pt"
|
|
139
|
+
) -> dict[str, float]:
|
|
140
|
+
result = {}
|
|
141
|
+
if bert:
|
|
142
|
+
result.update(bert_score_evaluation(
|
|
143
|
+
preds=preds,
|
|
144
|
+
refs=refs,
|
|
145
|
+
encoder=encoder,
|
|
146
|
+
lang=lang
|
|
147
|
+
))
|
|
148
|
+
if bleu:
|
|
149
|
+
result.update(bleu_score_evaluation(
|
|
150
|
+
preds=preds,
|
|
151
|
+
refs=refs
|
|
152
|
+
))
|
|
153
|
+
if cosine:
|
|
154
|
+
result.update(cosine_similarity_evaluation(
|
|
155
|
+
preds=preds,
|
|
156
|
+
refs=refs,
|
|
157
|
+
encoder=encoder
|
|
158
|
+
))
|
|
159
|
+
if rouge:
|
|
160
|
+
result.update(rouge_evaluation(
|
|
161
|
+
preds=preds,
|
|
162
|
+
refs=refs
|
|
163
|
+
))
|
|
164
|
+
|
|
165
|
+
return result
|
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
from evaluate import load
|
|
2
|
-
from nltk.stem.snowball import SnowballStemmer
|
|
3
|
-
from rouge_score import rouge_scorer
|
|
4
|
-
from sentence_transformers import SentenceTransformer, util
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def stem_texts(texts: list[str]) -> list[str]:
|
|
8
|
-
stemmer = SnowballStemmer("portuguese")
|
|
9
|
-
|
|
10
|
-
stemmed_texts: list[str] = []
|
|
11
|
-
for text in texts:
|
|
12
|
-
stemmed_text = " ".join([stemmer.stem(word) for word in text.split()])
|
|
13
|
-
stemmed_texts.append(stemmed_text)
|
|
14
|
-
|
|
15
|
-
return stemmed_texts
|
|
16
|
-
|
|
17
|
-
def rouge_evaluation(
|
|
18
|
-
preds: list[str],
|
|
19
|
-
refs: list[str]
|
|
20
|
-
) -> dict[str, float]:
|
|
21
|
-
preds_stemmed = stem_texts(preds)
|
|
22
|
-
refs_stemmed = stem_texts(refs)
|
|
23
|
-
|
|
24
|
-
rouge_metrics = {"rouge1": [], "rouge2": [], "rougeL": []}
|
|
25
|
-
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
|
|
26
|
-
|
|
27
|
-
for ref, pred in zip(refs_stemmed, preds_stemmed):
|
|
28
|
-
scores = scorer.score(ref, pred)
|
|
29
|
-
for key in rouge_metrics:
|
|
30
|
-
rouge_metrics[key].append(scores[key].fmeasure)
|
|
31
|
-
|
|
32
|
-
return {k: sum(v)/len(v) for k, v in rouge_metrics.items()}
|
|
33
|
-
|
|
34
|
-
def bert_score_evaluation(
|
|
35
|
-
preds: list[str],
|
|
36
|
-
refs: list[str]
|
|
37
|
-
) -> dict[str, float]:
|
|
38
|
-
bertscore = load("bertscore")
|
|
39
|
-
|
|
40
|
-
bert_result = bertscore.compute(predictions=preds, references=refs, lang="pt")
|
|
41
|
-
|
|
42
|
-
bert_avg = {}
|
|
43
|
-
if bert_result:
|
|
44
|
-
bert_avg = {
|
|
45
|
-
"bertscore_precision": sum(bert_result["precision"]) / len(bert_result["precision"]),
|
|
46
|
-
"bertscore_recall": sum(bert_result["recall"]) / len(bert_result["recall"]),
|
|
47
|
-
"bertscore_f1": sum(bert_result["f1"]) / len(bert_result["f1"])
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
return bert_avg
|
|
51
|
-
|
|
52
|
-
def cosine_similarity_evaluation(
|
|
53
|
-
preds: list[str],
|
|
54
|
-
refs: list[str]
|
|
55
|
-
) -> dict[str, float]:
|
|
56
|
-
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
|
57
|
-
|
|
58
|
-
emb_preds = model.encode(preds, convert_to_tensor=True)
|
|
59
|
-
emb_refs = model.encode(refs, convert_to_tensor=True)
|
|
60
|
-
|
|
61
|
-
cos_sim_matrix = util.cos_sim(emb_preds, emb_refs)
|
|
62
|
-
|
|
63
|
-
cos_sim_scores = cos_sim_matrix.diag()
|
|
64
|
-
avg_cos_sim = cos_sim_scores.mean().item()
|
|
65
|
-
|
|
66
|
-
return {"cosine_similarity": float(avg_cos_sim)}
|
|
67
|
-
|
|
68
|
-
def text_evaluation(
|
|
69
|
-
preds: list[str],
|
|
70
|
-
refs: list[str],
|
|
71
|
-
rouge: bool = True,
|
|
72
|
-
bert: bool = True,
|
|
73
|
-
cosine: bool = True
|
|
74
|
-
) -> dict[str, float]:
|
|
75
|
-
result = {}
|
|
76
|
-
if rouge:
|
|
77
|
-
result.update(rouge_evaluation(
|
|
78
|
-
preds=preds,
|
|
79
|
-
refs=refs
|
|
80
|
-
))
|
|
81
|
-
if bert:
|
|
82
|
-
result.update(bert_score_evaluation(
|
|
83
|
-
preds=preds,
|
|
84
|
-
refs=refs
|
|
85
|
-
))
|
|
86
|
-
if cosine:
|
|
87
|
-
result.update(cosine_similarity_evaluation(
|
|
88
|
-
preds=preds,
|
|
89
|
-
refs=refs
|
|
90
|
-
))
|
|
91
|
-
|
|
92
|
-
return result
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|