llmflowstack 1.2.3__tar.gz → 1.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/PKG-INFO +16 -15
  2. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/README.md +15 -14
  3. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/rag/VectorDatabase.py +9 -4
  4. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/utils/__init__.py +2 -2
  5. llmflowstack-1.2.4/llmflowstack/utils/evaluation_methods.py +165 -0
  6. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/pyproject.toml +1 -1
  7. llmflowstack-1.2.3/llmflowstack/utils/evaluation_methods.py +0 -92
  8. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/.github/workflows/python-publish.yml +0 -0
  9. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/.gitignore +0 -0
  10. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/LICENSE +0 -0
  11. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/__init__.py +0 -0
  12. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/callbacks/__init__.py +0 -0
  13. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/callbacks/log_collector.py +0 -0
  14. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/callbacks/stop_on_token.py +0 -0
  15. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/BaseDecoder.py +0 -0
  16. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/GPT_OSS.py +0 -0
  17. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/Gemma.py +0 -0
  18. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/LLaMA3.py +0 -0
  19. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/LLaMA4.py +0 -0
  20. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/MedGemma.py +0 -0
  21. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/decoders/__init__.py +0 -0
  22. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/rag/__init__.py +0 -0
  23. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/schemas/__init__.py +0 -0
  24. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/schemas/params.py +0 -0
  25. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/utils/exceptions.py +0 -0
  26. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/utils/generation_utils.py +0 -0
  27. {llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/utils/logging.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmflowstack
3
- Version: 1.2.3
3
+ Version: 1.2.4
4
4
  Summary: LLMFlowStack is a framework for training and using LLMs (LLaMA, GPT-OSS, Gemma, ...). Supports DAPT, fine-tuning, and distributed inference. Public fork without institution-specific components.
5
5
  Author-email: Gustavo Henrique Ferreira Cruz <gustavohferreiracruz@gmail.com>
6
6
  License: MIT
@@ -277,34 +277,35 @@ print(query_result)
277
277
 
278
278
  ### NLP Evaluation
279
279
 
280
+ > **Disclaimer**
281
+ > These evaluation functions are designed for batch processing. Models and encoders are loaded internally on each call, which may be inefficient for per-sample or streaming evaluation.
282
+
280
283
  ```python
281
284
  > from llmflowstack import text_evaluation
282
- > from llmflowstack.utils import (bert_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
285
+ > from llmflowstack.utils import (bert_score_evaluation, bleu_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
283
286
 
284
287
  # Predictions from some model
285
288
  > predictions = ["Chico is a dog, and he is orange!", "Fred is a cat, and he is white!"]
286
289
  # References text (ground truth)
287
290
  > references = ["Chico is a cat, and he is black!", "Fred is a dog, and he is white!"]
288
291
 
289
- # BERTScore Evaluation
292
+ # BERT Score Evaluation
290
293
  > bert_score_evaluation(predictions, references)
291
- {'bertscore_precision': 0.9772549867630005, 'bertscore_recall': 0.9772549867630005, 'bertscore_f1': 0.9772549867630005}
294
+ {'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773}
295
+
296
+ # Bleu Score Evaluation
297
+ > bleu_score_evaluation(predictions, references)
298
+ {'bleu_score': 0.3656}
292
299
 
293
300
  # Cosine Similarity Evaluation
294
301
  > cosine_similarity_evaluation(predictions, references)
295
- {'cosine_similarity': 0.7443363666534424}
302
+ {'cosine_similarity': 0.7443}
296
303
 
297
- # RougeScore Evaluation
304
+ # Rouge Score Evaluation
298
305
  > rouge_evaluation(predictions, references)
299
- {'rouge1': 0.8125, 'rouge2': 0.6428571428571428, 'rougeL': 0.8125}
306
+ {'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
300
307
 
301
308
  # All-in-one function
302
- > text_evaluation(predictions, references, bert=True, cosine=True, rouge=True)
303
- {'rouge1': 0.8125, 'rouge2': 0.6428571428571428, 'rougeL': 0.8125, 'bertscore_precision': 0.9772549867630005, 'bertscore_recall': 0.9772549867630005, 'bertscore_f1': 0.9772549867630005, 'cosine_similarity': 0.7443363666534424}
309
+ > text_evaluation(predictions, references)
310
+ {'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773, 'bleu_score': 0.3656, 'cosine_similarity': 0.7443, 'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
304
311
  ```
305
-
306
- ---
307
-
308
- > **Disclaimer**
309
- > This is a public fork of a framework originally developed in a research setting.
310
- > Institution-specific components have been removed for confidentiality reasons.
@@ -241,34 +241,35 @@ print(query_result)
241
241
 
242
242
  ### NLP Evaluation
243
243
 
244
+ > **Disclaimer**
245
+ > These evaluation functions are designed for batch processing. Models and encoders are loaded internally on each call, which may be inefficient for per-sample or streaming evaluation.
246
+
244
247
  ```python
245
248
  > from llmflowstack import text_evaluation
246
- > from llmflowstack.utils import (bert_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
249
+ > from llmflowstack.utils import (bert_score_evaluation, bleu_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
247
250
 
248
251
  # Predictions from some model
249
252
  > predictions = ["Chico is a dog, and he is orange!", "Fred is a cat, and he is white!"]
250
253
  # References text (ground truth)
251
254
  > references = ["Chico is a cat, and he is black!", "Fred is a dog, and he is white!"]
252
255
 
253
- # BERTScore Evaluation
256
+ # BERT Score Evaluation
254
257
  > bert_score_evaluation(predictions, references)
255
- {'bertscore_precision': 0.9772549867630005, 'bertscore_recall': 0.9772549867630005, 'bertscore_f1': 0.9772549867630005}
258
+ {'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773}
259
+
260
+ # Bleu Score Evaluation
261
+ > bleu_score_evaluation(predictions, references)
262
+ {'bleu_score': 0.3656}
256
263
 
257
264
  # Cosine Similarity Evaluation
258
265
  > cosine_similarity_evaluation(predictions, references)
259
- {'cosine_similarity': 0.7443363666534424}
266
+ {'cosine_similarity': 0.7443}
260
267
 
261
- # RougeScore Evaluation
268
+ # Rouge Score Evaluation
262
269
  > rouge_evaluation(predictions, references)
263
- {'rouge1': 0.8125, 'rouge2': 0.6428571428571428, 'rougeL': 0.8125}
270
+ {'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
264
271
 
265
272
  # All-in-one function
266
- > text_evaluation(predictions, references, bert=True, cosine=True, rouge=True)
267
- {'rouge1': 0.8125, 'rouge2': 0.6428571428571428, 'rougeL': 0.8125, 'bertscore_precision': 0.9772549867630005, 'bertscore_recall': 0.9772549867630005, 'bertscore_f1': 0.9772549867630005, 'cosine_similarity': 0.7443363666534424}
273
+ > text_evaluation(predictions, references)
274
+ {'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773, 'bleu_score': 0.3656, 'cosine_similarity': 0.7443, 'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
268
275
  ```
269
-
270
- ---
271
-
272
- > **Disclaimer**
273
- > This is a public fork of a framework originally developed in a research setting.
274
- > Institution-specific components have been removed for confidentiality reasons.
@@ -9,10 +9,9 @@ from langchain_chroma import Chroma
9
9
  from langchain_core.documents import Document
10
10
  from langchain_core.embeddings import Embeddings
11
11
  from langchain_text_splitters import RecursiveCharacterTextSplitter
12
- from sentence_transformers import SentenceTransformer
13
-
14
12
  from llmflowstack.utils.exceptions import MissingEssentialProp
15
13
  from llmflowstack.utils.logging import LogLevel
14
+ from sentence_transformers import SentenceTransformer
16
15
 
17
16
 
18
17
  class EncoderWrapper(Embeddings):
@@ -26,14 +25,20 @@ class EncoderWrapper(Embeddings):
26
25
  self,
27
26
  texts: list[str]
28
27
  ) -> list[list[float]]:
29
- vectors = self.model.encode(texts, task="retrieval", show_progress_bar=False)
28
+ try:
29
+ vectors = self.model.encode(texts, task="retrieval", show_progress_bar=False)
30
+ except TypeError:
31
+ vectors = self.model.encode(texts, show_progress_bar=False)
30
32
  return vectors.tolist()
31
33
 
32
34
  def embed_query(
33
35
  self,
34
36
  text: str
35
37
  ) -> list[float]:
36
- vectors = self.model.encode(text, task="retrieval", show_progress_bar=False)
38
+ try:
39
+ vectors = self.model.encode(text, task="retrieval", show_progress_bar=False)
40
+ except TypeError:
41
+ vectors = self.model.encode(text, show_progress_bar=False)
37
42
  return vectors.tolist()
38
43
 
39
44
  class VectorDatabase:
@@ -1,11 +1,11 @@
1
- from .evaluation_methods import (bert_score_evaluation,
1
+ from .evaluation_methods import (bert_score_evaluation, bleu_score_evaluation,
2
2
  cosine_similarity_evaluation,
3
3
  rouge_evaluation, text_evaluation)
4
4
 
5
5
  __all__ = [
6
6
  "bert_score_evaluation",
7
+ "bleu_score_evaluation",
7
8
  "cosine_similarity_evaluation",
8
9
  "rouge_evaluation",
9
- "evaluation_methods",
10
10
  "text_evaluation"
11
11
  ]
@@ -0,0 +1,165 @@
1
+ from typing import Literal
2
+
3
+ from evaluate import load
4
+ from nltk.stem.snowball import SnowballStemmer
5
+ from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
6
+ from rouge_score import rouge_scorer
7
+ from sentence_transformers import SentenceTransformer, util
8
+
9
+
10
+ def avg(
11
+ values: list[float] | None
12
+ ) -> float:
13
+ return sum(values) / len(values) if values else 0.0
14
+
15
+ def stem_texts(texts: list[str]) -> list[str]:
16
+ stemmer = SnowballStemmer("portuguese")
17
+
18
+ stemmed_texts: list[str] = []
19
+ for text in texts:
20
+ stemmed_text = " ".join([stemmer.stem(word) for word in text.split()])
21
+ stemmed_texts.append(stemmed_text)
22
+
23
+ return stemmed_texts
24
+
25
+ def rouge_evaluation(
26
+ preds: list[str],
27
+ refs: list[str]
28
+ ) -> dict[Literal["rouge1", "rouge2", "rougeL"], float]:
29
+ preds_stemmed = stem_texts(preds)
30
+ refs_stemmed = stem_texts(refs)
31
+
32
+ rouge_metrics = {"rouge1": [], "rouge2": [], "rougeL": []}
33
+ scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
34
+
35
+ for ref, pred in zip(refs_stemmed, preds_stemmed):
36
+ scores = scorer.score(
37
+ target=ref,
38
+ prediction=pred
39
+ )
40
+ for key in rouge_metrics:
41
+ rouge_metrics[key].append(scores[key].fmeasure)
42
+
43
+ rouge1 = round(avg(rouge_metrics["rouge1"]), 4)
44
+ rouge2 = round(avg(rouge_metrics["rouge2"]), 4)
45
+ rougeL = round(avg(rouge_metrics["rougeL"]), 4)
46
+
47
+ return {
48
+ "rouge1": rouge1,
49
+ "rouge2": rouge2,
50
+ "rougeL": rougeL
51
+ }
52
+
53
+ def bert_score_evaluation(
54
+ preds: list[str],
55
+ refs: list[str],
56
+ encoder: str | None = None,
57
+ lang: str = "pt"
58
+ ) -> dict[Literal["bertscore_precision", "bertscore_recall", "bertscore_f1"], float]:
59
+ bertscore = load("bertscore")
60
+
61
+ bert_score = bertscore.compute(
62
+ predictions=preds,
63
+ references=refs,
64
+ model_type=encoder,
65
+ lang=lang
66
+ )
67
+
68
+ assert bert_score is not None
69
+
70
+ precision = round(avg(bert_score["precision"]), 4)
71
+ recall = round(avg(bert_score["recall"]), 4)
72
+ f1 = round(avg(bert_score["f1"]), 4)
73
+
74
+ return {
75
+ "bertscore_precision": precision,
76
+ "bertscore_recall": recall,
77
+ "bertscore_f1": f1
78
+ }
79
+
80
+ def cosine_similarity_evaluation(
81
+ preds: list[str],
82
+ refs: list[str],
83
+ encoder: str | None = None
84
+ ) -> dict[Literal["cosine_similarity"], float]:
85
+ if not encoder:
86
+ encoder = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
87
+
88
+ model = SentenceTransformer(
89
+ encoder,
90
+ trust_remote_code=True
91
+ )
92
+
93
+ try:
94
+ emb_preds = model.encode(preds, task="retrieval", convert_to_tensor=True)
95
+ emb_refs = model.encode(refs, task="retrieval", convert_to_tensor=True)
96
+ except TypeError:
97
+ emb_preds = model.encode(preds, convert_to_tensor=True)
98
+ emb_refs = model.encode(refs, convert_to_tensor=True)
99
+
100
+ cos_sim_matrix = util.cos_sim(emb_preds, emb_refs)
101
+
102
+ cos_sim_scores = cos_sim_matrix.diag()
103
+ avg_cos_sim = round(float(cos_sim_scores.mean().item()), 4)
104
+
105
+ return {"cosine_similarity": float(avg_cos_sim)}
106
+
107
+ def bleu_score_evaluation(
108
+ preds: list[str],
109
+ refs: list[str]
110
+ ) -> dict[Literal["bleu_score"], float]:
111
+ smooth = SmoothingFunction().method1
112
+
113
+ scores = []
114
+ for pred, ref in zip(preds, refs):
115
+ if not pred.strip() or not ref.strip():
116
+ scores.append(0.0)
117
+ continue
118
+ scores.append(sentence_bleu(
119
+ references=[ref.split()],
120
+ hypothesis=pred.split(),
121
+ smoothing_function=smooth
122
+ ))
123
+
124
+ bleu_score = round(avg(scores), 4)
125
+
126
+ return {
127
+ "bleu_score": bleu_score
128
+ }
129
+
130
+ def text_evaluation(
131
+ preds: list[str],
132
+ refs: list[str],
133
+ rouge: bool = True,
134
+ bert: bool = True,
135
+ cosine: bool = True,
136
+ bleu: bool = True,
137
+ encoder: str | None = None,
138
+ lang: str = "pt"
139
+ ) -> dict[str, float]:
140
+ result = {}
141
+ if bert:
142
+ result.update(bert_score_evaluation(
143
+ preds=preds,
144
+ refs=refs,
145
+ encoder=encoder,
146
+ lang=lang
147
+ ))
148
+ if bleu:
149
+ result.update(bleu_score_evaluation(
150
+ preds=preds,
151
+ refs=refs
152
+ ))
153
+ if cosine:
154
+ result.update(cosine_similarity_evaluation(
155
+ preds=preds,
156
+ refs=refs,
157
+ encoder=encoder
158
+ ))
159
+ if rouge:
160
+ result.update(rouge_evaluation(
161
+ preds=preds,
162
+ refs=refs
163
+ ))
164
+
165
+ return result
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "llmflowstack"
7
- version = "1.2.3"
7
+ version = "1.2.4"
8
8
  authors = [
9
9
  { name = "Gustavo Henrique Ferreira Cruz", email = "gustavohferreiracruz@gmail.com" }
10
10
  ]
@@ -1,92 +0,0 @@
1
- from evaluate import load
2
- from nltk.stem.snowball import SnowballStemmer
3
- from rouge_score import rouge_scorer
4
- from sentence_transformers import SentenceTransformer, util
5
-
6
-
7
- def stem_texts(texts: list[str]) -> list[str]:
8
- stemmer = SnowballStemmer("portuguese")
9
-
10
- stemmed_texts: list[str] = []
11
- for text in texts:
12
- stemmed_text = " ".join([stemmer.stem(word) for word in text.split()])
13
- stemmed_texts.append(stemmed_text)
14
-
15
- return stemmed_texts
16
-
17
- def rouge_evaluation(
18
- preds: list[str],
19
- refs: list[str]
20
- ) -> dict[str, float]:
21
- preds_stemmed = stem_texts(preds)
22
- refs_stemmed = stem_texts(refs)
23
-
24
- rouge_metrics = {"rouge1": [], "rouge2": [], "rougeL": []}
25
- scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
26
-
27
- for ref, pred in zip(refs_stemmed, preds_stemmed):
28
- scores = scorer.score(ref, pred)
29
- for key in rouge_metrics:
30
- rouge_metrics[key].append(scores[key].fmeasure)
31
-
32
- return {k: sum(v)/len(v) for k, v in rouge_metrics.items()}
33
-
34
- def bert_score_evaluation(
35
- preds: list[str],
36
- refs: list[str]
37
- ) -> dict[str, float]:
38
- bertscore = load("bertscore")
39
-
40
- bert_result = bertscore.compute(predictions=preds, references=refs, lang="pt")
41
-
42
- bert_avg = {}
43
- if bert_result:
44
- bert_avg = {
45
- "bertscore_precision": sum(bert_result["precision"]) / len(bert_result["precision"]),
46
- "bertscore_recall": sum(bert_result["recall"]) / len(bert_result["recall"]),
47
- "bertscore_f1": sum(bert_result["f1"]) / len(bert_result["f1"])
48
- }
49
-
50
- return bert_avg
51
-
52
- def cosine_similarity_evaluation(
53
- preds: list[str],
54
- refs: list[str]
55
- ) -> dict[str, float]:
56
- model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
57
-
58
- emb_preds = model.encode(preds, convert_to_tensor=True)
59
- emb_refs = model.encode(refs, convert_to_tensor=True)
60
-
61
- cos_sim_matrix = util.cos_sim(emb_preds, emb_refs)
62
-
63
- cos_sim_scores = cos_sim_matrix.diag()
64
- avg_cos_sim = cos_sim_scores.mean().item()
65
-
66
- return {"cosine_similarity": float(avg_cos_sim)}
67
-
68
- def text_evaluation(
69
- preds: list[str],
70
- refs: list[str],
71
- rouge: bool = True,
72
- bert: bool = True,
73
- cosine: bool = True
74
- ) -> dict[str, float]:
75
- result = {}
76
- if rouge:
77
- result.update(rouge_evaluation(
78
- preds=preds,
79
- refs=refs
80
- ))
81
- if bert:
82
- result.update(bert_score_evaluation(
83
- preds=preds,
84
- refs=refs
85
- ))
86
- if cosine:
87
- result.update(cosine_similarity_evaluation(
88
- preds=preds,
89
- refs=refs
90
- ))
91
-
92
- return result
File without changes
File without changes