PyPI - llmflowstack - Versions diffs - 1.2.3__tar.gz → 1.2.4__tar.gz - Mend

llmflowstack 1.2.3tar.gz → 1.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{llmflowstack-1.2.3 → llmflowstack-1.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llmflowstack
-Version: 1.2.3
+Version: 1.2.4
 Summary: LLMFlowStack is a framework for training and using LLMs (LLaMA, GPT-OSS, Gemma, ...). Supports DAPT, fine-tuning, and distributed inference. Public fork without institution-specific components.
 Author-email: Gustavo Henrique Ferreira Cruz <gustavohferreiracruz@gmail.com>
 License: MIT
@@ -277,34 +277,35 @@ print(query_result)
 ### NLP Evaluation
+> **Disclaimer**
+> These evaluation functions are designed for batch processing. Models and encoders are loaded internally on each call, which may be inefficient for per-sample or streaming evaluation.
 ```python
 > from llmflowstack import text_evaluation
-> from llmflowstack.utils import (bert_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
+> from llmflowstack.utils import (bert_score_evaluation, bleu_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
 # Predictions from some model
 > predictions = ["Chico is a dog, and he is orange!", "Fred is a cat, and he is white!"]
 # References text (ground truth)
 > references = ["Chico is a cat, and he is black!", "Fred is a dog, and he is white!"]
-# BERTScore Evaluation
+# BERT Score Evaluation
 > bert_score_evaluation(predictions, references)
-{'bertscore_precision': 0.9772549867630005, 'bertscore_recall': 0.9772549867630005, 'bertscore_f1': 0.9772549867630005}
+{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773}
+# Bleu Score Evaluation
+> bleu_score_evaluation(predictions, references)
+{'bleu_score': 0.3656}
 # Cosine Similarity Evaluation
 > cosine_similarity_evaluation(predictions, references)
-{'cosine_similarity': 0.7443363666534424}
+{'cosine_similarity': 0.7443}
-# RougeScore Evaluation
+# Rouge Score Evaluation
 > rouge_evaluation(predictions, references)
-{'rouge1': 0.8125, 'rouge2': 0.6428571428571428, 'rougeL': 0.8125}
+{'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
 # All-in-one function
-> text_evaluation(predictions, references, bert=True, cosine=True, rouge=True)
-{'rouge1': 0.8125, 'rouge2': 0.6428571428571428, 'rougeL': 0.8125, 'bertscore_precision': 0.9772549867630005, 'bertscore_recall': 0.9772549867630005, 'bertscore_f1': 0.9772549867630005, 'cosine_similarity': 0.7443363666534424}
+> text_evaluation(predictions, references)
+{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773, 'bleu_score': 0.3656, 'cosine_similarity': 0.7443, 'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
 ```
----
-> **Disclaimer**
-> This is a public fork of a framework originally developed in a research setting.
-> Institution-specific components have been removed for confidentiality reasons.

{llmflowstack-1.2.3 → llmflowstack-1.2.4}/README.md RENAMED Viewed

@@ -241,34 +241,35 @@ print(query_result)
 ### NLP Evaluation
+> **Disclaimer**
+> These evaluation functions are designed for batch processing. Models and encoders are loaded internally on each call, which may be inefficient for per-sample or streaming evaluation.
 ```python
 > from llmflowstack import text_evaluation
-> from llmflowstack.utils import (bert_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
+> from llmflowstack.utils import (bert_score_evaluation, bleu_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
 # Predictions from some model
 > predictions = ["Chico is a dog, and he is orange!", "Fred is a cat, and he is white!"]
 # References text (ground truth)
 > references = ["Chico is a cat, and he is black!", "Fred is a dog, and he is white!"]
-# BERTScore Evaluation
+# BERT Score Evaluation
 > bert_score_evaluation(predictions, references)
-{'bertscore_precision': 0.9772549867630005, 'bertscore_recall': 0.9772549867630005, 'bertscore_f1': 0.9772549867630005}
+{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773}
+# Bleu Score Evaluation
+> bleu_score_evaluation(predictions, references)
+{'bleu_score': 0.3656}
 # Cosine Similarity Evaluation
 > cosine_similarity_evaluation(predictions, references)
-{'cosine_similarity': 0.7443363666534424}
+{'cosine_similarity': 0.7443}
-# RougeScore Evaluation
+# Rouge Score Evaluation
 > rouge_evaluation(predictions, references)
-{'rouge1': 0.8125, 'rouge2': 0.6428571428571428, 'rougeL': 0.8125}
+{'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
 # All-in-one function
-> text_evaluation(predictions, references, bert=True, cosine=True, rouge=True)
-{'rouge1': 0.8125, 'rouge2': 0.6428571428571428, 'rougeL': 0.8125, 'bertscore_precision': 0.9772549867630005, 'bertscore_recall': 0.9772549867630005, 'bertscore_f1': 0.9772549867630005, 'cosine_similarity': 0.7443363666534424}
+> text_evaluation(predictions, references)
+{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773, 'bleu_score': 0.3656, 'cosine_similarity': 0.7443, 'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
 ```
----
-> **Disclaimer**
-> This is a public fork of a framework originally developed in a research setting.
-> Institution-specific components have been removed for confidentiality reasons.

{llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/rag/VectorDatabase.py RENAMED Viewed

@@ -9,10 +9,9 @@ from langchain_chroma import Chroma
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-from sentence_transformers import SentenceTransformer
 from llmflowstack.utils.exceptions import MissingEssentialProp
 from llmflowstack.utils.logging import LogLevel
+from sentence_transformers import SentenceTransformer
 class EncoderWrapper(Embeddings):
@@ -26,14 +25,20 @@ class EncoderWrapper(Embeddings):
 		self,
 		texts: list[str]
 	) -> list[list[float]]:
-		vectors = self.model.encode(texts, task="retrieval", show_progress_bar=False)
+		try:
+			vectors = self.model.encode(texts, task="retrieval", show_progress_bar=False)
+		except TypeError:
+			vectors = self.model.encode(texts, show_progress_bar=False)
 		return vectors.tolist()
 	def embed_query(
 		self,
 		text: str
 	) -> list[float]:
-		vectors = self.model.encode(text, task="retrieval", show_progress_bar=False)
+		try:
+			vectors = self.model.encode(text, task="retrieval", show_progress_bar=False)
+		except TypeError:
+			vectors = self.model.encode(text, show_progress_bar=False)
 		return vectors.tolist()
 class VectorDatabase:

{llmflowstack-1.2.3 → llmflowstack-1.2.4}/llmflowstack/utils/__init__.py RENAMED Viewed

@@ -1,11 +1,11 @@
-from .evaluation_methods import (bert_score_evaluation,
+from .evaluation_methods import (bert_score_evaluation, bleu_score_evaluation,
                                  cosine_similarity_evaluation,
                                  rouge_evaluation, text_evaluation)
 __all__ = [
   "bert_score_evaluation",
+  "bleu_score_evaluation",
   "cosine_similarity_evaluation",
   "rouge_evaluation",
-  "evaluation_methods",
   "text_evaluation"
 ]

llmflowstack-1.2.4/llmflowstack/utils/evaluation_methods.py ADDED Viewed

@@ -0,0 +1,165 @@
+from typing import Literal
+from evaluate import load
+from nltk.stem.snowball import SnowballStemmer
+from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
+from rouge_score import rouge_scorer
+from sentence_transformers import SentenceTransformer, util
+def avg(
+	values: list[float] | None
+) -> float:
+	return sum(values) / len(values) if values else 0.0
+def stem_texts(texts: list[str]) -> list[str]:
+	stemmer = SnowballStemmer("portuguese")
+	stemmed_texts: list[str] = []
+	for text in texts:
+		stemmed_text = " ".join([stemmer.stem(word) for word in text.split()])
+		stemmed_texts.append(stemmed_text)
+	return stemmed_texts
+def rouge_evaluation(
+	preds: list[str],
+	refs: list[str]
+) -> dict[Literal["rouge1", "rouge2", "rougeL"], float]:
+	preds_stemmed = stem_texts(preds)
+	refs_stemmed = stem_texts(refs)
+	rouge_metrics = {"rouge1": [], "rouge2": [], "rougeL": []}
+	scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
+	for ref, pred in zip(refs_stemmed, preds_stemmed):
+		scores = scorer.score(
+			target=ref,
+			prediction=pred
+		)
+		for key in rouge_metrics:
+			rouge_metrics[key].append(scores[key].fmeasure)
+	rouge1 = round(avg(rouge_metrics["rouge1"]), 4)
+	rouge2 = round(avg(rouge_metrics["rouge2"]), 4)
+	rougeL = round(avg(rouge_metrics["rougeL"]), 4)
+	return {
+		"rouge1": rouge1,
+		"rouge2": rouge2,
+		"rougeL": rougeL
+	}
+def bert_score_evaluation(
+	preds: list[str],
+	refs: list[str],
+	encoder: str | None = None,
+	lang: str = "pt"
+) -> dict[Literal["bertscore_precision", "bertscore_recall", "bertscore_f1"], float]:
+	bertscore = load("bertscore")
+	bert_score = bertscore.compute(
+		predictions=preds,
+		references=refs,
+		model_type=encoder,
+		lang=lang
+	)
+	assert bert_score is not None
+	precision = round(avg(bert_score["precision"]), 4)
+	recall = round(avg(bert_score["recall"]), 4)
+	f1 = round(avg(bert_score["f1"]), 4)
+	return {
+		"bertscore_precision": precision,
+		"bertscore_recall": recall,
+		"bertscore_f1": f1
+	}
+def cosine_similarity_evaluation(
+	preds: list[str],
+	refs: list[str],
+	encoder: str | None = None
+) -> dict[Literal["cosine_similarity"], float]:
+	if not encoder:
+		encoder = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+	model = SentenceTransformer(
+		encoder,
+		trust_remote_code=True
+	)
+	try:
+		emb_preds = model.encode(preds, task="retrieval", convert_to_tensor=True)
+		emb_refs = model.encode(refs, task="retrieval", convert_to_tensor=True)
+	except TypeError:
+		emb_preds = model.encode(preds, convert_to_tensor=True)
+		emb_refs = model.encode(refs, convert_to_tensor=True)
+	cos_sim_matrix = util.cos_sim(emb_preds, emb_refs)
+	cos_sim_scores = cos_sim_matrix.diag()
+	avg_cos_sim = round(float(cos_sim_scores.mean().item()), 4)
+	return {"cosine_similarity": float(avg_cos_sim)}
+def bleu_score_evaluation(
+	preds: list[str],
+	refs: list[str]
+) -> dict[Literal["bleu_score"], float]:
+	smooth = SmoothingFunction().method1
+	scores = []
+	for pred, ref in zip(preds, refs):
+		if not pred.strip() or not ref.strip():
+			scores.append(0.0)
+			continue
+		scores.append(sentence_bleu(
+			references=[ref.split()],
+			hypothesis=pred.split(),
+			smoothing_function=smooth
+		))
+	bleu_score = round(avg(scores), 4)
+	return {
+		"bleu_score": bleu_score
+	}
+def text_evaluation(
+	preds: list[str],
+	refs: list[str],
+	rouge: bool = True,
+	bert: bool = True,
+	cosine: bool = True,
+	bleu: bool = True,
+	encoder: str | None = None,
+	lang: str = "pt"
+) -> dict[str, float]:
+	result = {}
+	if bert:
+		result.update(bert_score_evaluation(
+			preds=preds,
+			refs=refs,
+			encoder=encoder,
+			lang=lang
+		))
+	if bleu:
+		result.update(bleu_score_evaluation(
+			preds=preds,
+			refs=refs
+		))
+	if cosine:
+		result.update(cosine_similarity_evaluation(
+			preds=preds,
+			refs=refs,
+			encoder=encoder
+		))
+	if rouge:
+		result.update(rouge_evaluation(
+			preds=preds,
+			refs=refs
+		))
+	return result

{llmflowstack-1.2.3 → llmflowstack-1.2.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "llmflowstack"
-version = "1.2.3"
+version = "1.2.4"
 authors = [
   { name = "Gustavo Henrique Ferreira Cruz", email = "gustavohferreiracruz@gmail.com" }
 ]

llmflowstack-1.2.3/llmflowstack/utils/evaluation_methods.py DELETED Viewed

@@ -1,92 +0,0 @@
-from evaluate import load
-from nltk.stem.snowball import SnowballStemmer
-from rouge_score import rouge_scorer
-from sentence_transformers import SentenceTransformer, util
-def stem_texts(texts: list[str]) -> list[str]:
-	stemmer = SnowballStemmer("portuguese")
-	stemmed_texts: list[str] = []
-	for text in texts:
-		stemmed_text = " ".join([stemmer.stem(word) for word in text.split()])
-		stemmed_texts.append(stemmed_text)
-	return stemmed_texts
-def rouge_evaluation(
-	preds: list[str],
-	refs: list[str]
-) -> dict[str, float]:
-	preds_stemmed = stem_texts(preds)
-	refs_stemmed = stem_texts(refs)
-	rouge_metrics = {"rouge1": [], "rouge2": [], "rougeL": []}
-	scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
-	for ref, pred in zip(refs_stemmed, preds_stemmed):
-		scores = scorer.score(ref, pred)
-		for key in rouge_metrics:
-			rouge_metrics[key].append(scores[key].fmeasure)
-	return {k: sum(v)/len(v) for k, v in rouge_metrics.items()}
-def bert_score_evaluation(
-	preds: list[str],
-	refs: list[str]
-) -> dict[str, float]:
-	bertscore = load("bertscore")
-	bert_result = bertscore.compute(predictions=preds, references=refs, lang="pt")
-	bert_avg = {}
-	if bert_result:
-		bert_avg = {
-			"bertscore_precision": sum(bert_result["precision"]) / len(bert_result["precision"]),
-			"bertscore_recall": sum(bert_result["recall"]) / len(bert_result["recall"]),
-			"bertscore_f1": sum(bert_result["f1"]) / len(bert_result["f1"])
-		}
-	return bert_avg
-def cosine_similarity_evaluation(
-	preds: list[str],
-	refs: list[str]
-) -> dict[str, float]:
-	model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
-	emb_preds = model.encode(preds, convert_to_tensor=True)
-	emb_refs = model.encode(refs, convert_to_tensor=True)
-	cos_sim_matrix = util.cos_sim(emb_preds, emb_refs)
-	cos_sim_scores = cos_sim_matrix.diag()
-	avg_cos_sim = cos_sim_scores.mean().item()
-	return {"cosine_similarity": float(avg_cos_sim)}
-def text_evaluation(
-	preds: list[str],
-	refs: list[str],
-	rouge: bool = True,
-	bert: bool = True,
-	cosine: bool = True
-) -> dict[str, float]:
-	result = {}
-	if rouge:
-		result.update(rouge_evaluation(
-			preds=preds,
-			refs=refs
-		))
-	if bert:
-		result.update(bert_score_evaluation(
-			preds=preds,
-			refs=refs
-		))
-	if cosine:
-		result.update(cosine_similarity_evaluation(
-			preds=preds,
-			refs=refs
-		))
-	return result