PyPI - OntoLearner - Versions diffs - 1.4.5__tar.gz → 1.4.7__tar.gz - Mend

OntoLearner 1.4.5tar.gz → 1.4.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{ontolearner-1.4.5 → ontolearner-1.4.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: OntoLearner
-Version: 1.4.5
+Version: 1.4.7
 Summary: OntoLearner: A Modular Python Library for Ontology Learning with LLMs.
 License: MIT
 License-File: LICENSE
@@ -17,6 +17,7 @@ Requires-Dist: bitsandbytes (>=0.45.1,<0.46.0)
 Requires-Dist: dspy (>=2.6.14,<3.0.0)
 Requires-Dist: huggingface-hub (>=0.34.4,<0.35.0)
 Requires-Dist: matplotlib
+Requires-Dist: mistral-common[sentencepiece] (>=1.8.5,<2.0.0)
 Requires-Dist: networkx (==3.2.1)
 Requires-Dist: numpy
 Requires-Dist: openpyxl

ontolearner-1.4.7/ontolearner/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1.4.7

{ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/base/learner.py RENAMED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 from abc import ABC
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Dict
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import torch.nn.functional as F
@@ -147,7 +147,7 @@ class AutoLearner(ABC):
     def _non_taxonomic_re(self, data: Any, test: bool = False) -> Optional[Any]:
         pass
-    def tasks_data_former(self, data: Any, task: str, test: bool = False) -> Any:
+    def tasks_data_former(self, data: Any, task: str, test: bool = False) -> List[str | Dict[str, str]]:
         formatted_data = []
         if task == "term-typing":
             for typing in data.term_typings:
@@ -173,7 +173,7 @@ class AutoLearner(ABC):
             formatted_data = {"types": non_taxonomic_types, "relations": non_taxonomic_res}
         return formatted_data
-    def tasks_ground_truth_former(self, data: Any, task: str) -> Any:
+    def tasks_ground_truth_former(self, data: Any, task: str) -> List[Dict[str, str]]:
         formatted_data = []
         if task == "term-typing":
             for typing in data.term_typings:
@@ -238,7 +238,7 @@ class AutoLLM(ABC):
         if self.device == "cpu":
             device_map = "cpu"
         else:
-            device_map = "auto"
+            device_map = "balanced"
         self.model = AutoModelForCausalLM.from_pretrained(
             model_id,
             device_map=device_map,
@@ -271,7 +271,10 @@ class AutoLLM(ABC):
             Responses include the original input plus generated continuation.
         """
         # Tokenize inputs and move to device
-        encoded_inputs = self.tokenizer(inputs, return_tensors="pt", padding=True).to(self.model.device)
+        encoded_inputs = self.tokenizer(inputs,
+                                        return_tensors="pt",
+                                        padding=True,
+                                        truncation=True).to(self.model.device)
         input_ids = encoded_inputs["input_ids"]
         input_length = input_ids.shape[1]

{ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/evaluation/metrics.py RENAMED Viewed

@@ -11,13 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict
+from typing import List, Dict, Tuple, Set
 SYMMETRIC_RELATIONS = {"equivalentclass", "sameas", "disjointwith"}
-def text2onto_metrics(y_true, y_pred, similarity_threshold: float = 0.8) -> Dict:
-    def jaccard_similarity(a, b):
+def text2onto_metrics(y_true: List[str], y_pred: List[str], similarity_threshold: float = 0.8) -> Dict[str, float | int]:
+    def jaccard_similarity(a: str, b: str) -> float:
         set_a = set(a.lower().split())
         set_b = set(b.lower().split())
         if not set_a and not set_b:
@@ -46,10 +45,13 @@ def text2onto_metrics(y_true, y_pred, similarity_threshold: float = 0.8) -> Dict
     return {
         "f1_score": f1_score,
         "precision": precision,
-        "recall": recall
+        "recall": recall,
+        "total_correct": total_correct,
+        "total_predicted": total_predicted,
+        "total_ground_truth": total_ground_truth
     }
-def term_typing_metrics(y_true, y_pred) -> Dict:
+def term_typing_metrics(y_true: List[Dict[str, List[str]]], y_pred: List[Dict[str, List[str]]]) -> Dict[str, float | int]:
     """
     Compute precision, recall, and F1-score for term typing
     using (term, type) pair-level matching instead of ID-based lookups.
@@ -77,13 +79,17 @@ def term_typing_metrics(y_true, y_pred) -> Dict:
     precision = total_correct / total_predicted if total_predicted > 0 else 0.0
     recall = total_correct / total_ground_truth if total_ground_truth > 0 else 0.0
     f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
     return {
         "f1_score": f1_score,
         "precision": precision,
-        "recall": recall
+        "recall": recall,
+        "total_correct": total_correct,
+        "total_predicted": total_predicted,
+        "total_ground_truth": total_ground_truth
     }
-def taxonomy_discovery_metrics(y_true, y_pred) -> Dict:
+def taxonomy_discovery_metrics(y_true: List[Dict[str, str]], y_pred: List[Dict[str, str]]) -> Dict[str, float | int]:
     total_predicted = len(y_pred)
     total_ground_truth = len(y_true)
     # Convert ground truth and predictions to sets of tuples for easy comparison
@@ -102,18 +108,22 @@ def taxonomy_discovery_metrics(y_true, y_pred) -> Dict:
     return {
         "f1_score": f1_score,
         "precision": precision,
-        "recall": recall
+        "recall": recall,
+        "total_correct": total_correct,
+        "total_predicted": total_predicted,
+        "total_ground_truth": total_ground_truth
     }
-def non_taxonomic_re_metrics(y_true, y_pred) -> Dict:
-    def normalize_triple(item):
+def non_taxonomic_re_metrics(y_true: List[Dict[str, str]], y_pred: List[Dict[str, str]]) -> Dict[str, float | int]:
+    def normalize_triple(item: Dict[str, str]) -> Tuple[str, str, str]:
         return (
             item["head"].strip().lower(),
             item["relation"].strip().lower(),
             item["tail"].strip().lower()
         )
-    def expand_symmetric(triples):
+    def expand_symmetric(triples: Set[Tuple[str, str, str]]) -> Set[Tuple[str, str, str]]:
         expanded = set()
         for h, r, t in triples:
             expanded.add((h, r, t))
@@ -136,5 +146,8 @@ def non_taxonomic_re_metrics(y_true, y_pred) -> Dict:
     return {
         "f1_score": f1_score,
         "precision": precision,
-        "recall": recall
+        "recall": recall,
+        "total_correct": total_correct,
+        "total_predicted": total_predicted,
+        "total_ground_truth": total_ground_truth
     }

{ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/learner/__init__.py RENAMED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .llm import AutoLLMLearner
+from .llm import AutoLLMLearner, FalconLLM, MistralLLM
 from .retriever import AutoRetrieverLearner
 from .rag import AutoRAGLearner
 from .prompt import StandardizedPrompting

{ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/learner/llm.py RENAMED Viewed

@@ -13,23 +13,27 @@
 # limitations under the License.
 from ..base import AutoLLM, AutoLearner
-from typing import Any
+from typing import Any, List
 import warnings
 from tqdm import tqdm
 from torch.utils.data import DataLoader
+import torch
+from transformers import Mistral3ForConditionalGeneration
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 class AutoLLMLearner(AutoLearner):
     def __init__(self,
                  prompting,
                  label_mapper,
+                 llm: AutoLLM = AutoLLM,
                  token: str = "",
                  max_new_tokens: int = 5,
                  batch_size: int = 10,
                  device='cpu') -> None:
         super().__init__()
-        self.llm = AutoLLM(token=token, label_mapper=label_mapper, device=device)
+        self.llm = llm(token=token, label_mapper=label_mapper, device=device)
         self.prompting = prompting
         self.batch_size = batch_size
         self.max_new_tokens = max_new_tokens
@@ -136,3 +140,69 @@ class AutoLLMLearner(AutoLearner):
             return self._non_taxonomic_re_predict(dataset=dataset)
         else:
             warnings.warn("No requirement for fiting the non-taxonomic-re model, the predict module will use the input data to do the task.")
+class FalconLLM(AutoLLM):
+    def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
+        encoded_inputs = self.tokenizer(inputs,
+                                        return_tensors="pt",
+                                        padding=True,
+                                        truncation=True).to(self.model.device)
+        input_ids = encoded_inputs["input_ids"]
+        input_length = input_ids.shape[1]
+        outputs = self.model.generate(
+            input_ids,
+            max_new_tokens=max_new_tokens,
+            pad_token_id=self.tokenizer.eos_token_id
+        )
+        generated_tokens = outputs[:, input_length:]
+        decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens]
+        return self.label_mapper.predict(decoded_outputs)
+class MistralLLM(AutoLLM):
+    def load(self, model_id: str) -> None:
+        self.tokenizer = MistralTokenizer.from_hf_hub(model_id)
+        if self.device == "cpu":
+            device_map = "cpu"
+        else:
+            device_map = "balanced"
+        self.model = Mistral3ForConditionalGeneration.from_pretrained(
+            model_id,
+            device_map=device_map,
+            torch_dtype=torch.bfloat16,
+            token=self.token
+        )
+        if not hasattr(self.tokenizer, "pad_token_id") or self.tokenizer.pad_token_id is None:
+            self.tokenizer.pad_token_id = self.model.generation_config.eos_token_id
+        self.label_mapper.fit()
+    def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
+        tokenized_list = []
+        for prompt in inputs:
+            messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
+            tokenized = self.tokenizer.encode_chat_completion(ChatCompletionRequest(messages=messages))
+            tokenized_list.append(tokenized.tokens)
+        max_len = max(len(tokens) for tokens in tokenized_list)
+        input_ids, attention_masks = [], []
+        for tokens in tokenized_list:
+            pad_length = max_len - len(tokens)
+            input_ids.append(tokens + [self.tokenizer.pad_token_id] * pad_length)
+            attention_masks.append([1] * len(tokens) + [0] * pad_length)
+        input_ids = torch.tensor(input_ids).to(self.model.device)
+        attention_masks = torch.tensor(attention_masks).to(self.model.device)
+        outputs =self.model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_masks,
+            eos_token_id=self.model.generation_config.eos_token_id,
+            pad_token_id=self.tokenizer.pad_token_id,
+            max_new_tokens=max_new_tokens,
+        )
+        decoded_outputs = []
+        for i, tokens in enumerate(outputs):
+            output_text = self.tokenizer.decode(tokens[len(tokenized_list[i]):])
+            decoded_outputs.append(output_text)
+        return self.label_mapper.predict(decoded_outputs)

{ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/learner/retriever.py RENAMED Viewed

@@ -22,7 +22,6 @@ class AutoRetrieverLearner(AutoLearner):
         self.retriever = base_retriever
         self.top_k = top_k
         self._is_term_typing_fit = False
-        self._is_taxonomy_discovery_fit = False
         self._batch_size = batch_size
     def load(self, model_id: str = "sentence-transformers/all-MiniLM-L6-v2"):
@@ -64,9 +63,9 @@ class AutoRetrieverLearner(AutoLearner):
         if test:
             self._retriever_fit(data=data)
             candidates_lst =  self._retriever_predict(data=data, top_k=self.top_k + 1)
-            taxonomic_pairs = [{"parent": query, "child": candidate}
+            taxonomic_pairs = [{"parent": candidate, "child": query}
                                for query, candidates in zip(data, candidates_lst)
-                               for candidate in candidates if candidate != query]
+                               for candidate in candidates if candidate.lower() != query.lower()]
             return taxonomic_pairs
         else:
             warnings.warn("No requirement for fiting the taxonomy discovery model, the predict module will use the input data to do the fit as well.")

{ontolearner-1.4.5 → ontolearner-1.4.7}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "OntoLearner"
-version = "1.4.5"
+version = "1.4.7"
 description = "OntoLearner: A Modular Python Library for Ontology Learning with LLMs."
 authors = ["Hamed Babaei Giglou <hamedbabaeigiglou@gmail.com>", "Andrei C. Aioanei <andrei.c.aioanei@gmail.com>"]
 license = "MIT License"
@@ -29,6 +29,7 @@ transformers = "^4.56.0"
 sentence-transformers = "^5.1.0"
 dspy = "^2.6.14"
 bitsandbytes="^0.45.1"
+mistral-common = { version = "^1.8.5", extras = ["sentencepiece"] }
 [tool.poetry.dev-dependencies]
 ruff = "*"