PyPI - OntoLearner - Versions diffs - 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl - Mend

OntoLearner 1.4.6py3-none-any.whl → 1.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

ontolearner/VERSION +1 -1
ontolearner/base/learner.py +20 -14
ontolearner/learner/__init__.py +1 -1
ontolearner/learner/label_mapper.py +1 -1
ontolearner/learner/llm.py +73 -3
ontolearner/learner/retriever.py +24 -3
ontolearner/learner/taxonomy_discovery/__init__.py +18 -0
ontolearner/learner/taxonomy_discovery/alexbek.py +500 -0
ontolearner/learner/taxonomy_discovery/rwthdbis.py +1082 -0
ontolearner/learner/taxonomy_discovery/sbunlp.py +402 -0
ontolearner/learner/taxonomy_discovery/skhnlp.py +1138 -0
ontolearner/learner/term_typing/__init__.py +17 -0
ontolearner/learner/term_typing/alexbek.py +1262 -0
ontolearner/learner/term_typing/rwthdbis.py +379 -0
ontolearner/learner/term_typing/sbunlp.py +478 -0
ontolearner/learner/text2onto/__init__.py +16 -0
ontolearner/learner/text2onto/alexbek.py +1219 -0
ontolearner/learner/text2onto/sbunlp.py +598 -0
{ontolearner-1.4.6.dist-info → ontolearner-1.4.8.dist-info}/METADATA +5 -1
{ontolearner-1.4.6.dist-info → ontolearner-1.4.8.dist-info}/RECORD +22 -10
{ontolearner-1.4.6.dist-info → ontolearner-1.4.8.dist-info}/WHEEL +0 -0
{ontolearner-1.4.6.dist-info → ontolearner-1.4.8.dist-info}/licenses/LICENSE +0 -0

ontolearner/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 1.4.6
1	+ 1.4.8

ontolearner/base/learner.py CHANGED Viewed

@@ -236,15 +236,21 @@ class AutoLLM(ABC):
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left', token=self.token)
         self.tokenizer.pad_token = self.tokenizer.eos_token
         if self.device == "cpu":
-            device_map = "cpu"
+            # device_map = "cpu"
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                # device_map=device_map,
+                torch_dtype=torch.bfloat16,
+                token=self.token
+            )
         else:
-            device_map = "auto"
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            device_map=device_map,
-            torch_dtype=torch.bfloat16,
-            token=self.token
-        )
+            device_map = "balanced"
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                device_map=device_map,
+                torch_dtype=torch.bfloat16,
+                token=self.token
+            )
         self.label_mapper.fit()
     def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
@@ -271,7 +277,10 @@ class AutoLLM(ABC):
             Responses include the original input plus generated continuation.
         """
         # Tokenize inputs and move to device
-        encoded_inputs = self.tokenizer(inputs, return_tensors="pt", padding=True).to(self.model.device)
+        encoded_inputs = self.tokenizer(inputs,
+                                        return_tensors="pt",
+                                        padding=True,
+                                        truncation=True).to(self.model.device)
         input_ids = encoded_inputs["input_ids"]
         input_length = input_ids.shape[1]
@@ -287,7 +296,8 @@ class AutoLLM(ABC):
         # Decode only the generated part
         decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens]
+        print(decoded_outputs)
+        print(self.label_mapper.predict(decoded_outputs))
         # Map the decoded text to labels
         return self.label_mapper.predict(decoded_outputs)
@@ -298,9 +308,6 @@ class AutoRetriever(ABC):
     This class defines the interface for retrieval components used in ontology learning.
     Retrievers are responsible for finding semantically similar examples from training
     data to provide context for language models or to make direct predictions.
-    Attributes:
-        model: The loaded retrieval/embedding model instance.
     """
     def __init__(self) -> None:
@@ -310,7 +317,6 @@ class AutoRetriever(ABC):
         Sets up the basic structure with a model attribute that will be
         populated when load() is called.
         """
-        self.model: Optional[Any] = None
         self.embedding_model = None
         self.documents = []
         self.embeddings = None

ontolearner/learner/__init__.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .llm import AutoLLMLearner
+from .llm import AutoLLMLearner, FalconLLM, MistralLLM
 from .retriever import AutoRetrieverLearner
 from .rag import AutoRAGLearner
 from .prompt import StandardizedPrompting

ontolearner/learner/label_mapper.py CHANGED Viewed

@@ -85,6 +85,6 @@ class LabelMapper:
         Returns:
             List[str]: Predicted labels.
         """
-        predictions = list(self.model.predict(X))
+        predictions = self.model.predict(X).tolist()
         self.validate_predicts(predictions)
         return predictions

ontolearner/learner/llm.py CHANGED Viewed

@@ -13,23 +13,27 @@
 # limitations under the License.
 from ..base import AutoLLM, AutoLearner
-from typing import Any
+from typing import Any, List
 import warnings
 from tqdm import tqdm
 from torch.utils.data import DataLoader
+import torch
+from transformers import Mistral3ForConditionalGeneration
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 class AutoLLMLearner(AutoLearner):
     def __init__(self,
                  prompting,
                  label_mapper,
+                 llm: AutoLLM = AutoLLM,
                  token: str = "",
                  max_new_tokens: int = 5,
                  batch_size: int = 10,
                  device='cpu') -> None:
         super().__init__()
-        self.llm = AutoLLM(token=token, label_mapper=label_mapper, device=device)
+        self.llm = llm(token=token, label_mapper=label_mapper, device=device)
         self.prompting = prompting
         self.batch_size = batch_size
         self.max_new_tokens = max_new_tokens
@@ -136,3 +140,69 @@ class AutoLLMLearner(AutoLearner):
             return self._non_taxonomic_re_predict(dataset=dataset)
         else:
             warnings.warn("No requirement for fiting the non-taxonomic-re model, the predict module will use the input data to do the task.")
+class FalconLLM(AutoLLM):
+    def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
+        encoded_inputs = self.tokenizer(inputs,
+                                        return_tensors="pt",
+                                        padding=True,
+                                        truncation=True).to(self.model.device)
+        input_ids = encoded_inputs["input_ids"]
+        input_length = input_ids.shape[1]
+        outputs = self.model.generate(
+            input_ids,
+            max_new_tokens=max_new_tokens,
+            pad_token_id=self.tokenizer.eos_token_id
+        )
+        generated_tokens = outputs[:, input_length:]
+        decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens]
+        return self.label_mapper.predict(decoded_outputs)
+class MistralLLM(AutoLLM):
+    def load(self, model_id: str) -> None:
+        self.tokenizer = MistralTokenizer.from_hf_hub(model_id)
+        if self.device == "cpu":
+            device_map = "cpu"
+        else:
+            device_map = "balanced"
+        self.model = Mistral3ForConditionalGeneration.from_pretrained(
+            model_id,
+            device_map=device_map,
+            torch_dtype=torch.bfloat16,
+            token=self.token
+        )
+        if not hasattr(self.tokenizer, "pad_token_id") or self.tokenizer.pad_token_id is None:
+            self.tokenizer.pad_token_id = self.model.generation_config.eos_token_id
+        self.label_mapper.fit()
+    def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
+        tokenized_list = []
+        for prompt in inputs:
+            messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
+            tokenized = self.tokenizer.encode_chat_completion(ChatCompletionRequest(messages=messages))
+            tokenized_list.append(tokenized.tokens)
+        max_len = max(len(tokens) for tokens in tokenized_list)
+        input_ids, attention_masks = [], []
+        for tokens in tokenized_list:
+            pad_length = max_len - len(tokens)
+            input_ids.append(tokens + [self.tokenizer.pad_token_id] * pad_length)
+            attention_masks.append([1] * len(tokens) + [0] * pad_length)
+        input_ids = torch.tensor(input_ids).to(self.model.device)
+        attention_masks = torch.tensor(attention_masks).to(self.model.device)
+        outputs =self.model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_masks,
+            eos_token_id=self.model.generation_config.eos_token_id,
+            pad_token_id=self.tokenizer.pad_token_id,
+            max_new_tokens=max_new_tokens,
+        )
+        decoded_outputs = []
+        for i, tokens in enumerate(outputs):
+            output_text = self.tokenizer.decode(tokens[len(tokenized_list[i]):])
+            decoded_outputs.append(output_text)
+        return self.label_mapper.predict(decoded_outputs)

ontolearner/learner/retriever.py CHANGED Viewed

@@ -66,7 +66,16 @@ class AutoRetrieverLearner(AutoLearner):
             taxonomic_pairs = [{"parent": candidate, "child": query}
                                for query, candidates in zip(data, candidates_lst)
                                for candidate in candidates if candidate.lower() != query.lower()]
-            return taxonomic_pairs
+            taxonomic_pairs += [{"parent": query, "child": candidate}
+                               for query, candidates in zip(data, candidates_lst)
+                               for candidate in candidates if candidate.lower() != query.lower()]
+            unique_taxonomic_pairs, seen = [], set()
+            for pair in taxonomic_pairs:
+                key = (pair["parent"].lower(), pair["child"].lower()) # Directional key (parent, child)
+                if key not in seen:
+                    seen.add(key)
+                    unique_taxonomic_pairs.append(pair)
+            return unique_taxonomic_pairs
         else:
             warnings.warn("No requirement for fiting the taxonomy discovery model, the predict module will use the input data to do the fit as well.")
@@ -86,11 +95,23 @@ class AutoRetrieverLearner(AutoLearner):
             candidates_lst = self._retriever_predict(data=data['types'], top_k=self.top_k + 1)
             taxonomic_pairs = []
             taxonomic_pairs_query = []
+            seen = set()
             for query, candidates in zip(data['types'], candidates_lst):
                 for candidate in candidates:
                     if candidate != query:
-                        taxonomic_pairs.append((query, candidate))
-                        taxonomic_pairs_query.append(f"Head: {query} \n Tail: {candidate}")
+                        # Directional pair 1: query -> candidate
+                        key1 = (query.lower(), candidate.lower())
+                        if key1 not in seen:
+                            seen.add(key1)
+                            taxonomic_pairs.append((query, candidate))
+                            taxonomic_pairs_query.append(f"Head: {query}\nTail: {candidate}")
+                        # Directional pair 2: candidate -> query
+                        key2 = (candidate.lower(), query.lower())
+                        if key2 not in seen:
+                            seen.add(key2)
+                            taxonomic_pairs.append((candidate, query))
+                            taxonomic_pairs_query.append(f"Head: {candidate}\nTail: {query}")
             self._retriever_fit(data=data['relations'])
             candidate_relations_lst = self._retriever_predict(data=taxonomic_pairs_query, top_k=self.top_k)
             non_taxonomic_re = [{"head": head, "tail": tail, "relation": relation}

ontolearner/learner/taxonomy_discovery/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+# Copyright (c) 2025 SciKnowOrg
+#
+# Licensed under the MIT License (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://opensource.org/licenses/MIT
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .alexbek import AlexbekCrossAttnLearner
+from .rwthdbis import RWTHDBISSFTLearner
+from .sbunlp import SBUNLPFewShotLearner
+from .skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner

OntoLearner 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

OntoLearner 1.4.6py3-none-any.whl → 1.4.8py3-none-any.whl