PyPI - OntoLearner - Versions diffs - 1.4.10__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

OntoLearner 1.4.10py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

ontolearner/VERSION +1 -1
ontolearner/base/learner.py +41 -18
ontolearner/evaluation/metrics.py +72 -32
ontolearner/learner/__init__.py +3 -2
ontolearner/learner/label_mapper.py +5 -4
ontolearner/learner/llm.py +257 -0
ontolearner/learner/prompt.py +40 -5
ontolearner/learner/rag/__init__.py +14 -0
ontolearner/learner/{rag.py → rag/rag.py} +7 -2
ontolearner/learner/retriever/__init__.py +1 -1
ontolearner/learner/retriever/{llm_retriever.py → augmented_retriever.py} +48 -39
ontolearner/learner/retriever/learner.py +3 -4
ontolearner/learner/taxonomy_discovery/alexbek.py +632 -310
ontolearner/learner/taxonomy_discovery/skhnlp.py +216 -156
ontolearner/learner/text2onto/__init__.py +1 -1
ontolearner/learner/text2onto/alexbek.py +484 -1105
ontolearner/learner/text2onto/sbunlp.py +498 -493
ontolearner/ontology/biology.py +2 -3
ontolearner/ontology/chemistry.py +16 -18
ontolearner/ontology/ecology_environment.py +2 -3
ontolearner/ontology/general.py +4 -6
ontolearner/ontology/material_science_engineering.py +64 -45
ontolearner/ontology/medicine.py +2 -3
ontolearner/ontology/scholarly_knowledge.py +6 -9
ontolearner/processor.py +3 -3
ontolearner/text2onto/splitter.py +69 -6
{ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/METADATA +2 -2
{ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/RECORD +30 -29
{ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/WHEEL +1 -1
{ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/licenses/LICENSE +0 -0

ontolearner/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 1.4.10
1	+ 1.5.0

ontolearner/base/learner.py CHANGED Viewed

@@ -18,6 +18,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import torch.nn.functional as F
 from sentence_transformers import SentenceTransformer
+from collections import defaultdict
 class AutoLearner(ABC):
     """
@@ -70,6 +71,7 @@ class AutoLearner(ABC):
                  - "term-typing": Predict semantic types for terms
                  - "taxonomy-discovery": Identify hierarchical relationships
                  - "non-taxonomy-discovery": Identify non-hierarchical relationships
+                 - "text2onto" : Extract ontology terms and their semantic types from documents
         Raises:
             NotImplementedError: If not implemented by concrete class.
@@ -81,6 +83,8 @@ class AutoLearner(ABC):
             self._taxonomy_discovery(train_data, test=False)
         elif task == 'non-taxonomic-re':
             self._non_taxonomic_re(train_data, test=False)
+        elif task == 'text2onto':
+            self._text2onto(train_data, test=False)
         else:
             raise ValueError(f"{task} is not a valid task.")
@@ -103,6 +107,7 @@ class AutoLearner(ABC):
             - term-typing: List of predicted types for each term
             - taxonomy-discovery: Boolean predictions for relationships
             - non-taxonomy-discovery: Predicted relation types
+            - text2onto : Extract ontology terms and their semantic types from documents
         Raises:
             NotImplementedError: If not implemented by concrete class.
@@ -115,6 +120,8 @@ class AutoLearner(ABC):
             return self._taxonomy_discovery(eval_data, test=True)
         elif task == 'non-taxonomic-re':
             return self._non_taxonomic_re(eval_data, test=True)
+        elif task == 'text2onto':
+            return self._text2onto(eval_data, test=True)
         else:
             raise ValueError(f"{task} is not a valid task.")
@@ -147,6 +154,9 @@ class AutoLearner(ABC):
     def _non_taxonomic_re(self, data: Any, test: bool = False) -> Optional[Any]:
         pass
+    def _text2onto(self, data: Any, test: bool = False) -> Optional[Any]:
+        pass
     def tasks_data_former(self, data: Any, task: str, test: bool = False) -> List[str | Dict[str, str]]:
         formatted_data = []
         if task == "term-typing":
@@ -171,6 +181,7 @@ class AutoLearner(ABC):
             non_taxonomic_types = list(set(non_taxonomic_types))
             non_taxonomic_res = list(set(non_taxonomic_res))
             formatted_data = {"types": non_taxonomic_types, "relations": non_taxonomic_res}
         return formatted_data
     def tasks_ground_truth_former(self, data: Any, task: str) -> List[Dict[str, str]]:
@@ -186,6 +197,26 @@ class AutoLearner(ABC):
                 formatted_data.append({"head": non_taxonomic_triplets.head,
                                        "tail": non_taxonomic_triplets.tail,
                                        "relation": non_taxonomic_triplets.relation})
+        if task == "text2onto":
+            terms2docs = data.get("terms2docs", {}) or {}
+            terms2types = data.get("terms2types", {}) or {}
+            # gold doc→terms
+            gold_terms = []
+            for term, doc_ids in terms2docs.items():
+                for doc_id in doc_ids or []:
+                    gold_terms.append({"doc_id": doc_id, "term": term})
+            # gold doc→types derived via doc→terms + term→types
+            doc2types = defaultdict(set)
+            for term, doc_ids in terms2docs.items():
+                for doc_id in doc_ids or []:
+                    for ty in (terms2types.get(term, []) or []):
+                        if isinstance(ty, str) and ty.strip():
+                            doc2types[doc_id].add(ty.strip())
+            gold_types = [{"doc_id": doc_id, "type": ty} for doc_id, tys in doc2types.items() for ty in tys]
+            return {"terms": gold_terms, "types": gold_types}
         return formatted_data
 class AutoLLM(ABC):
@@ -201,7 +232,7 @@ class AutoLLM(ABC):
         tokenizer: The tokenizer associated with the model.
     """
-    def __init__(self, label_mapper: Any, device: str='cpu', token: str="") -> None:
+    def __init__(self, label_mapper: Any, device: str='cpu', token: str="", max_length: int = 512) -> None:
         """
         Initialize the LLM component.
@@ -213,6 +244,7 @@ class AutoLLM(ABC):
         self.device=device
         self.model: Optional[Any] = None
         self.tokenizer: Optional[Any] = None
+        self.max_length = max_length
     def load(self, model_id: str) -> None:
@@ -236,10 +268,8 @@ class AutoLLM(ABC):
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left', token=self.token)
         self.tokenizer.pad_token = self.tokenizer.eos_token
         if self.device == "cpu":
-            # device_map = "cpu"
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_id,
-                # device_map=device_map,
                 torch_dtype=torch.bfloat16,
                 token=self.token
             )
@@ -248,11 +278,12 @@ class AutoLLM(ABC):
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_id,
                 device_map=device_map,
-                torch_dtype=torch.bfloat16,
-                token=self.token
+                token=self.token,
+                trust_remote_code=True,
             )
         self.label_mapper.fit()
+    @torch.no_grad()
     def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
         """
         Generate text responses for the given input prompts.
@@ -276,29 +307,21 @@ class AutoLLM(ABC):
             List of generated text responses, one for each input prompt.
             Responses include the original input plus generated continuation.
         """
-        # Tokenize inputs and move to device
         encoded_inputs = self.tokenizer(inputs,
                                         return_tensors="pt",
-                                        padding=True,
-                                        truncation=True).to(self.model.device)
+                                        max_length=self.max_length,
+                                        truncation=True,
+                                        padding=True).to(self.model.device)
         input_ids = encoded_inputs["input_ids"]
         input_length = input_ids.shape[1]
-        # Generate output
         outputs = self.model.generate(
             **encoded_inputs,
             max_new_tokens=max_new_tokens,
-            pad_token_id=self.tokenizer.eos_token_id
+            pad_token_id=self.tokenizer.eos_token_id,
+            eos_token_id=self.tokenizer.eos_token_id
         )
-        # Extract only the newly generated tokens (excluding prompt)
         generated_tokens = outputs[:, input_length:]
-        # Decode only the generated part
         decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens]
-        print(decoded_outputs)
-        print(self.label_mapper.predict(decoded_outputs))
-        # Map the decoded text to labels
         return self.label_mapper.predict(decoded_outputs)
 class AutoRetriever(ABC):

ontolearner/evaluation/metrics.py CHANGED Viewed

@@ -11,44 +11,84 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Dict, Tuple, Set
+from typing import List, Dict, Tuple, Set, Any, Union
 SYMMETRIC_RELATIONS = {"equivalentclass", "sameas", "disjointwith"}
-def text2onto_metrics(y_true: List[str], y_pred: List[str], similarity_threshold: float = 0.8) -> Dict[str, float | int]:
-    def jaccard_similarity(a: str, b: str) -> float:
-        set_a = set(a.lower().split())
-        set_b = set(b.lower().split())
-        if not set_a and not set_b:
+def text2onto_metrics(
+    y_true: Dict[str, Any],
+    y_pred: Dict[str, Any],
+    similarity_threshold: float = 0.8
+) -> Dict[str, Any]:
+    """
+    Expects:
+      y_true = {"terms": [{"doc_id": str, "term": str}, ...],
+               "types": [{"doc_id": str, "type": str}, ...]}
+      y_pred = same shape
+    Returns:
+      {"terms": {...}, "types": {...}}
+    """
+    def jaccard_similarity(text_a: str, text_b: str) -> float:
+        tokens_a = set(text_a.lower().split())
+        tokens_b = set(text_b.lower().split())
+        if not tokens_a and not tokens_b:
             return 1.0
-        return len(set_a & set_b) / len(set_a | set_b)
-    matched_gt_indices = set()
-    matched_pred_indices = set()
-    for i, pred_label in enumerate(y_pred):
-        for j, gt_label in enumerate(y_true):
-            if j in matched_gt_indices:
-                continue
-            sim = jaccard_similarity(pred_label, gt_label)
-            if sim >= similarity_threshold:
-                matched_pred_indices.add(i)
-                matched_gt_indices.add(j)
-                break  # each gt matched once
-    total_correct = len(matched_pred_indices)
-    total_predicted = len(y_pred)
-    total_ground_truth = len(y_true)
+        return len(tokens_a & tokens_b) / len(tokens_a | tokens_b)
+    def pairs_to_strings(rows: List[Dict[str, str]], value_key: str) -> List[str]:
+        paired_strings: List[str] = []
+        for row in rows or []:
+            doc_id = (row.get("doc_id") or "").strip()
+            value = (row.get(value_key) or "").strip()
+            if doc_id and value:
+                # keep doc association + allow token Jaccard
+                paired_strings.append(f"{doc_id} {value}")
+        return paired_strings
+    def score_list(ground_truth_items: List[str], predicted_items: List[str]) -> Dict[str, Union[float, int]]:
+        matched_ground_truth_indices: Set[int] = set()
+        matched_predicted_indices: Set[int] = set()
+        for predicted_index, predicted_item in enumerate(predicted_items):
+            for ground_truth_index, ground_truth_item in enumerate(ground_truth_items):
+                if ground_truth_index in matched_ground_truth_indices:
+                    continue
+                if jaccard_similarity(predicted_item, ground_truth_item) >= similarity_threshold:
+                    matched_predicted_indices.add(predicted_index)
+                    matched_ground_truth_indices.add(ground_truth_index)
+                    break
+        total_correct = len(matched_predicted_indices)
+        total_predicted = len(predicted_items)
+        total_ground_truth = len(ground_truth_items)
+        precision = total_correct / total_predicted if total_predicted else 0.0
+        recall = total_correct / total_ground_truth if total_ground_truth else 0.0
+        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0
+        return {
+            "f1_score": f1,
+            "precision": precision,
+            "recall": recall,
+            "total_correct": total_correct,
+            "total_predicted": total_predicted,
+            "total_ground_truth": total_ground_truth,
+        }
+    ground_truth_terms = pairs_to_strings(y_true.get("terms", []), "term")
+    predicted_terms = pairs_to_strings(y_pred.get("terms", []), "term")
+    ground_truth_types = pairs_to_strings(y_true.get("types", []), "type")
+    predicted_types = pairs_to_strings(y_pred.get("types", []), "type")
+    terms_metrics = score_list(ground_truth_terms, predicted_terms)
+    types_metrics = score_list(ground_truth_types, predicted_types)
-    precision = total_correct / total_predicted if total_predicted > 0 else 0
-    recall = total_correct / total_ground_truth if total_ground_truth > 0 else 0
-    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
     return {
-        "f1_score": f1_score,
-        "precision": precision,
-        "recall": recall,
-        "total_correct": total_correct,
-        "total_predicted": total_predicted,
-        "total_ground_truth": total_ground_truth
+        "terms": terms_metrics,
+        "types": types_metrics,
     }
 def term_typing_metrics(y_true: List[Dict[str, List[str]]], y_pred: List[Dict[str, List[str]]]) -> Dict[str, float | int]:

ontolearner/learner/__init__.py CHANGED Viewed

@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .llm import AutoLLMLearner, FalconLLM, MistralLLM
+from .llm import AutoLLMLearner, FalconLLM, MistralLLM, LogitMistralLLM, \
+                 QwenInstructLLM, QwenThinkingLLM, LogitAutoLLM, LogitQuantAutoLLM
 from .retriever import AutoRetrieverLearner, LLMAugmentedRetrieverLearner
-from .rag import AutoRAGLearner
+from .rag import AutoRAGLearner, LLMAugmentedRAGLearner
 from .prompt import StandardizedPrompting
 from .label_mapper import LabelMapper

ontolearner/learner/label_mapper.py CHANGED Viewed

@@ -31,7 +31,7 @@ class LabelMapper:
                  ngram_range: Tuple=(1, 1),
                  label_dict: Dict[str, List[str]]=None,
                  analyzer: str = 'word',
-                 iterator_no: int = 100):
+                 iterator_no: int = 1000):
         """
         Initializes the TFIDFLabelMapper with a specified classifier and TF-IDF configuration.
@@ -45,11 +45,12 @@ class LabelMapper:
         if label_dict is None:
             label_dict = {
                 "yes": ["yes", "true"],
-                "no": ["no",  "false", " "]
+                "no": ["no",  "false"]
             }
-        self.labels = [label.lower() for label in list(label_dict.keys())]
+        self.label_dict = label_dict
+        self.labels = [label.lower() for label in list(self.label_dict.keys())]
         self.x_train, self.y_train = [], []
-        for label, candidates in label_dict.items():
+        for label, candidates in self.label_dict.items():
             self.x_train += [label] + candidates
             self.y_train += [label] * (len(candidates) + 1)
         self.x_train = iterator_no * self.x_train

ontolearner/learner/llm.py CHANGED Viewed

@@ -18,9 +18,11 @@ import warnings
 from tqdm import tqdm
 from torch.utils.data import DataLoader
 import torch
+import torch.nn.functional as F
 from transformers import Mistral3ForConditionalGeneration
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 class AutoLLMLearner(AutoLearner):
@@ -144,6 +146,7 @@ class AutoLLMLearner(AutoLearner):
 class FalconLLM(AutoLLM):
+    @torch.no_grad()
     def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
         encoded_inputs = self.tokenizer(inputs,
                                         return_tensors="pt",
@@ -160,6 +163,7 @@ class FalconLLM(AutoLLM):
         decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens]
         return self.label_mapper.predict(decoded_outputs)
 class MistralLLM(AutoLLM):
     def load(self, model_id: str) -> None:
@@ -178,6 +182,7 @@ class MistralLLM(AutoLLM):
             self.tokenizer.pad_token_id = self.model.generation_config.eos_token_id
         self.label_mapper.fit()
+    @torch.no_grad()
     def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
         tokenized_list = []
         for prompt in inputs:
@@ -206,3 +211,255 @@ class MistralLLM(AutoLLM):
             output_text = self.tokenizer.decode(tokens[len(tokenized_list[i]):])
             decoded_outputs.append(output_text)
         return self.label_mapper.predict(decoded_outputs)
+class LogitMistralLLM(AutoLLM):
+    label_dict = {
+        "yes": ["yes", "true", " yes", "Yes"],
+        "no": ["no", "false", " no", "No"]
+    }
+    def _get_label_token_ids(self):
+        label_token_ids = {}
+        for label, words in self.label_dict.items():
+            ids = []
+            for w in words:
+                messages = [{"role": "user", "content": [{"type": "text", "text": w}]}]
+                tokenized = self.tokenizer.encode_chat_completion(ChatCompletionRequest(messages=messages))
+                token_ids = tokenized.tokens[2:-1]
+                ids.append(token_ids)
+            label_token_ids[label] = ids
+        return label_token_ids
+    def load(self, model_id: str) -> None:
+        self.tokenizer = MistralTokenizer.from_hf_hub(model_id)
+        self.tokenizer.padding_side = 'left'
+        device_map = "cpu" if self.device == "cpu" else "balanced"
+        self.model = Mistral3ForConditionalGeneration.from_pretrained(
+            model_id,
+            device_map=device_map,
+            torch_dtype=torch.bfloat16,
+            token=self.token
+        )
+        self.pad_token_id = self.model.generation_config.eos_token_id
+        self.label_token_ids = self._get_label_token_ids()
+    @torch.no_grad()
+    def generate(self, inputs: List[str], max_new_tokens: int = 1) -> List[str]:
+        tokenized_list = []
+        for prompt in inputs:
+            messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
+            req = ChatCompletionRequest(messages=messages)
+            tokenized = self.tokenizer.encode_chat_completion(req)
+            tokenized_list.append(tokenized.tokens)
+        max_len = max(len(t) for t in tokenized_list)
+        input_ids, attention_masks = [], []
+        for tokens in tokenized_list:
+            pad_len = max_len - len(tokens)
+            input_ids.append(tokens + [self.pad_token_id] * pad_len)
+            attention_masks.append([1] * len(tokens) + [0] * pad_len)
+        input_ids = torch.tensor(input_ids).to(self.model.device)
+        attention_masks = torch.tensor(attention_masks).to(self.model.device)
+        outputs = self.model(input_ids=input_ids, attention_mask=attention_masks)
+        # logits: [batch, seq_len, vocab]
+        logits = outputs.logits
+        # next-token prediction
+        last_logits = logits[:, -1, :]
+        probs = torch.softmax(last_logits, dim=-1)
+        predictions = []
+        for i in range(probs.size(0)):
+            label_scores = {}
+            for label, token_id_lists in self.label_token_ids.items():
+                score = 0.0
+                for token_ids in token_id_lists:
+                    # single-token in practice, but safe
+                    score += probs[i, token_ids[0]].item()
+                label_scores[label] = score
+            predictions.append(max(label_scores, key=label_scores.get))
+        return predictions
+class QwenInstructLLM(AutoLLM):
+    def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
+        messages = [[{"role": "user", "content": prompt + " Please show your final response with 'answer': 'label'."}]
+                    for prompt in inputs]
+        texts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        encoded_inputs = self.tokenizer(texts, return_tensors="pt", padding="max_length", truncation=True,
+                                        max_length=256).to(self.model.device)
+        generated_ids = self.model.generate(**encoded_inputs,
+                                            max_new_tokens=max_new_tokens,
+                                            use_cache=False,
+                                            pad_token_id=self.tokenizer.pad_token_id,
+                                            eos_token_id=self.tokenizer.eos_token_id)
+        decoded_outputs = []
+        for i in range(len(generated_ids)):
+            prompt_len = encoded_inputs.attention_mask[i].sum().item()
+            output_ids = generated_ids[i][prompt_len:].tolist()
+            output_content = self.tokenizer.decode(output_ids, skip_special_tokens=True).strip()
+            decoded_outputs.append(output_content)
+        return self.label_mapper.predict(decoded_outputs)
+class QwenThinkingLLM(AutoLLM):
+    @torch.no_grad()
+    def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
+        messages = [[{"role": "user", "content": prompt + " Please show your final response with 'answer': 'label'."}]
+                    for prompt in inputs]
+        texts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        encoded_inputs = self.tokenizer(texts, return_tensors="pt", padding=True).to(self.model.device)
+        generated_ids = self.model.generate(**encoded_inputs, max_new_tokens=max_new_tokens)
+        decoded_outputs = []
+        for i in range(len(generated_ids)):
+            prompt_len = encoded_inputs.attention_mask[i].sum().item()
+            output_ids = generated_ids[i][prompt_len:].tolist()
+            try:
+                end = len(output_ids) - output_ids[::-1].index(151668)
+                thinking_ids = output_ids[:end]
+            except ValueError:
+                thinking_ids = output_ids
+            thinking_content = self.tokenizer.decode(thinking_ids, skip_special_tokens=True).strip()
+            decoded_outputs.append(thinking_content)
+        return self.label_mapper.predict(decoded_outputs)
+class LogitAutoLLM(AutoLLM):
+    def _get_label_token_ids(self):
+        label_token_ids = {}
+        for label, words in self.label_mapper.label_dict.items():
+            ids = []
+            for w in words:
+                token_ids = self.tokenizer.encode(w, add_special_tokens=False)
+                ids.append(token_ids)
+            label_token_ids[label] = ids
+        return label_token_ids
+    def load(self, model_id: str) -> None:
+        super().load(model_id)
+        self.label_token_ids = self._get_label_token_ids()
+    @torch.no_grad()
+    def generate(self, inputs: List[str], max_new_tokens: int = 1) -> List[str]:
+        encoded = self.tokenizer(inputs, return_tensors="pt", truncation=True, padding=True).to(self.model.device)
+        outputs = self.model(**encoded)
+        logits = outputs.logits # logits: [batch, seq_len, vocab]
+        last_logits = logits[:, -1, :]  # [batch, vocab] # we only care about the NEXT token prediction
+        probs = F.softmax(last_logits, dim=-1)
+        predictions = []
+        for i in range(probs.size(0)):
+            label_scores = {}
+            for label, token_id_lists in self.label_token_ids.items():
+                score = 0.0
+                for token_ids in token_id_lists:
+                    if len(token_ids) == 1:
+                        score += probs[i, token_ids[0]].item()
+                    else:
+                        score += probs[i, token_ids[0]].item() # multi-token fallback (rare but safe)
+                label_scores[label] = score
+            predictions.append(max(label_scores, key=label_scores.get))
+        return predictions
+class LogitQuantAutoLLM(AutoLLM):
+    label_dict = {
+        "yes": ["yes", "true", " yes", "Yes"],
+        "no": ["no", "false", " no", "No"]
+    }
+    def _get_label_token_ids(self):
+        label_token_ids = {}
+        for label, words in self.label_dict.items():
+            ids = []
+            for w in words:
+                token_ids = self.tokenizer.encode(
+                    w,
+                    add_special_tokens=False
+                )
+                # usually single-token, but be safe
+                ids.append(token_ids)
+            label_token_ids[label] = ids
+        return label_token_ids
+    def load(self, model_id: str) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left', token=self.token)
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        if self.device == "cpu":
+            # device_map = "cpu"
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                # device_map=device_map,
+                torch_dtype=torch.bfloat16,
+                token=self.token
+            )
+        else:
+            device_map = "balanced"
+            # self.model = AutoModelForCausalLM.from_pretrained(
+            #     model_id,
+            #     device_map=device_map,
+            #     torch_dtype=torch.bfloat16,
+            #     token=self.token
+            # )
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True
+            )
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                quantization_config=bnb_config,
+                device_map=device_map,
+                token=self.token,
+                # trust_remote_code=True,
+                # attn_implementation="flash_attention_2"
+            )
+        self.label_token_ids = self._get_label_token_ids()
+    @torch.no_grad()
+    def generate(self, inputs: List[str], max_new_tokens: int = 1) -> List[str]:
+        encoded = self.tokenizer(
+            inputs,
+            return_tensors="pt",
+            max_length=256,
+            truncation=True,
+            padding=True
+        ).to(self.model.device)
+        outputs = self.model(**encoded)
+        # logits: [batch, seq_len, vocab]
+        logits = outputs.logits
+        # we only care about the NEXT token prediction
+        last_logits = logits[:, -1, :]  # [batch, vocab]
+        probs = F.softmax(last_logits, dim=-1)
+        predictions = []
+        for i in range(probs.size(0)):
+            label_scores = {}
+            for label, token_id_lists in self.label_token_ids.items():
+                score = 0.0
+                for token_ids in token_id_lists:
+                    if len(token_ids) == 1:
+                        score += probs[i, token_ids[0]].item()
+                    else:
+                        # multi-token fallback (rare but safe)
+                        score += probs[i, token_ids[0]].item()
+                label_scores[label] = score
+            predictions.append(max(label_scores, key=label_scores.get))
+        return predictions

ontolearner/learner/prompt.py CHANGED Viewed

@@ -17,15 +17,50 @@ from ..base import AutoPrompt
 class StandardizedPrompting(AutoPrompt):
     def __init__(self, task: str = None):
         if task == "term-typing":
-            prompt_template = """Determine whether the given term can be categorized as an instance of the specified high-level type. Answer with `yes` if it is otherwise answer with `no`. Do not explain.
+            prompt_template = """You are performing term typing.
+Determine whether the given term is a clear and unambiguous instance of the specified high-level type.
+Rules:
+- Answer "yes" only if the term commonly and directly belongs to the type.
+- Answer "no" if the term does not belong to the type, is ambiguous, or only weakly related.
+- Use the most common meaning of the term.
+- Do not explain your answer.
 Term: {term}
 Type: {type}
-Answer: """
+Answer (yes or no):"""
         elif task == "taxonomy-discovery":
-            prompt_template = """Is {parent} a direct or indirect superclass (or parent concept) of {child} in a conceptual hierarchy? Answer with yes or no.
-Answer: """
+            prompt_template =  """You are identifying taxonomic (is-a) relationships.
+Question:
+Is "{parent}" a superclass (direct or indirect) of "{child}" in a standard conceptual or ontological hierarchy?
+Rules:
+- A superclass means: "{child}" is a type or instance of "{parent}".
+- Answer "yes" only if the relationship is a true is-a relationship.
+- Answer "no" for part-of, related-to, or associative relationships.
+- Use general world knowledge.
+- Do not explain.
+Parent: {parent}
+Child: {child}
+Answer (yes or no):"""
         elif task == "non-taxonomic-re":
-            prompt_template = """Given the conceptual types `{head}` and `{tail}`, does a `{relation}` relation exist between them? Respond with "yes" if it does, otherwise respond with "no"."""
+            prompt_template = """You are identifying non-taxonomic conceptual relationships.
+Given two conceptual types, determine whether the specified relation typically holds between them.
+Rules:
+- Answer "yes" only if the relation commonly and meaningfully applies.
+- Answer "no" if the relation is rare, indirect, or context-dependent.
+- Do not infer relations that require specific situations.
+- Do not explain.
+Head type: {head}
+Tail type: {tail}
+Relation: {relation}
+Answer (yes or no):"""
         else:
             raise ValueError("Unknown task! Current tasks are: 'term-typing', 'taxonomy-discovery', 'non-taxonomic-re'")
         super().__init__(prompt_template)

OntoLearner 1.4.10__py3-none-any.whl → 1.5.0__py3-none-any.whl

OntoLearner 1.4.10py3-none-any.whl → 1.5.0py3-none-any.whl