PyPI - OntoLearner - Versions diffs - 1.4.10__tar.gz → 1.5.0__tar.gz - Mend

OntoLearner 1.4.10tar.gz → 1.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

{ontolearner-1.4.10 → ontolearner-1.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: OntoLearner
-Version: 1.4.10
+Version: 1.5.0
 Summary: OntoLearner: A Modular Python Library for Ontology Learning with LLMs.
 License: MIT
 License-File: LICENSE
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Requires-Dist: Levenshtein
-Requires-Dist: bitsandbytes (>=0.45.1,<0.46.0)
+Requires-Dist: bitsandbytes (>=0.45.1,<1.0.0) ; platform_system == "Linux"
 Requires-Dist: dspy (>=2.6.14,<3.0.0)
 Requires-Dist: g4f
 Requires-Dist: gensim

ontolearner-1.5.0/ontolearner/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1.5.0

{ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/base/learner.py RENAMED Viewed

@@ -18,6 +18,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import torch.nn.functional as F
 from sentence_transformers import SentenceTransformer
+from collections import defaultdict
 class AutoLearner(ABC):
     """
@@ -70,6 +71,7 @@ class AutoLearner(ABC):
                  - "term-typing": Predict semantic types for terms
                  - "taxonomy-discovery": Identify hierarchical relationships
                  - "non-taxonomy-discovery": Identify non-hierarchical relationships
+                 - "text2onto" : Extract ontology terms and their semantic types from documents
         Raises:
             NotImplementedError: If not implemented by concrete class.
@@ -81,6 +83,8 @@ class AutoLearner(ABC):
             self._taxonomy_discovery(train_data, test=False)
         elif task == 'non-taxonomic-re':
             self._non_taxonomic_re(train_data, test=False)
+        elif task == 'text2onto':
+            self._text2onto(train_data, test=False)
         else:
             raise ValueError(f"{task} is not a valid task.")
@@ -103,6 +107,7 @@ class AutoLearner(ABC):
             - term-typing: List of predicted types for each term
             - taxonomy-discovery: Boolean predictions for relationships
             - non-taxonomy-discovery: Predicted relation types
+            - text2onto : Extract ontology terms and their semantic types from documents
         Raises:
             NotImplementedError: If not implemented by concrete class.
@@ -115,6 +120,8 @@ class AutoLearner(ABC):
             return self._taxonomy_discovery(eval_data, test=True)
         elif task == 'non-taxonomic-re':
             return self._non_taxonomic_re(eval_data, test=True)
+        elif task == 'text2onto':
+            return self._text2onto(eval_data, test=True)
         else:
             raise ValueError(f"{task} is not a valid task.")
@@ -147,6 +154,9 @@ class AutoLearner(ABC):
     def _non_taxonomic_re(self, data: Any, test: bool = False) -> Optional[Any]:
         pass
+    def _text2onto(self, data: Any, test: bool = False) -> Optional[Any]:
+        pass
     def tasks_data_former(self, data: Any, task: str, test: bool = False) -> List[str | Dict[str, str]]:
         formatted_data = []
         if task == "term-typing":
@@ -171,6 +181,7 @@ class AutoLearner(ABC):
             non_taxonomic_types = list(set(non_taxonomic_types))
             non_taxonomic_res = list(set(non_taxonomic_res))
             formatted_data = {"types": non_taxonomic_types, "relations": non_taxonomic_res}
         return formatted_data
     def tasks_ground_truth_former(self, data: Any, task: str) -> List[Dict[str, str]]:
@@ -186,6 +197,26 @@ class AutoLearner(ABC):
                 formatted_data.append({"head": non_taxonomic_triplets.head,
                                        "tail": non_taxonomic_triplets.tail,
                                        "relation": non_taxonomic_triplets.relation})
+        if task == "text2onto":
+            terms2docs = data.get("terms2docs", {}) or {}
+            terms2types = data.get("terms2types", {}) or {}
+            # gold doc→terms
+            gold_terms = []
+            for term, doc_ids in terms2docs.items():
+                for doc_id in doc_ids or []:
+                    gold_terms.append({"doc_id": doc_id, "term": term})
+            # gold doc→types derived via doc→terms + term→types
+            doc2types = defaultdict(set)
+            for term, doc_ids in terms2docs.items():
+                for doc_id in doc_ids or []:
+                    for ty in (terms2types.get(term, []) or []):
+                        if isinstance(ty, str) and ty.strip():
+                            doc2types[doc_id].add(ty.strip())
+            gold_types = [{"doc_id": doc_id, "type": ty} for doc_id, tys in doc2types.items() for ty in tys]
+            return {"terms": gold_terms, "types": gold_types}
         return formatted_data
 class AutoLLM(ABC):
@@ -201,7 +232,7 @@ class AutoLLM(ABC):
         tokenizer: The tokenizer associated with the model.
     """
-    def __init__(self, label_mapper: Any, device: str='cpu', token: str="") -> None:
+    def __init__(self, label_mapper: Any, device: str='cpu', token: str="", max_length: int = 512) -> None:
         """
         Initialize the LLM component.
@@ -213,6 +244,7 @@ class AutoLLM(ABC):
         self.device=device
         self.model: Optional[Any] = None
         self.tokenizer: Optional[Any] = None
+        self.max_length = max_length
     def load(self, model_id: str) -> None:
@@ -236,10 +268,8 @@ class AutoLLM(ABC):
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left', token=self.token)
         self.tokenizer.pad_token = self.tokenizer.eos_token
         if self.device == "cpu":
-            # device_map = "cpu"
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_id,
-                # device_map=device_map,
                 torch_dtype=torch.bfloat16,
                 token=self.token
             )
@@ -248,11 +278,12 @@ class AutoLLM(ABC):
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_id,
                 device_map=device_map,
-                torch_dtype=torch.bfloat16,
-                token=self.token
+                token=self.token,
+                trust_remote_code=True,
             )
         self.label_mapper.fit()
+    @torch.no_grad()
     def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
         """
         Generate text responses for the given input prompts.
@@ -276,29 +307,21 @@ class AutoLLM(ABC):
             List of generated text responses, one for each input prompt.
             Responses include the original input plus generated continuation.
         """
-        # Tokenize inputs and move to device
         encoded_inputs = self.tokenizer(inputs,
                                         return_tensors="pt",
-                                        padding=True,
-                                        truncation=True).to(self.model.device)
+                                        max_length=self.max_length,
+                                        truncation=True,
+                                        padding=True).to(self.model.device)
         input_ids = encoded_inputs["input_ids"]
         input_length = input_ids.shape[1]
-        # Generate output
         outputs = self.model.generate(
             **encoded_inputs,
             max_new_tokens=max_new_tokens,
-            pad_token_id=self.tokenizer.eos_token_id
+            pad_token_id=self.tokenizer.eos_token_id,
+            eos_token_id=self.tokenizer.eos_token_id
         )
-        # Extract only the newly generated tokens (excluding prompt)
         generated_tokens = outputs[:, input_length:]
-        # Decode only the generated part
         decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens]
-        print(decoded_outputs)
-        print(self.label_mapper.predict(decoded_outputs))
-        # Map the decoded text to labels
         return self.label_mapper.predict(decoded_outputs)
 class AutoRetriever(ABC):

{ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/evaluation/metrics.py RENAMED Viewed

@@ -11,44 +11,84 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Dict, Tuple, Set
+from typing import List, Dict, Tuple, Set, Any, Union
 SYMMETRIC_RELATIONS = {"equivalentclass", "sameas", "disjointwith"}
-def text2onto_metrics(y_true: List[str], y_pred: List[str], similarity_threshold: float = 0.8) -> Dict[str, float | int]:
-    def jaccard_similarity(a: str, b: str) -> float:
-        set_a = set(a.lower().split())
-        set_b = set(b.lower().split())
-        if not set_a and not set_b:
+def text2onto_metrics(
+    y_true: Dict[str, Any],
+    y_pred: Dict[str, Any],
+    similarity_threshold: float = 0.8
+) -> Dict[str, Any]:
+    """
+    Expects:
+      y_true = {"terms": [{"doc_id": str, "term": str}, ...],
+               "types": [{"doc_id": str, "type": str}, ...]}
+      y_pred = same shape
+    Returns:
+      {"terms": {...}, "types": {...}}
+    """
+    def jaccard_similarity(text_a: str, text_b: str) -> float:
+        tokens_a = set(text_a.lower().split())
+        tokens_b = set(text_b.lower().split())
+        if not tokens_a and not tokens_b:
             return 1.0
-        return len(set_a & set_b) / len(set_a | set_b)
-    matched_gt_indices = set()
-    matched_pred_indices = set()
-    for i, pred_label in enumerate(y_pred):
-        for j, gt_label in enumerate(y_true):
-            if j in matched_gt_indices:
-                continue
-            sim = jaccard_similarity(pred_label, gt_label)
-            if sim >= similarity_threshold:
-                matched_pred_indices.add(i)
-                matched_gt_indices.add(j)
-                break  # each gt matched once
-    total_correct = len(matched_pred_indices)
-    total_predicted = len(y_pred)
-    total_ground_truth = len(y_true)
+        return len(tokens_a & tokens_b) / len(tokens_a | tokens_b)
+    def pairs_to_strings(rows: List[Dict[str, str]], value_key: str) -> List[str]:
+        paired_strings: List[str] = []
+        for row in rows or []:
+            doc_id = (row.get("doc_id") or "").strip()
+            value = (row.get(value_key) or "").strip()
+            if doc_id and value:
+                # keep doc association + allow token Jaccard
+                paired_strings.append(f"{doc_id} {value}")
+        return paired_strings
+    def score_list(ground_truth_items: List[str], predicted_items: List[str]) -> Dict[str, Union[float, int]]:
+        matched_ground_truth_indices: Set[int] = set()
+        matched_predicted_indices: Set[int] = set()
+        for predicted_index, predicted_item in enumerate(predicted_items):
+            for ground_truth_index, ground_truth_item in enumerate(ground_truth_items):
+                if ground_truth_index in matched_ground_truth_indices:
+                    continue
+                if jaccard_similarity(predicted_item, ground_truth_item) >= similarity_threshold:
+                    matched_predicted_indices.add(predicted_index)
+                    matched_ground_truth_indices.add(ground_truth_index)
+                    break
+        total_correct = len(matched_predicted_indices)
+        total_predicted = len(predicted_items)
+        total_ground_truth = len(ground_truth_items)
+        precision = total_correct / total_predicted if total_predicted else 0.0
+        recall = total_correct / total_ground_truth if total_ground_truth else 0.0
+        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0
+        return {
+            "f1_score": f1,
+            "precision": precision,
+            "recall": recall,
+            "total_correct": total_correct,
+            "total_predicted": total_predicted,
+            "total_ground_truth": total_ground_truth,
+        }
+    ground_truth_terms = pairs_to_strings(y_true.get("terms", []), "term")
+    predicted_terms = pairs_to_strings(y_pred.get("terms", []), "term")
+    ground_truth_types = pairs_to_strings(y_true.get("types", []), "type")
+    predicted_types = pairs_to_strings(y_pred.get("types", []), "type")
+    terms_metrics = score_list(ground_truth_terms, predicted_terms)
+    types_metrics = score_list(ground_truth_types, predicted_types)
-    precision = total_correct / total_predicted if total_predicted > 0 else 0
-    recall = total_correct / total_ground_truth if total_ground_truth > 0 else 0
-    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
     return {
-        "f1_score": f1_score,
-        "precision": precision,
-        "recall": recall,
-        "total_correct": total_correct,
-        "total_predicted": total_predicted,
-        "total_ground_truth": total_ground_truth
+        "terms": terms_metrics,
+        "types": types_metrics,
     }
 def term_typing_metrics(y_true: List[Dict[str, List[str]]], y_pred: List[Dict[str, List[str]]]) -> Dict[str, float | int]:

{ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/__init__.py RENAMED Viewed

@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .llm import AutoLLMLearner, FalconLLM, MistralLLM
+from .llm import AutoLLMLearner, FalconLLM, MistralLLM, LogitMistralLLM, \
+                 QwenInstructLLM, QwenThinkingLLM, LogitAutoLLM, LogitQuantAutoLLM
 from .retriever import AutoRetrieverLearner, LLMAugmentedRetrieverLearner
-from .rag import AutoRAGLearner
+from .rag import AutoRAGLearner, LLMAugmentedRAGLearner
 from .prompt import StandardizedPrompting
 from .label_mapper import LabelMapper

{ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/label_mapper.py RENAMED Viewed

@@ -31,7 +31,7 @@ class LabelMapper:
                  ngram_range: Tuple=(1, 1),
                  label_dict: Dict[str, List[str]]=None,
                  analyzer: str = 'word',
-                 iterator_no: int = 100):
+                 iterator_no: int = 1000):
         """
         Initializes the TFIDFLabelMapper with a specified classifier and TF-IDF configuration.
@@ -45,11 +45,12 @@ class LabelMapper:
         if label_dict is None:
             label_dict = {
                 "yes": ["yes", "true"],
-                "no": ["no",  "false", " "]
+                "no": ["no",  "false"]
             }
-        self.labels = [label.lower() for label in list(label_dict.keys())]
+        self.label_dict = label_dict
+        self.labels = [label.lower() for label in list(self.label_dict.keys())]
         self.x_train, self.y_train = [], []
-        for label, candidates in label_dict.items():
+        for label, candidates in self.label_dict.items():
             self.x_train += [label] + candidates
             self.y_train += [label] * (len(candidates) + 1)
         self.x_train = iterator_no * self.x_train

OntoLearner 1.4.10__tar.gz → 1.5.0__tar.gz

OntoLearner 1.4.10tar.gz → 1.5.0tar.gz