OntoLearner 1.4.5__tar.gz → 1.4.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {ontolearner-1.4.5 → ontolearner-1.4.7}/PKG-INFO +2 -1
  2. ontolearner-1.4.7/ontolearner/VERSION +1 -0
  3. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/base/learner.py +8 -5
  4. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/evaluation/metrics.py +26 -13
  5. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/learner/__init__.py +1 -1
  6. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/learner/llm.py +73 -3
  7. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/learner/retriever.py +2 -3
  8. {ontolearner-1.4.5 → ontolearner-1.4.7}/pyproject.toml +2 -1
  9. ontolearner-1.4.5/ontolearner/VERSION +0 -1
  10. {ontolearner-1.4.5 → ontolearner-1.4.7}/LICENSE +0 -0
  11. {ontolearner-1.4.5 → ontolearner-1.4.7}/README.md +0 -0
  12. {ontolearner-1.4.5 → ontolearner-1.4.7}/images/logo.png +0 -0
  13. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/__init__.py +0 -0
  14. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/_learner.py +0 -0
  15. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/_ontology.py +0 -0
  16. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/base/__init__.py +0 -0
  17. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/base/ontology.py +0 -0
  18. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/base/text2onto.py +0 -0
  19. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/data_structure/__init__.py +0 -0
  20. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/data_structure/data.py +0 -0
  21. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/data_structure/metric.py +0 -0
  22. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/evaluation/__init__.py +0 -0
  23. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/evaluation/evaluate.py +0 -0
  24. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/learner/label_mapper.py +0 -0
  25. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/learner/prompt.py +0 -0
  26. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/learner/rag.py +0 -0
  27. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/__init__.py +0 -0
  28. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/agriculture.py +0 -0
  29. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/arts_humanities.py +0 -0
  30. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/biology.py +0 -0
  31. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/chemistry.py +0 -0
  32. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/ecology_environment.py +0 -0
  33. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/education.py +0 -0
  34. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/events.py +0 -0
  35. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/finance.py +0 -0
  36. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/food_beverage.py +0 -0
  37. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/general.py +0 -0
  38. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/geography.py +0 -0
  39. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/industry.py +0 -0
  40. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/law.py +0 -0
  41. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/library_cultural_heritage.py +0 -0
  42. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/material_science_engineering.py +0 -0
  43. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/medicine.py +0 -0
  44. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/news_media.py +0 -0
  45. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/scholarly_knowledge.py +0 -0
  46. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/social_sciences.py +0 -0
  47. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/units_measurements.py +0 -0
  48. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/upper_ontologies.py +0 -0
  49. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/ontology/web.py +0 -0
  50. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/processor.py +0 -0
  51. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/text2onto/__init__.py +0 -0
  52. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/text2onto/batchifier.py +0 -0
  53. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/text2onto/general.py +0 -0
  54. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/text2onto/splitter.py +0 -0
  55. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/text2onto/synthesizer.py +0 -0
  56. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/tools/__init__.py +0 -0
  57. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/tools/analyzer.py +0 -0
  58. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/tools/visualizer.py +0 -0
  59. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/utils/__init__.py +0 -0
  60. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/utils/io.py +0 -0
  61. {ontolearner-1.4.5 → ontolearner-1.4.7}/ontolearner/utils/train_test_split.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OntoLearner
3
- Version: 1.4.5
3
+ Version: 1.4.7
4
4
  Summary: OntoLearner: A Modular Python Library for Ontology Learning with LLMs.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -17,6 +17,7 @@ Requires-Dist: bitsandbytes (>=0.45.1,<0.46.0)
17
17
  Requires-Dist: dspy (>=2.6.14,<3.0.0)
18
18
  Requires-Dist: huggingface-hub (>=0.34.4,<0.35.0)
19
19
  Requires-Dist: matplotlib
20
+ Requires-Dist: mistral-common[sentencepiece] (>=1.8.5,<2.0.0)
20
21
  Requires-Dist: networkx (==3.2.1)
21
22
  Requires-Dist: numpy
22
23
  Requires-Dist: openpyxl
@@ -0,0 +1 @@
1
+ 1.4.7
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from abc import ABC
16
- from typing import Any, List, Optional
16
+ from typing import Any, List, Optional, Dict
17
17
  from transformers import AutoModelForCausalLM, AutoTokenizer
18
18
  import torch
19
19
  import torch.nn.functional as F
@@ -147,7 +147,7 @@ class AutoLearner(ABC):
147
147
  def _non_taxonomic_re(self, data: Any, test: bool = False) -> Optional[Any]:
148
148
  pass
149
149
 
150
- def tasks_data_former(self, data: Any, task: str, test: bool = False) -> Any:
150
+ def tasks_data_former(self, data: Any, task: str, test: bool = False) -> List[str | Dict[str, str]]:
151
151
  formatted_data = []
152
152
  if task == "term-typing":
153
153
  for typing in data.term_typings:
@@ -173,7 +173,7 @@ class AutoLearner(ABC):
173
173
  formatted_data = {"types": non_taxonomic_types, "relations": non_taxonomic_res}
174
174
  return formatted_data
175
175
 
176
- def tasks_ground_truth_former(self, data: Any, task: str) -> Any:
176
+ def tasks_ground_truth_former(self, data: Any, task: str) -> List[Dict[str, str]]:
177
177
  formatted_data = []
178
178
  if task == "term-typing":
179
179
  for typing in data.term_typings:
@@ -238,7 +238,7 @@ class AutoLLM(ABC):
238
238
  if self.device == "cpu":
239
239
  device_map = "cpu"
240
240
  else:
241
- device_map = "auto"
241
+ device_map = "balanced"
242
242
  self.model = AutoModelForCausalLM.from_pretrained(
243
243
  model_id,
244
244
  device_map=device_map,
@@ -271,7 +271,10 @@ class AutoLLM(ABC):
271
271
  Responses include the original input plus generated continuation.
272
272
  """
273
273
  # Tokenize inputs and move to device
274
- encoded_inputs = self.tokenizer(inputs, return_tensors="pt", padding=True).to(self.model.device)
274
+ encoded_inputs = self.tokenizer(inputs,
275
+ return_tensors="pt",
276
+ padding=True,
277
+ truncation=True).to(self.model.device)
275
278
  input_ids = encoded_inputs["input_ids"]
276
279
  input_length = input_ids.shape[1]
277
280
 
@@ -11,13 +11,12 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
- from typing import Dict
14
+ from typing import List, Dict, Tuple, Set
16
15
 
17
16
  SYMMETRIC_RELATIONS = {"equivalentclass", "sameas", "disjointwith"}
18
17
 
19
- def text2onto_metrics(y_true, y_pred, similarity_threshold: float = 0.8) -> Dict:
20
- def jaccard_similarity(a, b):
18
+ def text2onto_metrics(y_true: List[str], y_pred: List[str], similarity_threshold: float = 0.8) -> Dict[str, float | int]:
19
+ def jaccard_similarity(a: str, b: str) -> float:
21
20
  set_a = set(a.lower().split())
22
21
  set_b = set(b.lower().split())
23
22
  if not set_a and not set_b:
@@ -46,10 +45,13 @@ def text2onto_metrics(y_true, y_pred, similarity_threshold: float = 0.8) -> Dict
46
45
  return {
47
46
  "f1_score": f1_score,
48
47
  "precision": precision,
49
- "recall": recall
48
+ "recall": recall,
49
+ "total_correct": total_correct,
50
+ "total_predicted": total_predicted,
51
+ "total_ground_truth": total_ground_truth
50
52
  }
51
53
 
52
- def term_typing_metrics(y_true, y_pred) -> Dict:
54
+ def term_typing_metrics(y_true: List[Dict[str, List[str]]], y_pred: List[Dict[str, List[str]]]) -> Dict[str, float | int]:
53
55
  """
54
56
  Compute precision, recall, and F1-score for term typing
55
57
  using (term, type) pair-level matching instead of ID-based lookups.
@@ -77,13 +79,17 @@ def term_typing_metrics(y_true, y_pred) -> Dict:
77
79
  precision = total_correct / total_predicted if total_predicted > 0 else 0.0
78
80
  recall = total_correct / total_ground_truth if total_ground_truth > 0 else 0.0
79
81
  f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
82
+
80
83
  return {
81
84
  "f1_score": f1_score,
82
85
  "precision": precision,
83
- "recall": recall
86
+ "recall": recall,
87
+ "total_correct": total_correct,
88
+ "total_predicted": total_predicted,
89
+ "total_ground_truth": total_ground_truth
84
90
  }
85
91
 
86
- def taxonomy_discovery_metrics(y_true, y_pred) -> Dict:
92
+ def taxonomy_discovery_metrics(y_true: List[Dict[str, str]], y_pred: List[Dict[str, str]]) -> Dict[str, float | int]:
87
93
  total_predicted = len(y_pred)
88
94
  total_ground_truth = len(y_true)
89
95
  # Convert ground truth and predictions to sets of tuples for easy comparison
@@ -102,18 +108,22 @@ def taxonomy_discovery_metrics(y_true, y_pred) -> Dict:
102
108
  return {
103
109
  "f1_score": f1_score,
104
110
  "precision": precision,
105
- "recall": recall
111
+ "recall": recall,
112
+ "total_correct": total_correct,
113
+ "total_predicted": total_predicted,
114
+ "total_ground_truth": total_ground_truth
106
115
  }
107
116
 
108
- def non_taxonomic_re_metrics(y_true, y_pred) -> Dict:
109
- def normalize_triple(item):
117
+
118
+ def non_taxonomic_re_metrics(y_true: List[Dict[str, str]], y_pred: List[Dict[str, str]]) -> Dict[str, float | int]:
119
+ def normalize_triple(item: Dict[str, str]) -> Tuple[str, str, str]:
110
120
  return (
111
121
  item["head"].strip().lower(),
112
122
  item["relation"].strip().lower(),
113
123
  item["tail"].strip().lower()
114
124
  )
115
125
 
116
- def expand_symmetric(triples):
126
+ def expand_symmetric(triples: Set[Tuple[str, str, str]]) -> Set[Tuple[str, str, str]]:
117
127
  expanded = set()
118
128
  for h, r, t in triples:
119
129
  expanded.add((h, r, t))
@@ -136,5 +146,8 @@ def non_taxonomic_re_metrics(y_true, y_pred) -> Dict:
136
146
  return {
137
147
  "f1_score": f1_score,
138
148
  "precision": precision,
139
- "recall": recall
149
+ "recall": recall,
150
+ "total_correct": total_correct,
151
+ "total_predicted": total_predicted,
152
+ "total_ground_truth": total_ground_truth
140
153
  }
@@ -12,7 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from .llm import AutoLLMLearner
15
+ from .llm import AutoLLMLearner, FalconLLM, MistralLLM
16
16
  from .retriever import AutoRetrieverLearner
17
17
  from .rag import AutoRAGLearner
18
18
  from .prompt import StandardizedPrompting
@@ -13,23 +13,27 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from ..base import AutoLLM, AutoLearner
16
- from typing import Any
16
+ from typing import Any, List
17
17
  import warnings
18
18
  from tqdm import tqdm
19
19
  from torch.utils.data import DataLoader
20
-
20
+ import torch
21
+ from transformers import Mistral3ForConditionalGeneration
22
+ from mistral_common.protocol.instruct.request import ChatCompletionRequest
23
+ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
21
24
 
22
25
  class AutoLLMLearner(AutoLearner):
23
26
 
24
27
  def __init__(self,
25
28
  prompting,
26
29
  label_mapper,
30
+ llm: AutoLLM = AutoLLM,
27
31
  token: str = "",
28
32
  max_new_tokens: int = 5,
29
33
  batch_size: int = 10,
30
34
  device='cpu') -> None:
31
35
  super().__init__()
32
- self.llm = AutoLLM(token=token, label_mapper=label_mapper, device=device)
36
+ self.llm = llm(token=token, label_mapper=label_mapper, device=device)
33
37
  self.prompting = prompting
34
38
  self.batch_size = batch_size
35
39
  self.max_new_tokens = max_new_tokens
@@ -136,3 +140,69 @@ class AutoLLMLearner(AutoLearner):
136
140
  return self._non_taxonomic_re_predict(dataset=dataset)
137
141
  else:
138
142
  warnings.warn("No requirement for fiting the non-taxonomic-re model, the predict module will use the input data to do the task.")
143
+
144
+
145
+ class FalconLLM(AutoLLM):
146
+
147
+ def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
148
+ encoded_inputs = self.tokenizer(inputs,
149
+ return_tensors="pt",
150
+ padding=True,
151
+ truncation=True).to(self.model.device)
152
+ input_ids = encoded_inputs["input_ids"]
153
+ input_length = input_ids.shape[1]
154
+ outputs = self.model.generate(
155
+ input_ids,
156
+ max_new_tokens=max_new_tokens,
157
+ pad_token_id=self.tokenizer.eos_token_id
158
+ )
159
+ generated_tokens = outputs[:, input_length:]
160
+ decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens]
161
+ return self.label_mapper.predict(decoded_outputs)
162
+
163
+ class MistralLLM(AutoLLM):
164
+
165
+ def load(self, model_id: str) -> None:
166
+ self.tokenizer = MistralTokenizer.from_hf_hub(model_id)
167
+ if self.device == "cpu":
168
+ device_map = "cpu"
169
+ else:
170
+ device_map = "balanced"
171
+ self.model = Mistral3ForConditionalGeneration.from_pretrained(
172
+ model_id,
173
+ device_map=device_map,
174
+ torch_dtype=torch.bfloat16,
175
+ token=self.token
176
+ )
177
+ if not hasattr(self.tokenizer, "pad_token_id") or self.tokenizer.pad_token_id is None:
178
+ self.tokenizer.pad_token_id = self.model.generation_config.eos_token_id
179
+ self.label_mapper.fit()
180
+
181
+ def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
182
+ tokenized_list = []
183
+ for prompt in inputs:
184
+ messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
185
+ tokenized = self.tokenizer.encode_chat_completion(ChatCompletionRequest(messages=messages))
186
+ tokenized_list.append(tokenized.tokens)
187
+ max_len = max(len(tokens) for tokens in tokenized_list)
188
+ input_ids, attention_masks = [], []
189
+ for tokens in tokenized_list:
190
+ pad_length = max_len - len(tokens)
191
+ input_ids.append(tokens + [self.tokenizer.pad_token_id] * pad_length)
192
+ attention_masks.append([1] * len(tokens) + [0] * pad_length)
193
+
194
+ input_ids = torch.tensor(input_ids).to(self.model.device)
195
+ attention_masks = torch.tensor(attention_masks).to(self.model.device)
196
+
197
+ outputs =self.model.generate(
198
+ input_ids=input_ids,
199
+ attention_mask=attention_masks,
200
+ eos_token_id=self.model.generation_config.eos_token_id,
201
+ pad_token_id=self.tokenizer.pad_token_id,
202
+ max_new_tokens=max_new_tokens,
203
+ )
204
+ decoded_outputs = []
205
+ for i, tokens in enumerate(outputs):
206
+ output_text = self.tokenizer.decode(tokens[len(tokenized_list[i]):])
207
+ decoded_outputs.append(output_text)
208
+ return self.label_mapper.predict(decoded_outputs)
@@ -22,7 +22,6 @@ class AutoRetrieverLearner(AutoLearner):
22
22
  self.retriever = base_retriever
23
23
  self.top_k = top_k
24
24
  self._is_term_typing_fit = False
25
- self._is_taxonomy_discovery_fit = False
26
25
  self._batch_size = batch_size
27
26
 
28
27
  def load(self, model_id: str = "sentence-transformers/all-MiniLM-L6-v2"):
@@ -64,9 +63,9 @@ class AutoRetrieverLearner(AutoLearner):
64
63
  if test:
65
64
  self._retriever_fit(data=data)
66
65
  candidates_lst = self._retriever_predict(data=data, top_k=self.top_k + 1)
67
- taxonomic_pairs = [{"parent": query, "child": candidate}
66
+ taxonomic_pairs = [{"parent": candidate, "child": query}
68
67
  for query, candidates in zip(data, candidates_lst)
69
- for candidate in candidates if candidate != query]
68
+ for candidate in candidates if candidate.lower() != query.lower()]
70
69
  return taxonomic_pairs
71
70
  else:
72
71
  warnings.warn("No requirement for fiting the taxonomy discovery model, the predict module will use the input data to do the fit as well.")
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "OntoLearner"
3
- version = "1.4.5"
3
+ version = "1.4.7"
4
4
  description = "OntoLearner: A Modular Python Library for Ontology Learning with LLMs."
5
5
  authors = ["Hamed Babaei Giglou <hamedbabaeigiglou@gmail.com>", "Andrei C. Aioanei <andrei.c.aioanei@gmail.com>"]
6
6
  license = "MIT License"
@@ -29,6 +29,7 @@ transformers = "^4.56.0"
29
29
  sentence-transformers = "^5.1.0"
30
30
  dspy = "^2.6.14"
31
31
  bitsandbytes="^0.45.1"
32
+ mistral-common = { version = "^1.8.5", extras = ["sentencepiece"] }
32
33
 
33
34
  [tool.poetry.dev-dependencies]
34
35
  ruff = "*"
@@ -1 +0,0 @@
1
- 1.4.5
File without changes
File without changes
File without changes