OntoLearner 1.4.6__tar.gz → 1.4.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {ontolearner-1.4.6 → ontolearner-1.4.8}/PKG-INFO +5 -1
  2. ontolearner-1.4.8/ontolearner/VERSION +1 -0
  3. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/base/learner.py +20 -14
  4. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/learner/__init__.py +1 -1
  5. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/learner/label_mapper.py +1 -1
  6. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/learner/llm.py +73 -3
  7. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/learner/retriever.py +24 -3
  8. ontolearner-1.4.8/ontolearner/learner/taxonomy_discovery/__init__.py +18 -0
  9. ontolearner-1.4.8/ontolearner/learner/taxonomy_discovery/alexbek.py +500 -0
  10. ontolearner-1.4.8/ontolearner/learner/taxonomy_discovery/rwthdbis.py +1082 -0
  11. ontolearner-1.4.8/ontolearner/learner/taxonomy_discovery/sbunlp.py +402 -0
  12. ontolearner-1.4.8/ontolearner/learner/taxonomy_discovery/skhnlp.py +1138 -0
  13. ontolearner-1.4.8/ontolearner/learner/term_typing/__init__.py +17 -0
  14. ontolearner-1.4.8/ontolearner/learner/term_typing/alexbek.py +1262 -0
  15. ontolearner-1.4.8/ontolearner/learner/term_typing/rwthdbis.py +379 -0
  16. ontolearner-1.4.8/ontolearner/learner/term_typing/sbunlp.py +478 -0
  17. ontolearner-1.4.8/ontolearner/learner/text2onto/__init__.py +16 -0
  18. ontolearner-1.4.8/ontolearner/learner/text2onto/alexbek.py +1219 -0
  19. ontolearner-1.4.8/ontolearner/learner/text2onto/sbunlp.py +598 -0
  20. {ontolearner-1.4.6 → ontolearner-1.4.8}/pyproject.toml +5 -1
  21. ontolearner-1.4.6/ontolearner/VERSION +0 -1
  22. {ontolearner-1.4.6 → ontolearner-1.4.8}/LICENSE +0 -0
  23. {ontolearner-1.4.6 → ontolearner-1.4.8}/README.md +0 -0
  24. {ontolearner-1.4.6 → ontolearner-1.4.8}/images/logo.png +0 -0
  25. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/__init__.py +0 -0
  26. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/_learner.py +0 -0
  27. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/_ontology.py +0 -0
  28. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/base/__init__.py +0 -0
  29. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/base/ontology.py +0 -0
  30. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/base/text2onto.py +0 -0
  31. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/data_structure/__init__.py +0 -0
  32. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/data_structure/data.py +0 -0
  33. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/data_structure/metric.py +0 -0
  34. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/evaluation/__init__.py +0 -0
  35. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/evaluation/evaluate.py +0 -0
  36. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/evaluation/metrics.py +0 -0
  37. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/learner/prompt.py +0 -0
  38. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/learner/rag.py +0 -0
  39. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/__init__.py +0 -0
  40. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/agriculture.py +0 -0
  41. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/arts_humanities.py +0 -0
  42. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/biology.py +0 -0
  43. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/chemistry.py +0 -0
  44. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/ecology_environment.py +0 -0
  45. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/education.py +0 -0
  46. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/events.py +0 -0
  47. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/finance.py +0 -0
  48. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/food_beverage.py +0 -0
  49. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/general.py +0 -0
  50. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/geography.py +0 -0
  51. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/industry.py +0 -0
  52. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/law.py +0 -0
  53. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/library_cultural_heritage.py +0 -0
  54. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/material_science_engineering.py +0 -0
  55. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/medicine.py +0 -0
  56. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/news_media.py +0 -0
  57. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/scholarly_knowledge.py +0 -0
  58. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/social_sciences.py +0 -0
  59. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/units_measurements.py +0 -0
  60. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/upper_ontologies.py +0 -0
  61. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/ontology/web.py +0 -0
  62. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/processor.py +0 -0
  63. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/text2onto/__init__.py +0 -0
  64. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/text2onto/batchifier.py +0 -0
  65. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/text2onto/general.py +0 -0
  66. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/text2onto/splitter.py +0 -0
  67. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/text2onto/synthesizer.py +0 -0
  68. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/tools/__init__.py +0 -0
  69. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/tools/analyzer.py +0 -0
  70. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/tools/visualizer.py +0 -0
  71. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/utils/__init__.py +0 -0
  72. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/utils/io.py +0 -0
  73. {ontolearner-1.4.6 → ontolearner-1.4.8}/ontolearner/utils/train_test_split.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OntoLearner
3
- Version: 1.4.6
3
+ Version: 1.4.8
4
4
  Summary: OntoLearner: A Modular Python Library for Ontology Learning with LLMs.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -13,15 +13,19 @@ Classifier: Programming Language :: Python :: 3.10
13
13
  Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Dist: Levenshtein
16
17
  Requires-Dist: bitsandbytes (>=0.45.1,<0.46.0)
17
18
  Requires-Dist: dspy (>=2.6.14,<3.0.0)
19
+ Requires-Dist: g4f
18
20
  Requires-Dist: huggingface-hub (>=0.34.4,<0.35.0)
19
21
  Requires-Dist: matplotlib
22
+ Requires-Dist: mistral-common[sentencepiece] (>=1.8.5,<2.0.0)
20
23
  Requires-Dist: networkx (==3.2.1)
21
24
  Requires-Dist: numpy
22
25
  Requires-Dist: openpyxl
23
26
  Requires-Dist: pandas
24
27
  Requires-Dist: pathlib (==1.0.1)
28
+ Requires-Dist: protobuf (<5)
25
29
  Requires-Dist: pydantic (==2.11.3)
26
30
  Requires-Dist: python-dotenv
27
31
  Requires-Dist: rdflib (==7.1.1)
@@ -0,0 +1 @@
1
+ 1.4.8
@@ -236,15 +236,21 @@ class AutoLLM(ABC):
236
236
  self.tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left', token=self.token)
237
237
  self.tokenizer.pad_token = self.tokenizer.eos_token
238
238
  if self.device == "cpu":
239
- device_map = "cpu"
239
+ # device_map = "cpu"
240
+ self.model = AutoModelForCausalLM.from_pretrained(
241
+ model_id,
242
+ # device_map=device_map,
243
+ torch_dtype=torch.bfloat16,
244
+ token=self.token
245
+ )
240
246
  else:
241
- device_map = "auto"
242
- self.model = AutoModelForCausalLM.from_pretrained(
243
- model_id,
244
- device_map=device_map,
245
- torch_dtype=torch.bfloat16,
246
- token=self.token
247
- )
247
+ device_map = "balanced"
248
+ self.model = AutoModelForCausalLM.from_pretrained(
249
+ model_id,
250
+ device_map=device_map,
251
+ torch_dtype=torch.bfloat16,
252
+ token=self.token
253
+ )
248
254
  self.label_mapper.fit()
249
255
 
250
256
  def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
@@ -271,7 +277,10 @@ class AutoLLM(ABC):
271
277
  Responses include the original input plus generated continuation.
272
278
  """
273
279
  # Tokenize inputs and move to device
274
- encoded_inputs = self.tokenizer(inputs, return_tensors="pt", padding=True).to(self.model.device)
280
+ encoded_inputs = self.tokenizer(inputs,
281
+ return_tensors="pt",
282
+ padding=True,
283
+ truncation=True).to(self.model.device)
275
284
  input_ids = encoded_inputs["input_ids"]
276
285
  input_length = input_ids.shape[1]
277
286
 
@@ -287,7 +296,8 @@ class AutoLLM(ABC):
287
296
 
288
297
  # Decode only the generated part
289
298
  decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens]
290
-
299
+ print(decoded_outputs)
300
+ print(self.label_mapper.predict(decoded_outputs))
291
301
  # Map the decoded text to labels
292
302
  return self.label_mapper.predict(decoded_outputs)
293
303
 
@@ -298,9 +308,6 @@ class AutoRetriever(ABC):
298
308
  This class defines the interface for retrieval components used in ontology learning.
299
309
  Retrievers are responsible for finding semantically similar examples from training
300
310
  data to provide context for language models or to make direct predictions.
301
-
302
- Attributes:
303
- model: The loaded retrieval/embedding model instance.
304
311
  """
305
312
 
306
313
  def __init__(self) -> None:
@@ -310,7 +317,6 @@ class AutoRetriever(ABC):
310
317
  Sets up the basic structure with a model attribute that will be
311
318
  populated when load() is called.
312
319
  """
313
- self.model: Optional[Any] = None
314
320
  self.embedding_model = None
315
321
  self.documents = []
316
322
  self.embeddings = None
@@ -12,7 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from .llm import AutoLLMLearner
15
+ from .llm import AutoLLMLearner, FalconLLM, MistralLLM
16
16
  from .retriever import AutoRetrieverLearner
17
17
  from .rag import AutoRAGLearner
18
18
  from .prompt import StandardizedPrompting
@@ -85,6 +85,6 @@ class LabelMapper:
85
85
  Returns:
86
86
  List[str]: Predicted labels.
87
87
  """
88
- predictions = list(self.model.predict(X))
88
+ predictions = self.model.predict(X).tolist()
89
89
  self.validate_predicts(predictions)
90
90
  return predictions
@@ -13,23 +13,27 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from ..base import AutoLLM, AutoLearner
16
- from typing import Any
16
+ from typing import Any, List
17
17
  import warnings
18
18
  from tqdm import tqdm
19
19
  from torch.utils.data import DataLoader
20
-
20
+ import torch
21
+ from transformers import Mistral3ForConditionalGeneration
22
+ from mistral_common.protocol.instruct.request import ChatCompletionRequest
23
+ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
21
24
 
22
25
  class AutoLLMLearner(AutoLearner):
23
26
 
24
27
  def __init__(self,
25
28
  prompting,
26
29
  label_mapper,
30
+ llm: AutoLLM = AutoLLM,
27
31
  token: str = "",
28
32
  max_new_tokens: int = 5,
29
33
  batch_size: int = 10,
30
34
  device='cpu') -> None:
31
35
  super().__init__()
32
- self.llm = AutoLLM(token=token, label_mapper=label_mapper, device=device)
36
+ self.llm = llm(token=token, label_mapper=label_mapper, device=device)
33
37
  self.prompting = prompting
34
38
  self.batch_size = batch_size
35
39
  self.max_new_tokens = max_new_tokens
@@ -136,3 +140,69 @@ class AutoLLMLearner(AutoLearner):
136
140
  return self._non_taxonomic_re_predict(dataset=dataset)
137
141
  else:
138
142
  warnings.warn("No requirement for fiting the non-taxonomic-re model, the predict module will use the input data to do the task.")
143
+
144
+
145
+ class FalconLLM(AutoLLM):
146
+
147
+ def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
148
+ encoded_inputs = self.tokenizer(inputs,
149
+ return_tensors="pt",
150
+ padding=True,
151
+ truncation=True).to(self.model.device)
152
+ input_ids = encoded_inputs["input_ids"]
153
+ input_length = input_ids.shape[1]
154
+ outputs = self.model.generate(
155
+ input_ids,
156
+ max_new_tokens=max_new_tokens,
157
+ pad_token_id=self.tokenizer.eos_token_id
158
+ )
159
+ generated_tokens = outputs[:, input_length:]
160
+ decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens]
161
+ return self.label_mapper.predict(decoded_outputs)
162
+
163
+ class MistralLLM(AutoLLM):
164
+
165
+ def load(self, model_id: str) -> None:
166
+ self.tokenizer = MistralTokenizer.from_hf_hub(model_id)
167
+ if self.device == "cpu":
168
+ device_map = "cpu"
169
+ else:
170
+ device_map = "balanced"
171
+ self.model = Mistral3ForConditionalGeneration.from_pretrained(
172
+ model_id,
173
+ device_map=device_map,
174
+ torch_dtype=torch.bfloat16,
175
+ token=self.token
176
+ )
177
+ if not hasattr(self.tokenizer, "pad_token_id") or self.tokenizer.pad_token_id is None:
178
+ self.tokenizer.pad_token_id = self.model.generation_config.eos_token_id
179
+ self.label_mapper.fit()
180
+
181
+ def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
182
+ tokenized_list = []
183
+ for prompt in inputs:
184
+ messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
185
+ tokenized = self.tokenizer.encode_chat_completion(ChatCompletionRequest(messages=messages))
186
+ tokenized_list.append(tokenized.tokens)
187
+ max_len = max(len(tokens) for tokens in tokenized_list)
188
+ input_ids, attention_masks = [], []
189
+ for tokens in tokenized_list:
190
+ pad_length = max_len - len(tokens)
191
+ input_ids.append(tokens + [self.tokenizer.pad_token_id] * pad_length)
192
+ attention_masks.append([1] * len(tokens) + [0] * pad_length)
193
+
194
+ input_ids = torch.tensor(input_ids).to(self.model.device)
195
+ attention_masks = torch.tensor(attention_masks).to(self.model.device)
196
+
197
+ outputs =self.model.generate(
198
+ input_ids=input_ids,
199
+ attention_mask=attention_masks,
200
+ eos_token_id=self.model.generation_config.eos_token_id,
201
+ pad_token_id=self.tokenizer.pad_token_id,
202
+ max_new_tokens=max_new_tokens,
203
+ )
204
+ decoded_outputs = []
205
+ for i, tokens in enumerate(outputs):
206
+ output_text = self.tokenizer.decode(tokens[len(tokenized_list[i]):])
207
+ decoded_outputs.append(output_text)
208
+ return self.label_mapper.predict(decoded_outputs)
@@ -66,7 +66,16 @@ class AutoRetrieverLearner(AutoLearner):
66
66
  taxonomic_pairs = [{"parent": candidate, "child": query}
67
67
  for query, candidates in zip(data, candidates_lst)
68
68
  for candidate in candidates if candidate.lower() != query.lower()]
69
- return taxonomic_pairs
69
+ taxonomic_pairs += [{"parent": query, "child": candidate}
70
+ for query, candidates in zip(data, candidates_lst)
71
+ for candidate in candidates if candidate.lower() != query.lower()]
72
+ unique_taxonomic_pairs, seen = [], set()
73
+ for pair in taxonomic_pairs:
74
+ key = (pair["parent"].lower(), pair["child"].lower()) # Directional key (parent, child)
75
+ if key not in seen:
76
+ seen.add(key)
77
+ unique_taxonomic_pairs.append(pair)
78
+ return unique_taxonomic_pairs
70
79
  else:
71
80
  warnings.warn("No requirement for fiting the taxonomy discovery model, the predict module will use the input data to do the fit as well.")
72
81
 
@@ -86,11 +95,23 @@ class AutoRetrieverLearner(AutoLearner):
86
95
  candidates_lst = self._retriever_predict(data=data['types'], top_k=self.top_k + 1)
87
96
  taxonomic_pairs = []
88
97
  taxonomic_pairs_query = []
98
+ seen = set()
89
99
  for query, candidates in zip(data['types'], candidates_lst):
90
100
  for candidate in candidates:
91
101
  if candidate != query:
92
- taxonomic_pairs.append((query, candidate))
93
- taxonomic_pairs_query.append(f"Head: {query} \n Tail: {candidate}")
102
+ # Directional pair 1: query -> candidate
103
+ key1 = (query.lower(), candidate.lower())
104
+ if key1 not in seen:
105
+ seen.add(key1)
106
+ taxonomic_pairs.append((query, candidate))
107
+ taxonomic_pairs_query.append(f"Head: {query}\nTail: {candidate}")
108
+ # Directional pair 2: candidate -> query
109
+ key2 = (candidate.lower(), query.lower())
110
+ if key2 not in seen:
111
+ seen.add(key2)
112
+ taxonomic_pairs.append((candidate, query))
113
+ taxonomic_pairs_query.append(f"Head: {candidate}\nTail: {query}")
114
+
94
115
  self._retriever_fit(data=data['relations'])
95
116
  candidate_relations_lst = self._retriever_predict(data=taxonomic_pairs_query, top_k=self.top_k)
96
117
  non_taxonomic_re = [{"head": head, "tail": tail, "relation": relation}
@@ -0,0 +1,18 @@
1
+ # Copyright (c) 2025 SciKnowOrg
2
+ #
3
+ # Licensed under the MIT License (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://opensource.org/licenses/MIT
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .alexbek import AlexbekCrossAttnLearner
16
+ from .rwthdbis import RWTHDBISSFTLearner
17
+ from .sbunlp import SBUNLPFewShotLearner
18
+ from .skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner