OntoLearner 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ontolearner/VERSION +1 -1
- ontolearner/base/learner.py +20 -14
- ontolearner/learner/__init__.py +1 -1
- ontolearner/learner/label_mapper.py +1 -1
- ontolearner/learner/llm.py +73 -3
- ontolearner/learner/retriever.py +24 -3
- ontolearner/learner/taxonomy_discovery/__init__.py +18 -0
- ontolearner/learner/taxonomy_discovery/alexbek.py +500 -0
- ontolearner/learner/taxonomy_discovery/rwthdbis.py +1082 -0
- ontolearner/learner/taxonomy_discovery/sbunlp.py +402 -0
- ontolearner/learner/taxonomy_discovery/skhnlp.py +1138 -0
- ontolearner/learner/term_typing/__init__.py +17 -0
- ontolearner/learner/term_typing/alexbek.py +1262 -0
- ontolearner/learner/term_typing/rwthdbis.py +379 -0
- ontolearner/learner/term_typing/sbunlp.py +478 -0
- ontolearner/learner/text2onto/__init__.py +16 -0
- ontolearner/learner/text2onto/alexbek.py +1219 -0
- ontolearner/learner/text2onto/sbunlp.py +598 -0
- {ontolearner-1.4.6.dist-info → ontolearner-1.4.8.dist-info}/METADATA +5 -1
- {ontolearner-1.4.6.dist-info → ontolearner-1.4.8.dist-info}/RECORD +22 -10
- {ontolearner-1.4.6.dist-info → ontolearner-1.4.8.dist-info}/WHEEL +0 -0
- {ontolearner-1.4.6.dist-info → ontolearner-1.4.8.dist-info}/licenses/LICENSE +0 -0
ontolearner/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
1.4.
|
|
1
|
+
1.4.8
|
ontolearner/base/learner.py
CHANGED
|
@@ -236,15 +236,21 @@ class AutoLLM(ABC):
|
|
|
236
236
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left', token=self.token)
|
|
237
237
|
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
238
238
|
if self.device == "cpu":
|
|
239
|
-
device_map = "cpu"
|
|
239
|
+
# device_map = "cpu"
|
|
240
|
+
self.model = AutoModelForCausalLM.from_pretrained(
|
|
241
|
+
model_id,
|
|
242
|
+
# device_map=device_map,
|
|
243
|
+
torch_dtype=torch.bfloat16,
|
|
244
|
+
token=self.token
|
|
245
|
+
)
|
|
240
246
|
else:
|
|
241
|
-
device_map = "
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
247
|
+
device_map = "balanced"
|
|
248
|
+
self.model = AutoModelForCausalLM.from_pretrained(
|
|
249
|
+
model_id,
|
|
250
|
+
device_map=device_map,
|
|
251
|
+
torch_dtype=torch.bfloat16,
|
|
252
|
+
token=self.token
|
|
253
|
+
)
|
|
248
254
|
self.label_mapper.fit()
|
|
249
255
|
|
|
250
256
|
def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
|
|
@@ -271,7 +277,10 @@ class AutoLLM(ABC):
|
|
|
271
277
|
Responses include the original input plus generated continuation.
|
|
272
278
|
"""
|
|
273
279
|
# Tokenize inputs and move to device
|
|
274
|
-
encoded_inputs = self.tokenizer(inputs,
|
|
280
|
+
encoded_inputs = self.tokenizer(inputs,
|
|
281
|
+
return_tensors="pt",
|
|
282
|
+
padding=True,
|
|
283
|
+
truncation=True).to(self.model.device)
|
|
275
284
|
input_ids = encoded_inputs["input_ids"]
|
|
276
285
|
input_length = input_ids.shape[1]
|
|
277
286
|
|
|
@@ -287,7 +296,8 @@ class AutoLLM(ABC):
|
|
|
287
296
|
|
|
288
297
|
# Decode only the generated part
|
|
289
298
|
decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens]
|
|
290
|
-
|
|
299
|
+
print(decoded_outputs)
|
|
300
|
+
print(self.label_mapper.predict(decoded_outputs))
|
|
291
301
|
# Map the decoded text to labels
|
|
292
302
|
return self.label_mapper.predict(decoded_outputs)
|
|
293
303
|
|
|
@@ -298,9 +308,6 @@ class AutoRetriever(ABC):
|
|
|
298
308
|
This class defines the interface for retrieval components used in ontology learning.
|
|
299
309
|
Retrievers are responsible for finding semantically similar examples from training
|
|
300
310
|
data to provide context for language models or to make direct predictions.
|
|
301
|
-
|
|
302
|
-
Attributes:
|
|
303
|
-
model: The loaded retrieval/embedding model instance.
|
|
304
311
|
"""
|
|
305
312
|
|
|
306
313
|
def __init__(self) -> None:
|
|
@@ -310,7 +317,6 @@ class AutoRetriever(ABC):
|
|
|
310
317
|
Sets up the basic structure with a model attribute that will be
|
|
311
318
|
populated when load() is called.
|
|
312
319
|
"""
|
|
313
|
-
self.model: Optional[Any] = None
|
|
314
320
|
self.embedding_model = None
|
|
315
321
|
self.documents = []
|
|
316
322
|
self.embeddings = None
|
ontolearner/learner/__init__.py
CHANGED
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from .llm import AutoLLMLearner
|
|
15
|
+
from .llm import AutoLLMLearner, FalconLLM, MistralLLM
|
|
16
16
|
from .retriever import AutoRetrieverLearner
|
|
17
17
|
from .rag import AutoRAGLearner
|
|
18
18
|
from .prompt import StandardizedPrompting
|
ontolearner/learner/llm.py
CHANGED
|
@@ -13,23 +13,27 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
from ..base import AutoLLM, AutoLearner
|
|
16
|
-
from typing import Any
|
|
16
|
+
from typing import Any, List
|
|
17
17
|
import warnings
|
|
18
18
|
from tqdm import tqdm
|
|
19
19
|
from torch.utils.data import DataLoader
|
|
20
|
-
|
|
20
|
+
import torch
|
|
21
|
+
from transformers import Mistral3ForConditionalGeneration
|
|
22
|
+
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
|
23
|
+
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
|
21
24
|
|
|
22
25
|
class AutoLLMLearner(AutoLearner):
|
|
23
26
|
|
|
24
27
|
def __init__(self,
|
|
25
28
|
prompting,
|
|
26
29
|
label_mapper,
|
|
30
|
+
llm: AutoLLM = AutoLLM,
|
|
27
31
|
token: str = "",
|
|
28
32
|
max_new_tokens: int = 5,
|
|
29
33
|
batch_size: int = 10,
|
|
30
34
|
device='cpu') -> None:
|
|
31
35
|
super().__init__()
|
|
32
|
-
self.llm =
|
|
36
|
+
self.llm = llm(token=token, label_mapper=label_mapper, device=device)
|
|
33
37
|
self.prompting = prompting
|
|
34
38
|
self.batch_size = batch_size
|
|
35
39
|
self.max_new_tokens = max_new_tokens
|
|
@@ -136,3 +140,69 @@ class AutoLLMLearner(AutoLearner):
|
|
|
136
140
|
return self._non_taxonomic_re_predict(dataset=dataset)
|
|
137
141
|
else:
|
|
138
142
|
warnings.warn("No requirement for fiting the non-taxonomic-re model, the predict module will use the input data to do the task.")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class FalconLLM(AutoLLM):
|
|
146
|
+
|
|
147
|
+
def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
|
|
148
|
+
encoded_inputs = self.tokenizer(inputs,
|
|
149
|
+
return_tensors="pt",
|
|
150
|
+
padding=True,
|
|
151
|
+
truncation=True).to(self.model.device)
|
|
152
|
+
input_ids = encoded_inputs["input_ids"]
|
|
153
|
+
input_length = input_ids.shape[1]
|
|
154
|
+
outputs = self.model.generate(
|
|
155
|
+
input_ids,
|
|
156
|
+
max_new_tokens=max_new_tokens,
|
|
157
|
+
pad_token_id=self.tokenizer.eos_token_id
|
|
158
|
+
)
|
|
159
|
+
generated_tokens = outputs[:, input_length:]
|
|
160
|
+
decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens]
|
|
161
|
+
return self.label_mapper.predict(decoded_outputs)
|
|
162
|
+
|
|
163
|
+
class MistralLLM(AutoLLM):
|
|
164
|
+
|
|
165
|
+
def load(self, model_id: str) -> None:
|
|
166
|
+
self.tokenizer = MistralTokenizer.from_hf_hub(model_id)
|
|
167
|
+
if self.device == "cpu":
|
|
168
|
+
device_map = "cpu"
|
|
169
|
+
else:
|
|
170
|
+
device_map = "balanced"
|
|
171
|
+
self.model = Mistral3ForConditionalGeneration.from_pretrained(
|
|
172
|
+
model_id,
|
|
173
|
+
device_map=device_map,
|
|
174
|
+
torch_dtype=torch.bfloat16,
|
|
175
|
+
token=self.token
|
|
176
|
+
)
|
|
177
|
+
if not hasattr(self.tokenizer, "pad_token_id") or self.tokenizer.pad_token_id is None:
|
|
178
|
+
self.tokenizer.pad_token_id = self.model.generation_config.eos_token_id
|
|
179
|
+
self.label_mapper.fit()
|
|
180
|
+
|
|
181
|
+
def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
|
|
182
|
+
tokenized_list = []
|
|
183
|
+
for prompt in inputs:
|
|
184
|
+
messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
|
|
185
|
+
tokenized = self.tokenizer.encode_chat_completion(ChatCompletionRequest(messages=messages))
|
|
186
|
+
tokenized_list.append(tokenized.tokens)
|
|
187
|
+
max_len = max(len(tokens) for tokens in tokenized_list)
|
|
188
|
+
input_ids, attention_masks = [], []
|
|
189
|
+
for tokens in tokenized_list:
|
|
190
|
+
pad_length = max_len - len(tokens)
|
|
191
|
+
input_ids.append(tokens + [self.tokenizer.pad_token_id] * pad_length)
|
|
192
|
+
attention_masks.append([1] * len(tokens) + [0] * pad_length)
|
|
193
|
+
|
|
194
|
+
input_ids = torch.tensor(input_ids).to(self.model.device)
|
|
195
|
+
attention_masks = torch.tensor(attention_masks).to(self.model.device)
|
|
196
|
+
|
|
197
|
+
outputs =self.model.generate(
|
|
198
|
+
input_ids=input_ids,
|
|
199
|
+
attention_mask=attention_masks,
|
|
200
|
+
eos_token_id=self.model.generation_config.eos_token_id,
|
|
201
|
+
pad_token_id=self.tokenizer.pad_token_id,
|
|
202
|
+
max_new_tokens=max_new_tokens,
|
|
203
|
+
)
|
|
204
|
+
decoded_outputs = []
|
|
205
|
+
for i, tokens in enumerate(outputs):
|
|
206
|
+
output_text = self.tokenizer.decode(tokens[len(tokenized_list[i]):])
|
|
207
|
+
decoded_outputs.append(output_text)
|
|
208
|
+
return self.label_mapper.predict(decoded_outputs)
|
ontolearner/learner/retriever.py
CHANGED
|
@@ -66,7 +66,16 @@ class AutoRetrieverLearner(AutoLearner):
|
|
|
66
66
|
taxonomic_pairs = [{"parent": candidate, "child": query}
|
|
67
67
|
for query, candidates in zip(data, candidates_lst)
|
|
68
68
|
for candidate in candidates if candidate.lower() != query.lower()]
|
|
69
|
-
|
|
69
|
+
taxonomic_pairs += [{"parent": query, "child": candidate}
|
|
70
|
+
for query, candidates in zip(data, candidates_lst)
|
|
71
|
+
for candidate in candidates if candidate.lower() != query.lower()]
|
|
72
|
+
unique_taxonomic_pairs, seen = [], set()
|
|
73
|
+
for pair in taxonomic_pairs:
|
|
74
|
+
key = (pair["parent"].lower(), pair["child"].lower()) # Directional key (parent, child)
|
|
75
|
+
if key not in seen:
|
|
76
|
+
seen.add(key)
|
|
77
|
+
unique_taxonomic_pairs.append(pair)
|
|
78
|
+
return unique_taxonomic_pairs
|
|
70
79
|
else:
|
|
71
80
|
warnings.warn("No requirement for fiting the taxonomy discovery model, the predict module will use the input data to do the fit as well.")
|
|
72
81
|
|
|
@@ -86,11 +95,23 @@ class AutoRetrieverLearner(AutoLearner):
|
|
|
86
95
|
candidates_lst = self._retriever_predict(data=data['types'], top_k=self.top_k + 1)
|
|
87
96
|
taxonomic_pairs = []
|
|
88
97
|
taxonomic_pairs_query = []
|
|
98
|
+
seen = set()
|
|
89
99
|
for query, candidates in zip(data['types'], candidates_lst):
|
|
90
100
|
for candidate in candidates:
|
|
91
101
|
if candidate != query:
|
|
92
|
-
|
|
93
|
-
|
|
102
|
+
# Directional pair 1: query -> candidate
|
|
103
|
+
key1 = (query.lower(), candidate.lower())
|
|
104
|
+
if key1 not in seen:
|
|
105
|
+
seen.add(key1)
|
|
106
|
+
taxonomic_pairs.append((query, candidate))
|
|
107
|
+
taxonomic_pairs_query.append(f"Head: {query}\nTail: {candidate}")
|
|
108
|
+
# Directional pair 2: candidate -> query
|
|
109
|
+
key2 = (candidate.lower(), query.lower())
|
|
110
|
+
if key2 not in seen:
|
|
111
|
+
seen.add(key2)
|
|
112
|
+
taxonomic_pairs.append((candidate, query))
|
|
113
|
+
taxonomic_pairs_query.append(f"Head: {candidate}\nTail: {query}")
|
|
114
|
+
|
|
94
115
|
self._retriever_fit(data=data['relations'])
|
|
95
116
|
candidate_relations_lst = self._retriever_predict(data=taxonomic_pairs_query, top_k=self.top_k)
|
|
96
117
|
non_taxonomic_re = [{"head": head, "tail": tail, "relation": relation}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Copyright (c) 2025 SciKnowOrg
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the MIT License (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# https://opensource.org/licenses/MIT
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from .alexbek import AlexbekCrossAttnLearner
|
|
16
|
+
from .rwthdbis import RWTHDBISSFTLearner
|
|
17
|
+
from .sbunlp import SBUNLPFewShotLearner
|
|
18
|
+
from .skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner
|