OntoLearner 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ontolearner/VERSION +1 -1
- ontolearner/base/learner.py +15 -12
- ontolearner/learner/label_mapper.py +1 -1
- ontolearner/learner/retriever.py +24 -3
- ontolearner/learner/taxonomy_discovery/__init__.py +18 -0
- ontolearner/learner/taxonomy_discovery/alexbek.py +500 -0
- ontolearner/learner/taxonomy_discovery/rwthdbis.py +1082 -0
- ontolearner/learner/taxonomy_discovery/sbunlp.py +402 -0
- ontolearner/learner/taxonomy_discovery/skhnlp.py +1138 -0
- ontolearner/learner/term_typing/__init__.py +17 -0
- ontolearner/learner/term_typing/alexbek.py +1262 -0
- ontolearner/learner/term_typing/rwthdbis.py +379 -0
- ontolearner/learner/term_typing/sbunlp.py +478 -0
- ontolearner/learner/text2onto/__init__.py +16 -0
- ontolearner/learner/text2onto/alexbek.py +1219 -0
- ontolearner/learner/text2onto/sbunlp.py +598 -0
- {ontolearner-1.4.7.dist-info → ontolearner-1.4.8.dist-info}/METADATA +4 -1
- {ontolearner-1.4.7.dist-info → ontolearner-1.4.8.dist-info}/RECORD +20 -8
- {ontolearner-1.4.7.dist-info → ontolearner-1.4.8.dist-info}/WHEEL +0 -0
- {ontolearner-1.4.7.dist-info → ontolearner-1.4.8.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
# Copyright (c) 2025 SciKnowOrg
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the MIT License (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# https://opensource.org/licenses/MIT
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import random
|
|
17
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
18
|
+
|
|
19
|
+
import torch
|
|
20
|
+
from datasets import Dataset, DatasetDict
|
|
21
|
+
from tqdm.auto import tqdm
|
|
22
|
+
from transformers import (
|
|
23
|
+
AutoTokenizer,
|
|
24
|
+
AutoModelForSequenceClassification,
|
|
25
|
+
DataCollatorWithPadding,
|
|
26
|
+
Trainer,
|
|
27
|
+
TrainingArguments,
|
|
28
|
+
set_seed,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
from ...base import AutoLearner
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class RWTHDBISSFTLearner(AutoLearner):
|
|
35
|
+
"""
|
|
36
|
+
Supervised term-typing
|
|
37
|
+
|
|
38
|
+
Training expands multi-label examples into multiple single-label rows.
|
|
39
|
+
Inference returns: [{"term": "<text>", "types": ["<label_str>"]}, ...]
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
model_name: str = "microsoft/deberta-v3-small",
|
|
45
|
+
trained_model_path: Optional[str] = None,
|
|
46
|
+
output_dir: Optional[str] = None,
|
|
47
|
+
device: str = "cpu",
|
|
48
|
+
max_length: int = 64,
|
|
49
|
+
per_device_train_batch_size: int = 16,
|
|
50
|
+
gradient_accumulation_steps: int = 2,
|
|
51
|
+
num_train_epochs: int = 3,
|
|
52
|
+
learning_rate: float = 2e-5,
|
|
53
|
+
weight_decay: float = 0.01,
|
|
54
|
+
logging_steps: int = 50,
|
|
55
|
+
save_strategy: str = "epoch",
|
|
56
|
+
save_total_limit: int = 1,
|
|
57
|
+
fp16: bool = False,
|
|
58
|
+
bf16: bool = False,
|
|
59
|
+
seed: int = 42,
|
|
60
|
+
) -> None:
|
|
61
|
+
"""Initialize the term-typing learner and configure training defaults.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
model_name: Backbone HF model identifier (used if `trained_model_path` is None).
|
|
65
|
+
trained_model_path: Optional path to a fine-tuned checkpoint for loading.
|
|
66
|
+
output_dir: Directory to write checkpoints and tokenizer; defaults to './term_typing'.
|
|
67
|
+
device: user-defined argument as 'cuda' or 'cpu'.
|
|
68
|
+
max_length: Maximum tokenized sequence length.
|
|
69
|
+
per_device_train_batch_size: Per-device batch size during training.
|
|
70
|
+
gradient_accumulation_steps: Number of update accumulation steps.
|
|
71
|
+
num_train_epochs: Training epochs.
|
|
72
|
+
learning_rate: Optimizer learning rate.
|
|
73
|
+
weight_decay: Weight decay coefficient.
|
|
74
|
+
logging_steps: Logging interval (steps) for the Trainer.
|
|
75
|
+
save_strategy: Checkpoint save strategy (e.g., 'epoch', 'steps', 'no').
|
|
76
|
+
save_total_limit: Maximum number of checkpoints to keep.
|
|
77
|
+
fp16: Enable mixed precision (FP16) if supported.
|
|
78
|
+
bf16: Enable mixed precision (BF16) if supported.
|
|
79
|
+
seed: Random seed for reproducibility.
|
|
80
|
+
|
|
81
|
+
Side Effects:
|
|
82
|
+
Creates `output_dir` if it does not exist.
|
|
83
|
+
|
|
84
|
+
Notes:
|
|
85
|
+
The learner predicts exactly one label per term at inference time
|
|
86
|
+
(argmax over logits).
|
|
87
|
+
"""
|
|
88
|
+
super().__init__()
|
|
89
|
+
self.model_name = model_name
|
|
90
|
+
self.trained_model_path = trained_model_path
|
|
91
|
+
self.output_dir = output_dir or "./term_typing"
|
|
92
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
93
|
+
|
|
94
|
+
self.max_length = max_length
|
|
95
|
+
self.per_device_train_batch_size = per_device_train_batch_size
|
|
96
|
+
self.gradient_accumulation_steps = gradient_accumulation_steps
|
|
97
|
+
self.num_train_epochs = num_train_epochs
|
|
98
|
+
self.learning_rate = learning_rate
|
|
99
|
+
self.weight_decay = weight_decay
|
|
100
|
+
self.logging_steps = logging_steps
|
|
101
|
+
self.save_strategy = save_strategy
|
|
102
|
+
self.save_total_limit = save_total_limit
|
|
103
|
+
self.fp16 = fp16
|
|
104
|
+
self.bf16 = bf16
|
|
105
|
+
self.seed = seed
|
|
106
|
+
|
|
107
|
+
self.device = device
|
|
108
|
+
self.model: Optional[AutoModelForSequenceClassification] = None
|
|
109
|
+
self.tokenizer: Optional[AutoTokenizer] = None
|
|
110
|
+
self.id2label: Dict[int, str] = {}
|
|
111
|
+
self.label2id: Dict[str, int] = {}
|
|
112
|
+
|
|
113
|
+
def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]:
|
|
114
|
+
"""
|
|
115
|
+
Train or run inference for term typing, depending on `test`.
|
|
116
|
+
|
|
117
|
+
When `test=False`, trains on `data.term_typings`.
|
|
118
|
+
When `test=True`, predicts labels for provided terms.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
data: If training, an object with `.term_typings` where each item has
|
|
122
|
+
`term` and `types` (list[str]). If testing, either a `List[str]`
|
|
123
|
+
of raw term texts or an object with `.term_typings`.
|
|
124
|
+
test: If True, runs inference; otherwise trains.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
If `test=True`: a list of dicts like
|
|
128
|
+
`[{"term": "<text>", "types": ["<label_str>"]}, ...]`.
|
|
129
|
+
If `test=False`: None.
|
|
130
|
+
|
|
131
|
+
Raises:
|
|
132
|
+
ValueError: If required fields are missing from `data`.
|
|
133
|
+
"""
|
|
134
|
+
if test:
|
|
135
|
+
terms = self._collect_eval_terms(data)
|
|
136
|
+
return self._predict_structured_output(terms)
|
|
137
|
+
else:
|
|
138
|
+
self._train_from_term_typings(train_data=data)
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
def _expand_multilabel_training_rows(
|
|
142
|
+
self, term_typings: List[Any]
|
|
143
|
+
) -> Tuple[List[str], List[int], Dict[int, str], Dict[str, int]]:
|
|
144
|
+
"""
|
|
145
|
+
Expand multi-label instances into single-label rows and derive label maps.
|
|
146
|
+
|
|
147
|
+
Each training instance with fields:
|
|
148
|
+
- `term`: str-like
|
|
149
|
+
- `types`: list of label strings
|
|
150
|
+
is expanded into len(types) rows with the same `term` and individual labels.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
term_typings: Sequence of objects (e.g., dataclasses) exposing
|
|
154
|
+
`.term` and `.types`.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
A tuple `(texts, label_ids, id2label, label2id)`:
|
|
158
|
+
- texts: Flattened list of term strings (one per label).
|
|
159
|
+
- label_ids: Parallel list of integer label ids.
|
|
160
|
+
- id2label: Mapping from id -> label string.
|
|
161
|
+
- label2id: Mapping from label string -> id.
|
|
162
|
+
"""
|
|
163
|
+
label_strings: List[str] = []
|
|
164
|
+
for instance in term_typings:
|
|
165
|
+
label_strings.extend([str(label) for label in instance.types])
|
|
166
|
+
|
|
167
|
+
unique_labels = sorted(set(label_strings))
|
|
168
|
+
id2label = {i: label for i, label in enumerate(unique_labels)}
|
|
169
|
+
label2id = {label: i for i, label in enumerate(unique_labels)}
|
|
170
|
+
|
|
171
|
+
texts: List[str] = []
|
|
172
|
+
label_ids: List[int] = []
|
|
173
|
+
for instance in term_typings:
|
|
174
|
+
term_text = str(instance.term)
|
|
175
|
+
for label in instance.types:
|
|
176
|
+
texts.append(term_text)
|
|
177
|
+
label_ids.append(label2id[str(label)])
|
|
178
|
+
|
|
179
|
+
return texts, label_ids, id2label, label2id
|
|
180
|
+
|
|
181
|
+
def _collect_eval_terms(self, eval_data: Any) -> List[str]:
|
|
182
|
+
"""
|
|
183
|
+
Collect the list of term texts to predict for evaluation.
|
|
184
|
+
|
|
185
|
+
Accepts either:
|
|
186
|
+
- A `List[str]` of raw term texts, or
|
|
187
|
+
- An object with `.term_typings`, from which `.term` is extracted.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
eval_data: Input carrier for terms.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
List of term strings.
|
|
194
|
+
|
|
195
|
+
Raises:
|
|
196
|
+
ValueError: If `eval_data` lacks the expected structure.
|
|
197
|
+
"""
|
|
198
|
+
if isinstance(eval_data, list) and all(isinstance(x, str) for x in eval_data):
|
|
199
|
+
terms = eval_data
|
|
200
|
+
else:
|
|
201
|
+
term_typings = getattr(eval_data, "term_typings", None)
|
|
202
|
+
if term_typings is None:
|
|
203
|
+
raise ValueError(
|
|
204
|
+
"Provide a List[str] OR an object with .term_typings for test=True."
|
|
205
|
+
)
|
|
206
|
+
terms = [str(instance.term) for instance in term_typings]
|
|
207
|
+
return terms
|
|
208
|
+
|
|
209
|
+
def _train_from_term_typings(self, train_data: Any) -> None:
|
|
210
|
+
"""Train the term-typing classifier from `.term_typings`.
|
|
211
|
+
|
|
212
|
+
Steps:
|
|
213
|
+
1) Seed RNGs for reproducibility.
|
|
214
|
+
2) Expand multi-label examples into single-label rows.
|
|
215
|
+
3) Build HF `DatasetDict`, tokenizer, and data collator.
|
|
216
|
+
4) Initialize `AutoModelForSequenceClassification`.
|
|
217
|
+
5) Train with `Trainer` and save model/tokenizer to `output_dir`.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
train_data: Object with `.term_typings`; each item exposes
|
|
221
|
+
`.term` (text) and `.types` (list[str]).
|
|
222
|
+
|
|
223
|
+
Raises:
|
|
224
|
+
ValueError: If `train_data` does not provide `.term_typings`.
|
|
225
|
+
|
|
226
|
+
Side Effects:
|
|
227
|
+
Writes a trained model to `self.output_dir` and updates
|
|
228
|
+
`self.id2label` / `self.label2id`.
|
|
229
|
+
"""
|
|
230
|
+
set_seed(self.seed)
|
|
231
|
+
random.seed(self.seed)
|
|
232
|
+
torch.manual_seed(self.seed)
|
|
233
|
+
if torch.cuda.is_available():
|
|
234
|
+
torch.cuda.manual_seed_all(self.seed)
|
|
235
|
+
|
|
236
|
+
term_typings: List[Any] = getattr(train_data, "term_typings", None)
|
|
237
|
+
if term_typings is None:
|
|
238
|
+
raise ValueError("train_data must provide .term_typings for term-typing.")
|
|
239
|
+
|
|
240
|
+
texts, label_ids, self.id2label, self.label2id = (
|
|
241
|
+
self._expand_multilabel_training_rows(term_typings)
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
dataset = DatasetDict(
|
|
245
|
+
{"train": Dataset.from_dict({"labels": label_ids, "text": texts})}
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
backbone = self.trained_model_path or self.model_name
|
|
249
|
+
try:
|
|
250
|
+
self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=True)
|
|
251
|
+
except Exception:
|
|
252
|
+
# fallback if fast tokenizer isn't available
|
|
253
|
+
self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=False)
|
|
254
|
+
|
|
255
|
+
def tokenize_batch(batch: Dict[str, List[str]]):
|
|
256
|
+
"""Tokenize a batch of texts with truncation and max length."""
|
|
257
|
+
return self.tokenizer(
|
|
258
|
+
batch["text"], truncation=True, max_length=self.max_length
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
tokenized = dataset.map(tokenize_batch, batched=True, remove_columns=["text"])
|
|
262
|
+
data_collator = DataCollatorWithPadding(self.tokenizer)
|
|
263
|
+
|
|
264
|
+
self.model = AutoModelForSequenceClassification.from_pretrained(
|
|
265
|
+
backbone,
|
|
266
|
+
num_labels=len(self.id2label),
|
|
267
|
+
id2label=self.id2label,
|
|
268
|
+
label2id=self.label2id,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
if (
|
|
272
|
+
getattr(self.model.config, "pad_token_id", None) is None
|
|
273
|
+
and self.tokenizer.pad_token_id is not None
|
|
274
|
+
):
|
|
275
|
+
self.model.config.pad_token_id = self.tokenizer.pad_token_id
|
|
276
|
+
|
|
277
|
+
training_args = TrainingArguments(
|
|
278
|
+
output_dir=self.output_dir,
|
|
279
|
+
learning_rate=self.learning_rate,
|
|
280
|
+
per_device_train_batch_size=self.per_device_train_batch_size,
|
|
281
|
+
gradient_accumulation_steps=self.gradient_accumulation_steps,
|
|
282
|
+
num_train_epochs=self.num_train_epochs,
|
|
283
|
+
weight_decay=self.weight_decay,
|
|
284
|
+
save_strategy=self.save_strategy,
|
|
285
|
+
save_total_limit=self.save_total_limit,
|
|
286
|
+
logging_steps=self.logging_steps,
|
|
287
|
+
fp16=self.fp16,
|
|
288
|
+
bf16=self.bf16,
|
|
289
|
+
report_to=[],
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
trainer = Trainer(
|
|
293
|
+
model=self.model,
|
|
294
|
+
args=training_args,
|
|
295
|
+
train_dataset=tokenized["train"],
|
|
296
|
+
tokenizer=self.tokenizer,
|
|
297
|
+
data_collator=data_collator,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
trainer.train()
|
|
301
|
+
trainer.save_model(self.output_dir)
|
|
302
|
+
self.tokenizer.save_pretrained(self.output_dir)
|
|
303
|
+
|
|
304
|
+
def _ensure_loaded_for_inference(self) -> None:
|
|
305
|
+
"""Load model/tokenizer for inference if not already loaded.
|
|
306
|
+
|
|
307
|
+
Loads from `trained_model_path` if set, otherwise from `output_dir`.
|
|
308
|
+
Also restores `id2label`/`label2id` from the model config when present,
|
|
309
|
+
moves the model to the configured device, and sets eval mode.
|
|
310
|
+
"""
|
|
311
|
+
if self.model is not None and self.tokenizer is not None:
|
|
312
|
+
return
|
|
313
|
+
model_path = self.trained_model_path or self.output_dir
|
|
314
|
+
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
|
|
315
|
+
try:
|
|
316
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
|
|
317
|
+
except Exception:
|
|
318
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
|
|
319
|
+
|
|
320
|
+
cfg = self.model.config
|
|
321
|
+
if hasattr(cfg, "id2label") and hasattr(cfg, "label2id"):
|
|
322
|
+
self.id2label = dict(cfg.id2label)
|
|
323
|
+
self.label2id = dict(cfg.label2id)
|
|
324
|
+
|
|
325
|
+
self.model.to(self.device).eval()
|
|
326
|
+
|
|
327
|
+
def _predict_label_ids(self, terms: List[str]) -> List[int]:
|
|
328
|
+
"""Predict label ids (argmax) for a list of term strings.
|
|
329
|
+
|
|
330
|
+
Ensures model/tokenizer are loaded, then performs forward passes
|
|
331
|
+
term-by-term and collects the argmax label id.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
terms: List of raw term texts.
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
List of integer label ids corresponding to `terms`.
|
|
338
|
+
"""
|
|
339
|
+
self._ensure_loaded_for_inference()
|
|
340
|
+
predictions: List[int] = []
|
|
341
|
+
for term_text in tqdm(
|
|
342
|
+
terms, desc="Inference", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}"
|
|
343
|
+
):
|
|
344
|
+
inputs = self.tokenizer(
|
|
345
|
+
term_text,
|
|
346
|
+
return_tensors="pt",
|
|
347
|
+
truncation=True,
|
|
348
|
+
max_length=self.max_length,
|
|
349
|
+
)
|
|
350
|
+
inputs = {name: tensor.to(self.device) for name, tensor in inputs.items()}
|
|
351
|
+
with torch.no_grad():
|
|
352
|
+
logits = self.model(**inputs).logits
|
|
353
|
+
predictions.append(int(torch.argmax(logits, dim=-1).item()))
|
|
354
|
+
return predictions
|
|
355
|
+
|
|
356
|
+
def _predict_structured_output(
|
|
357
|
+
self, terms: List[str]
|
|
358
|
+
) -> List[Dict[str, List[str]]]:
|
|
359
|
+
"""
|
|
360
|
+
Convert predicted label IDs into evaluator-friendly structured outputs.
|
|
361
|
+
|
|
362
|
+
The output format is:
|
|
363
|
+
[{"term": "<text>", "types": ["<label_str>"]}, ...]
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
terms: Raw term texts to classify.
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
List of dicts mapping each input term to a list with its predicted
|
|
370
|
+
label string. Falls back to stringified id if label mapping is absent.
|
|
371
|
+
"""
|
|
372
|
+
label_ids = self._predict_label_ids(terms)
|
|
373
|
+
id2label_map = self.id2label or {} # fallback handled below
|
|
374
|
+
|
|
375
|
+
results: List[Dict[str, List[str]]] = []
|
|
376
|
+
for term_text, label_id in zip(terms, label_ids):
|
|
377
|
+
label_str = id2label_map.get(int(label_id), str(int(label_id)))
|
|
378
|
+
results.append({"term": term_text, "types": [label_str]})
|
|
379
|
+
return results
|