OntoLearner 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,379 @@
1
+ # Copyright (c) 2025 SciKnowOrg
2
+ #
3
+ # Licensed under the MIT License (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://opensource.org/licenses/MIT
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import random
17
+ from typing import Any, Dict, List, Optional, Tuple
18
+
19
+ import torch
20
+ from datasets import Dataset, DatasetDict
21
+ from tqdm.auto import tqdm
22
+ from transformers import (
23
+ AutoTokenizer,
24
+ AutoModelForSequenceClassification,
25
+ DataCollatorWithPadding,
26
+ Trainer,
27
+ TrainingArguments,
28
+ set_seed,
29
+ )
30
+
31
+ from ...base import AutoLearner
32
+
33
+
34
+ class RWTHDBISSFTLearner(AutoLearner):
35
+ """
36
+ Supervised term-typing
37
+
38
+ Training expands multi-label examples into multiple single-label rows.
39
+ Inference returns: [{"term": "<text>", "types": ["<label_str>"]}, ...]
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ model_name: str = "microsoft/deberta-v3-small",
45
+ trained_model_path: Optional[str] = None,
46
+ output_dir: Optional[str] = None,
47
+ device: str = "cpu",
48
+ max_length: int = 64,
49
+ per_device_train_batch_size: int = 16,
50
+ gradient_accumulation_steps: int = 2,
51
+ num_train_epochs: int = 3,
52
+ learning_rate: float = 2e-5,
53
+ weight_decay: float = 0.01,
54
+ logging_steps: int = 50,
55
+ save_strategy: str = "epoch",
56
+ save_total_limit: int = 1,
57
+ fp16: bool = False,
58
+ bf16: bool = False,
59
+ seed: int = 42,
60
+ ) -> None:
61
+ """Initialize the term-typing learner and configure training defaults.
62
+
63
+ Args:
64
+ model_name: Backbone HF model identifier (used if `trained_model_path` is None).
65
+ trained_model_path: Optional path to a fine-tuned checkpoint for loading.
66
+ output_dir: Directory to write checkpoints and tokenizer; defaults to './term_typing'.
67
+ device: user-defined argument as 'cuda' or 'cpu'.
68
+ max_length: Maximum tokenized sequence length.
69
+ per_device_train_batch_size: Per-device batch size during training.
70
+ gradient_accumulation_steps: Number of update accumulation steps.
71
+ num_train_epochs: Training epochs.
72
+ learning_rate: Optimizer learning rate.
73
+ weight_decay: Weight decay coefficient.
74
+ logging_steps: Logging interval (steps) for the Trainer.
75
+ save_strategy: Checkpoint save strategy (e.g., 'epoch', 'steps', 'no').
76
+ save_total_limit: Maximum number of checkpoints to keep.
77
+ fp16: Enable mixed precision (FP16) if supported.
78
+ bf16: Enable mixed precision (BF16) if supported.
79
+ seed: Random seed for reproducibility.
80
+
81
+ Side Effects:
82
+ Creates `output_dir` if it does not exist.
83
+
84
+ Notes:
85
+ The learner predicts exactly one label per term at inference time
86
+ (argmax over logits).
87
+ """
88
+ super().__init__()
89
+ self.model_name = model_name
90
+ self.trained_model_path = trained_model_path
91
+ self.output_dir = output_dir or "./term_typing"
92
+ os.makedirs(self.output_dir, exist_ok=True)
93
+
94
+ self.max_length = max_length
95
+ self.per_device_train_batch_size = per_device_train_batch_size
96
+ self.gradient_accumulation_steps = gradient_accumulation_steps
97
+ self.num_train_epochs = num_train_epochs
98
+ self.learning_rate = learning_rate
99
+ self.weight_decay = weight_decay
100
+ self.logging_steps = logging_steps
101
+ self.save_strategy = save_strategy
102
+ self.save_total_limit = save_total_limit
103
+ self.fp16 = fp16
104
+ self.bf16 = bf16
105
+ self.seed = seed
106
+
107
+ self.device = device
108
+ self.model: Optional[AutoModelForSequenceClassification] = None
109
+ self.tokenizer: Optional[AutoTokenizer] = None
110
+ self.id2label: Dict[int, str] = {}
111
+ self.label2id: Dict[str, int] = {}
112
+
113
+ def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]:
114
+ """
115
+ Train or run inference for term typing, depending on `test`.
116
+
117
+ When `test=False`, trains on `data.term_typings`.
118
+ When `test=True`, predicts labels for provided terms.
119
+
120
+ Args:
121
+ data: If training, an object with `.term_typings` where each item has
122
+ `term` and `types` (list[str]). If testing, either a `List[str]`
123
+ of raw term texts or an object with `.term_typings`.
124
+ test: If True, runs inference; otherwise trains.
125
+
126
+ Returns:
127
+ If `test=True`: a list of dicts like
128
+ `[{"term": "<text>", "types": ["<label_str>"]}, ...]`.
129
+ If `test=False`: None.
130
+
131
+ Raises:
132
+ ValueError: If required fields are missing from `data`.
133
+ """
134
+ if test:
135
+ terms = self._collect_eval_terms(data)
136
+ return self._predict_structured_output(terms)
137
+ else:
138
+ self._train_from_term_typings(train_data=data)
139
+ return None
140
+
141
+ def _expand_multilabel_training_rows(
142
+ self, term_typings: List[Any]
143
+ ) -> Tuple[List[str], List[int], Dict[int, str], Dict[str, int]]:
144
+ """
145
+ Expand multi-label instances into single-label rows and derive label maps.
146
+
147
+ Each training instance with fields:
148
+ - `term`: str-like
149
+ - `types`: list of label strings
150
+ is expanded into len(types) rows with the same `term` and individual labels.
151
+
152
+ Args:
153
+ term_typings: Sequence of objects (e.g., dataclasses) exposing
154
+ `.term` and `.types`.
155
+
156
+ Returns:
157
+ A tuple `(texts, label_ids, id2label, label2id)`:
158
+ - texts: Flattened list of term strings (one per label).
159
+ - label_ids: Parallel list of integer label ids.
160
+ - id2label: Mapping from id -> label string.
161
+ - label2id: Mapping from label string -> id.
162
+ """
163
+ label_strings: List[str] = []
164
+ for instance in term_typings:
165
+ label_strings.extend([str(label) for label in instance.types])
166
+
167
+ unique_labels = sorted(set(label_strings))
168
+ id2label = {i: label for i, label in enumerate(unique_labels)}
169
+ label2id = {label: i for i, label in enumerate(unique_labels)}
170
+
171
+ texts: List[str] = []
172
+ label_ids: List[int] = []
173
+ for instance in term_typings:
174
+ term_text = str(instance.term)
175
+ for label in instance.types:
176
+ texts.append(term_text)
177
+ label_ids.append(label2id[str(label)])
178
+
179
+ return texts, label_ids, id2label, label2id
180
+
181
+ def _collect_eval_terms(self, eval_data: Any) -> List[str]:
182
+ """
183
+ Collect the list of term texts to predict for evaluation.
184
+
185
+ Accepts either:
186
+ - A `List[str]` of raw term texts, or
187
+ - An object with `.term_typings`, from which `.term` is extracted.
188
+
189
+ Args:
190
+ eval_data: Input carrier for terms.
191
+
192
+ Returns:
193
+ List of term strings.
194
+
195
+ Raises:
196
+ ValueError: If `eval_data` lacks the expected structure.
197
+ """
198
+ if isinstance(eval_data, list) and all(isinstance(x, str) for x in eval_data):
199
+ terms = eval_data
200
+ else:
201
+ term_typings = getattr(eval_data, "term_typings", None)
202
+ if term_typings is None:
203
+ raise ValueError(
204
+ "Provide a List[str] OR an object with .term_typings for test=True."
205
+ )
206
+ terms = [str(instance.term) for instance in term_typings]
207
+ return terms
208
+
209
+ def _train_from_term_typings(self, train_data: Any) -> None:
210
+ """Train the term-typing classifier from `.term_typings`.
211
+
212
+ Steps:
213
+ 1) Seed RNGs for reproducibility.
214
+ 2) Expand multi-label examples into single-label rows.
215
+ 3) Build HF `DatasetDict`, tokenizer, and data collator.
216
+ 4) Initialize `AutoModelForSequenceClassification`.
217
+ 5) Train with `Trainer` and save model/tokenizer to `output_dir`.
218
+
219
+ Args:
220
+ train_data: Object with `.term_typings`; each item exposes
221
+ `.term` (text) and `.types` (list[str]).
222
+
223
+ Raises:
224
+ ValueError: If `train_data` does not provide `.term_typings`.
225
+
226
+ Side Effects:
227
+ Writes a trained model to `self.output_dir` and updates
228
+ `self.id2label` / `self.label2id`.
229
+ """
230
+ set_seed(self.seed)
231
+ random.seed(self.seed)
232
+ torch.manual_seed(self.seed)
233
+ if torch.cuda.is_available():
234
+ torch.cuda.manual_seed_all(self.seed)
235
+
236
+ term_typings: List[Any] = getattr(train_data, "term_typings", None)
237
+ if term_typings is None:
238
+ raise ValueError("train_data must provide .term_typings for term-typing.")
239
+
240
+ texts, label_ids, self.id2label, self.label2id = (
241
+ self._expand_multilabel_training_rows(term_typings)
242
+ )
243
+
244
+ dataset = DatasetDict(
245
+ {"train": Dataset.from_dict({"labels": label_ids, "text": texts})}
246
+ )
247
+
248
+ backbone = self.trained_model_path or self.model_name
249
+ try:
250
+ self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=True)
251
+ except Exception:
252
+ # fallback if fast tokenizer isn't available
253
+ self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=False)
254
+
255
+ def tokenize_batch(batch: Dict[str, List[str]]):
256
+ """Tokenize a batch of texts with truncation and max length."""
257
+ return self.tokenizer(
258
+ batch["text"], truncation=True, max_length=self.max_length
259
+ )
260
+
261
+ tokenized = dataset.map(tokenize_batch, batched=True, remove_columns=["text"])
262
+ data_collator = DataCollatorWithPadding(self.tokenizer)
263
+
264
+ self.model = AutoModelForSequenceClassification.from_pretrained(
265
+ backbone,
266
+ num_labels=len(self.id2label),
267
+ id2label=self.id2label,
268
+ label2id=self.label2id,
269
+ )
270
+
271
+ if (
272
+ getattr(self.model.config, "pad_token_id", None) is None
273
+ and self.tokenizer.pad_token_id is not None
274
+ ):
275
+ self.model.config.pad_token_id = self.tokenizer.pad_token_id
276
+
277
+ training_args = TrainingArguments(
278
+ output_dir=self.output_dir,
279
+ learning_rate=self.learning_rate,
280
+ per_device_train_batch_size=self.per_device_train_batch_size,
281
+ gradient_accumulation_steps=self.gradient_accumulation_steps,
282
+ num_train_epochs=self.num_train_epochs,
283
+ weight_decay=self.weight_decay,
284
+ save_strategy=self.save_strategy,
285
+ save_total_limit=self.save_total_limit,
286
+ logging_steps=self.logging_steps,
287
+ fp16=self.fp16,
288
+ bf16=self.bf16,
289
+ report_to=[],
290
+ )
291
+
292
+ trainer = Trainer(
293
+ model=self.model,
294
+ args=training_args,
295
+ train_dataset=tokenized["train"],
296
+ tokenizer=self.tokenizer,
297
+ data_collator=data_collator,
298
+ )
299
+
300
+ trainer.train()
301
+ trainer.save_model(self.output_dir)
302
+ self.tokenizer.save_pretrained(self.output_dir)
303
+
304
+ def _ensure_loaded_for_inference(self) -> None:
305
+ """Load model/tokenizer for inference if not already loaded.
306
+
307
+ Loads from `trained_model_path` if set, otherwise from `output_dir`.
308
+ Also restores `id2label`/`label2id` from the model config when present,
309
+ moves the model to the configured device, and sets eval mode.
310
+ """
311
+ if self.model is not None and self.tokenizer is not None:
312
+ return
313
+ model_path = self.trained_model_path or self.output_dir
314
+ self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
315
+ try:
316
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
317
+ except Exception:
318
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
319
+
320
+ cfg = self.model.config
321
+ if hasattr(cfg, "id2label") and hasattr(cfg, "label2id"):
322
+ self.id2label = dict(cfg.id2label)
323
+ self.label2id = dict(cfg.label2id)
324
+
325
+ self.model.to(self.device).eval()
326
+
327
+ def _predict_label_ids(self, terms: List[str]) -> List[int]:
328
+ """Predict label ids (argmax) for a list of term strings.
329
+
330
+ Ensures model/tokenizer are loaded, then performs forward passes
331
+ term-by-term and collects the argmax label id.
332
+
333
+ Args:
334
+ terms: List of raw term texts.
335
+
336
+ Returns:
337
+ List of integer label ids corresponding to `terms`.
338
+ """
339
+ self._ensure_loaded_for_inference()
340
+ predictions: List[int] = []
341
+ for term_text in tqdm(
342
+ terms, desc="Inference", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}"
343
+ ):
344
+ inputs = self.tokenizer(
345
+ term_text,
346
+ return_tensors="pt",
347
+ truncation=True,
348
+ max_length=self.max_length,
349
+ )
350
+ inputs = {name: tensor.to(self.device) for name, tensor in inputs.items()}
351
+ with torch.no_grad():
352
+ logits = self.model(**inputs).logits
353
+ predictions.append(int(torch.argmax(logits, dim=-1).item()))
354
+ return predictions
355
+
356
+ def _predict_structured_output(
357
+ self, terms: List[str]
358
+ ) -> List[Dict[str, List[str]]]:
359
+ """
360
+ Convert predicted label IDs into evaluator-friendly structured outputs.
361
+
362
+ The output format is:
363
+ [{"term": "<text>", "types": ["<label_str>"]}, ...]
364
+
365
+ Args:
366
+ terms: Raw term texts to classify.
367
+
368
+ Returns:
369
+ List of dicts mapping each input term to a list with its predicted
370
+ label string. Falls back to stringified id if label mapping is absent.
371
+ """
372
+ label_ids = self._predict_label_ids(terms)
373
+ id2label_map = self.id2label or {} # fallback handled below
374
+
375
+ results: List[Dict[str, List[str]]] = []
376
+ for term_text, label_id in zip(terms, label_ids):
377
+ label_str = id2label_map.get(int(label_id), str(int(label_id)))
378
+ results.append({"term": term_text, "types": [label_str]})
379
+ return results