OntoLearner 1.4.7__py3-none-any.whl → 1.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,598 @@
1
+ # Copyright (c) 2025 SciKnowOrg
2
+ #
3
+ # Licensed under the MIT License (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ #      https://opensource.org/licenses/MIT
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import json
16
+ import random
17
+ import re
18
+ import ast
19
+ import gc
20
+ from typing import Any, Dict, List, Optional, Set, Tuple
21
+ from collections import defaultdict
22
+
23
+ import torch
24
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
25
+
26
+ from ...base import AutoLearner, AutoLLM
27
+
28
+
29
+ # -----------------------------------------------------------------------------
30
+ # Concrete AutoLLM: local HF wrapper that follows the AutoLLM interface
31
+ # -----------------------------------------------------------------------------
32
+ class LocalAutoLLM(AutoLLM):
33
+ """
34
+ Handles loading and generation for a Hugging Face Causal Language Model (Qwen/TinyLlama).
35
+ Uses 4-bit quantization for efficiency and greedy decoding by default.
36
+ """
37
+
38
+ def __init__(
39
+ self, label_mapper: Any = None, device: str = "cpu", token: str = ""
40
+ ) -> None:
41
+ super().__init__(label_mapper=label_mapper, device=device, token=token)
42
+ self.model = None
43
+ self.tokenizer = None
44
+
45
+ def load(
46
+ self,
47
+ model_id: str,
48
+ load_in_4bit: bool = False,
49
+ dtype: str = "auto",
50
+ trust_remote_code: bool = True,
51
+ ):
52
+ """Load tokenizer + model, applying 4-bit quantization if specified and possible."""
53
+
54
+ # Determine the target data type (default to float32 for CPU, float16 for GPU)
55
+ torch_dtype_val = torch.float16 if torch.cuda.is_available() else torch.float32
56
+
57
+ # Load the tokenizer
58
+ self.tokenizer = AutoTokenizer.from_pretrained(
59
+ model_id, trust_remote_code=trust_remote_code
60
+ )
61
+ if self.tokenizer.pad_token is None:
62
+ self.tokenizer.pad_token = self.tokenizer.eos_token
63
+
64
+ quant_config = None
65
+ if load_in_4bit:
66
+ # Configure BitsAndBytes for 4-bit loading
67
+ quant_config = BitsAndBytesConfig(
68
+ load_in_4bit=True,
69
+ bnb_4bit_compute_dtype=torch.float16,
70
+ bnb_4bit_use_double_quant=True,
71
+ bnb_4bit_quant_type="nf4",
72
+ )
73
+ if torch_dtype_val is None:
74
+ torch_dtype_val = torch.float16
75
+
76
+ # Set device mapping (auto for multi-GPU or single GPU, explicit CPU otherwise)
77
+ device_map = "auto" if (self.device != "cpu") else {"": "cpu"}
78
+
79
+ # Load the Causal Language Model
80
+ self.model = AutoModelForCausalLM.from_pretrained(
81
+ model_id,
82
+ device_map=device_map,
83
+ torch_dtype=torch_dtype_val,
84
+ quantization_config=quant_config,
85
+ trust_remote_code=trust_remote_code,
86
+ )
87
+
88
+ # Ensure model is on the correct device (redundant if device_map="auto" but safe)
89
+ if self.device == "cpu":
90
+ self.model.to("cpu")
91
+
92
+ def generate(
93
+ self,
94
+ inputs: List[str],
95
+ max_new_tokens: int = 64,
96
+ temperature: float = 0.0,
97
+ top_p: float = 1.0,
98
+ ) -> List[str]:
99
+ """Generate continuations for a list of prompts, returning only the generated part."""
100
+ if self.model is None or self.tokenizer is None:
101
+ raise RuntimeError("Model/tokenizer not loaded. Call .load() first.")
102
+
103
+ # --- Generation Setup ---
104
+ # Tokenize batch (padding is essential for batch inference)
105
+ enc = self.tokenizer(inputs, return_tensors="pt", padding=True, truncation=True)
106
+ input_ids = enc["input_ids"]
107
+ attention_mask = enc["attention_mask"]
108
+
109
+ # Move tensors to the model's device (e.g., cuda:0)
110
+ model_device = next(self.model.parameters()).device
111
+ input_ids = input_ids.to(model_device)
112
+ attention_mask = attention_mask.to(model_device)
113
+
114
+ # --- Generate ---
115
+ with torch.no_grad():
116
+ outputs = self.model.generate(
117
+ input_ids=input_ids,
118
+ attention_mask=attention_mask,
119
+ max_new_tokens=max_new_tokens,
120
+ do_sample=(
121
+ temperature > 0.0
122
+ ), # Use greedy decoding if temperature is 0.0
123
+ temperature=temperature,
124
+ top_p=top_p,
125
+ pad_token_id=self.tokenizer.eos_token_id,
126
+ )
127
+
128
+ # --- Post-processing: Extract only the generated tail ---
129
+ decoded_outputs: List[str] = []
130
+ for i, output_ids in enumerate(outputs):
131
+ full_decoded_text = self.tokenizer.decode(
132
+ output_ids, skip_special_tokens=True
133
+ )
134
+ prompt_text = self.tokenizer.decode(input_ids[i], skip_special_tokens=True)
135
+
136
+ # Safely strip the prompt text from the full output
137
+ if full_decoded_text.startswith(prompt_text):
138
+ generated_tail = full_decoded_text[len(prompt_text) :].strip()
139
+ else:
140
+ # Fallback extraction (less robust if padding affects token indices)
141
+ prompt_len = input_ids.shape[1]
142
+ generated_tail = self.tokenizer.decode(
143
+ output_ids[prompt_len:], skip_special_tokens=True
144
+ ).strip()
145
+ decoded_outputs.append(generated_tail)
146
+
147
+ return decoded_outputs
148
+
149
+
150
+ # -----------------------------------------------------------------------------
151
+ # Main Learner: SBUNLPFewShotLearner (Task A Text2Onto)
152
+ # -----------------------------------------------------------------------------
153
+ class SBUNLPFewShotLearner(AutoLearner):
154
+ """
155
+ Concrete learner implementing the Task A Text2Onto pipeline (Term and Type Extraction).
156
+ It uses Few-Shot prompts generated from training data for inference.
157
+ """
158
+
159
+ def __init__(self, model: Optional[AutoLLM] = None, device: str = "cpu"):
160
+ super().__init__()
161
+ # self.model is an instance of LocalAutoLLM
162
+ self.model = model or LocalAutoLLM(device=device)
163
+ self.device = device
164
+ # Cached in-memory prompt blocks built during the fit phase
165
+ self.fewshot_terms_block: str = ""
166
+ self.fewshot_types_block: str = ""
167
+
168
+ # --- Few-shot construction (terms) ---
169
+ def build_stratified_fewshot_prompt(
170
+ self,
171
+ documents_path: str,
172
+ terms_path: str,
173
+ sample_size: int = 28,
174
+ seed: int = 123,
175
+ max_chars_per_text: int = 1200,
176
+ ) -> str:
177
+ """
178
+ Builds the few-shot exemplar block for Term Extraction using stratified sampling.
179
+ """
180
+ random.seed(seed)
181
+
182
+ # Read documents (JSONL) into a list
183
+ corpus_documents: List[Dict[str, Any]] = []
184
+ with open(documents_path, "r", encoding="utf-8") as file_handle:
185
+ for line in file_handle:
186
+ if line.strip():
187
+ corpus_documents.append(json.loads(line))
188
+
189
+ num_total_docs = len(corpus_documents)
190
+ num_sample_docs = min(sample_size, num_total_docs)
191
+
192
+ # Load the map of term -> [list of document IDs]
193
+ with open(terms_path, "r", encoding="utf-8") as file_handle:
194
+ term_to_doc_map = json.load(file_handle)
195
+
196
+ # Invert map: document ID -> [list of terms]
197
+ doc_id_to_terms_map = defaultdict(list)
198
+ for term, doc_ids in term_to_doc_map.items():
199
+ for doc_id in doc_ids:
200
+ doc_id_to_terms_map[doc_id].append(term)
201
+
202
+ # Define strata (groups of documents associated with specific terms)
203
+ strata_map = defaultdict(list)
204
+ for doc in corpus_documents:
205
+ doc_id = doc.get("id", "")
206
+ associated_terms = doc_id_to_terms_map.get(doc_id, ["no_term"])
207
+ for term in associated_terms:
208
+ strata_map[term].append(doc)
209
+
210
+ # Perform proportional sampling across strata
211
+ sampled_documents: List[Dict[str, Any]] = []
212
+ for term_str, stratum_docs in strata_map.items():
213
+ num_stratum_docs = len(stratum_docs)
214
+ if num_stratum_docs == 0:
215
+ continue
216
+
217
+ # Calculate proportional sample size
218
+ proportion = num_stratum_docs / num_total_docs
219
+ num_to_sample_from_stratum = int(num_sample_docs * proportion)
220
+
221
+ if num_to_sample_from_stratum > 0:
222
+ sampled_documents.extend(
223
+ random.sample(
224
+ stratum_docs, min(num_to_sample_from_stratum, num_stratum_docs)
225
+ )
226
+ )
227
+
228
+ # Deduplicate sampled documents by ID and adjust count to exactly 'sample_size'
229
+ unique_docs_by_id = {}
230
+ for doc in sampled_documents:
231
+ unique_docs_by_id[doc.get("id", "")] = doc
232
+
233
+ final_sample_docs = list(unique_docs_by_id.values())
234
+
235
+ if len(final_sample_docs) > num_sample_docs:
236
+ final_sample_docs = random.sample(final_sample_docs, num_sample_docs)
237
+ elif len(final_sample_docs) < num_sample_docs:
238
+ remaining_docs = [
239
+ d for d in corpus_documents if d.get("id", "") not in unique_docs_by_id
240
+ ]
241
+ needed_count = min(
242
+ num_sample_docs - len(final_sample_docs), len(remaining_docs)
243
+ )
244
+ final_sample_docs.extend(random.sample(remaining_docs, needed_count))
245
+
246
+ # Format the few-shot exemplar text block
247
+ prompt_lines: List[str] = []
248
+ for doc in final_sample_docs:
249
+ doc_id = doc.get("id", "")
250
+ title = doc.get("title", "")
251
+ text = doc.get("text", "")
252
+
253
+ # Truncate text if it exceeds the maximum character limit
254
+ if max_chars_per_text and len(text) > max_chars_per_text:
255
+ text = text[:max_chars_per_text] + "…"
256
+
257
+ associated_terms = doc_id_to_terms_map.get(doc_id, [])
258
+ prompt_lines.append(
259
+ f"Document ID: {doc_id}\nTitle: {title}\nText: {text}\nAssociated Terms: {associated_terms}\n----------------------------------------"
260
+ )
261
+
262
+ prompt_block = "\n".join(prompt_lines)
263
+ self.fewshot_terms_block = prompt_block
264
+ return prompt_block
265
+
266
+ # --- Few-shot construction (types) ---
267
+ def build_types_fewshot_block(
268
+ self,
269
+ docs_jsonl: str,
270
+ terms2doc_json: str,
271
+ sample_per_term: int = 1,
272
+ full_word: bool = True,
273
+ case_sensitive: bool = True,
274
+ max_chars_per_text: int = 800,
275
+ ) -> str:
276
+ """
277
+ Builds the few-shot block for Type Extraction.
278
+ This method samples documents based on finding an associated term/type within the text.
279
+ """
280
+ # Load documents into dict by ID
281
+ docs_by_id = {}
282
+ with open(docs_jsonl, "r", encoding="utf-8") as file_handle:
283
+ for line in file_handle:
284
+ line_stripped = line.strip()
285
+ if line_stripped:
286
+ try:
287
+ doc = json.loads(line_stripped)
288
+ doc_id = doc.get("id", "")
289
+ if doc_id:
290
+ docs_by_id[doc_id] = doc
291
+ except json.JSONDecodeError:
292
+ continue
293
+
294
+ # Load term -> [doc_id,...] map
295
+ with open(terms2doc_json, "r", encoding="utf-8") as file_handle:
296
+ term_to_doc_map = json.load(file_handle)
297
+
298
+ flags = 0 if case_sensitive else re.IGNORECASE
299
+ prompt_lines: List[str] = []
300
+
301
+ # Iterate over terms (which act as types in this context)
302
+ for term, doc_ids in term_to_doc_map.items():
303
+ escaped_term = re.escape(term)
304
+ # Create regex pattern for matching the term in the text
305
+ pattern = rf"\b{escaped_term}\b" if full_word else escaped_term
306
+ term_regex = re.compile(pattern, flags=flags)
307
+
308
+ picked_count = 0
309
+ for doc_id in doc_ids:
310
+ doc = docs_by_id.get(doc_id)
311
+ if not doc:
312
+ continue
313
+
314
+ title = doc.get("title", "")
315
+ text = doc.get("text", "")
316
+
317
+ # Check if the term/type is actually present in the document text/title
318
+ if term_regex.search(f"{title} {text}"):
319
+ text_content = text
320
+
321
+ # Truncate text if necessary
322
+ if max_chars_per_text and len(text_content) > max_chars_per_text:
323
+ text_content = text_content[:max_chars_per_text] + "…"
324
+
325
+ # Escape single quotes in the term for Python list formatting in the prompt
326
+ term_for_prompt = term.replace("'", "\\'")
327
+
328
+ prompt_lines.append(
329
+ f"Document ID: {doc_id}\nTitle: {title}\nText: {text_content}\nAssociated Types: ['{term_for_prompt}']\n----------------------------------------"
330
+ )
331
+ picked_count += 1
332
+
333
+ if picked_count >= sample_per_term:
334
+ break # Move to the next term
335
+
336
+ prompt_block = "\n".join(prompt_lines)
337
+ self.fewshot_types_block = prompt_block
338
+ return prompt_block
339
+
340
+ def fit(
341
+ self,
342
+ train_docs_jsonl: str,
343
+ terms2doc_json: str,
344
+ sample_size: int = 28,
345
+ seed: int = 123,
346
+ ) -> None:
347
+ """
348
+ Fit phase: Builds and caches the few-shot prompt blocks from the training files.
349
+ No model training occurs (Few-Shot/In-Context Learning).
350
+ """
351
+ # Build prompt block for Term extraction
352
+ _ = self.build_stratified_fewshot_prompt(
353
+ train_docs_jsonl, terms2doc_json, sample_size=sample_size, seed=seed
354
+ )
355
+ # Build prompt block for Type extraction
356
+ _ = self.build_types_fewshot_block(
357
+ train_docs_jsonl, terms2doc_json, sample_per_term=1
358
+ )
359
+
360
+ # -------------------------
361
+ # Inference helpers (prompt construction and output parsing)
362
+ # -------------------------
363
+ def _build_term_prompt(self, example_block: str, title: str, text: str) -> str:
364
+ """Constructs the full prompt for Term Extraction."""
365
+ return f"""{example_block}
366
+ [var]
367
+ Title: {title}
368
+ Text: {text}
369
+ [var]
370
+ Extract all relevant terms that could form the basis of an ontology from the above document.
371
+ Return ONLY a Python list like ['term1', 'term2', ...] and nothing else.
372
+ If no terms are found, return [].
373
+ """
374
+
375
+ def _build_type_prompt(self, example_block: str, title: str, text: str) -> str:
376
+ """Constructs the full prompt for Type Extraction."""
377
+ return f"""{example_block}
378
+ [var]
379
+ Title: {title}
380
+ Text: {text}
381
+ [var]
382
+ Extract all relevant TYPES mentioned in the above document that could serve as ontology classes.
383
+ Only consider content inside the [var] ... [var] block.
384
+ Return ONLY a valid Python list like ['type1', 'type2'] and nothing else. If none, return [].
385
+ """
386
+
387
+ def _parse_list_like(self, raw_string: str) -> List[str]:
388
+ """Try to extract a Python list of strings from model output robustly."""
389
+ processed_string = raw_string.strip()
390
+ if processed_string in ("[]", ""):
391
+ return []
392
+
393
+ # 1. Try direct evaluation
394
+ try:
395
+ parsed_value = ast.literal_eval(processed_string)
396
+ if isinstance(parsed_value, list):
397
+ # Filter to ensure only strings are returned
398
+ return [item for item in parsed_value if isinstance(item, str)]
399
+ except Exception:
400
+ pass
401
+
402
+ # 2. Try finding and evaluating text within outermost brackets [ ... ]
403
+ bracket_match = re.search(r"\[[\s\S]*?\]", processed_string)
404
+ if bracket_match:
405
+ try:
406
+ parsed_value = ast.literal_eval(bracket_match.group(0))
407
+ if isinstance(parsed_value, list):
408
+ return [item for item in parsed_value if isinstance(item, str)]
409
+ except Exception:
410
+ pass
411
+
412
+ # 3. Fallback: Find comma-separated quoted substrings (less robust, but catches errors)
413
+ # Finds content inside either single quotes ('...') or double quotes ("...")
414
+ quoted_matches = re.findall(r"'([^']+)'|\"([^\"]+)\"", processed_string)
415
+ flattened_list = [a_match or b_match for a_match, b_match in quoted_matches]
416
+ return flattened_list
417
+
418
+ def _call_model_one(self, prompt: str, max_new_tokens: int = 120) -> str:
419
+ """Calls the underlying LocalAutoLLM for a single prompt. Returns the raw tail output."""
420
+ # self.model is an instance of LocalAutoLLM
421
+ model_output = self.model.generate(
422
+ [prompt], max_new_tokens=max_new_tokens, temperature=0.0, top_p=1.0
423
+ )
424
+ return model_output[0] if model_output else ""
425
+
426
+ def predict_terms(
427
+ self,
428
+ docs_test_jsonl: str,
429
+ out_jsonl: str,
430
+ max_lines: int = -1,
431
+ max_new_tokens: int = 120,
432
+ ) -> int:
433
+ """
434
+ Runs Term Extraction on the test documents and saves results to a JSONL file.
435
+ Returns: The count of individual terms written.
436
+ """
437
+ if not self.fewshot_terms_block:
438
+ raise RuntimeError("Few-shot block for terms is empty. Call fit() first.")
439
+
440
+ num_written_terms = 0
441
+ with (
442
+ open(docs_test_jsonl, "r", encoding="utf-8") as file_in,
443
+ open(out_jsonl, "w", encoding="utf-8") as file_out,
444
+ ):
445
+ for line_index, line in enumerate(file_in, start=1):
446
+ if 0 < max_lines < line_index:
447
+ break
448
+
449
+ try:
450
+ document = json.loads(line.strip())
451
+ except Exception:
452
+ continue # Skip malformed JSON lines
453
+
454
+ doc_id = document.get("id", "unknown")
455
+ title = document.get("title", "")
456
+ text = document.get("text", "")
457
+
458
+ # Construct and call model
459
+ prompt = self._build_term_prompt(self.fewshot_terms_block, title, text)
460
+ raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens)
461
+ predicted_terms = self._parse_list_like(raw_output)
462
+
463
+ # Write extracted terms
464
+ for term_or_type in predicted_terms:
465
+ if isinstance(term_or_type, str) and term_or_type.strip():
466
+ file_out.write(
467
+ json.dumps({"doc_id": doc_id, "term": term_or_type.strip()})
468
+ + "\n"
469
+ )
470
+ num_written_terms += 1
471
+
472
+ # Lightweight memory management for long runs
473
+ if line_index % 50 == 0:
474
+ gc.collect()
475
+ if torch.cuda.is_available():
476
+ torch.cuda.empty_cache()
477
+
478
+ return num_written_terms
479
+
480
+ def predict_types(
481
+ self,
482
+ docs_test_jsonl: str,
483
+ out_jsonl: str,
484
+ max_lines: int = -1,
485
+ max_new_tokens: int = 120,
486
+ ) -> int:
487
+ """
488
+ Runs Type Extraction on the test documents and saves results to a JSONL file.
489
+ Returns: The count of individual types written.
490
+ """
491
+ if not self.fewshot_types_block:
492
+ raise RuntimeError("Few-shot block for types is empty. Call fit() first.")
493
+
494
+ num_written_types = 0
495
+ with (
496
+ open(docs_test_jsonl, "r", encoding="utf-8") as file_in,
497
+ open(out_jsonl, "w", encoding="utf-8") as file_out,
498
+ ):
499
+ for line_index, line in enumerate(file_in, start=1):
500
+ if 0 < max_lines < line_index:
501
+ break
502
+
503
+ try:
504
+ document = json.loads(line.strip())
505
+ except Exception:
506
+ continue # Skip malformed JSON lines
507
+
508
+ doc_id = document.get("id", "unknown")
509
+ title = document.get("title", "")
510
+ text = document.get("text", "")
511
+
512
+ # Construct and call model using the dedicated type prompt block
513
+ prompt = self._build_type_prompt(self.fewshot_types_block, title, text)
514
+ raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens)
515
+ predicted_types = self._parse_list_like(raw_output)
516
+
517
+ # Write extracted types
518
+ for term_or_type in predicted_types:
519
+ if isinstance(term_or_type, str) and term_or_type.strip():
520
+ file_out.write(
521
+ json.dumps({"doc_id": doc_id, "type": term_or_type.strip()})
522
+ + "\n"
523
+ )
524
+ num_written_types += 1
525
+
526
+ if line_index % 50 == 0:
527
+ gc.collect()
528
+ if torch.cuda.is_available():
529
+ torch.cuda.empty_cache()
530
+
531
+ return num_written_types
532
+
533
+ # --- Evaluation utilities (unchanged from prior definition, added docstrings) ---
534
+ def load_gold_pairs(self, terms2doc_path: str) -> Set[Tuple[str, str]]:
535
+ """Convert terms2docs JSON into a set of unique (doc_id, term) pairs, lowercased."""
536
+ gold_pairs = set()
537
+ with open(terms2doc_path, "r", encoding="utf-8") as file_handle:
538
+ term_to_doc_map = json.load(file_handle)
539
+
540
+ for term, doc_ids in term_to_doc_map.items():
541
+ clean_term = term.strip().lower()
542
+ for doc_id in doc_ids:
543
+ gold_pairs.add((doc_id, clean_term))
544
+ return gold_pairs
545
+
546
+ def load_predicted_pairs(
547
+ self, predicted_jsonl_path: str, key: str = "term"
548
+ ) -> Set[Tuple[str, str]]:
549
+ """Load predicted (doc_id, term/type) pairs from a JSONL file, lowercased."""
550
+ predicted_pairs = set()
551
+ with open(predicted_jsonl_path, "r", encoding="utf-8") as file_handle:
552
+ for line in file_handle:
553
+ try:
554
+ entry = json.loads(line.strip())
555
+ except Exception:
556
+ continue
557
+ doc_id = entry.get("doc_id")
558
+ value = entry.get(key)
559
+ if doc_id and value:
560
+ predicted_pairs.add((doc_id, value.strip().lower()))
561
+ return predicted_pairs
562
+
563
+ def evaluate_extraction_f1(
564
+ self, terms2doc_path: str, predicted_jsonl: str, key: str = "term"
565
+ ) -> float:
566
+ """
567
+ Computes set-based binary Precision, Recall, and F1 score against the gold pairs.
568
+ """
569
+ # Load the ground truth and predictions
570
+ gold_set = self.load_gold_pairs(terms2doc_path)
571
+ predicted_set = self.load_predicted_pairs(predicted_jsonl, key=key)
572
+
573
+ # Build combined universe of all pairs for score calculation
574
+ all_pairs = sorted(gold_set | predicted_set)
575
+
576
+ # Create binary labels (1=present, 0=absent)
577
+ y_true = [1 if pair in gold_set else 0 for pair in all_pairs]
578
+ y_pred = [1 if pair in predicted_set else 0 for pair in all_pairs]
579
+
580
+ # Use scikit-learn for metric calculation
581
+ from sklearn.metrics import precision_recall_fscore_support
582
+
583
+ precision, recall, f1, _ = precision_recall_fscore_support(
584
+ y_true, y_pred, average="binary", zero_division=0
585
+ )
586
+
587
+ # Display results
588
+ num_true_positives = len(gold_set & predicted_set)
589
+
590
+ print("\n📊 Evaluation Results:")
591
+ print(f" ✅ Precision: {precision:.4f}")
592
+ print(f" ✅ Recall: {recall:.4f}")
593
+ print(f" ✅ F1 Score: {f1:.4f}")
594
+ print(f" 📌 Gold pairs: {len(gold_set)}")
595
+ print(f" 📌 Predicted pairs:{len(predicted_set)}")
596
+ print(f" 🎯 True Positives: {num_true_positives}")
597
+
598
+ return float(f1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OntoLearner
3
- Version: 1.4.7
3
+ Version: 1.4.9
4
4
  Summary: OntoLearner: A Modular Python Library for Ontology Learning with LLMs.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -13,8 +13,11 @@ Classifier: Programming Language :: Python :: 3.10
13
13
  Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Dist: Levenshtein
16
17
  Requires-Dist: bitsandbytes (>=0.45.1,<0.46.0)
17
18
  Requires-Dist: dspy (>=2.6.14,<3.0.0)
19
+ Requires-Dist: g4f
20
+ Requires-Dist: gensim
18
21
  Requires-Dist: huggingface-hub (>=0.34.4,<0.35.0)
19
22
  Requires-Dist: matplotlib
20
23
  Requires-Dist: mistral-common[sentencepiece] (>=1.8.5,<2.0.0)
@@ -23,6 +26,7 @@ Requires-Dist: numpy
23
26
  Requires-Dist: openpyxl
24
27
  Requires-Dist: pandas
25
28
  Requires-Dist: pathlib (==1.0.1)
29
+ Requires-Dist: protobuf (<5)
26
30
  Requires-Dist: pydantic (==2.11.3)
27
31
  Requires-Dist: python-dotenv
28
32
  Requires-Dist: rdflib (==7.1.1)
@@ -77,16 +81,16 @@ Please refer to [Installation](https://ontolearner.readthedocs.io/installation.h
77
81
 
78
82
  ## 🔗 Essential Resources
79
83
 
80
- | Resource | Info |
81
- |:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
82
- | **[📚 OntoLearner Documentation](https://ontolearner.readthedocs.io/)** | OntoLearner's extensive documentation website. |
83
- | **[🤗 Datasets on Hugging Face](https://huggingface.co/collections/SciKnowOrg/ontolearner-benchmarking-6823bcd051300c210b7ef68a)** | Access curated, machine-readable ontologies. |
84
- | **Quick Tour on OntoLearner** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1DuElAyEFzd1vtqTjDEXWcc0zCbiV2Yee?usp=sharing) ``version=1.2.1`` | OntoLearner hands-on Colab tutorials. |
85
- | **[🚀 Quickstart](https://ontolearner.readthedocs.io/quickstart.html)** | Get started quickly with OntoLearner’s main features and workflow. |
86
- | **[🕸️ Learning Tasks](https://ontolearner.readthedocs.io/learning_tasks/learning_tasks.html)** | Explore supported ontology learning tasks like LLMs4OL Paradigm tasks and Text2Onto. | |
87
- | **[🧠 Learner Models](https://ontolearner.readthedocs.io/learners/llm.html)** | Browse and configure various learner models, including LLMs, Retrieval, or RAG approaches. |
88
- | **[📚 Ontologies Documentations](https://ontolearner.readthedocs.io/benchmarking/benchmark.html)** | Review benchmark ontologies and datasets used for evaluation and training. |
89
- | **[🧩 How to work with Ontologizer?](https://ontolearner.readthedocs.io/ontologizer/ontology_modularization.html)** | Learn how to modularize and preprocess ontologies using the Ontologizer module. |
84
+ | Resource | Info |
85
+ |:-----------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------|
86
+ | **[📚 OntoLearner Documentation](https://ontolearner.readthedocs.io/)** | OntoLearner's extensive documentation website. |
87
+ | **[🤗 Datasets on Hugging Face](https://huggingface.co/collections/SciKnowOrg/ontolearner-benchmarking-6823bcd051300c210b7ef68a)** | Access curated, machine-readable ontologies. |
88
+ | **[🚀 Quickstart](https://ontolearner.readthedocs.io/quickstart.html)** | Get started quickly with OntoLearner’s main features and workflow. |
89
+ | **[🕸️ Learning Tasks](https://ontolearner.readthedocs.io/learning_tasks/learning_tasks.html)** | Explore supported ontology learning tasks like LLMs4OL Paradigm tasks and Text2Onto. | |
90
+ | **[🧠 Learner Models](https://ontolearner.readthedocs.io/learners/llm.html)** | Browse and configure various learner models, including LLMs, Retrieval, or RAG approaches. |
91
+ | **[📚 Ontologies Documentations](https://ontolearner.readthedocs.io/benchmarking/benchmark.html)** | Review benchmark ontologies and datasets used for evaluation and training. |
92
+ | **[🧩 How to work with Ontologizer?](https://ontolearner.readthedocs.io/ontologizer/ontology_modularization.html)** | Learn how to modularize and preprocess ontologies using the Ontologizer module. |
93
+ | **[🤗 Ontology Metrics Dashboard](https://huggingface.co/spaces/SciKnowOrg/OntoLearner-Benchmark-Metrics)** | Benchmark ontologies with their metrics and complexity scores. |
90
94
 
91
95
  ## 🚀 Quick Tour
92
96
  Get started with OntoLearner in just a few lines of code. This guide demonstrates how to initialize ontologies, load datasets, and train an LLM-assisted learner for ontology engineering tasks.
@@ -132,7 +136,7 @@ task = 'non-taxonomic-re'
132
136
  ret_learner = AutoRetrieverLearner(top_k=5)
133
137
  ret_learner.load(model_id='sentence-transformers/all-MiniLM-L6-v2')
134
138
 
135
- # 5. Fit the model to training data and do the predict
139
+ # 5. Fit the model to training data and then predict over the test data
136
140
  ret_learner.fit(train_data, task=task)
137
141
  predicts = ret_learner.predict(test_data, task=task)
138
142