OntoLearner 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,598 @@
1
+ # Copyright (c) 2025 SciKnowOrg
2
+ #
3
+ # Licensed under the MIT License (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ #      https://opensource.org/licenses/MIT
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import json
16
+ import random
17
+ import re
18
+ import ast
19
+ import gc
20
+ from typing import Any, Dict, List, Optional, Set, Tuple
21
+ from collections import defaultdict
22
+
23
+ import torch
24
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
25
+
26
+ from ...base import AutoLearner, AutoLLM
27
+
28
+
29
+ # -----------------------------------------------------------------------------
30
+ # Concrete AutoLLM: local HF wrapper that follows the AutoLLM interface
31
+ # -----------------------------------------------------------------------------
32
+ class LocalAutoLLM(AutoLLM):
33
+ """
34
+ Handles loading and generation for a Hugging Face Causal Language Model (Qwen/TinyLlama).
35
+ Uses 4-bit quantization for efficiency and greedy decoding by default.
36
+ """
37
+
38
+ def __init__(
39
+ self, label_mapper: Any = None, device: str = "cpu", token: str = ""
40
+ ) -> None:
41
+ super().__init__(label_mapper=label_mapper, device=device, token=token)
42
+ self.model = None
43
+ self.tokenizer = None
44
+
45
+ def load(
46
+ self,
47
+ model_id: str,
48
+ load_in_4bit: bool = False,
49
+ dtype: str = "auto",
50
+ trust_remote_code: bool = True,
51
+ ):
52
+ """Load tokenizer + model, applying 4-bit quantization if specified and possible."""
53
+
54
+ # Determine the target data type (default to float32 for CPU, float16 for GPU)
55
+ torch_dtype_val = torch.float16 if torch.cuda.is_available() else torch.float32
56
+
57
+ # Load the tokenizer
58
+ self.tokenizer = AutoTokenizer.from_pretrained(
59
+ model_id, trust_remote_code=trust_remote_code
60
+ )
61
+ if self.tokenizer.pad_token is None:
62
+ self.tokenizer.pad_token = self.tokenizer.eos_token
63
+
64
+ quant_config = None
65
+ if load_in_4bit:
66
+ # Configure BitsAndBytes for 4-bit loading
67
+ quant_config = BitsAndBytesConfig(
68
+ load_in_4bit=True,
69
+ bnb_4bit_compute_dtype=torch.float16,
70
+ bnb_4bit_use_double_quant=True,
71
+ bnb_4bit_quant_type="nf4",
72
+ )
73
+ if torch_dtype_val is None:
74
+ torch_dtype_val = torch.float16
75
+
76
+ # Set device mapping (auto for multi-GPU or single GPU, explicit CPU otherwise)
77
+ device_map = "auto" if (self.device != "cpu") else {"": "cpu"}
78
+
79
+ # Load the Causal Language Model
80
+ self.model = AutoModelForCausalLM.from_pretrained(
81
+ model_id,
82
+ device_map=device_map,
83
+ torch_dtype=torch_dtype_val,
84
+ quantization_config=quant_config,
85
+ trust_remote_code=trust_remote_code,
86
+ )
87
+
88
+ # Ensure model is on the correct device (redundant if device_map="auto" but safe)
89
+ if self.device == "cpu":
90
+ self.model.to("cpu")
91
+
92
+ def generate(
93
+ self,
94
+ inputs: List[str],
95
+ max_new_tokens: int = 64,
96
+ temperature: float = 0.0,
97
+ top_p: float = 1.0,
98
+ ) -> List[str]:
99
+ """Generate continuations for a list of prompts, returning only the generated part."""
100
+ if self.model is None or self.tokenizer is None:
101
+ raise RuntimeError("Model/tokenizer not loaded. Call .load() first.")
102
+
103
+ # --- Generation Setup ---
104
+ # Tokenize batch (padding is essential for batch inference)
105
+ enc = self.tokenizer(inputs, return_tensors="pt", padding=True, truncation=True)
106
+ input_ids = enc["input_ids"]
107
+ attention_mask = enc["attention_mask"]
108
+
109
+ # Move tensors to the model's device (e.g., cuda:0)
110
+ model_device = next(self.model.parameters()).device
111
+ input_ids = input_ids.to(model_device)
112
+ attention_mask = attention_mask.to(model_device)
113
+
114
+ # --- Generate ---
115
+ with torch.no_grad():
116
+ outputs = self.model.generate(
117
+ input_ids=input_ids,
118
+ attention_mask=attention_mask,
119
+ max_new_tokens=max_new_tokens,
120
+ do_sample=(
121
+ temperature > 0.0
122
+ ), # Use greedy decoding if temperature is 0.0
123
+ temperature=temperature,
124
+ top_p=top_p,
125
+ pad_token_id=self.tokenizer.eos_token_id,
126
+ )
127
+
128
+ # --- Post-processing: Extract only the generated tail ---
129
+ decoded_outputs: List[str] = []
130
+ for i, output_ids in enumerate(outputs):
131
+ full_decoded_text = self.tokenizer.decode(
132
+ output_ids, skip_special_tokens=True
133
+ )
134
+ prompt_text = self.tokenizer.decode(input_ids[i], skip_special_tokens=True)
135
+
136
+ # Safely strip the prompt text from the full output
137
+ if full_decoded_text.startswith(prompt_text):
138
+ generated_tail = full_decoded_text[len(prompt_text) :].strip()
139
+ else:
140
+ # Fallback extraction (less robust if padding affects token indices)
141
+ prompt_len = input_ids.shape[1]
142
+ generated_tail = self.tokenizer.decode(
143
+ output_ids[prompt_len:], skip_special_tokens=True
144
+ ).strip()
145
+ decoded_outputs.append(generated_tail)
146
+
147
+ return decoded_outputs
148
+
149
+
150
+ # -----------------------------------------------------------------------------
151
+ # Main Learner: SBUNLPFewShotLearner (Task A Text2Onto)
152
+ # -----------------------------------------------------------------------------
153
+ class SBUNLPFewShotLearner(AutoLearner):
154
+ """
155
+ Concrete learner implementing the Task A Text2Onto pipeline (Term and Type Extraction).
156
+ It uses Few-Shot prompts generated from training data for inference.
157
+ """
158
+
159
+ def __init__(self, model: Optional[AutoLLM] = None, device: str = "cpu"):
160
+ super().__init__()
161
+ # self.model is an instance of LocalAutoLLM
162
+ self.model = model or LocalAutoLLM(device=device)
163
+ self.device = device
164
+ # Cached in-memory prompt blocks built during the fit phase
165
+ self.fewshot_terms_block: str = ""
166
+ self.fewshot_types_block: str = ""
167
+
168
+ # --- Few-shot construction (terms) ---
169
+ def build_stratified_fewshot_prompt(
170
+ self,
171
+ documents_path: str,
172
+ terms_path: str,
173
+ sample_size: int = 28,
174
+ seed: int = 123,
175
+ max_chars_per_text: int = 1200,
176
+ ) -> str:
177
+ """
178
+ Builds the few-shot exemplar block for Term Extraction using stratified sampling.
179
+ """
180
+ random.seed(seed)
181
+
182
+ # Read documents (JSONL) into a list
183
+ corpus_documents: List[Dict[str, Any]] = []
184
+ with open(documents_path, "r", encoding="utf-8") as file_handle:
185
+ for line in file_handle:
186
+ if line.strip():
187
+ corpus_documents.append(json.loads(line))
188
+
189
+ num_total_docs = len(corpus_documents)
190
+ num_sample_docs = min(sample_size, num_total_docs)
191
+
192
+ # Load the map of term -> [list of document IDs]
193
+ with open(terms_path, "r", encoding="utf-8") as file_handle:
194
+ term_to_doc_map = json.load(file_handle)
195
+
196
+ # Invert map: document ID -> [list of terms]
197
+ doc_id_to_terms_map = defaultdict(list)
198
+ for term, doc_ids in term_to_doc_map.items():
199
+ for doc_id in doc_ids:
200
+ doc_id_to_terms_map[doc_id].append(term)
201
+
202
+ # Define strata (groups of documents associated with specific terms)
203
+ strata_map = defaultdict(list)
204
+ for doc in corpus_documents:
205
+ doc_id = doc.get("id", "")
206
+ associated_terms = doc_id_to_terms_map.get(doc_id, ["no_term"])
207
+ for term in associated_terms:
208
+ strata_map[term].append(doc)
209
+
210
+ # Perform proportional sampling across strata
211
+ sampled_documents: List[Dict[str, Any]] = []
212
+ for term_str, stratum_docs in strata_map.items():
213
+ num_stratum_docs = len(stratum_docs)
214
+ if num_stratum_docs == 0:
215
+ continue
216
+
217
+ # Calculate proportional sample size
218
+ proportion = num_stratum_docs / num_total_docs
219
+ num_to_sample_from_stratum = int(num_sample_docs * proportion)
220
+
221
+ if num_to_sample_from_stratum > 0:
222
+ sampled_documents.extend(
223
+ random.sample(
224
+ stratum_docs, min(num_to_sample_from_stratum, num_stratum_docs)
225
+ )
226
+ )
227
+
228
+ # Deduplicate sampled documents by ID and adjust count to exactly 'sample_size'
229
+ unique_docs_by_id = {}
230
+ for doc in sampled_documents:
231
+ unique_docs_by_id[doc.get("id", "")] = doc
232
+
233
+ final_sample_docs = list(unique_docs_by_id.values())
234
+
235
+ if len(final_sample_docs) > num_sample_docs:
236
+ final_sample_docs = random.sample(final_sample_docs, num_sample_docs)
237
+ elif len(final_sample_docs) < num_sample_docs:
238
+ remaining_docs = [
239
+ d for d in corpus_documents if d.get("id", "") not in unique_docs_by_id
240
+ ]
241
+ needed_count = min(
242
+ num_sample_docs - len(final_sample_docs), len(remaining_docs)
243
+ )
244
+ final_sample_docs.extend(random.sample(remaining_docs, needed_count))
245
+
246
+ # Format the few-shot exemplar text block
247
+ prompt_lines: List[str] = []
248
+ for doc in final_sample_docs:
249
+ doc_id = doc.get("id", "")
250
+ title = doc.get("title", "")
251
+ text = doc.get("text", "")
252
+
253
+ # Truncate text if it exceeds the maximum character limit
254
+ if max_chars_per_text and len(text) > max_chars_per_text:
255
+ text = text[:max_chars_per_text] + "…"
256
+
257
+ associated_terms = doc_id_to_terms_map.get(doc_id, [])
258
+ prompt_lines.append(
259
+ f"Document ID: {doc_id}\nTitle: {title}\nText: {text}\nAssociated Terms: {associated_terms}\n----------------------------------------"
260
+ )
261
+
262
+ prompt_block = "\n".join(prompt_lines)
263
+ self.fewshot_terms_block = prompt_block
264
+ return prompt_block
265
+
266
+ # --- Few-shot construction (types) ---
267
+ def build_types_fewshot_block(
268
+ self,
269
+ docs_jsonl: str,
270
+ terms2doc_json: str,
271
+ sample_per_term: int = 1,
272
+ full_word: bool = True,
273
+ case_sensitive: bool = True,
274
+ max_chars_per_text: int = 800,
275
+ ) -> str:
276
+ """
277
+ Builds the few-shot block for Type Extraction.
278
+ This method samples documents based on finding an associated term/type within the text.
279
+ """
280
+ # Load documents into dict by ID
281
+ docs_by_id = {}
282
+ with open(docs_jsonl, "r", encoding="utf-8") as file_handle:
283
+ for line in file_handle:
284
+ line_stripped = line.strip()
285
+ if line_stripped:
286
+ try:
287
+ doc = json.loads(line_stripped)
288
+ doc_id = doc.get("id", "")
289
+ if doc_id:
290
+ docs_by_id[doc_id] = doc
291
+ except json.JSONDecodeError:
292
+ continue
293
+
294
+ # Load term -> [doc_id,...] map
295
+ with open(terms2doc_json, "r", encoding="utf-8") as file_handle:
296
+ term_to_doc_map = json.load(file_handle)
297
+
298
+ flags = 0 if case_sensitive else re.IGNORECASE
299
+ prompt_lines: List[str] = []
300
+
301
+ # Iterate over terms (which act as types in this context)
302
+ for term, doc_ids in term_to_doc_map.items():
303
+ escaped_term = re.escape(term)
304
+ # Create regex pattern for matching the term in the text
305
+ pattern = rf"\b{escaped_term}\b" if full_word else escaped_term
306
+ term_regex = re.compile(pattern, flags=flags)
307
+
308
+ picked_count = 0
309
+ for doc_id in doc_ids:
310
+ doc = docs_by_id.get(doc_id)
311
+ if not doc:
312
+ continue
313
+
314
+ title = doc.get("title", "")
315
+ text = doc.get("text", "")
316
+
317
+ # Check if the term/type is actually present in the document text/title
318
+ if term_regex.search(f"{title} {text}"):
319
+ text_content = text
320
+
321
+ # Truncate text if necessary
322
+ if max_chars_per_text and len(text_content) > max_chars_per_text:
323
+ text_content = text_content[:max_chars_per_text] + "…"
324
+
325
+ # Escape single quotes in the term for Python list formatting in the prompt
326
+ term_for_prompt = term.replace("'", "\\'")
327
+
328
+ prompt_lines.append(
329
+ f"Document ID: {doc_id}\nTitle: {title}\nText: {text_content}\nAssociated Types: ['{term_for_prompt}']\n----------------------------------------"
330
+ )
331
+ picked_count += 1
332
+
333
+ if picked_count >= sample_per_term:
334
+ break # Move to the next term
335
+
336
+ prompt_block = "\n".join(prompt_lines)
337
+ self.fewshot_types_block = prompt_block
338
+ return prompt_block
339
+
340
+ def fit(
341
+ self,
342
+ train_docs_jsonl: str,
343
+ terms2doc_json: str,
344
+ sample_size: int = 28,
345
+ seed: int = 123,
346
+ ) -> None:
347
+ """
348
+ Fit phase: Builds and caches the few-shot prompt blocks from the training files.
349
+ No model training occurs (Few-Shot/In-Context Learning).
350
+ """
351
+ # Build prompt block for Term extraction
352
+ _ = self.build_stratified_fewshot_prompt(
353
+ train_docs_jsonl, terms2doc_json, sample_size=sample_size, seed=seed
354
+ )
355
+ # Build prompt block for Type extraction
356
+ _ = self.build_types_fewshot_block(
357
+ train_docs_jsonl, terms2doc_json, sample_per_term=1
358
+ )
359
+
360
+ # -------------------------
361
+ # Inference helpers (prompt construction and output parsing)
362
+ # -------------------------
363
+ def _build_term_prompt(self, example_block: str, title: str, text: str) -> str:
364
+ """Constructs the full prompt for Term Extraction."""
365
+ return f"""{example_block}
366
+ [var]
367
+ Title: {title}
368
+ Text: {text}
369
+ [var]
370
+ Extract all relevant terms that could form the basis of an ontology from the above document.
371
+ Return ONLY a Python list like ['term1', 'term2', ...] and nothing else.
372
+ If no terms are found, return [].
373
+ """
374
+
375
+ def _build_type_prompt(self, example_block: str, title: str, text: str) -> str:
376
+ """Constructs the full prompt for Type Extraction."""
377
+ return f"""{example_block}
378
+ [var]
379
+ Title: {title}
380
+ Text: {text}
381
+ [var]
382
+ Extract all relevant TYPES mentioned in the above document that could serve as ontology classes.
383
+ Only consider content inside the [var] ... [var] block.
384
+ Return ONLY a valid Python list like ['type1', 'type2'] and nothing else. If none, return [].
385
+ """
386
+
387
+ def _parse_list_like(self, raw_string: str) -> List[str]:
388
+ """Try to extract a Python list of strings from model output robustly."""
389
+ processed_string = raw_string.strip()
390
+ if processed_string in ("[]", ""):
391
+ return []
392
+
393
+ # 1. Try direct evaluation
394
+ try:
395
+ parsed_value = ast.literal_eval(processed_string)
396
+ if isinstance(parsed_value, list):
397
+ # Filter to ensure only strings are returned
398
+ return [item for item in parsed_value if isinstance(item, str)]
399
+ except Exception:
400
+ pass
401
+
402
+ # 2. Try finding and evaluating text within outermost brackets [ ... ]
403
+ bracket_match = re.search(r"\[[\s\S]*?\]", processed_string)
404
+ if bracket_match:
405
+ try:
406
+ parsed_value = ast.literal_eval(bracket_match.group(0))
407
+ if isinstance(parsed_value, list):
408
+ return [item for item in parsed_value if isinstance(item, str)]
409
+ except Exception:
410
+ pass
411
+
412
+ # 3. Fallback: Find comma-separated quoted substrings (less robust, but catches errors)
413
+ # Finds content inside either single quotes ('...') or double quotes ("...")
414
+ quoted_matches = re.findall(r"'([^']+)'|\"([^\"]+)\"", processed_string)
415
+ flattened_list = [a_match or b_match for a_match, b_match in quoted_matches]
416
+ return flattened_list
417
+
418
+ def _call_model_one(self, prompt: str, max_new_tokens: int = 120) -> str:
419
+ """Calls the underlying LocalAutoLLM for a single prompt. Returns the raw tail output."""
420
+ # self.model is an instance of LocalAutoLLM
421
+ model_output = self.model.generate(
422
+ [prompt], max_new_tokens=max_new_tokens, temperature=0.0, top_p=1.0
423
+ )
424
+ return model_output[0] if model_output else ""
425
+
426
+ def predict_terms(
427
+ self,
428
+ docs_test_jsonl: str,
429
+ out_jsonl: str,
430
+ max_lines: int = -1,
431
+ max_new_tokens: int = 120,
432
+ ) -> int:
433
+ """
434
+ Runs Term Extraction on the test documents and saves results to a JSONL file.
435
+ Returns: The count of individual terms written.
436
+ """
437
+ if not self.fewshot_terms_block:
438
+ raise RuntimeError("Few-shot block for terms is empty. Call fit() first.")
439
+
440
+ num_written_terms = 0
441
+ with (
442
+ open(docs_test_jsonl, "r", encoding="utf-8") as file_in,
443
+ open(out_jsonl, "w", encoding="utf-8") as file_out,
444
+ ):
445
+ for line_index, line in enumerate(file_in, start=1):
446
+ if 0 < max_lines < line_index:
447
+ break
448
+
449
+ try:
450
+ document = json.loads(line.strip())
451
+ except Exception:
452
+ continue # Skip malformed JSON lines
453
+
454
+ doc_id = document.get("id", "unknown")
455
+ title = document.get("title", "")
456
+ text = document.get("text", "")
457
+
458
+ # Construct and call model
459
+ prompt = self._build_term_prompt(self.fewshot_terms_block, title, text)
460
+ raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens)
461
+ predicted_terms = self._parse_list_like(raw_output)
462
+
463
+ # Write extracted terms
464
+ for term_or_type in predicted_terms:
465
+ if isinstance(term_or_type, str) and term_or_type.strip():
466
+ file_out.write(
467
+ json.dumps({"doc_id": doc_id, "term": term_or_type.strip()})
468
+ + "\n"
469
+ )
470
+ num_written_terms += 1
471
+
472
+ # Lightweight memory management for long runs
473
+ if line_index % 50 == 0:
474
+ gc.collect()
475
+ if torch.cuda.is_available():
476
+ torch.cuda.empty_cache()
477
+
478
+ return num_written_terms
479
+
480
+ def predict_types(
481
+ self,
482
+ docs_test_jsonl: str,
483
+ out_jsonl: str,
484
+ max_lines: int = -1,
485
+ max_new_tokens: int = 120,
486
+ ) -> int:
487
+ """
488
+ Runs Type Extraction on the test documents and saves results to a JSONL file.
489
+ Returns: The count of individual types written.
490
+ """
491
+ if not self.fewshot_types_block:
492
+ raise RuntimeError("Few-shot block for types is empty. Call fit() first.")
493
+
494
+ num_written_types = 0
495
+ with (
496
+ open(docs_test_jsonl, "r", encoding="utf-8") as file_in,
497
+ open(out_jsonl, "w", encoding="utf-8") as file_out,
498
+ ):
499
+ for line_index, line in enumerate(file_in, start=1):
500
+ if 0 < max_lines < line_index:
501
+ break
502
+
503
+ try:
504
+ document = json.loads(line.strip())
505
+ except Exception:
506
+ continue # Skip malformed JSON lines
507
+
508
+ doc_id = document.get("id", "unknown")
509
+ title = document.get("title", "")
510
+ text = document.get("text", "")
511
+
512
+ # Construct and call model using the dedicated type prompt block
513
+ prompt = self._build_type_prompt(self.fewshot_types_block, title, text)
514
+ raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens)
515
+ predicted_types = self._parse_list_like(raw_output)
516
+
517
+ # Write extracted types
518
+ for term_or_type in predicted_types:
519
+ if isinstance(term_or_type, str) and term_or_type.strip():
520
+ file_out.write(
521
+ json.dumps({"doc_id": doc_id, "type": term_or_type.strip()})
522
+ + "\n"
523
+ )
524
+ num_written_types += 1
525
+
526
+ if line_index % 50 == 0:
527
+ gc.collect()
528
+ if torch.cuda.is_available():
529
+ torch.cuda.empty_cache()
530
+
531
+ return num_written_types
532
+
533
+ # --- Evaluation utilities (unchanged from prior definition, added docstrings) ---
534
+ def load_gold_pairs(self, terms2doc_path: str) -> Set[Tuple[str, str]]:
535
+ """Convert terms2docs JSON into a set of unique (doc_id, term) pairs, lowercased."""
536
+ gold_pairs = set()
537
+ with open(terms2doc_path, "r", encoding="utf-8") as file_handle:
538
+ term_to_doc_map = json.load(file_handle)
539
+
540
+ for term, doc_ids in term_to_doc_map.items():
541
+ clean_term = term.strip().lower()
542
+ for doc_id in doc_ids:
543
+ gold_pairs.add((doc_id, clean_term))
544
+ return gold_pairs
545
+
546
+ def load_predicted_pairs(
547
+ self, predicted_jsonl_path: str, key: str = "term"
548
+ ) -> Set[Tuple[str, str]]:
549
+ """Load predicted (doc_id, term/type) pairs from a JSONL file, lowercased."""
550
+ predicted_pairs = set()
551
+ with open(predicted_jsonl_path, "r", encoding="utf-8") as file_handle:
552
+ for line in file_handle:
553
+ try:
554
+ entry = json.loads(line.strip())
555
+ except Exception:
556
+ continue
557
+ doc_id = entry.get("doc_id")
558
+ value = entry.get(key)
559
+ if doc_id and value:
560
+ predicted_pairs.add((doc_id, value.strip().lower()))
561
+ return predicted_pairs
562
+
563
+ def evaluate_extraction_f1(
564
+ self, terms2doc_path: str, predicted_jsonl: str, key: str = "term"
565
+ ) -> float:
566
+ """
567
+ Computes set-based binary Precision, Recall, and F1 score against the gold pairs.
568
+ """
569
+ # Load the ground truth and predictions
570
+ gold_set = self.load_gold_pairs(terms2doc_path)
571
+ predicted_set = self.load_predicted_pairs(predicted_jsonl, key=key)
572
+
573
+ # Build combined universe of all pairs for score calculation
574
+ all_pairs = sorted(gold_set | predicted_set)
575
+
576
+ # Create binary labels (1=present, 0=absent)
577
+ y_true = [1 if pair in gold_set else 0 for pair in all_pairs]
578
+ y_pred = [1 if pair in predicted_set else 0 for pair in all_pairs]
579
+
580
+ # Use scikit-learn for metric calculation
581
+ from sklearn.metrics import precision_recall_fscore_support
582
+
583
+ precision, recall, f1, _ = precision_recall_fscore_support(
584
+ y_true, y_pred, average="binary", zero_division=0
585
+ )
586
+
587
+ # Display results
588
+ num_true_positives = len(gold_set & predicted_set)
589
+
590
+ print("\n📊 Evaluation Results:")
591
+ print(f" ✅ Precision: {precision:.4f}")
592
+ print(f" ✅ Recall: {recall:.4f}")
593
+ print(f" ✅ F1 Score: {f1:.4f}")
594
+ print(f" 📌 Gold pairs: {len(gold_set)}")
595
+ print(f" 📌 Predicted pairs:{len(predicted_set)}")
596
+ print(f" 🎯 True Positives: {num_true_positives}")
597
+
598
+ return float(f1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OntoLearner
3
- Version: 1.4.7
3
+ Version: 1.4.8
4
4
  Summary: OntoLearner: A Modular Python Library for Ontology Learning with LLMs.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -13,8 +13,10 @@ Classifier: Programming Language :: Python :: 3.10
13
13
  Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Dist: Levenshtein
16
17
  Requires-Dist: bitsandbytes (>=0.45.1,<0.46.0)
17
18
  Requires-Dist: dspy (>=2.6.14,<3.0.0)
19
+ Requires-Dist: g4f
18
20
  Requires-Dist: huggingface-hub (>=0.34.4,<0.35.0)
19
21
  Requires-Dist: matplotlib
20
22
  Requires-Dist: mistral-common[sentencepiece] (>=1.8.5,<2.0.0)
@@ -23,6 +25,7 @@ Requires-Dist: numpy
23
25
  Requires-Dist: openpyxl
24
26
  Requires-Dist: pandas
25
27
  Requires-Dist: pathlib (==1.0.1)
28
+ Requires-Dist: protobuf (<5)
26
29
  Requires-Dist: pydantic (==2.11.3)
27
30
  Requires-Dist: python-dotenv
28
31
  Requires-Dist: rdflib (==7.1.1)
@@ -1,9 +1,9 @@
1
- ontolearner/VERSION,sha256=eU2b_S0GS-IEHLoICSOz45ujvshMNXlHgEgoUrX0AH4,6
1
+ ontolearner/VERSION,sha256=ffNZhUJ2DAagaoMtfLwSvSCz_Rvq5cjAOZ6WQvcHNxc,6
2
2
  ontolearner/__init__.py,sha256=E4yukFv2PV4uyztTPDWljCySY9AVDcDDzabuvxfabYE,1889
3
3
  ontolearner/_learner.py,sha256=2CRQvpsz8akIOdxTs2-KLJ-MssULrjpK-QDD3QXUJXI,5297
4
4
  ontolearner/_ontology.py,sha256=W1mp195SImqLKwaj4ueEaBWuLJg2jUdx1JT20Ds3fmQ,6950
5
5
  ontolearner/base/__init__.py,sha256=5pf-ltxzGp32xhEcPdbtm11wXJrYJMUeWG-mbcAYD8Q,705
6
- ontolearner/base/learner.py,sha256=G2NP8XMMbCPllW35bLXxH9-hQRcVoRto5aNWT5PZsYY,18684
6
+ ontolearner/base/learner.py,sha256=latiGv8p3nyPrxMp7g5B2MSF-JEInRwIlbOn09uh7io,18899
7
7
  ontolearner/base/ontology.py,sha256=JbMJ1-WUyHWQiNJL-DeaqcriUimLdqN3_ESROgqOPTQ,24772
8
8
  ontolearner/base/text2onto.py,sha256=iUXYZoqnwgebQuQzM-XSGTVRfHLlhjUK_z5XUvhRICc,5388
9
9
  ontolearner/data_structure/__init__.py,sha256=1HiKvk8FKjhYeI92RHnJXxyQbUJBi3JFytjQjthsY_s,599
@@ -13,11 +13,23 @@ ontolearner/evaluation/__init__.py,sha256=4BZr3BUXjQDTj4Aqlqy4THa80lZPsMuh1EBTCy
13
13
  ontolearner/evaluation/evaluate.py,sha256=NYCVcmPqpyIxYZrMAim37gL-erdh698RD3t3eNTTgZc,1163
14
14
  ontolearner/evaluation/metrics.py,sha256=3Aw6ycJ3_Q6xfj4tMBJP6QcexUei0G16H0ZQWt87aRU,6286
15
15
  ontolearner/learner/__init__.py,sha256=L54a3uvOeS6hbID6-BFd6fo9rH4WK2Q1XecpXMkEru0,768
16
- ontolearner/learner/label_mapper.py,sha256=-XW8MHafm4ix3e9u-RRwDePJ71D804DNuKzdf1zudtk,3789
16
+ ontolearner/learner/label_mapper.py,sha256=YMPeFKzJxoCYNU5z7QRYPbB88sWdu1iT6iBDpPsjn-4,3792
17
17
  ontolearner/learner/llm.py,sha256=3kq_IrwEPTFgeNVKZH9Er_OydJuDpRBtM3YXNNa8_KA,10343
18
18
  ontolearner/learner/prompt.py,sha256=0ckH7xphIDKczPe7G-rwiOxFGZ7RsLnpPlNW92b-31U,1574
19
19
  ontolearner/learner/rag.py,sha256=eysB2RvcWkVo53s8-kSbZtJv904YVTmdtxplM4ukUKM,4283
20
- ontolearner/learner/retriever.py,sha256=GDXr6l0m_prxnctxQzBpm75xL4jW2Q4b91iyePFcDAs,4988
20
+ ontolearner/learner/retriever.py,sha256=PNDAwsLIOBD3aQW2Ez0q6PqE3CB7d_GN-yLKJ9_D04s,6204
21
+ ontolearner/learner/taxonomy_discovery/__init__.py,sha256=-Hb5Dl6_6c4l1uIT2zWtyBWMq5cjVD4PNjxt5qJePl4,747
22
+ ontolearner/learner/taxonomy_discovery/alexbek.py,sha256=kFEDvoKxLf-sB7-d5REkcC0DqXZpcA6ZSJ2QHrNoC5E,19010
23
+ ontolearner/learner/taxonomy_discovery/rwthdbis.py,sha256=698Gze2cR-QIhpTbuaOFm7Q4p0lCbdWz3rO6rewJZ1s,41644
24
+ ontolearner/learner/taxonomy_discovery/sbunlp.py,sha256=hyTxPMCdS2BIb9R61OQgT9ibZYmPd-vaj7KBCRCAggk,14987
25
+ ontolearner/learner/taxonomy_discovery/skhnlp.py,sha256=nEsA1MJueEs25IC5B-4OAOn5R6mOfz_7C4xIUC6hNN4,45516
26
+ ontolearner/learner/term_typing/__init__.py,sha256=2rBbgp8683GNVgB58T4xe76l4m-NTqL7MwpAnux0IDY,691
27
+ ontolearner/learner/term_typing/alexbek.py,sha256=SzWQbndkhAjxETVbrJ4uyH7ykL_TMIwHozSS08zwjoM,46684
28
+ ontolearner/learner/term_typing/rwthdbis.py,sha256=F6Jr1SrsbDOIe0Ee_FkDVGTG4wRWpM-R2YqrqEQiex0,14576
29
+ ontolearner/learner/term_typing/sbunlp.py,sha256=Xd3UqMO3m_Skn_2geTN22MGQmSD6R8bYfPgubZre3IE,19820
30
+ ontolearner/learner/text2onto/__init__.py,sha256=4-G6iel0Nxcj4nzPxUDqtFf9CMCzi8LghooOSAnbNfc,641
31
+ ontolearner/learner/text2onto/alexbek.py,sha256=MySzxJUR0F3UyeS5rPIN988xxtPaoAxDFkBc-Q0vFTE,45494
32
+ ontolearner/learner/text2onto/sbunlp.py,sha256=5p-s2Ixtntws5eO3gOUyYLpfZpCbOE0hG5gEcCwKHz4,24177
21
33
  ontolearner/ontology/__init__.py,sha256=F9Ta1qCX9mOxIK5CPRypEoglQNkpJ6SJpqziz73xKQE,1328
22
34
  ontolearner/ontology/agriculture.py,sha256=ZaXHNEFjbtsMH8M7HQ8ypnfJS4TUQy_as16fwv-kOKA,5903
23
35
  ontolearner/ontology/arts_humanities.py,sha256=K4ceDJL6PfIfSJZ86uQUkUXOVoiERG6ItgvVE2lhLKk,3996
@@ -53,7 +65,7 @@ ontolearner/tools/visualizer.py,sha256=cwijl4yYaS1SCLM5wbvRTEcbQj9Bjo4fHzZR6q6o8
53
65
  ontolearner/utils/__init__.py,sha256=pSEyU3dlPMADBqygqaaid44RdWf0Lo3Fvz-K_rQ7_Bw,733
54
66
  ontolearner/utils/io.py,sha256=3DqGK2p7c0onKi0Xxs16WB08uHfHUId3bW0dDKwyS0g,2110
55
67
  ontolearner/utils/train_test_split.py,sha256=Zlm42eT6QGWwlySyomCPIiTGmGqeN_h4z4xBY2EAOR8,11530
56
- ontolearner-1.4.7.dist-info/METADATA,sha256=2oMe5F7YGUl3TiOrhgCTAJyVFQ2LRbUu1WPPEcXvHX4,14083
57
- ontolearner-1.4.7.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
58
- ontolearner-1.4.7.dist-info/licenses/LICENSE,sha256=krXMLuMKgzX-UgaufgfJdm9ojIloZot7ZdvJUnNxl4I,1067
59
- ontolearner-1.4.7.dist-info/RECORD,,
68
+ ontolearner-1.4.8.dist-info/METADATA,sha256=B_ULVAw849kBqCpF0-oX8lUuq2d7GFMLPPESYQdWnp8,14158
69
+ ontolearner-1.4.8.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
70
+ ontolearner-1.4.8.dist-info/licenses/LICENSE,sha256=krXMLuMKgzX-UgaufgfJdm9ojIloZot7ZdvJUnNxl4I,1067
71
+ ontolearner-1.4.8.dist-info/RECORD,,