OntoLearner 1.4.7__py3-none-any.whl → 1.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ontolearner/VERSION +1 -1
- ontolearner/base/learner.py +15 -12
- ontolearner/learner/__init__.py +1 -1
- ontolearner/learner/label_mapper.py +1 -1
- ontolearner/learner/retriever/__init__.py +19 -0
- ontolearner/learner/retriever/crossencoder.py +129 -0
- ontolearner/learner/retriever/embedding.py +229 -0
- ontolearner/learner/retriever/learner.py +217 -0
- ontolearner/learner/retriever/llm_retriever.py +356 -0
- ontolearner/learner/retriever/ngram.py +123 -0
- ontolearner/learner/taxonomy_discovery/__init__.py +18 -0
- ontolearner/learner/taxonomy_discovery/alexbek.py +500 -0
- ontolearner/learner/taxonomy_discovery/rwthdbis.py +1082 -0
- ontolearner/learner/taxonomy_discovery/sbunlp.py +402 -0
- ontolearner/learner/taxonomy_discovery/skhnlp.py +1138 -0
- ontolearner/learner/term_typing/__init__.py +17 -0
- ontolearner/learner/term_typing/alexbek.py +1262 -0
- ontolearner/learner/term_typing/rwthdbis.py +379 -0
- ontolearner/learner/term_typing/sbunlp.py +478 -0
- ontolearner/learner/text2onto/__init__.py +16 -0
- ontolearner/learner/text2onto/alexbek.py +1219 -0
- ontolearner/learner/text2onto/sbunlp.py +598 -0
- {ontolearner-1.4.7.dist-info → ontolearner-1.4.9.dist-info}/METADATA +16 -12
- {ontolearner-1.4.7.dist-info → ontolearner-1.4.9.dist-info}/RECORD +26 -9
- ontolearner/learner/retriever.py +0 -101
- {ontolearner-1.4.7.dist-info → ontolearner-1.4.9.dist-info}/WHEEL +0 -0
- {ontolearner-1.4.7.dist-info → ontolearner-1.4.9.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,598 @@
|
|
|
1
|
+
# Copyright (c) 2025 SciKnowOrg
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the MIT License (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# https://opensource.org/licenses/MIT
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import random
|
|
17
|
+
import re
|
|
18
|
+
import ast
|
|
19
|
+
import gc
|
|
20
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
|
|
23
|
+
import torch
|
|
24
|
+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|
25
|
+
|
|
26
|
+
from ...base import AutoLearner, AutoLLM
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# -----------------------------------------------------------------------------
|
|
30
|
+
# Concrete AutoLLM: local HF wrapper that follows the AutoLLM interface
|
|
31
|
+
# -----------------------------------------------------------------------------
|
|
32
|
+
class LocalAutoLLM(AutoLLM):
|
|
33
|
+
"""
|
|
34
|
+
Handles loading and generation for a Hugging Face Causal Language Model (Qwen/TinyLlama).
|
|
35
|
+
Uses 4-bit quantization for efficiency and greedy decoding by default.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self, label_mapper: Any = None, device: str = "cpu", token: str = ""
|
|
40
|
+
) -> None:
|
|
41
|
+
super().__init__(label_mapper=label_mapper, device=device, token=token)
|
|
42
|
+
self.model = None
|
|
43
|
+
self.tokenizer = None
|
|
44
|
+
|
|
45
|
+
def load(
|
|
46
|
+
self,
|
|
47
|
+
model_id: str,
|
|
48
|
+
load_in_4bit: bool = False,
|
|
49
|
+
dtype: str = "auto",
|
|
50
|
+
trust_remote_code: bool = True,
|
|
51
|
+
):
|
|
52
|
+
"""Load tokenizer + model, applying 4-bit quantization if specified and possible."""
|
|
53
|
+
|
|
54
|
+
# Determine the target data type (default to float32 for CPU, float16 for GPU)
|
|
55
|
+
torch_dtype_val = torch.float16 if torch.cuda.is_available() else torch.float32
|
|
56
|
+
|
|
57
|
+
# Load the tokenizer
|
|
58
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
59
|
+
model_id, trust_remote_code=trust_remote_code
|
|
60
|
+
)
|
|
61
|
+
if self.tokenizer.pad_token is None:
|
|
62
|
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
63
|
+
|
|
64
|
+
quant_config = None
|
|
65
|
+
if load_in_4bit:
|
|
66
|
+
# Configure BitsAndBytes for 4-bit loading
|
|
67
|
+
quant_config = BitsAndBytesConfig(
|
|
68
|
+
load_in_4bit=True,
|
|
69
|
+
bnb_4bit_compute_dtype=torch.float16,
|
|
70
|
+
bnb_4bit_use_double_quant=True,
|
|
71
|
+
bnb_4bit_quant_type="nf4",
|
|
72
|
+
)
|
|
73
|
+
if torch_dtype_val is None:
|
|
74
|
+
torch_dtype_val = torch.float16
|
|
75
|
+
|
|
76
|
+
# Set device mapping (auto for multi-GPU or single GPU, explicit CPU otherwise)
|
|
77
|
+
device_map = "auto" if (self.device != "cpu") else {"": "cpu"}
|
|
78
|
+
|
|
79
|
+
# Load the Causal Language Model
|
|
80
|
+
self.model = AutoModelForCausalLM.from_pretrained(
|
|
81
|
+
model_id,
|
|
82
|
+
device_map=device_map,
|
|
83
|
+
torch_dtype=torch_dtype_val,
|
|
84
|
+
quantization_config=quant_config,
|
|
85
|
+
trust_remote_code=trust_remote_code,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Ensure model is on the correct device (redundant if device_map="auto" but safe)
|
|
89
|
+
if self.device == "cpu":
|
|
90
|
+
self.model.to("cpu")
|
|
91
|
+
|
|
92
|
+
def generate(
|
|
93
|
+
self,
|
|
94
|
+
inputs: List[str],
|
|
95
|
+
max_new_tokens: int = 64,
|
|
96
|
+
temperature: float = 0.0,
|
|
97
|
+
top_p: float = 1.0,
|
|
98
|
+
) -> List[str]:
|
|
99
|
+
"""Generate continuations for a list of prompts, returning only the generated part."""
|
|
100
|
+
if self.model is None or self.tokenizer is None:
|
|
101
|
+
raise RuntimeError("Model/tokenizer not loaded. Call .load() first.")
|
|
102
|
+
|
|
103
|
+
# --- Generation Setup ---
|
|
104
|
+
# Tokenize batch (padding is essential for batch inference)
|
|
105
|
+
enc = self.tokenizer(inputs, return_tensors="pt", padding=True, truncation=True)
|
|
106
|
+
input_ids = enc["input_ids"]
|
|
107
|
+
attention_mask = enc["attention_mask"]
|
|
108
|
+
|
|
109
|
+
# Move tensors to the model's device (e.g., cuda:0)
|
|
110
|
+
model_device = next(self.model.parameters()).device
|
|
111
|
+
input_ids = input_ids.to(model_device)
|
|
112
|
+
attention_mask = attention_mask.to(model_device)
|
|
113
|
+
|
|
114
|
+
# --- Generate ---
|
|
115
|
+
with torch.no_grad():
|
|
116
|
+
outputs = self.model.generate(
|
|
117
|
+
input_ids=input_ids,
|
|
118
|
+
attention_mask=attention_mask,
|
|
119
|
+
max_new_tokens=max_new_tokens,
|
|
120
|
+
do_sample=(
|
|
121
|
+
temperature > 0.0
|
|
122
|
+
), # Use greedy decoding if temperature is 0.0
|
|
123
|
+
temperature=temperature,
|
|
124
|
+
top_p=top_p,
|
|
125
|
+
pad_token_id=self.tokenizer.eos_token_id,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# --- Post-processing: Extract only the generated tail ---
|
|
129
|
+
decoded_outputs: List[str] = []
|
|
130
|
+
for i, output_ids in enumerate(outputs):
|
|
131
|
+
full_decoded_text = self.tokenizer.decode(
|
|
132
|
+
output_ids, skip_special_tokens=True
|
|
133
|
+
)
|
|
134
|
+
prompt_text = self.tokenizer.decode(input_ids[i], skip_special_tokens=True)
|
|
135
|
+
|
|
136
|
+
# Safely strip the prompt text from the full output
|
|
137
|
+
if full_decoded_text.startswith(prompt_text):
|
|
138
|
+
generated_tail = full_decoded_text[len(prompt_text) :].strip()
|
|
139
|
+
else:
|
|
140
|
+
# Fallback extraction (less robust if padding affects token indices)
|
|
141
|
+
prompt_len = input_ids.shape[1]
|
|
142
|
+
generated_tail = self.tokenizer.decode(
|
|
143
|
+
output_ids[prompt_len:], skip_special_tokens=True
|
|
144
|
+
).strip()
|
|
145
|
+
decoded_outputs.append(generated_tail)
|
|
146
|
+
|
|
147
|
+
return decoded_outputs
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# -----------------------------------------------------------------------------
|
|
151
|
+
# Main Learner: SBUNLPFewShotLearner (Task A Text2Onto)
|
|
152
|
+
# -----------------------------------------------------------------------------
|
|
153
|
+
class SBUNLPFewShotLearner(AutoLearner):
|
|
154
|
+
"""
|
|
155
|
+
Concrete learner implementing the Task A Text2Onto pipeline (Term and Type Extraction).
|
|
156
|
+
It uses Few-Shot prompts generated from training data for inference.
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
def __init__(self, model: Optional[AutoLLM] = None, device: str = "cpu"):
|
|
160
|
+
super().__init__()
|
|
161
|
+
# self.model is an instance of LocalAutoLLM
|
|
162
|
+
self.model = model or LocalAutoLLM(device=device)
|
|
163
|
+
self.device = device
|
|
164
|
+
# Cached in-memory prompt blocks built during the fit phase
|
|
165
|
+
self.fewshot_terms_block: str = ""
|
|
166
|
+
self.fewshot_types_block: str = ""
|
|
167
|
+
|
|
168
|
+
# --- Few-shot construction (terms) ---
|
|
169
|
+
def build_stratified_fewshot_prompt(
|
|
170
|
+
self,
|
|
171
|
+
documents_path: str,
|
|
172
|
+
terms_path: str,
|
|
173
|
+
sample_size: int = 28,
|
|
174
|
+
seed: int = 123,
|
|
175
|
+
max_chars_per_text: int = 1200,
|
|
176
|
+
) -> str:
|
|
177
|
+
"""
|
|
178
|
+
Builds the few-shot exemplar block for Term Extraction using stratified sampling.
|
|
179
|
+
"""
|
|
180
|
+
random.seed(seed)
|
|
181
|
+
|
|
182
|
+
# Read documents (JSONL) into a list
|
|
183
|
+
corpus_documents: List[Dict[str, Any]] = []
|
|
184
|
+
with open(documents_path, "r", encoding="utf-8") as file_handle:
|
|
185
|
+
for line in file_handle:
|
|
186
|
+
if line.strip():
|
|
187
|
+
corpus_documents.append(json.loads(line))
|
|
188
|
+
|
|
189
|
+
num_total_docs = len(corpus_documents)
|
|
190
|
+
num_sample_docs = min(sample_size, num_total_docs)
|
|
191
|
+
|
|
192
|
+
# Load the map of term -> [list of document IDs]
|
|
193
|
+
with open(terms_path, "r", encoding="utf-8") as file_handle:
|
|
194
|
+
term_to_doc_map = json.load(file_handle)
|
|
195
|
+
|
|
196
|
+
# Invert map: document ID -> [list of terms]
|
|
197
|
+
doc_id_to_terms_map = defaultdict(list)
|
|
198
|
+
for term, doc_ids in term_to_doc_map.items():
|
|
199
|
+
for doc_id in doc_ids:
|
|
200
|
+
doc_id_to_terms_map[doc_id].append(term)
|
|
201
|
+
|
|
202
|
+
# Define strata (groups of documents associated with specific terms)
|
|
203
|
+
strata_map = defaultdict(list)
|
|
204
|
+
for doc in corpus_documents:
|
|
205
|
+
doc_id = doc.get("id", "")
|
|
206
|
+
associated_terms = doc_id_to_terms_map.get(doc_id, ["no_term"])
|
|
207
|
+
for term in associated_terms:
|
|
208
|
+
strata_map[term].append(doc)
|
|
209
|
+
|
|
210
|
+
# Perform proportional sampling across strata
|
|
211
|
+
sampled_documents: List[Dict[str, Any]] = []
|
|
212
|
+
for term_str, stratum_docs in strata_map.items():
|
|
213
|
+
num_stratum_docs = len(stratum_docs)
|
|
214
|
+
if num_stratum_docs == 0:
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
# Calculate proportional sample size
|
|
218
|
+
proportion = num_stratum_docs / num_total_docs
|
|
219
|
+
num_to_sample_from_stratum = int(num_sample_docs * proportion)
|
|
220
|
+
|
|
221
|
+
if num_to_sample_from_stratum > 0:
|
|
222
|
+
sampled_documents.extend(
|
|
223
|
+
random.sample(
|
|
224
|
+
stratum_docs, min(num_to_sample_from_stratum, num_stratum_docs)
|
|
225
|
+
)
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Deduplicate sampled documents by ID and adjust count to exactly 'sample_size'
|
|
229
|
+
unique_docs_by_id = {}
|
|
230
|
+
for doc in sampled_documents:
|
|
231
|
+
unique_docs_by_id[doc.get("id", "")] = doc
|
|
232
|
+
|
|
233
|
+
final_sample_docs = list(unique_docs_by_id.values())
|
|
234
|
+
|
|
235
|
+
if len(final_sample_docs) > num_sample_docs:
|
|
236
|
+
final_sample_docs = random.sample(final_sample_docs, num_sample_docs)
|
|
237
|
+
elif len(final_sample_docs) < num_sample_docs:
|
|
238
|
+
remaining_docs = [
|
|
239
|
+
d for d in corpus_documents if d.get("id", "") not in unique_docs_by_id
|
|
240
|
+
]
|
|
241
|
+
needed_count = min(
|
|
242
|
+
num_sample_docs - len(final_sample_docs), len(remaining_docs)
|
|
243
|
+
)
|
|
244
|
+
final_sample_docs.extend(random.sample(remaining_docs, needed_count))
|
|
245
|
+
|
|
246
|
+
# Format the few-shot exemplar text block
|
|
247
|
+
prompt_lines: List[str] = []
|
|
248
|
+
for doc in final_sample_docs:
|
|
249
|
+
doc_id = doc.get("id", "")
|
|
250
|
+
title = doc.get("title", "")
|
|
251
|
+
text = doc.get("text", "")
|
|
252
|
+
|
|
253
|
+
# Truncate text if it exceeds the maximum character limit
|
|
254
|
+
if max_chars_per_text and len(text) > max_chars_per_text:
|
|
255
|
+
text = text[:max_chars_per_text] + "…"
|
|
256
|
+
|
|
257
|
+
associated_terms = doc_id_to_terms_map.get(doc_id, [])
|
|
258
|
+
prompt_lines.append(
|
|
259
|
+
f"Document ID: {doc_id}\nTitle: {title}\nText: {text}\nAssociated Terms: {associated_terms}\n----------------------------------------"
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
prompt_block = "\n".join(prompt_lines)
|
|
263
|
+
self.fewshot_terms_block = prompt_block
|
|
264
|
+
return prompt_block
|
|
265
|
+
|
|
266
|
+
# --- Few-shot construction (types) ---
|
|
267
|
+
def build_types_fewshot_block(
|
|
268
|
+
self,
|
|
269
|
+
docs_jsonl: str,
|
|
270
|
+
terms2doc_json: str,
|
|
271
|
+
sample_per_term: int = 1,
|
|
272
|
+
full_word: bool = True,
|
|
273
|
+
case_sensitive: bool = True,
|
|
274
|
+
max_chars_per_text: int = 800,
|
|
275
|
+
) -> str:
|
|
276
|
+
"""
|
|
277
|
+
Builds the few-shot block for Type Extraction.
|
|
278
|
+
This method samples documents based on finding an associated term/type within the text.
|
|
279
|
+
"""
|
|
280
|
+
# Load documents into dict by ID
|
|
281
|
+
docs_by_id = {}
|
|
282
|
+
with open(docs_jsonl, "r", encoding="utf-8") as file_handle:
|
|
283
|
+
for line in file_handle:
|
|
284
|
+
line_stripped = line.strip()
|
|
285
|
+
if line_stripped:
|
|
286
|
+
try:
|
|
287
|
+
doc = json.loads(line_stripped)
|
|
288
|
+
doc_id = doc.get("id", "")
|
|
289
|
+
if doc_id:
|
|
290
|
+
docs_by_id[doc_id] = doc
|
|
291
|
+
except json.JSONDecodeError:
|
|
292
|
+
continue
|
|
293
|
+
|
|
294
|
+
# Load term -> [doc_id,...] map
|
|
295
|
+
with open(terms2doc_json, "r", encoding="utf-8") as file_handle:
|
|
296
|
+
term_to_doc_map = json.load(file_handle)
|
|
297
|
+
|
|
298
|
+
flags = 0 if case_sensitive else re.IGNORECASE
|
|
299
|
+
prompt_lines: List[str] = []
|
|
300
|
+
|
|
301
|
+
# Iterate over terms (which act as types in this context)
|
|
302
|
+
for term, doc_ids in term_to_doc_map.items():
|
|
303
|
+
escaped_term = re.escape(term)
|
|
304
|
+
# Create regex pattern for matching the term in the text
|
|
305
|
+
pattern = rf"\b{escaped_term}\b" if full_word else escaped_term
|
|
306
|
+
term_regex = re.compile(pattern, flags=flags)
|
|
307
|
+
|
|
308
|
+
picked_count = 0
|
|
309
|
+
for doc_id in doc_ids:
|
|
310
|
+
doc = docs_by_id.get(doc_id)
|
|
311
|
+
if not doc:
|
|
312
|
+
continue
|
|
313
|
+
|
|
314
|
+
title = doc.get("title", "")
|
|
315
|
+
text = doc.get("text", "")
|
|
316
|
+
|
|
317
|
+
# Check if the term/type is actually present in the document text/title
|
|
318
|
+
if term_regex.search(f"{title} {text}"):
|
|
319
|
+
text_content = text
|
|
320
|
+
|
|
321
|
+
# Truncate text if necessary
|
|
322
|
+
if max_chars_per_text and len(text_content) > max_chars_per_text:
|
|
323
|
+
text_content = text_content[:max_chars_per_text] + "…"
|
|
324
|
+
|
|
325
|
+
# Escape single quotes in the term for Python list formatting in the prompt
|
|
326
|
+
term_for_prompt = term.replace("'", "\\'")
|
|
327
|
+
|
|
328
|
+
prompt_lines.append(
|
|
329
|
+
f"Document ID: {doc_id}\nTitle: {title}\nText: {text_content}\nAssociated Types: ['{term_for_prompt}']\n----------------------------------------"
|
|
330
|
+
)
|
|
331
|
+
picked_count += 1
|
|
332
|
+
|
|
333
|
+
if picked_count >= sample_per_term:
|
|
334
|
+
break # Move to the next term
|
|
335
|
+
|
|
336
|
+
prompt_block = "\n".join(prompt_lines)
|
|
337
|
+
self.fewshot_types_block = prompt_block
|
|
338
|
+
return prompt_block
|
|
339
|
+
|
|
340
|
+
def fit(
|
|
341
|
+
self,
|
|
342
|
+
train_docs_jsonl: str,
|
|
343
|
+
terms2doc_json: str,
|
|
344
|
+
sample_size: int = 28,
|
|
345
|
+
seed: int = 123,
|
|
346
|
+
) -> None:
|
|
347
|
+
"""
|
|
348
|
+
Fit phase: Builds and caches the few-shot prompt blocks from the training files.
|
|
349
|
+
No model training occurs (Few-Shot/In-Context Learning).
|
|
350
|
+
"""
|
|
351
|
+
# Build prompt block for Term extraction
|
|
352
|
+
_ = self.build_stratified_fewshot_prompt(
|
|
353
|
+
train_docs_jsonl, terms2doc_json, sample_size=sample_size, seed=seed
|
|
354
|
+
)
|
|
355
|
+
# Build prompt block for Type extraction
|
|
356
|
+
_ = self.build_types_fewshot_block(
|
|
357
|
+
train_docs_jsonl, terms2doc_json, sample_per_term=1
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# -------------------------
|
|
361
|
+
# Inference helpers (prompt construction and output parsing)
|
|
362
|
+
# -------------------------
|
|
363
|
+
def _build_term_prompt(self, example_block: str, title: str, text: str) -> str:
|
|
364
|
+
"""Constructs the full prompt for Term Extraction."""
|
|
365
|
+
return f"""{example_block}
|
|
366
|
+
[var]
|
|
367
|
+
Title: {title}
|
|
368
|
+
Text: {text}
|
|
369
|
+
[var]
|
|
370
|
+
Extract all relevant terms that could form the basis of an ontology from the above document.
|
|
371
|
+
Return ONLY a Python list like ['term1', 'term2', ...] and nothing else.
|
|
372
|
+
If no terms are found, return [].
|
|
373
|
+
"""
|
|
374
|
+
|
|
375
|
+
def _build_type_prompt(self, example_block: str, title: str, text: str) -> str:
|
|
376
|
+
"""Constructs the full prompt for Type Extraction."""
|
|
377
|
+
return f"""{example_block}
|
|
378
|
+
[var]
|
|
379
|
+
Title: {title}
|
|
380
|
+
Text: {text}
|
|
381
|
+
[var]
|
|
382
|
+
Extract all relevant TYPES mentioned in the above document that could serve as ontology classes.
|
|
383
|
+
Only consider content inside the [var] ... [var] block.
|
|
384
|
+
Return ONLY a valid Python list like ['type1', 'type2'] and nothing else. If none, return [].
|
|
385
|
+
"""
|
|
386
|
+
|
|
387
|
+
def _parse_list_like(self, raw_string: str) -> List[str]:
|
|
388
|
+
"""Try to extract a Python list of strings from model output robustly."""
|
|
389
|
+
processed_string = raw_string.strip()
|
|
390
|
+
if processed_string in ("[]", ""):
|
|
391
|
+
return []
|
|
392
|
+
|
|
393
|
+
# 1. Try direct evaluation
|
|
394
|
+
try:
|
|
395
|
+
parsed_value = ast.literal_eval(processed_string)
|
|
396
|
+
if isinstance(parsed_value, list):
|
|
397
|
+
# Filter to ensure only strings are returned
|
|
398
|
+
return [item for item in parsed_value if isinstance(item, str)]
|
|
399
|
+
except Exception:
|
|
400
|
+
pass
|
|
401
|
+
|
|
402
|
+
# 2. Try finding and evaluating text within outermost brackets [ ... ]
|
|
403
|
+
bracket_match = re.search(r"\[[\s\S]*?\]", processed_string)
|
|
404
|
+
if bracket_match:
|
|
405
|
+
try:
|
|
406
|
+
parsed_value = ast.literal_eval(bracket_match.group(0))
|
|
407
|
+
if isinstance(parsed_value, list):
|
|
408
|
+
return [item for item in parsed_value if isinstance(item, str)]
|
|
409
|
+
except Exception:
|
|
410
|
+
pass
|
|
411
|
+
|
|
412
|
+
# 3. Fallback: Find comma-separated quoted substrings (less robust, but catches errors)
|
|
413
|
+
# Finds content inside either single quotes ('...') or double quotes ("...")
|
|
414
|
+
quoted_matches = re.findall(r"'([^']+)'|\"([^\"]+)\"", processed_string)
|
|
415
|
+
flattened_list = [a_match or b_match for a_match, b_match in quoted_matches]
|
|
416
|
+
return flattened_list
|
|
417
|
+
|
|
418
|
+
def _call_model_one(self, prompt: str, max_new_tokens: int = 120) -> str:
|
|
419
|
+
"""Calls the underlying LocalAutoLLM for a single prompt. Returns the raw tail output."""
|
|
420
|
+
# self.model is an instance of LocalAutoLLM
|
|
421
|
+
model_output = self.model.generate(
|
|
422
|
+
[prompt], max_new_tokens=max_new_tokens, temperature=0.0, top_p=1.0
|
|
423
|
+
)
|
|
424
|
+
return model_output[0] if model_output else ""
|
|
425
|
+
|
|
426
|
+
def predict_terms(
|
|
427
|
+
self,
|
|
428
|
+
docs_test_jsonl: str,
|
|
429
|
+
out_jsonl: str,
|
|
430
|
+
max_lines: int = -1,
|
|
431
|
+
max_new_tokens: int = 120,
|
|
432
|
+
) -> int:
|
|
433
|
+
"""
|
|
434
|
+
Runs Term Extraction on the test documents and saves results to a JSONL file.
|
|
435
|
+
Returns: The count of individual terms written.
|
|
436
|
+
"""
|
|
437
|
+
if not self.fewshot_terms_block:
|
|
438
|
+
raise RuntimeError("Few-shot block for terms is empty. Call fit() first.")
|
|
439
|
+
|
|
440
|
+
num_written_terms = 0
|
|
441
|
+
with (
|
|
442
|
+
open(docs_test_jsonl, "r", encoding="utf-8") as file_in,
|
|
443
|
+
open(out_jsonl, "w", encoding="utf-8") as file_out,
|
|
444
|
+
):
|
|
445
|
+
for line_index, line in enumerate(file_in, start=1):
|
|
446
|
+
if 0 < max_lines < line_index:
|
|
447
|
+
break
|
|
448
|
+
|
|
449
|
+
try:
|
|
450
|
+
document = json.loads(line.strip())
|
|
451
|
+
except Exception:
|
|
452
|
+
continue # Skip malformed JSON lines
|
|
453
|
+
|
|
454
|
+
doc_id = document.get("id", "unknown")
|
|
455
|
+
title = document.get("title", "")
|
|
456
|
+
text = document.get("text", "")
|
|
457
|
+
|
|
458
|
+
# Construct and call model
|
|
459
|
+
prompt = self._build_term_prompt(self.fewshot_terms_block, title, text)
|
|
460
|
+
raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens)
|
|
461
|
+
predicted_terms = self._parse_list_like(raw_output)
|
|
462
|
+
|
|
463
|
+
# Write extracted terms
|
|
464
|
+
for term_or_type in predicted_terms:
|
|
465
|
+
if isinstance(term_or_type, str) and term_or_type.strip():
|
|
466
|
+
file_out.write(
|
|
467
|
+
json.dumps({"doc_id": doc_id, "term": term_or_type.strip()})
|
|
468
|
+
+ "\n"
|
|
469
|
+
)
|
|
470
|
+
num_written_terms += 1
|
|
471
|
+
|
|
472
|
+
# Lightweight memory management for long runs
|
|
473
|
+
if line_index % 50 == 0:
|
|
474
|
+
gc.collect()
|
|
475
|
+
if torch.cuda.is_available():
|
|
476
|
+
torch.cuda.empty_cache()
|
|
477
|
+
|
|
478
|
+
return num_written_terms
|
|
479
|
+
|
|
480
|
+
def predict_types(
|
|
481
|
+
self,
|
|
482
|
+
docs_test_jsonl: str,
|
|
483
|
+
out_jsonl: str,
|
|
484
|
+
max_lines: int = -1,
|
|
485
|
+
max_new_tokens: int = 120,
|
|
486
|
+
) -> int:
|
|
487
|
+
"""
|
|
488
|
+
Runs Type Extraction on the test documents and saves results to a JSONL file.
|
|
489
|
+
Returns: The count of individual types written.
|
|
490
|
+
"""
|
|
491
|
+
if not self.fewshot_types_block:
|
|
492
|
+
raise RuntimeError("Few-shot block for types is empty. Call fit() first.")
|
|
493
|
+
|
|
494
|
+
num_written_types = 0
|
|
495
|
+
with (
|
|
496
|
+
open(docs_test_jsonl, "r", encoding="utf-8") as file_in,
|
|
497
|
+
open(out_jsonl, "w", encoding="utf-8") as file_out,
|
|
498
|
+
):
|
|
499
|
+
for line_index, line in enumerate(file_in, start=1):
|
|
500
|
+
if 0 < max_lines < line_index:
|
|
501
|
+
break
|
|
502
|
+
|
|
503
|
+
try:
|
|
504
|
+
document = json.loads(line.strip())
|
|
505
|
+
except Exception:
|
|
506
|
+
continue # Skip malformed JSON lines
|
|
507
|
+
|
|
508
|
+
doc_id = document.get("id", "unknown")
|
|
509
|
+
title = document.get("title", "")
|
|
510
|
+
text = document.get("text", "")
|
|
511
|
+
|
|
512
|
+
# Construct and call model using the dedicated type prompt block
|
|
513
|
+
prompt = self._build_type_prompt(self.fewshot_types_block, title, text)
|
|
514
|
+
raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens)
|
|
515
|
+
predicted_types = self._parse_list_like(raw_output)
|
|
516
|
+
|
|
517
|
+
# Write extracted types
|
|
518
|
+
for term_or_type in predicted_types:
|
|
519
|
+
if isinstance(term_or_type, str) and term_or_type.strip():
|
|
520
|
+
file_out.write(
|
|
521
|
+
json.dumps({"doc_id": doc_id, "type": term_or_type.strip()})
|
|
522
|
+
+ "\n"
|
|
523
|
+
)
|
|
524
|
+
num_written_types += 1
|
|
525
|
+
|
|
526
|
+
if line_index % 50 == 0:
|
|
527
|
+
gc.collect()
|
|
528
|
+
if torch.cuda.is_available():
|
|
529
|
+
torch.cuda.empty_cache()
|
|
530
|
+
|
|
531
|
+
return num_written_types
|
|
532
|
+
|
|
533
|
+
# --- Evaluation utilities (unchanged from prior definition, added docstrings) ---
|
|
534
|
+
def load_gold_pairs(self, terms2doc_path: str) -> Set[Tuple[str, str]]:
|
|
535
|
+
"""Convert terms2docs JSON into a set of unique (doc_id, term) pairs, lowercased."""
|
|
536
|
+
gold_pairs = set()
|
|
537
|
+
with open(terms2doc_path, "r", encoding="utf-8") as file_handle:
|
|
538
|
+
term_to_doc_map = json.load(file_handle)
|
|
539
|
+
|
|
540
|
+
for term, doc_ids in term_to_doc_map.items():
|
|
541
|
+
clean_term = term.strip().lower()
|
|
542
|
+
for doc_id in doc_ids:
|
|
543
|
+
gold_pairs.add((doc_id, clean_term))
|
|
544
|
+
return gold_pairs
|
|
545
|
+
|
|
546
|
+
def load_predicted_pairs(
|
|
547
|
+
self, predicted_jsonl_path: str, key: str = "term"
|
|
548
|
+
) -> Set[Tuple[str, str]]:
|
|
549
|
+
"""Load predicted (doc_id, term/type) pairs from a JSONL file, lowercased."""
|
|
550
|
+
predicted_pairs = set()
|
|
551
|
+
with open(predicted_jsonl_path, "r", encoding="utf-8") as file_handle:
|
|
552
|
+
for line in file_handle:
|
|
553
|
+
try:
|
|
554
|
+
entry = json.loads(line.strip())
|
|
555
|
+
except Exception:
|
|
556
|
+
continue
|
|
557
|
+
doc_id = entry.get("doc_id")
|
|
558
|
+
value = entry.get(key)
|
|
559
|
+
if doc_id and value:
|
|
560
|
+
predicted_pairs.add((doc_id, value.strip().lower()))
|
|
561
|
+
return predicted_pairs
|
|
562
|
+
|
|
563
|
+
def evaluate_extraction_f1(
|
|
564
|
+
self, terms2doc_path: str, predicted_jsonl: str, key: str = "term"
|
|
565
|
+
) -> float:
|
|
566
|
+
"""
|
|
567
|
+
Computes set-based binary Precision, Recall, and F1 score against the gold pairs.
|
|
568
|
+
"""
|
|
569
|
+
# Load the ground truth and predictions
|
|
570
|
+
gold_set = self.load_gold_pairs(terms2doc_path)
|
|
571
|
+
predicted_set = self.load_predicted_pairs(predicted_jsonl, key=key)
|
|
572
|
+
|
|
573
|
+
# Build combined universe of all pairs for score calculation
|
|
574
|
+
all_pairs = sorted(gold_set | predicted_set)
|
|
575
|
+
|
|
576
|
+
# Create binary labels (1=present, 0=absent)
|
|
577
|
+
y_true = [1 if pair in gold_set else 0 for pair in all_pairs]
|
|
578
|
+
y_pred = [1 if pair in predicted_set else 0 for pair in all_pairs]
|
|
579
|
+
|
|
580
|
+
# Use scikit-learn for metric calculation
|
|
581
|
+
from sklearn.metrics import precision_recall_fscore_support
|
|
582
|
+
|
|
583
|
+
precision, recall, f1, _ = precision_recall_fscore_support(
|
|
584
|
+
y_true, y_pred, average="binary", zero_division=0
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
# Display results
|
|
588
|
+
num_true_positives = len(gold_set & predicted_set)
|
|
589
|
+
|
|
590
|
+
print("\n📊 Evaluation Results:")
|
|
591
|
+
print(f" ✅ Precision: {precision:.4f}")
|
|
592
|
+
print(f" ✅ Recall: {recall:.4f}")
|
|
593
|
+
print(f" ✅ F1 Score: {f1:.4f}")
|
|
594
|
+
print(f" 📌 Gold pairs: {len(gold_set)}")
|
|
595
|
+
print(f" 📌 Predicted pairs:{len(predicted_set)}")
|
|
596
|
+
print(f" 🎯 True Positives: {num_true_positives}")
|
|
597
|
+
|
|
598
|
+
return float(f1)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OntoLearner
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.9
|
|
4
4
|
Summary: OntoLearner: A Modular Python Library for Ontology Learning with LLMs.
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -13,8 +13,11 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Dist: Levenshtein
|
|
16
17
|
Requires-Dist: bitsandbytes (>=0.45.1,<0.46.0)
|
|
17
18
|
Requires-Dist: dspy (>=2.6.14,<3.0.0)
|
|
19
|
+
Requires-Dist: g4f
|
|
20
|
+
Requires-Dist: gensim
|
|
18
21
|
Requires-Dist: huggingface-hub (>=0.34.4,<0.35.0)
|
|
19
22
|
Requires-Dist: matplotlib
|
|
20
23
|
Requires-Dist: mistral-common[sentencepiece] (>=1.8.5,<2.0.0)
|
|
@@ -23,6 +26,7 @@ Requires-Dist: numpy
|
|
|
23
26
|
Requires-Dist: openpyxl
|
|
24
27
|
Requires-Dist: pandas
|
|
25
28
|
Requires-Dist: pathlib (==1.0.1)
|
|
29
|
+
Requires-Dist: protobuf (<5)
|
|
26
30
|
Requires-Dist: pydantic (==2.11.3)
|
|
27
31
|
Requires-Dist: python-dotenv
|
|
28
32
|
Requires-Dist: rdflib (==7.1.1)
|
|
@@ -77,16 +81,16 @@ Please refer to [Installation](https://ontolearner.readthedocs.io/installation.h
|
|
|
77
81
|
|
|
78
82
|
## 🔗 Essential Resources
|
|
79
83
|
|
|
80
|
-
| Resource
|
|
81
|
-
|
|
82
|
-
| **[📚 OntoLearner Documentation](https://ontolearner.readthedocs.io/)**
|
|
83
|
-
| **[🤗 Datasets on Hugging Face](https://huggingface.co/collections/SciKnowOrg/ontolearner-benchmarking-6823bcd051300c210b7ef68a)**
|
|
84
|
-
| **
|
|
85
|
-
| **[
|
|
86
|
-
| **[
|
|
87
|
-
| **[
|
|
88
|
-
| **[
|
|
89
|
-
| **[
|
|
84
|
+
| Resource | Info |
|
|
85
|
+
|:-----------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------|
|
|
86
|
+
| **[📚 OntoLearner Documentation](https://ontolearner.readthedocs.io/)** | OntoLearner's extensive documentation website. |
|
|
87
|
+
| **[🤗 Datasets on Hugging Face](https://huggingface.co/collections/SciKnowOrg/ontolearner-benchmarking-6823bcd051300c210b7ef68a)** | Access curated, machine-readable ontologies. |
|
|
88
|
+
| **[🚀 Quickstart](https://ontolearner.readthedocs.io/quickstart.html)** | Get started quickly with OntoLearner’s main features and workflow. |
|
|
89
|
+
| **[🕸️ Learning Tasks](https://ontolearner.readthedocs.io/learning_tasks/learning_tasks.html)** | Explore supported ontology learning tasks like LLMs4OL Paradigm tasks and Text2Onto. | |
|
|
90
|
+
| **[🧠 Learner Models](https://ontolearner.readthedocs.io/learners/llm.html)** | Browse and configure various learner models, including LLMs, Retrieval, or RAG approaches. |
|
|
91
|
+
| **[📚 Ontologies Documentations](https://ontolearner.readthedocs.io/benchmarking/benchmark.html)** | Review benchmark ontologies and datasets used for evaluation and training. |
|
|
92
|
+
| **[🧩 How to work with Ontologizer?](https://ontolearner.readthedocs.io/ontologizer/ontology_modularization.html)** | Learn how to modularize and preprocess ontologies using the Ontologizer module. |
|
|
93
|
+
| **[🤗 Ontology Metrics Dashboard](https://huggingface.co/spaces/SciKnowOrg/OntoLearner-Benchmark-Metrics)** | Benchmark ontologies with their metrics and complexity scores. |
|
|
90
94
|
|
|
91
95
|
## 🚀 Quick Tour
|
|
92
96
|
Get started with OntoLearner in just a few lines of code. This guide demonstrates how to initialize ontologies, load datasets, and train an LLM-assisted learner for ontology engineering tasks.
|
|
@@ -132,7 +136,7 @@ task = 'non-taxonomic-re'
|
|
|
132
136
|
ret_learner = AutoRetrieverLearner(top_k=5)
|
|
133
137
|
ret_learner.load(model_id='sentence-transformers/all-MiniLM-L6-v2')
|
|
134
138
|
|
|
135
|
-
# 5. Fit the model to training data and
|
|
139
|
+
# 5. Fit the model to training data and then predict over the test data
|
|
136
140
|
ret_learner.fit(train_data, task=task)
|
|
137
141
|
predicts = ret_learner.predict(test_data, task=task)
|
|
138
142
|
|