OntoLearner 1.4.10__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ontolearner/VERSION +1 -1
- ontolearner/base/learner.py +41 -18
- ontolearner/evaluation/metrics.py +72 -32
- ontolearner/learner/__init__.py +3 -2
- ontolearner/learner/label_mapper.py +5 -4
- ontolearner/learner/llm.py +257 -0
- ontolearner/learner/prompt.py +40 -5
- ontolearner/learner/rag/__init__.py +14 -0
- ontolearner/learner/{rag.py → rag/rag.py} +7 -2
- ontolearner/learner/retriever/__init__.py +1 -1
- ontolearner/learner/retriever/{llm_retriever.py → augmented_retriever.py} +48 -39
- ontolearner/learner/retriever/learner.py +3 -4
- ontolearner/learner/taxonomy_discovery/alexbek.py +632 -310
- ontolearner/learner/taxonomy_discovery/skhnlp.py +216 -156
- ontolearner/learner/text2onto/__init__.py +1 -1
- ontolearner/learner/text2onto/alexbek.py +484 -1105
- ontolearner/learner/text2onto/sbunlp.py +498 -493
- ontolearner/ontology/biology.py +2 -3
- ontolearner/ontology/chemistry.py +16 -18
- ontolearner/ontology/ecology_environment.py +2 -3
- ontolearner/ontology/general.py +4 -6
- ontolearner/ontology/material_science_engineering.py +64 -45
- ontolearner/ontology/medicine.py +2 -3
- ontolearner/ontology/scholarly_knowledge.py +6 -9
- ontolearner/processor.py +3 -3
- ontolearner/text2onto/splitter.py +69 -6
- {ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/METADATA +2 -2
- {ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/RECORD +30 -29
- {ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/WHEEL +1 -1
- {ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Copyright (c) 2025 SciKnowOrg
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the MIT License (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# https://opensource.org/licenses/MIT
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from .rag import AutoRAGLearner, LLMAugmentedRAGLearner
|
|
@@ -14,8 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import warnings
|
|
16
16
|
from typing import Any
|
|
17
|
-
from
|
|
18
|
-
|
|
17
|
+
from ...base import AutoLearner
|
|
19
18
|
|
|
20
19
|
class AutoRAGLearner(AutoLearner):
|
|
21
20
|
def __init__(self,
|
|
@@ -87,3 +86,9 @@ class AutoRAGLearner(AutoLearner):
|
|
|
87
86
|
return self.llm._non_taxonomic_re_predict(dataset=dataset)
|
|
88
87
|
else:
|
|
89
88
|
warnings.warn("No requirement for fiting the non-taxonomic-re model, the predict module will use the input data to do the fit as well.")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class LLMAugmentedRAGLearner(AutoRAGLearner):
|
|
92
|
+
|
|
93
|
+
def set_augmenter(self, augmenter):
|
|
94
|
+
self.retriever.set_augmenter(augmenter=augmenter)
|
|
@@ -16,4 +16,4 @@ from .crossencoder import CrossEncoderRetriever
|
|
|
16
16
|
from .embedding import GloveRetriever, Word2VecRetriever
|
|
17
17
|
from .ngram import NgramRetriever
|
|
18
18
|
from .learner import AutoRetrieverLearner, LLMAugmentedRetrieverLearner
|
|
19
|
-
from .
|
|
19
|
+
from .augmented_retriever import LLMAugmenterGenerator, LLMAugmenter, LLMAugmentedRetriever
|
|
@@ -17,6 +17,8 @@ from typing import Any, List, Dict
|
|
|
17
17
|
from openai import OpenAI
|
|
18
18
|
import time
|
|
19
19
|
from tqdm import tqdm
|
|
20
|
+
import torch
|
|
21
|
+
import torch.nn.functional as F
|
|
20
22
|
|
|
21
23
|
from ...base import AutoRetriever
|
|
22
24
|
from ...utils import load_json
|
|
@@ -125,7 +127,6 @@ class LLMAugmenterGenerator(ABC):
|
|
|
125
127
|
except Exception:
|
|
126
128
|
print("sleep for 5 seconds")
|
|
127
129
|
time.sleep(5)
|
|
128
|
-
|
|
129
130
|
return inference
|
|
130
131
|
|
|
131
132
|
def tasks_data_former(self, data: Any, task: str) -> List[str] | Dict[str, List[str]]:
|
|
@@ -298,21 +299,12 @@ class LLMAugmentedRetriever(AutoRetriever):
|
|
|
298
299
|
Attributes:
|
|
299
300
|
augmenter: An augmenter instance that provides transform() and top_n_candidate.
|
|
300
301
|
"""
|
|
301
|
-
|
|
302
|
-
def __init__(self) -> None:
|
|
303
|
-
"""
|
|
304
|
-
Initialize the augmented retriever with no augmenter attached.
|
|
305
|
-
"""
|
|
302
|
+
def __init__(self, threshold: float = 0.0, cutoff_rate: float = 100.0) -> None:
|
|
306
303
|
super().__init__()
|
|
307
|
-
self.
|
|
304
|
+
self.threshold = threshold
|
|
305
|
+
self.cutoff_rate = cutoff_rate
|
|
308
306
|
|
|
309
307
|
def set_augmenter(self, augmenter):
|
|
310
|
-
"""
|
|
311
|
-
Attach an augmenter instance.
|
|
312
|
-
|
|
313
|
-
Args:
|
|
314
|
-
augmenter: An object providing `transform(query, task)` and `top_n_candidate`.
|
|
315
|
-
"""
|
|
316
308
|
self.augmenter = augmenter
|
|
317
309
|
|
|
318
310
|
def retrieve(self, query: List[str], top_k: int = 5, batch_size: int = -1, task: str = None) -> List[List[str]]:
|
|
@@ -328,29 +320,46 @@ class LLMAugmentedRetriever(AutoRetriever):
|
|
|
328
320
|
Returns:
|
|
329
321
|
list[list[str]]: A list of document lists, one per input query.
|
|
330
322
|
"""
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
323
|
+
if task != 'taxonomy-discovery':
|
|
324
|
+
return super().retrieve(query=query, top_k=top_k, batch_size=batch_size)
|
|
325
|
+
return self.augmented_retrieve(query, top_k=top_k, batch_size=batch_size, task=task)
|
|
326
|
+
|
|
327
|
+
def augmented_retrieve(self, query: List[str], top_k: int = 5, batch_size: int = -1, task: str = None):
|
|
328
|
+
if self.embeddings is None:
|
|
329
|
+
raise RuntimeError("Retriever model must index documents before prediction.")
|
|
330
|
+
|
|
331
|
+
augmented_queries, index_map = [], []
|
|
332
|
+
for qu_idx, qu in enumerate(query):
|
|
333
|
+
augmented = self.augmenter.transform(qu, task=task)
|
|
334
|
+
for aug in augmented:
|
|
335
|
+
augmented_queries.append(aug)
|
|
336
|
+
index_map.append(qu_idx)
|
|
337
|
+
|
|
338
|
+
doc_norm = F.normalize(self.embeddings, p=2, dim=1)
|
|
339
|
+
results = [dict() for _ in range(len(query))]
|
|
340
|
+
|
|
341
|
+
if batch_size == -1:
|
|
342
|
+
batch_size = len(augmented_queries)
|
|
343
|
+
|
|
344
|
+
for start in range(0, len(augmented_queries), batch_size):
|
|
345
|
+
batch_aug = augmented_queries[start:start + batch_size]
|
|
346
|
+
batch_embeddings = self.embedding_model.encode(batch_aug, convert_to_tensor=True)
|
|
347
|
+
batch_norm = F.normalize(batch_embeddings, p=2, dim=1)
|
|
348
|
+
similarity_matrix = torch.matmul(batch_norm, doc_norm.T)
|
|
349
|
+
current_top_k = min(top_k, len(self.documents))
|
|
350
|
+
topk_similarities, topk_indices = torch.topk(similarity_matrix, k=current_top_k, dim=1)
|
|
351
|
+
|
|
352
|
+
for i, (doc_indices, sim_scores) in enumerate(zip(topk_indices, topk_similarities)):
|
|
353
|
+
original_query_idx = index_map[start + i]
|
|
354
|
+
|
|
355
|
+
for doc_idx, score in zip(doc_indices.tolist(), sim_scores.tolist()):
|
|
356
|
+
if score >= self.threshold:
|
|
357
|
+
doc = self.documents[doc_idx]
|
|
358
|
+
prev = results[original_query_idx].get(doc, 0.0)
|
|
359
|
+
results[original_query_idx][doc] = prev + score
|
|
360
|
+
|
|
361
|
+
final_results = []
|
|
362
|
+
for doc_score_map in results:
|
|
363
|
+
sorted_docs = sorted(doc_score_map.items(), key=lambda x: x[1], reverse=True)
|
|
364
|
+
final_results.append([doc for doc, _ in sorted_docs])
|
|
365
|
+
return final_results
|
|
@@ -122,7 +122,6 @@ class AutoRetrieverLearner(AutoLearner):
|
|
|
122
122
|
warnings.warn("No requirement for fiting the non-taxonomic RE model, the predict module will use the input data to do the fit as well..")
|
|
123
123
|
|
|
124
124
|
|
|
125
|
-
|
|
126
125
|
class LLMAugmentedRetrieverLearner(AutoRetrieverLearner):
|
|
127
126
|
|
|
128
127
|
def set_augmenter(self, augmenter):
|
|
@@ -160,9 +159,9 @@ class LLMAugmentedRetrieverLearner(AutoRetrieverLearner):
|
|
|
160
159
|
taxonomic_pairs = [{"parent": candidate, "child": query}
|
|
161
160
|
for query, candidates in zip(data, candidates_lst)
|
|
162
161
|
for candidate in candidates if candidate.lower() != query.lower()]
|
|
163
|
-
taxonomic_pairs += [{"parent": query, "child": candidate}
|
|
164
|
-
|
|
165
|
-
|
|
162
|
+
# taxonomic_pairs += [{"parent": query, "child": candidate}
|
|
163
|
+
# for query, candidates in zip(data, candidates_lst)
|
|
164
|
+
# for candidate in candidates if candidate.lower() != query.lower()]
|
|
166
165
|
unique_taxonomic_pairs, seen = [], set()
|
|
167
166
|
for pair in taxonomic_pairs:
|
|
168
167
|
key = (pair["parent"].lower(), pair["child"].lower()) # Directional key (parent, child)
|