OntoLearner 1.4.10__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. ontolearner/VERSION +1 -1
  2. ontolearner/base/learner.py +41 -18
  3. ontolearner/evaluation/metrics.py +72 -32
  4. ontolearner/learner/__init__.py +3 -2
  5. ontolearner/learner/label_mapper.py +5 -4
  6. ontolearner/learner/llm.py +257 -0
  7. ontolearner/learner/prompt.py +40 -5
  8. ontolearner/learner/rag/__init__.py +14 -0
  9. ontolearner/learner/{rag.py → rag/rag.py} +7 -2
  10. ontolearner/learner/retriever/__init__.py +1 -1
  11. ontolearner/learner/retriever/{llm_retriever.py → augmented_retriever.py} +48 -39
  12. ontolearner/learner/retriever/learner.py +3 -4
  13. ontolearner/learner/taxonomy_discovery/alexbek.py +632 -310
  14. ontolearner/learner/taxonomy_discovery/skhnlp.py +216 -156
  15. ontolearner/learner/text2onto/__init__.py +1 -1
  16. ontolearner/learner/text2onto/alexbek.py +484 -1105
  17. ontolearner/learner/text2onto/sbunlp.py +498 -493
  18. ontolearner/ontology/biology.py +2 -3
  19. ontolearner/ontology/chemistry.py +16 -18
  20. ontolearner/ontology/ecology_environment.py +2 -3
  21. ontolearner/ontology/general.py +4 -6
  22. ontolearner/ontology/material_science_engineering.py +64 -45
  23. ontolearner/ontology/medicine.py +2 -3
  24. ontolearner/ontology/scholarly_knowledge.py +6 -9
  25. ontolearner/processor.py +3 -3
  26. ontolearner/text2onto/splitter.py +69 -6
  27. {ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/METADATA +2 -2
  28. {ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/RECORD +30 -29
  29. {ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/WHEEL +1 -1
  30. {ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,14 @@
1
+ # Copyright (c) 2025 SciKnowOrg
2
+ #
3
+ # Licensed under the MIT License (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://opensource.org/licenses/MIT
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from .rag import AutoRAGLearner, LLMAugmentedRAGLearner
@@ -14,8 +14,7 @@
14
14
 
15
15
  import warnings
16
16
  from typing import Any
17
- from ..base import AutoLearner
18
-
17
+ from ...base import AutoLearner
19
18
 
20
19
  class AutoRAGLearner(AutoLearner):
21
20
  def __init__(self,
@@ -87,3 +86,9 @@ class AutoRAGLearner(AutoLearner):
87
86
  return self.llm._non_taxonomic_re_predict(dataset=dataset)
88
87
  else:
89
88
  warnings.warn("No requirement for fiting the non-taxonomic-re model, the predict module will use the input data to do the fit as well.")
89
+
90
+
91
+ class LLMAugmentedRAGLearner(AutoRAGLearner):
92
+
93
+ def set_augmenter(self, augmenter):
94
+ self.retriever.set_augmenter(augmenter=augmenter)
@@ -16,4 +16,4 @@ from .crossencoder import CrossEncoderRetriever
16
16
  from .embedding import GloveRetriever, Word2VecRetriever
17
17
  from .ngram import NgramRetriever
18
18
  from .learner import AutoRetrieverLearner, LLMAugmentedRetrieverLearner
19
- from .llm_retriever import LLMAugmenterGenerator, LLMAugmenter, LLMAugmentedRetriever
19
+ from .augmented_retriever import LLMAugmenterGenerator, LLMAugmenter, LLMAugmentedRetriever
@@ -17,6 +17,8 @@ from typing import Any, List, Dict
17
17
  from openai import OpenAI
18
18
  import time
19
19
  from tqdm import tqdm
20
+ import torch
21
+ import torch.nn.functional as F
20
22
 
21
23
  from ...base import AutoRetriever
22
24
  from ...utils import load_json
@@ -125,7 +127,6 @@ class LLMAugmenterGenerator(ABC):
125
127
  except Exception:
126
128
  print("sleep for 5 seconds")
127
129
  time.sleep(5)
128
-
129
130
  return inference
130
131
 
131
132
  def tasks_data_former(self, data: Any, task: str) -> List[str] | Dict[str, List[str]]:
@@ -298,21 +299,12 @@ class LLMAugmentedRetriever(AutoRetriever):
298
299
  Attributes:
299
300
  augmenter: An augmenter instance that provides transform() and top_n_candidate.
300
301
  """
301
-
302
- def __init__(self) -> None:
303
- """
304
- Initialize the augmented retriever with no augmenter attached.
305
- """
302
+ def __init__(self, threshold: float = 0.0, cutoff_rate: float = 100.0) -> None:
306
303
  super().__init__()
307
- self.augmenter = None
304
+ self.threshold = threshold
305
+ self.cutoff_rate = cutoff_rate
308
306
 
309
307
  def set_augmenter(self, augmenter):
310
- """
311
- Attach an augmenter instance.
312
-
313
- Args:
314
- augmenter: An object providing `transform(query, task)` and `top_n_candidate`.
315
- """
316
308
  self.augmenter = augmenter
317
309
 
318
310
  def retrieve(self, query: List[str], top_k: int = 5, batch_size: int = -1, task: str = None) -> List[List[str]]:
@@ -328,29 +320,46 @@ class LLMAugmentedRetriever(AutoRetriever):
328
320
  Returns:
329
321
  list[list[str]]: A list of document lists, one per input query.
330
322
  """
331
- parent_retrieve = super(LLMAugmentedRetriever, self).retrieve
332
-
333
- if task == 'taxonomy-discovery':
334
- query_sets = []
335
- for idx in range(self.augmenter.top_n_candidate):
336
- query_set = []
337
- for qu in query:
338
- query_set.append(self.augmenter.transform(qu, task=task)[idx])
339
- query_sets.append(query_set)
340
-
341
- retrieves = [
342
- parent_retrieve(query=query_set, top_k=top_k, batch_size=batch_size)
343
- for query_set in query_sets
344
- ]
345
-
346
- results = []
347
- for qu_idx, qu in enumerate(query):
348
- qu_result = []
349
- for top_idx in range(self.augmenter.top_n_candidate):
350
- qu_result += retrieves[top_idx][qu_idx]
351
- results.append(list(set(qu_result)))
352
-
353
- return results
354
-
355
- else:
356
- return parent_retrieve(query=query, top_k=top_k, batch_size=batch_size)
323
+ if task != 'taxonomy-discovery':
324
+ return super().retrieve(query=query, top_k=top_k, batch_size=batch_size)
325
+ return self.augmented_retrieve(query, top_k=top_k, batch_size=batch_size, task=task)
326
+
327
+ def augmented_retrieve(self, query: List[str], top_k: int = 5, batch_size: int = -1, task: str = None):
328
+ if self.embeddings is None:
329
+ raise RuntimeError("Retriever model must index documents before prediction.")
330
+
331
+ augmented_queries, index_map = [], []
332
+ for qu_idx, qu in enumerate(query):
333
+ augmented = self.augmenter.transform(qu, task=task)
334
+ for aug in augmented:
335
+ augmented_queries.append(aug)
336
+ index_map.append(qu_idx)
337
+
338
+ doc_norm = F.normalize(self.embeddings, p=2, dim=1)
339
+ results = [dict() for _ in range(len(query))]
340
+
341
+ if batch_size == -1:
342
+ batch_size = len(augmented_queries)
343
+
344
+ for start in range(0, len(augmented_queries), batch_size):
345
+ batch_aug = augmented_queries[start:start + batch_size]
346
+ batch_embeddings = self.embedding_model.encode(batch_aug, convert_to_tensor=True)
347
+ batch_norm = F.normalize(batch_embeddings, p=2, dim=1)
348
+ similarity_matrix = torch.matmul(batch_norm, doc_norm.T)
349
+ current_top_k = min(top_k, len(self.documents))
350
+ topk_similarities, topk_indices = torch.topk(similarity_matrix, k=current_top_k, dim=1)
351
+
352
+ for i, (doc_indices, sim_scores) in enumerate(zip(topk_indices, topk_similarities)):
353
+ original_query_idx = index_map[start + i]
354
+
355
+ for doc_idx, score in zip(doc_indices.tolist(), sim_scores.tolist()):
356
+ if score >= self.threshold:
357
+ doc = self.documents[doc_idx]
358
+ prev = results[original_query_idx].get(doc, 0.0)
359
+ results[original_query_idx][doc] = prev + score
360
+
361
+ final_results = []
362
+ for doc_score_map in results:
363
+ sorted_docs = sorted(doc_score_map.items(), key=lambda x: x[1], reverse=True)
364
+ final_results.append([doc for doc, _ in sorted_docs])
365
+ return final_results
@@ -122,7 +122,6 @@ class AutoRetrieverLearner(AutoLearner):
122
122
  warnings.warn("No requirement for fiting the non-taxonomic RE model, the predict module will use the input data to do the fit as well..")
123
123
 
124
124
 
125
-
126
125
  class LLMAugmentedRetrieverLearner(AutoRetrieverLearner):
127
126
 
128
127
  def set_augmenter(self, augmenter):
@@ -160,9 +159,9 @@ class LLMAugmentedRetrieverLearner(AutoRetrieverLearner):
160
159
  taxonomic_pairs = [{"parent": candidate, "child": query}
161
160
  for query, candidates in zip(data, candidates_lst)
162
161
  for candidate in candidates if candidate.lower() != query.lower()]
163
- taxonomic_pairs += [{"parent": query, "child": candidate}
164
- for query, candidates in zip(data, candidates_lst)
165
- for candidate in candidates if candidate.lower() != query.lower()]
162
+ # taxonomic_pairs += [{"parent": query, "child": candidate}
163
+ # for query, candidates in zip(data, candidates_lst)
164
+ # for candidate in candidates if candidate.lower() != query.lower()]
166
165
  unique_taxonomic_pairs, seen = [], set()
167
166
  for pair in taxonomic_pairs:
168
167
  key = (pair["parent"].lower(), pair["child"].lower()) # Directional key (parent, child)