OntoLearner 1.4.9__py3-none-any.whl → 1.4.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -200,10 +200,73 @@ class SyntheticDataSplitter:
200
200
 
201
201
  return terms_splits, types_splits, docs_split, types2docs_splits
202
202
 
203
- def split(self, train: float = 0.8, val: float = 0.1, test: float = 0.1):
204
- split_targets, split_docs_targets = self.set_train_val_test_sizes(train_percentage=train,
205
- val_percentage=val,
206
- test_percentage=test)
203
+ def split_fine_grained(self, doc_ids):
204
+ """
205
+ Build a single split bundle containing only:
206
+ - docs
207
+ - terms
208
+ - types
209
+ - terms2docs
210
+ - terms2types
211
+ """
212
+ # normalize to string ids (constructor uses str(row.id))
213
+ doc_ids = {str(d) for d in (doc_ids or [])}
214
+
215
+ # docs + collect terms/types from docs
216
+ docs = []
217
+ terms_set = set()
218
+ types_set = set()
219
+
220
+ for doc_id in doc_ids:
221
+ doc = self.doc_id_to_doc[doc_id]
222
+ docs.append({"id": str(doc.id), "title": doc.title, "text": doc.text})
223
+
224
+ terms_set.update(self.doc_id_to_terms[doc_id])
225
+ types_set.update(self.doc_id_to_types[doc_id])
226
+
227
+ terms = sorted(terms_set)
228
+ types = sorted(types_set)
229
+
230
+ # terms2docs: use the constructor-built mapping and restrict to this split's doc_ids
231
+ terms2docs = {
232
+ term: sorted(list(self.term_to_doc_id.get(term, set()) & doc_ids))
233
+ for term in terms
234
+ }
235
+
236
+ # terms2types: ontology lookup (term -> parent types)
237
+ terms2types = {term: self.child_to_parent.get(term, []) for term in terms}
238
+
239
+ return {
240
+ "documents": docs,
241
+ "terms": terms,
242
+ "types": types,
243
+ "terms2docs": terms2docs,
244
+ "terms2types": terms2types,
245
+ }
246
+
247
+ def train_test_val_split(self, train: float = 0.8, val: float = 0.1, test: float = 0.1):
248
+ """
249
+ Returns:
250
+ train_split, val_split, test_split
251
+
252
+ Each split is a dict with keys:
253
+ - "docs"
254
+ - "terms"
255
+ - "types"
256
+ - "terms2docs"
257
+ - "terms2types"
258
+ """
259
+ # compute which docs go to which split
260
+ split_targets, split_docs_targets = self.set_train_val_test_sizes(
261
+ train_percentage=train,
262
+ val_percentage=val,
263
+ test_percentage=test,
264
+ )
207
265
  split_docs = self.create_train_val_test_splits(split_targets, split_docs_targets)
208
- terms, types, docs, types2docs = self.generate_split_artefacts(split_docs)
209
- return terms, types, docs, types2docs
266
+ # split_docs: {"train": set(doc_ids), "val": set(doc_ids), "test": set(doc_ids)}
267
+
268
+ train_split = self.split_fine_grained(split_docs.get("train", set()))
269
+ val_split = self.split_fine_grained(split_docs.get("val", set()))
270
+ test_split = self.split_fine_grained(split_docs.get("test", set()))
271
+
272
+ return train_split, val_split, test_split
@@ -14,6 +14,7 @@
14
14
 
15
15
  import logging
16
16
  import time
17
+ import numpy as np
17
18
  from abc import ABC
18
19
  from rdflib import RDF, RDFS, OWL
19
20
  from collections import defaultdict
@@ -186,6 +187,56 @@ class Analyzer(ABC):
186
187
 
187
188
  return metrics
188
189
 
190
+ @staticmethod
191
+ def compute_complexity_score(
192
+ topology_metrics: TopologyMetrics,
193
+ dataset_metrics: DatasetMetrics,
194
+ a: float = 0.4,
195
+ b: float = 6.0,
196
+ eps: float = 1e-12
197
+ ) -> float:
198
+ """
199
+ Compute a single normalized complexity score for an ontology.
200
+
201
+ This function combines structural topology metrics and dataset quality metrics
202
+ into a weighted aggregate score, then applies a logistic transformation to
203
+ normalize it to the range [0, 1]. The score reflects overall ontology complexity,
204
+ considering graph structure, hierarchy, breadth, coverage, and dataset richness.
205
+
206
+ Args:
207
+ topology_metrics (TopologyMetrics): Precomputed structural metrics of the ontology graph.
208
+ dataset_metrics (DatasetMetrics): Precomputed metrics of extracted learning datasets.
209
+ a (float, optional): Steepness parameter for the logistic normalization function. Default is 0.4.
210
+ b (float, optional): Centering parameter for the logistic function, should be tuned to match the scale of aggregated metrics. Default is 6.0.
211
+ eps (float, optional): Small epsilon to prevent numerical issues in logistic computation. Default is 1e-12.
212
+
213
+ Returns:
214
+ float: Normalized complexity score in [0, 1], where higher values indicate more complex ontologies.
215
+
216
+ Notes:
217
+ - Weights are assigned to different metric categories: graph metrics, coverage metrics, hierarchy metrics,
218
+ breadth metrics, and dataset metrics (term-types, taxonomic, non-taxonomic relations).
219
+ - Metrics are log-normalized before weighting to reduce scale differences.
220
+ - The logistic transformation ensures the final score is bounded and interpretable.
221
+ """
222
+ # Define metric categories with their weights
223
+ metric_categories = {
224
+ 0.3: ["total_nodes", "total_edges", "num_root_nodes", "num_leaf_nodes"],
225
+ 0.25: ["num_classes", "num_properties", "num_individuals"],
226
+ 0.10: ["max_depth", "min_depth", "avg_depth", "depth_variance"],
227
+ 0.20: ["max_breadth", "min_breadth", "avg_breadth", "breadth_variance"],
228
+ 0.15: ["num_term_types", "num_taxonomic_relations", "num_non_taxonomic_relations", "avg_terms"]
229
+ }
230
+ weights = {metric: weight for weight, metrics in metric_categories.items() for metric in metrics}
231
+ metrics = [metric for _, metric_list in metric_categories.items() for metric in metric_list]
232
+ onto_metrics = {**topology_metrics.__dict__, **dataset_metrics.__dict__}
233
+ norm_weighted_values = [np.log1p(onto_metrics[m]) * weights[m] for m in metrics if m in onto_metrics]
234
+ total_weight = sum(weights[m] for m in metrics if m in onto_metrics)
235
+ weighted_sum = sum(norm_weighted_values) / total_weight if total_weight > 0 else 0.0
236
+ complexity_score = 1.0 / (1.0 + np.exp(-a * (weighted_sum - b) + eps))
237
+ return complexity_score
238
+
239
+
189
240
  @staticmethod
190
241
  def compute_dataset_metrics(ontology: BaseOntology) -> DatasetMetrics:
191
242
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: OntoLearner
3
- Version: 1.4.9
3
+ Version: 1.4.11
4
4
  Summary: OntoLearner: A Modular Python Library for Ontology Learning with LLMs.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
16
  Requires-Dist: Levenshtein
17
- Requires-Dist: bitsandbytes (>=0.45.1,<0.46.0)
17
+ Requires-Dist: bitsandbytes (>=0.45.1,<1.0.0) ; platform_system == "Linux"
18
18
  Requires-Dist: dspy (>=2.6.14,<3.0.0)
19
19
  Requires-Dist: g4f
20
20
  Requires-Dist: gensim
@@ -1,27 +1,28 @@
1
- ontolearner/VERSION,sha256=x-xbkXEIv48hifmVFcVtJDdZj6d_bmXwy3Lp4d5pPVY,6
1
+ ontolearner/VERSION,sha256=IUVii4YAC88nU9izX_pBt1ZZ3pyLpyv3xOe7B9Tzuzo,7
2
2
  ontolearner/__init__.py,sha256=E4yukFv2PV4uyztTPDWljCySY9AVDcDDzabuvxfabYE,1889
3
3
  ontolearner/_learner.py,sha256=2CRQvpsz8akIOdxTs2-KLJ-MssULrjpK-QDD3QXUJXI,5297
4
4
  ontolearner/_ontology.py,sha256=W1mp195SImqLKwaj4ueEaBWuLJg2jUdx1JT20Ds3fmQ,6950
5
5
  ontolearner/base/__init__.py,sha256=5pf-ltxzGp32xhEcPdbtm11wXJrYJMUeWG-mbcAYD8Q,705
6
- ontolearner/base/learner.py,sha256=latiGv8p3nyPrxMp7g5B2MSF-JEInRwIlbOn09uh7io,18899
7
- ontolearner/base/ontology.py,sha256=JbMJ1-WUyHWQiNJL-DeaqcriUimLdqN3_ESROgqOPTQ,24772
6
+ ontolearner/base/learner.py,sha256=dWMiIBhdvxZLxIWrTq4d4LbyCqDuAmTwfmxwa7UkjfQ,20075
7
+ ontolearner/base/ontology.py,sha256=t7n81Vk8Y5BCK88AYIyNKd7d1LjJnoTlXigyPvrLxR4,24784
8
8
  ontolearner/base/text2onto.py,sha256=iUXYZoqnwgebQuQzM-XSGTVRfHLlhjUK_z5XUvhRICc,5388
9
9
  ontolearner/data_structure/__init__.py,sha256=1HiKvk8FKjhYeI92RHnJXxyQbUJBi3JFytjQjthsY_s,599
10
10
  ontolearner/data_structure/data.py,sha256=jUUDfqsOZcEqIR83SRboiKibPdA_JquI1uOEiQQ_lqY,11273
11
11
  ontolearner/data_structure/metric.py,sha256=4QKkZ5L1YK6hDTU-N5Z9I9Ha99DVHmGfYxK7N2qdhfc,7589
12
12
  ontolearner/evaluation/__init__.py,sha256=4BZr3BUXjQDTj4Aqlqy4THa80lZPsMuh1EBTCyi9Wig,842
13
13
  ontolearner/evaluation/evaluate.py,sha256=NYCVcmPqpyIxYZrMAim37gL-erdh698RD3t3eNTTgZc,1163
14
- ontolearner/evaluation/metrics.py,sha256=3Aw6ycJ3_Q6xfj4tMBJP6QcexUei0G16H0ZQWt87aRU,6286
15
- ontolearner/learner/__init__.py,sha256=RKREPrrjzQ5KYvcOwC_2l7yFKwFBd6HoCwhX2H6Spg8,798
16
- ontolearner/learner/label_mapper.py,sha256=YMPeFKzJxoCYNU5z7QRYPbB88sWdu1iT6iBDpPsjn-4,3792
14
+ ontolearner/evaluation/metrics.py,sha256=rgEHwkvtWIZ8BB8dNN5bjwptV70F4Y9RRLp9j2xeAuE,7914
15
+ ontolearner/learner/__init__.py,sha256=8NOPB8IaU04Ae5aWnIm6B0rcijSYN6z3xJElzzKD34I,822
16
+ ontolearner/learner/label_mapper.py,sha256=SiceknqOW2ORX7K4-ljLJYay8DQrKF6Dwv-wUg7uQ78,3793
17
17
  ontolearner/learner/llm.py,sha256=3kq_IrwEPTFgeNVKZH9Er_OydJuDpRBtM3YXNNa8_KA,10343
18
- ontolearner/learner/prompt.py,sha256=0ckH7xphIDKczPe7G-rwiOxFGZ7RsLnpPlNW92b-31U,1574
19
- ontolearner/learner/rag.py,sha256=eysB2RvcWkVo53s8-kSbZtJv904YVTmdtxplM4ukUKM,4283
20
- ontolearner/learner/retriever/__init__.py,sha256=G5XuJcTblqXVWboVW9StJ2Vo2xACp_kG5_w2nrueqlc,854
18
+ ontolearner/learner/prompt.py,sha256=1z8KRLrvRBS8QFoJYGlaajwHi4P4CZezhNQv3WcIfNA,2464
19
+ ontolearner/learner/rag/__init__.py,sha256=NpnBBw5wqZ1MNtpIJ7zT-LWm5IL9aDEzwlbKPo_kCis,612
20
+ ontolearner/learner/rag/rag.py,sha256=apnjK4KvVzFmzF6CmxtZoYoE9NAscRVULTt8Nj5wcWk,4430
21
+ ontolearner/learner/retriever/__init__.py,sha256=ECMEEtwacnugiJ-sADVGidS88pC57nVi299vKb5R16U,860
22
+ ontolearner/learner/retriever/augmented_retriever.py,sha256=tI4z7IbbaShOY-BxOreEGe7fhemz7l48diB2Erri3ek,14004
21
23
  ontolearner/learner/retriever/crossencoder.py,sha256=yurzGE4zydlBSwUefi1CugsWv34HEZ61qADG_-nILbo,4996
22
24
  ontolearner/learner/retriever/embedding.py,sha256=Lp9oA7LiOYaSWDvzG779KMv5keNl6Xv7hw0WpeaepDE,7875
23
- ontolearner/learner/retriever/learner.py,sha256=VcarTwwR8HNddJCh0loCQejDzZ_GO4NkdQUjEhLVy48,11181
24
- ontolearner/learner/retriever/llm_retriever.py,sha256=goInWYxrD9PSo_EsSKbNV8wEaSPvWY3LEC8XM7jlH64,12917
25
+ ontolearner/learner/retriever/learner.py,sha256=bMkXj_MnzBRQDjPloqnOYEj400fsO6CFBfUql7gHIxw,11184
25
26
  ontolearner/learner/retriever/ngram.py,sha256=XgS1OeheKEIi7wfJHZgS8mWxKv9MQrP0apOJD_XSOnM,4575
26
27
  ontolearner/learner/taxonomy_discovery/__init__.py,sha256=-Hb5Dl6_6c4l1uIT2zWtyBWMq5cjVD4PNjxt5qJePl4,747
27
28
  ontolearner/learner/taxonomy_discovery/alexbek.py,sha256=kFEDvoKxLf-sB7-d5REkcC0DqXZpcA6ZSJ2QHrNoC5E,19010
@@ -32,9 +33,9 @@ ontolearner/learner/term_typing/__init__.py,sha256=2rBbgp8683GNVgB58T4xe76l4m-NT
32
33
  ontolearner/learner/term_typing/alexbek.py,sha256=SzWQbndkhAjxETVbrJ4uyH7ykL_TMIwHozSS08zwjoM,46684
33
34
  ontolearner/learner/term_typing/rwthdbis.py,sha256=F6Jr1SrsbDOIe0Ee_FkDVGTG4wRWpM-R2YqrqEQiex0,14576
34
35
  ontolearner/learner/term_typing/sbunlp.py,sha256=Xd3UqMO3m_Skn_2geTN22MGQmSD6R8bYfPgubZre3IE,19820
35
- ontolearner/learner/text2onto/__init__.py,sha256=4-G6iel0Nxcj4nzPxUDqtFf9CMCzi8LghooOSAnbNfc,641
36
- ontolearner/learner/text2onto/alexbek.py,sha256=MySzxJUR0F3UyeS5rPIN988xxtPaoAxDFkBc-Q0vFTE,45494
37
- ontolearner/learner/text2onto/sbunlp.py,sha256=5p-s2Ixtntws5eO3gOUyYLpfZpCbOE0hG5gEcCwKHz4,24177
36
+ ontolearner/learner/text2onto/__init__.py,sha256=bLv25lJmgQymgMfhr6JTezMndpDMk9ihheY-VLE-nRI,644
37
+ ontolearner/learner/text2onto/alexbek.py,sha256=0CE5KHgB47tXASgscYH-W3X17XtI2QBtTuhDNpGGaUI,23347
38
+ ontolearner/learner/text2onto/sbunlp.py,sha256=-ULysm_iFUMqEsxNRwgZVcq-70nBzlXMR5BeMezUjjw,23786
38
39
  ontolearner/ontology/__init__.py,sha256=F9Ta1qCX9mOxIK5CPRypEoglQNkpJ6SJpqziz73xKQE,1328
39
40
  ontolearner/ontology/agriculture.py,sha256=ZaXHNEFjbtsMH8M7HQ8ypnfJS4TUQy_as16fwv-kOKA,5903
40
41
  ontolearner/ontology/arts_humanities.py,sha256=K4ceDJL6PfIfSJZ86uQUkUXOVoiERG6ItgvVE2lhLKk,3996
@@ -62,15 +63,15 @@ ontolearner/processor.py,sha256=LaPUr4BSmPZDINo5t55q9U0i9lLXa77u4pN38usQMBc,4817
62
63
  ontolearner/text2onto/__init__.py,sha256=YbbDYpHYSMA4dof-7y40PKYsiRO7wvoXZ2LbsRwpPJE,645
63
64
  ontolearner/text2onto/batchifier.py,sha256=2CljvcZo0EDW3sHHcG9d5w26RcRwbMsQdFB1j-vCam4,6646
64
65
  ontolearner/text2onto/general.py,sha256=2RUFMbWm7qLq3MJHsyNb3rgYkGcicnkbiH2wdPBsBps,1099
65
- ontolearner/text2onto/splitter.py,sha256=7SrFeUM5GZTTvbrve9RRKtBjELlkpnMkyPluO614PYM,10941
66
+ ontolearner/text2onto/splitter.py,sha256=PeiVbw5zuNlX3dhtcIJRVCghNizWE8ugIGV7OigR5Ac,12743
66
67
  ontolearner/text2onto/synthesizer.py,sha256=tSJgPTFWVKBQi2RqLQfMhX_noXeNLh2Wq2Ezbqyv-OA,5486
67
68
  ontolearner/tools/__init__.py,sha256=IB5ycAW5vUDKeq-NAMMbwjSFzwSzC-5j0UobIzO3ZmI,623
68
- ontolearner/tools/analyzer.py,sha256=8iL9wY1ESh4RumSW-s28EtXjtjPj71IKp0MBK0ograg,9925
69
+ ontolearner/tools/analyzer.py,sha256=1SooAT7qYqDIrHyvHXnrBRmuPwZhLK1uj26OiKRECc0,12989
69
70
  ontolearner/tools/visualizer.py,sha256=cwijl4yYaS1SCLM5wbvRTEcbQj9Bjo4fHzZR6q6o8qo,6267
70
71
  ontolearner/utils/__init__.py,sha256=pSEyU3dlPMADBqygqaaid44RdWf0Lo3Fvz-K_rQ7_Bw,733
71
72
  ontolearner/utils/io.py,sha256=3DqGK2p7c0onKi0Xxs16WB08uHfHUId3bW0dDKwyS0g,2110
72
73
  ontolearner/utils/train_test_split.py,sha256=Zlm42eT6QGWwlySyomCPIiTGmGqeN_h4z4xBY2EAOR8,11530
73
- ontolearner-1.4.9.dist-info/METADATA,sha256=c_V_1mUkxAhzJz04u1wRYU7xodpZQdiJXBVFzUCIMK8,11444
74
- ontolearner-1.4.9.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
75
- ontolearner-1.4.9.dist-info/licenses/LICENSE,sha256=krXMLuMKgzX-UgaufgfJdm9ojIloZot7ZdvJUnNxl4I,1067
76
- ontolearner-1.4.9.dist-info/RECORD,,
74
+ ontolearner-1.4.11.dist-info/METADATA,sha256=YDJySz7VAXa80XACaj-WDyuHtFhticcNqEmQVaR8Jsg,11473
75
+ ontolearner-1.4.11.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
76
+ ontolearner-1.4.11.dist-info/licenses/LICENSE,sha256=krXMLuMKgzX-UgaufgfJdm9ojIloZot7ZdvJUnNxl4I,1067
77
+ ontolearner-1.4.11.dist-info/RECORD,,