OntoLearner 1.4.10__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. ontolearner/VERSION +1 -1
  2. ontolearner/base/learner.py +41 -18
  3. ontolearner/evaluation/metrics.py +72 -32
  4. ontolearner/learner/__init__.py +3 -2
  5. ontolearner/learner/label_mapper.py +5 -4
  6. ontolearner/learner/llm.py +257 -0
  7. ontolearner/learner/prompt.py +40 -5
  8. ontolearner/learner/rag/__init__.py +14 -0
  9. ontolearner/learner/{rag.py → rag/rag.py} +7 -2
  10. ontolearner/learner/retriever/__init__.py +1 -1
  11. ontolearner/learner/retriever/{llm_retriever.py → augmented_retriever.py} +48 -39
  12. ontolearner/learner/retriever/learner.py +3 -4
  13. ontolearner/learner/taxonomy_discovery/alexbek.py +632 -310
  14. ontolearner/learner/taxonomy_discovery/skhnlp.py +216 -156
  15. ontolearner/learner/text2onto/__init__.py +1 -1
  16. ontolearner/learner/text2onto/alexbek.py +484 -1105
  17. ontolearner/learner/text2onto/sbunlp.py +498 -493
  18. ontolearner/ontology/biology.py +2 -3
  19. ontolearner/ontology/chemistry.py +16 -18
  20. ontolearner/ontology/ecology_environment.py +2 -3
  21. ontolearner/ontology/general.py +4 -6
  22. ontolearner/ontology/material_science_engineering.py +64 -45
  23. ontolearner/ontology/medicine.py +2 -3
  24. ontolearner/ontology/scholarly_knowledge.py +6 -9
  25. ontolearner/processor.py +3 -3
  26. ontolearner/text2onto/splitter.py +69 -6
  27. {ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/METADATA +2 -2
  28. {ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/RECORD +30 -29
  29. {ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/WHEEL +1 -1
  30. {ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -70,15 +70,14 @@ class GO(BaseOntology):
70
70
  format = "OWL"
71
71
  download_url = "https://geneontology.org/docs/download-ontology/"
72
72
 
73
- @staticmethod
74
- def _is_anonymous_id(label: str) -> bool:
73
+ def _is_anonymous_id(self, label: str) -> bool:
75
74
  """Override to handle GO-specific blank nodes."""
76
75
  # Check the general patterns from the parent class
77
76
  # GO-specific patterns
78
77
  if label.startswith('GO_'):
79
78
  return True
80
79
 
81
- if BaseOntology._is_anonymous_id(label):
80
+ if super()._is_anonymous_id(label):
82
81
  return True
83
82
 
84
83
  return False
@@ -57,15 +57,15 @@ class ChEBI(BaseOntology):
57
57
  format = "OWL"
58
58
  download_url = "https://www.ebi.ac.uk/chebi/"
59
59
 
60
- @staticmethod
61
- def _is_anonymous_id(label: str) -> bool:
60
+
61
+ def _is_anonymous_id(self, label: str) -> bool:
62
62
  """Override to handle ChEBI-specific blank nodes."""
63
63
  # ChEBI-specific patterns
64
64
  if re.match(r'^CHEBI_[0-9]+$', label):
65
65
  return True
66
66
 
67
67
  # Check the general patterns from the parent class
68
- if BaseOntology._is_anonymous_id(label):
68
+ if super()._is_anonymous_id(label):
69
69
  return True
70
70
 
71
71
  return False
@@ -89,15 +89,15 @@ class CHEMINF(BaseOntology):
89
89
  format = "OWL"
90
90
  download_url = "https://terminology.tib.eu/ts/ontologies/CHEMINF"
91
91
 
92
- @staticmethod
93
- def _is_anonymous_id(label: str) -> bool:
92
+
93
+ def _is_anonymous_id(self, label: str) -> bool:
94
94
  """Override to handle CHEMINF-specific blank nodes."""
95
95
  # ChEBI-specific patterns
96
96
  if re.match(r'^CHEMINF_[0-9]+$', label):
97
97
  return True
98
98
 
99
99
  # Check the general patterns from the parent class
100
- if BaseOntology._is_anonymous_id(label):
100
+ if super()._is_anonymous_id(label):
101
101
  return True
102
102
 
103
103
  return False
@@ -145,15 +145,15 @@ class ChMO(BaseOntology):
145
145
  format = "OWL"
146
146
  download_url = "https://github.com/rsc-ontologies/rsc-cmo"
147
147
 
148
- @staticmethod
149
- def _is_anonymous_id(label: str) -> bool:
148
+
149
+ def _is_anonymous_id(self, label: str) -> bool:
150
150
  """Override to handle ChMO-specific blank nodes."""
151
151
  # ChEBI-specific patterns
152
152
  if re.match(r'^CHMO_[0-9]+$', label):
153
153
  return True
154
154
 
155
155
  # Check the general patterns from the parent class
156
- if BaseOntology._is_anonymous_id(label):
156
+ if super()._is_anonymous_id(label):
157
157
  return True
158
158
 
159
159
  return False
@@ -190,15 +190,15 @@ class MassSpectrometry(BaseOntology):
190
190
  format = "OWL"
191
191
  download_url = "https://terminology.tib.eu/ts/ontologies/MS"
192
192
 
193
- @staticmethod
194
- def _is_anonymous_id(label: str) -> bool:
193
+
194
+ def _is_anonymous_id(self, label: str) -> bool:
195
195
  """Override to handle MassSpectrometry-specific blank nodes."""
196
196
  # MassSpectrometry-specific patterns
197
197
  if re.match(r'^PEFF_[0-9]+$', label):
198
198
  return True
199
199
 
200
200
  # Check the general patterns from the parent class
201
- if BaseOntology._is_anonymous_id(label):
201
+ if super()._is_anonymous_id(label):
202
202
  return True
203
203
 
204
204
  return False
@@ -220,8 +220,7 @@ class MOP(BaseOntology):
220
220
  format = "OWL"
221
221
  download_url = "https://terminology.tib.eu/ts/ontologies/MOP"
222
222
 
223
- @staticmethod
224
- def _is_anonymous_id(label: str) -> bool:
223
+ def _is_anonymous_id(self, label: str) -> bool:
225
224
  """Override to handle MOP-specific blank nodes."""
226
225
  # MOP-specific patterns
227
226
  if re.match(r'^MOP_[0-9]+$', label):
@@ -234,7 +233,7 @@ class MOP(BaseOntology):
234
233
  return True
235
234
 
236
235
  # Check the general patterns from the parent class
237
- if BaseOntology._is_anonymous_id(label):
236
+ if super()._is_anonymous_id(label):
238
237
  return True
239
238
 
240
239
  return False
@@ -357,8 +356,7 @@ class RXNO(BaseOntology):
357
356
  format = "OWL"
358
357
  download_url = "https://github.com/rsc-ontologies/rxno"
359
358
 
360
- @staticmethod
361
- def _is_anonymous_id(label: str) -> bool:
359
+ def _is_anonymous_id(self, label: str) -> bool:
362
360
  """Override to handle RXNO-specific blank nodes."""
363
361
  # RXNO-specific patterns
364
362
  if re.match(r'^RXNO_[0-9]+$', label):
@@ -368,7 +366,7 @@ class RXNO(BaseOntology):
368
366
  return True
369
367
 
370
368
  # Check the general patterns from the parent class
371
- if BaseOntology._is_anonymous_id(label):
369
+ if super()._is_anonymous_id(label):
372
370
  return True
373
371
 
374
372
  return False
@@ -35,15 +35,14 @@ class ENVO(BaseOntology):
35
35
  format = "OWL"
36
36
  download_url = "https://obofoundry.org/ontology/envo.html"
37
37
 
38
- @staticmethod
39
- def _is_anonymous_id(label: str) -> bool:
38
+ def _is_anonymous_id(self, label: str) -> bool:
40
39
  """Override to handle ENVO-specific blank nodes."""
41
40
  # ENVO-specific patterns
42
41
  if re.match(r'^PATO_[0-9]+$', label):
43
42
  return True
44
43
 
45
44
  # Check the general patterns from the parent class
46
- if BaseOntology._is_anonymous_id(label):
45
+ if super()._is_anonymous_id(label):
47
46
  return True
48
47
 
49
48
  return False
@@ -53,15 +53,14 @@ class DBpedia(BaseOntology):
53
53
  format = "OWL"
54
54
  download_url = "https://wiki.dbpedia.org/"
55
55
 
56
- @staticmethod
57
- def _is_anonymous_id(label: str) -> bool:
56
+ def _is_anonymous_id(self, label: str) -> bool:
58
57
  """Override to handle DBpedia/Wikidata-specific blank nodes."""
59
58
  # DBpedia/Wikidata-specific patterns
60
59
  if re.match(r'^Q[0-9]+$', label):
61
60
  return True
62
61
 
63
62
  # Check the general patterns from the parent class
64
- if BaseOntology._is_anonymous_id(label):
63
+ if super()._is_anonymous_id(label):
65
64
  return True
66
65
 
67
66
  return False
@@ -226,15 +225,14 @@ class UMBEL(BaseOntology):
226
225
  format = "n3"
227
226
  download_url = "https://github.com/structureddynamics/UMBEL/tree/master/Ontology"
228
227
 
229
- @staticmethod
230
- def _is_anonymous_id(label: str) -> bool:
228
+ def _is_anonymous_id(self, label: str) -> bool:
231
229
  """Override to handle UMBEL-specific blank nodes."""
232
230
  # UMBEL-specific patterns
233
231
  if re.match(r'^f5295f96ac3e649dcb1740b0d93d3e6c2b[0-9a-f]+$', label): # Long hexadecimal identifiers
234
232
  return True
235
233
 
236
234
  # Check the general patterns from the parent class
237
- if BaseOntology._is_anonymous_id(label):
235
+ if super()._is_anonymous_id(label):
238
236
  return True
239
237
 
240
238
  return False
@@ -74,15 +74,14 @@ class Atomistic(BaseOntology):
74
74
  format = "TTL"
75
75
  download_url = "https://github.com/emmo-repo/domain-atomistic"
76
76
 
77
- @staticmethod
78
- def _is_anonymous_id(label: str) -> bool:
77
+ def _is_anonymous_id(self, label: str) -> bool:
79
78
  """Override to handle Atomistic-specific blank nodes."""
80
79
  # EMMO-specific patterns (UUID format) in Atomistic
81
80
  if re.match(r'^EMMO_[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$', label):
82
81
  return True
83
82
 
84
83
  # Check the general patterns from the parent class
85
- if BaseOntology._is_anonymous_id(label):
84
+ if super()._is_anonymous_id(label):
86
85
  return True
87
86
 
88
87
  return False
@@ -106,8 +105,8 @@ class BattINFO(BaseOntology):
106
105
  format = "TTL"
107
106
  download_url = "https://github.com/BIG-MAP/BattINFO"
108
107
 
109
- @staticmethod
110
- def _is_anonymous_id(label: str) -> bool:
108
+
109
+ def _is_anonymous_id(self, label: str) -> bool:
111
110
  """Override to handle BattINFO-specific blank nodes."""
112
111
  # UUID pattern for various prefixes
113
112
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -123,7 +122,7 @@ class BattINFO(BaseOntology):
123
122
  return True
124
123
 
125
124
  # Check the general patterns from the parent class
126
- if BaseOntology._is_anonymous_id(label):
125
+ if super()._is_anonymous_id(label):
127
126
  return True
128
127
 
129
128
  return False
@@ -168,8 +167,7 @@ class BVCO(BaseOntology):
168
167
  format = "TTL"
169
168
  download_url = "https://github.com/Battery-Value-Chain-Ontology/ontology"
170
169
 
171
- @staticmethod
172
- def _is_anonymous_id(label: str) -> bool:
170
+ def _is_anonymous_id(self, label: str) -> bool:
173
171
  """Override to handle BVCO-specific blank nodes."""
174
172
  # UUID pattern for various prefixes
175
173
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -184,7 +182,7 @@ class BVCO(BaseOntology):
184
182
  return True
185
183
 
186
184
  # Check the general patterns from the parent class
187
- if BaseOntology._is_anonymous_id(label):
185
+ if super()._is_anonymous_id(label):
188
186
  return True
189
187
 
190
188
  return False
@@ -277,8 +275,7 @@ class CHAMEO(BaseOntology):
277
275
  format = "TTL"
278
276
  download_url = "https://github.com/emmo-repo/domain-characterisation-methodology"
279
277
 
280
- @staticmethod
281
- def _is_anonymous_id(label: str) -> bool:
278
+ def _is_anonymous_id(self, label: str) -> bool:
282
279
  """Override to handle CHAMEO-specific blank nodes."""
283
280
  # UUID pattern for various prefixes
284
281
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -287,7 +284,7 @@ class CHAMEO(BaseOntology):
287
284
  return True
288
285
 
289
286
  # Check the general patterns from the parent class
290
- if BaseOntology._is_anonymous_id(label):
287
+ if super()._is_anonymous_id(label):
291
288
  return True
292
289
 
293
290
  return False
@@ -400,8 +397,7 @@ class EMMOCrystallography(BaseOntology):
400
397
  format = "TTL"
401
398
  download_url = "https://github.com/emmo-repo/domain-crystallography"
402
399
 
403
- @staticmethod
404
- def _is_anonymous_id(label: str) -> bool:
400
+ def _is_anonymous_id(self, label: str) -> bool:
405
401
  """Override to handle EMMOCrystallography-specific blank nodes."""
406
402
  # UUID pattern for various prefixes
407
403
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -410,7 +406,7 @@ class EMMOCrystallography(BaseOntology):
410
406
  return True
411
407
 
412
408
  # Check the general patterns from the parent class
413
- if BaseOntology._is_anonymous_id(label):
409
+ if super()._is_anonymous_id(label):
414
410
  return True
415
411
 
416
412
  return False
@@ -451,8 +447,7 @@ class GPO(BaseOntology):
451
447
  format = "TTL"
452
448
  download_url = "https://github.com/General-Process-Ontology/ontology"
453
449
 
454
- @staticmethod
455
- def _is_anonymous_id(label: str) -> bool:
450
+ def _is_anonymous_id(self, label: str) -> bool:
456
451
  """Override to handle GPO-specific blank nodes."""
457
452
  # UUID pattern for various prefixes
458
453
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -463,7 +458,7 @@ class GPO(BaseOntology):
463
458
  return True
464
459
 
465
460
  # Check the general patterns from the parent class
466
- if BaseOntology._is_anonymous_id(label):
461
+ if super()._is_anonymous_id(label):
467
462
  return True
468
463
 
469
464
  return False
@@ -697,8 +692,7 @@ class MechanicalTesting(BaseOntology):
697
692
  format = "OWL"
698
693
  download_url = "https://github.com/emmo-repo/domain-mechanical-testing"
699
694
 
700
- @staticmethod
701
- def _is_anonymous_id(label: str) -> bool:
695
+ def _is_anonymous_id(self, label: str) -> bool:
702
696
  """Override to handle MechanicalTesting-specific blank nodes."""
703
697
  # UUID pattern for various prefixes
704
698
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -707,7 +701,7 @@ class MechanicalTesting(BaseOntology):
707
701
  return True
708
702
 
709
703
  # Check the general patterns from the parent class
710
- if BaseOntology._is_anonymous_id(label):
704
+ if super()._is_anonymous_id(label):
711
705
  return True
712
706
 
713
707
  return False
@@ -731,8 +725,7 @@ class MicroStructures(BaseOntology):
731
725
  format = "OWL"
732
726
  download_url = "https://github.com/jesper-friis/emmo-microstructure"
733
727
 
734
- @staticmethod
735
- def _is_anonymous_id(label: str) -> bool:
728
+ def _is_anonymous_id(self, label: str) -> bool:
736
729
  """Override to handle MicroStructures-specific blank nodes."""
737
730
  # UUID pattern for various prefixes
738
731
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -741,7 +734,7 @@ class MicroStructures(BaseOntology):
741
734
  return True
742
735
 
743
736
  # Check the general patterns from the parent class
744
- if BaseOntology._is_anonymous_id(label):
737
+ if super()._is_anonymous_id(label):
745
738
  return True
746
739
 
747
740
  return False
@@ -868,8 +861,7 @@ class OIECharacterisation(BaseOntology):
868
861
  format = "TTL"
869
862
  download_url = "https://github.com/emmo-repo/OIE-Ontologies/"
870
863
 
871
- @staticmethod
872
- def _is_anonymous_id(label: str) -> bool:
864
+ def _is_anonymous_id(self, label: str) -> bool:
873
865
  """Override to handle OIECharacterisation-specific blank nodes."""
874
866
  # UUID pattern for various prefixes
875
867
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -878,7 +870,7 @@ class OIECharacterisation(BaseOntology):
878
870
  return True
879
871
 
880
872
  # Check the general patterns from the parent class
881
- if BaseOntology._is_anonymous_id(label):
873
+ if super()._is_anonymous_id(label):
882
874
  return True
883
875
 
884
876
  return False
@@ -901,8 +893,7 @@ class OIEManufacturing(BaseOntology):
901
893
  format = "TTL"
902
894
  download_url = "https://github.com/emmo-repo/OIE-Ontologies/"
903
895
 
904
- @staticmethod
905
- def _is_anonymous_id(label: str) -> bool:
896
+ def _is_anonymous_id(self, label: str) -> bool:
906
897
  """Override to handle OIEManufacturing-specific blank nodes."""
907
898
  # UUID pattern for various prefixes
908
899
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -911,7 +902,7 @@ class OIEManufacturing(BaseOntology):
911
902
  return True
912
903
 
913
904
  # Check the general patterns from the parent class
914
- if BaseOntology._is_anonymous_id(label):
905
+ if super()._is_anonymous_id(label):
915
906
  return True
916
907
 
917
908
  return False
@@ -934,8 +925,7 @@ class OIEMaterials(BaseOntology):
934
925
  format = "TTL"
935
926
  download_url = "https://github.com/emmo-repo/OIE-Ontologies/"
936
927
 
937
- @staticmethod
938
- def _is_anonymous_id(label: str) -> bool:
928
+ def _is_anonymous_id(self, label: str) -> bool:
939
929
  """Override to handle OIEMaterials-specific blank nodes."""
940
930
  # UUID pattern for various prefixes
941
931
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -944,7 +934,7 @@ class OIEMaterials(BaseOntology):
944
934
  return True
945
935
 
946
936
  # Check the general patterns from the parent class
947
- if BaseOntology._is_anonymous_id(label):
937
+ if super()._is_anonymous_id(label):
948
938
  return True
949
939
 
950
940
  return False
@@ -967,8 +957,7 @@ class OIEModels(BaseOntology):
967
957
  format = "TTL"
968
958
  download_url = "https://github.com/emmo-repo/OIE-Ontologies/"
969
959
 
970
- @staticmethod
971
- def _is_anonymous_id(label: str) -> bool:
960
+ def _is_anonymous_id(self, label: str) -> bool:
972
961
  """Override to handle OIEModels-specific blank nodes."""
973
962
  # UUID pattern for various prefixes
974
963
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -977,7 +966,7 @@ class OIEModels(BaseOntology):
977
966
  return True
978
967
 
979
968
  # Check the general patterns from the parent class
980
- if BaseOntology._is_anonymous_id(label):
969
+ if super()._is_anonymous_id(label):
981
970
  return True
982
971
 
983
972
  return False
@@ -999,8 +988,7 @@ class OIESoftware(BaseOntology):
999
988
  format = "TTL"
1000
989
  download_url = "https://github.com/emmo-repo/OIE-Ontologies/"
1001
990
 
1002
- @staticmethod
1003
- def _is_anonymous_id(label: str) -> bool:
991
+ def _is_anonymous_id(self, label: str) -> bool:
1004
992
  """Override to handle OIESoftware-specific blank nodes."""
1005
993
  # UUID pattern for various prefixes
1006
994
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -1009,7 +997,7 @@ class OIESoftware(BaseOntology):
1009
997
  return True
1010
998
 
1011
999
  # Check the general patterns from the parent class
1012
- if BaseOntology._is_anonymous_id(label):
1000
+ if super()._is_anonymous_id(label):
1013
1001
  return True
1014
1002
 
1015
1003
  return False
@@ -1067,8 +1055,7 @@ class Photovoltaics(BaseOntology):
1067
1055
  format = "TTL"
1068
1056
  download_url = "https://github.com/emmo-repo/domain-photovoltaics"
1069
1057
 
1070
- @staticmethod
1071
- def _is_anonymous_id(label: str) -> bool:
1058
+ def _is_anonymous_id(self, label: str) -> bool:
1072
1059
  """Override to handle Photovoltaics-specific blank nodes."""
1073
1060
  # UUID pattern for various prefixes
1074
1061
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -1077,7 +1064,7 @@ class Photovoltaics(BaseOntology):
1077
1064
  return True
1078
1065
 
1079
1066
  # Check the general patterns from the parent class
1080
- if BaseOntology._is_anonymous_id(label):
1067
+ if super()._is_anonymous_id(label):
1081
1068
  return True
1082
1069
 
1083
1070
  return False
@@ -1219,8 +1206,8 @@ class VIMMP(BaseOntology):
1219
1206
  format = "OWL"
1220
1207
  download_url = "https://matportal.org/ontologies/VIMMP_ONTOLOGIES"
1221
1208
 
1222
- @staticmethod
1223
- def _is_anonymous_id(label: str) -> bool:
1209
+
1210
+ def _is_anonymous_id(self, label: str) -> bool:
1224
1211
  """Override to handle VIMMP-specific blank nodes."""
1225
1212
  # UUID pattern for various prefixes
1226
1213
  uuid_pattern = r'[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}$'
@@ -1232,7 +1219,39 @@ class VIMMP(BaseOntology):
1232
1219
  return True
1233
1220
 
1234
1221
  # Check the general patterns from the parent class
1235
- if BaseOntology._is_anonymous_id(label):
1222
+ if super()._is_anonymous_id(label):
1236
1223
  return True
1237
1224
 
1238
1225
  return False
1226
+
1227
+
1228
+ class MDSOnto(BaseOntology):
1229
+ """
1230
+ MDS-Onto is a domain (low) level ontology that describes terms in Materials Data Science. It is divided into six
1231
+ large modules: BuiltEnv, Exposure, Chemistry, Manufacture, Characterization, and Geospatial. Under each module,
1232
+ there are multiple sub-modules such as FTIR, AFM, Chem-Rxn, PV-Module, Accelerated Exposure, etc.
1233
+ """
1234
+ ontology_id = "MDSOnto"
1235
+ ontology_full_name = "The Modular Ontology for Materials and Data Science (MDS-Onto)"
1236
+ domain = "Materials Science and Engineering"
1237
+ category = "Materials Data Science"
1238
+ version = "0.3.1.16"
1239
+ last_updated = "2026-02-03"
1240
+ creator = "SDLE Research Center"
1241
+ license = "CC BY-SA 4.0"
1242
+ format = "OWL"
1243
+ download_url = "https://cwrusdle.bitbucket.io/files/MDS_Onto/index-en.html"
1244
+
1245
+ def _is_anonymous_id(self, label: str) -> bool:
1246
+ """Override to handle MDSOnto-specific and ontology-generated blank nodes."""
1247
+ # Treat IDs starting with 'ont0' as anonymous (e.g. ont00000562)
1248
+ if re.match(r'^ont0', label):
1249
+ return True
1250
+ if super()._is_anonymous_id(label):
1251
+ return True
1252
+ return False
1253
+
1254
+
1255
+ # def contains_imports(self) -> bool:
1256
+ # """Hook: Check if the ontology contains imports."""
1257
+ # return True # Set to True if your ontology imports other ontologies
@@ -33,14 +33,13 @@ class BTO(BaseOntology):
33
33
  format = "OWL"
34
34
  download_url = "https://terminology.tib.eu/ts/ontologies/BTO"
35
35
 
36
- @staticmethod
37
- def _is_anonymous_id(label: str) -> bool:
36
+ def _is_anonymous_id(self, label: str) -> bool:
38
37
  """Override to handle VIMMP-specific blank nodes."""
39
38
  if re.match(r'^BTO_[0-9]+$', label):
40
39
  return True
41
40
 
42
41
  # Check the general patterns from the parent class
43
- if BaseOntology._is_anonymous_id(label):
42
+ if super()._is_anonymous_id(label):
44
43
  return True
45
44
 
46
45
  return False
@@ -181,8 +181,7 @@ class DUO(BaseOntology):
181
181
  format = "OWL"
182
182
  download_url = "https://terminology.tib.eu/ts/ontologies/DUO/"
183
183
 
184
- @staticmethod
185
- def _is_anonymous_id(label: str) -> bool:
184
+ def _is_anonymous_id(self, label: str) -> bool:
186
185
  """Override to handle DUO-specific blank nodes."""
187
186
  if re.match(r'^APOLLO_SV_[0-9]+$', label):
188
187
  return True
@@ -191,7 +190,7 @@ class DUO(BaseOntology):
191
190
  return True
192
191
 
193
192
  # Check the general patterns from the parent class
194
- if BaseOntology._is_anonymous_id(label):
193
+ if super()._is_anonymous_id(label):
195
194
  return True
196
195
 
197
196
  return False
@@ -327,14 +326,13 @@ class Metadata4Ing(BaseOntology):
327
326
  format = "TTL"
328
327
  download_url = "https://git.rwth-aachen.de/nfdi4ing/metadata4ing/metadata4ing"
329
328
 
330
- @staticmethod
331
- def _is_anonymous_id(label: str) -> bool:
329
+ def _is_anonymous_id(self, label: str) -> bool:
332
330
  """Override to handle Metadata4Ing-specific blank nodes."""
333
331
  if re.match(r'^\d{4}-\d{4}-\d{4}-\d{4}$', label):
334
332
  return True
335
333
 
336
334
  # Check the general patterns from the parent class
337
- if BaseOntology._is_anonymous_id(label):
335
+ if super()._is_anonymous_id(label):
338
336
  return True
339
337
 
340
338
  return False
@@ -517,14 +515,13 @@ class SWO(BaseOntology):
517
515
  format = "OWL"
518
516
  download_url = "https://terminology.tib.eu/ts/ontologies/SWO"
519
517
 
520
- @staticmethod
521
- def _is_anonymous_id(label: str) -> bool:
518
+ def _is_anonymous_id(self, label: str) -> bool:
522
519
  """Override to handle SWO-specific blank nodes."""
523
520
  if re.match(r'^SWO_[0-9]+$', label):
524
521
  return True
525
522
 
526
523
  # Check the general patterns from the parent class
527
- if BaseOntology._is_anonymous_id(label):
524
+ if super()._is_anonymous_id(label):
528
525
  return True
529
526
 
530
527
  return False
ontolearner/processor.py CHANGED
@@ -850,13 +850,13 @@ If you find our work helpful, feel free to give us a cite.
850
850
 
851
851
  # Commit and push ontology repository
852
852
  repo.git_add(auto_lfs_track=True)
853
- commit_message = f":sparkles: Added {self.ontology_id} ontology!"
853
+ commit_message = f" Added {self.processed_ontology['ontology_id']} ontology!"
854
854
  repo.git_commit(commit_message)
855
855
  repo.git_push()
856
856
 
857
857
  # Commit and push metrics repository
858
858
  metrics_repo.git_add(auto_lfs_track=True)
859
- metrics_commit_message = f":memo: Update metrics for {self.ontology_id}"
859
+ metrics_commit_message = f"📝 Update metrics for {self.processed_ontology['ontology_id']}"
860
860
  metrics_repo.git_commit(metrics_commit_message)
861
861
  metrics_repo.git_push()
862
862
 
@@ -864,7 +864,7 @@ If you find our work helpful, feel free to give us a cite.
864
864
  "status": "success",
865
865
  "repository": repo_id,
866
866
  "metrics_repository": metrics_repo_id,
867
- "ontology_id": self.ontology_id,
867
+ "ontology_id": self.processed_ontology['ontology_id'],
868
868
  "url": f"https://huggingface.co/datasets/{repo_id}",
869
869
  "metrics_url": f"https://huggingface.co/spaces/{metrics_repo_id}"
870
870
  }
@@ -200,10 +200,73 @@ class SyntheticDataSplitter:
200
200
 
201
201
  return terms_splits, types_splits, docs_split, types2docs_splits
202
202
 
203
- def split(self, train: float = 0.8, val: float = 0.1, test: float = 0.1):
204
- split_targets, split_docs_targets = self.set_train_val_test_sizes(train_percentage=train,
205
- val_percentage=val,
206
- test_percentage=test)
203
+ def split_fine_grained(self, doc_ids):
204
+ """
205
+ Build a single split bundle containing only:
206
+ - docs
207
+ - terms
208
+ - types
209
+ - terms2docs
210
+ - terms2types
211
+ """
212
+ # normalize to string ids (constructor uses str(row.id))
213
+ doc_ids = {str(d) for d in (doc_ids or [])}
214
+
215
+ # docs + collect terms/types from docs
216
+ docs = []
217
+ terms_set = set()
218
+ types_set = set()
219
+
220
+ for doc_id in doc_ids:
221
+ doc = self.doc_id_to_doc[doc_id]
222
+ docs.append({"id": str(doc.id), "title": doc.title, "text": doc.text})
223
+
224
+ terms_set.update(self.doc_id_to_terms[doc_id])
225
+ types_set.update(self.doc_id_to_types[doc_id])
226
+
227
+ terms = sorted(terms_set)
228
+ types = sorted(types_set)
229
+
230
+ # terms2docs: use the constructor-built mapping and restrict to this split's doc_ids
231
+ terms2docs = {
232
+ term: sorted(list(self.term_to_doc_id.get(term, set()) & doc_ids))
233
+ for term in terms
234
+ }
235
+
236
+ # terms2types: ontology lookup (term -> parent types)
237
+ terms2types = {term: self.child_to_parent.get(term, []) for term in terms}
238
+
239
+ return {
240
+ "documents": docs,
241
+ "terms": terms,
242
+ "types": types,
243
+ "terms2docs": terms2docs,
244
+ "terms2types": terms2types,
245
+ }
246
+
247
+ def train_test_val_split(self, train: float = 0.8, val: float = 0.1, test: float = 0.1):
248
+ """
249
+ Returns:
250
+ train_split, val_split, test_split
251
+
252
+ Each split is a dict with keys:
253
+ - "docs"
254
+ - "terms"
255
+ - "types"
256
+ - "terms2docs"
257
+ - "terms2types"
258
+ """
259
+ # compute which docs go to which split
260
+ split_targets, split_docs_targets = self.set_train_val_test_sizes(
261
+ train_percentage=train,
262
+ val_percentage=val,
263
+ test_percentage=test,
264
+ )
207
265
  split_docs = self.create_train_val_test_splits(split_targets, split_docs_targets)
208
- terms, types, docs, types2docs = self.generate_split_artefacts(split_docs)
209
- return terms, types, docs, types2docs
266
+ # split_docs: {"train": set(doc_ids), "val": set(doc_ids), "test": set(doc_ids)}
267
+
268
+ train_split = self.split_fine_grained(split_docs.get("train", set()))
269
+ val_split = self.split_fine_grained(split_docs.get("val", set()))
270
+ test_split = self.split_fine_grained(split_docs.get("test", set()))
271
+
272
+ return train_split, val_split, test_split