rdf-construct 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdf_construct/__init__.py +1 -1
- rdf_construct/cli.py +129 -2
- rdf_construct/describe/__init__.py +93 -0
- rdf_construct/describe/analyzer.py +176 -0
- rdf_construct/describe/documentation.py +146 -0
- rdf_construct/describe/formatters/__init__.py +47 -0
- rdf_construct/describe/formatters/json.py +65 -0
- rdf_construct/describe/formatters/markdown.py +275 -0
- rdf_construct/describe/formatters/text.py +315 -0
- rdf_construct/describe/hierarchy.py +232 -0
- rdf_construct/describe/imports.py +213 -0
- rdf_construct/describe/metadata.py +187 -0
- rdf_construct/describe/metrics.py +145 -0
- rdf_construct/describe/models.py +552 -0
- rdf_construct/describe/namespaces.py +180 -0
- rdf_construct/describe/profiles.py +415 -0
- {rdf_construct-0.3.0.dist-info → rdf_construct-0.4.1.dist-info}/METADATA +28 -6
- {rdf_construct-0.3.0.dist-info → rdf_construct-0.4.1.dist-info}/RECORD +21 -7
- {rdf_construct-0.3.0.dist-info → rdf_construct-0.4.1.dist-info}/WHEEL +0 -0
- {rdf_construct-0.3.0.dist-info → rdf_construct-0.4.1.dist-info}/entry_points.txt +0 -0
- {rdf_construct-0.3.0.dist-info → rdf_construct-0.4.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""Class hierarchy analysis for ontology description.
|
|
2
|
+
|
|
3
|
+
Analyses class hierarchy structure including roots, depth, orphans, and cycles.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
|
|
8
|
+
from rdflib import Graph, URIRef, RDF, RDFS
|
|
9
|
+
from rdflib.namespace import OWL
|
|
10
|
+
|
|
11
|
+
from rdf_construct.describe.models import HierarchyAnalysis
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Classes that should not count as "real" superclasses for root detection
|
|
15
|
+
TOP_CLASSES = {
|
|
16
|
+
OWL.Thing,
|
|
17
|
+
RDFS.Resource,
|
|
18
|
+
# Some ontologies use owl:Class as a type marker
|
|
19
|
+
OWL.Class,
|
|
20
|
+
RDFS.Class,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def analyse_hierarchy(graph: Graph, max_roots_display: int = 10) -> HierarchyAnalysis:
|
|
25
|
+
"""Analyse the class hierarchy structure.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
graph: RDF graph to analyse.
|
|
29
|
+
max_roots_display: Maximum number of root classes to list.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
HierarchyAnalysis with hierarchy metrics.
|
|
33
|
+
"""
|
|
34
|
+
# Get all classes
|
|
35
|
+
all_classes = _get_all_classes(graph)
|
|
36
|
+
|
|
37
|
+
if not all_classes:
|
|
38
|
+
return HierarchyAnalysis()
|
|
39
|
+
|
|
40
|
+
# Build parent-child relationships
|
|
41
|
+
parents: dict[URIRef, set[URIRef]] = defaultdict(set)
|
|
42
|
+
children: dict[URIRef, set[URIRef]] = defaultdict(set)
|
|
43
|
+
|
|
44
|
+
for cls in all_classes:
|
|
45
|
+
for superclass in graph.objects(cls, RDFS.subClassOf):
|
|
46
|
+
if isinstance(superclass, URIRef) and superclass in all_classes:
|
|
47
|
+
parents[cls].add(superclass)
|
|
48
|
+
children[superclass].add(cls)
|
|
49
|
+
|
|
50
|
+
# Find root classes (no parent except top classes)
|
|
51
|
+
root_classes: list[str] = []
|
|
52
|
+
for cls in all_classes:
|
|
53
|
+
real_parents = parents[cls] - TOP_CLASSES
|
|
54
|
+
if not real_parents:
|
|
55
|
+
root_classes.append(_curie(graph, cls))
|
|
56
|
+
|
|
57
|
+
# Sort and limit for display
|
|
58
|
+
root_classes.sort()
|
|
59
|
+
display_roots = root_classes[:max_roots_display]
|
|
60
|
+
if len(root_classes) > max_roots_display:
|
|
61
|
+
display_roots.append(f"...and {len(root_classes) - max_roots_display} more")
|
|
62
|
+
|
|
63
|
+
# Find orphan classes (neither parent nor child of anything)
|
|
64
|
+
orphan_classes: list[str] = []
|
|
65
|
+
for cls in all_classes:
|
|
66
|
+
has_parent = bool(parents[cls] - TOP_CLASSES)
|
|
67
|
+
has_child = bool(children[cls])
|
|
68
|
+
if not has_parent and not has_child:
|
|
69
|
+
orphan_classes.append(_curie(graph, cls))
|
|
70
|
+
|
|
71
|
+
orphan_classes.sort()
|
|
72
|
+
|
|
73
|
+
# Calculate maximum depth
|
|
74
|
+
max_depth = _calculate_max_depth(all_classes, parents)
|
|
75
|
+
|
|
76
|
+
# Detect cycles
|
|
77
|
+
has_cycles, cycle_members = _detect_cycles(all_classes, parents)
|
|
78
|
+
|
|
79
|
+
return HierarchyAnalysis(
|
|
80
|
+
root_classes=display_roots,
|
|
81
|
+
max_depth=max_depth,
|
|
82
|
+
orphan_classes=orphan_classes,
|
|
83
|
+
has_cycles=has_cycles,
|
|
84
|
+
cycle_members=[_curie(graph, uri) for uri in cycle_members],
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _get_all_classes(graph: Graph) -> set[URIRef]:
|
|
89
|
+
"""Get all classes from the graph.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
graph: RDF graph to query.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Set of class URIRefs.
|
|
96
|
+
"""
|
|
97
|
+
classes: set[URIRef] = set()
|
|
98
|
+
|
|
99
|
+
# owl:Class
|
|
100
|
+
for cls in graph.subjects(RDF.type, OWL.Class):
|
|
101
|
+
if isinstance(cls, URIRef):
|
|
102
|
+
classes.add(cls)
|
|
103
|
+
|
|
104
|
+
# rdfs:Class
|
|
105
|
+
for cls in graph.subjects(RDF.type, RDFS.Class):
|
|
106
|
+
if isinstance(cls, URIRef):
|
|
107
|
+
classes.add(cls)
|
|
108
|
+
|
|
109
|
+
return classes
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _calculate_max_depth(
|
|
113
|
+
classes: set[URIRef],
|
|
114
|
+
parents: dict[URIRef, set[URIRef]],
|
|
115
|
+
) -> int:
|
|
116
|
+
"""Calculate the maximum depth of the class hierarchy.
|
|
117
|
+
|
|
118
|
+
Uses iterative deepening to handle potentially cyclic graphs.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
classes: Set of all classes.
|
|
122
|
+
parents: Parent relationships.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Maximum hierarchy depth (0 if no hierarchy).
|
|
126
|
+
"""
|
|
127
|
+
if not classes:
|
|
128
|
+
return 0
|
|
129
|
+
|
|
130
|
+
# Calculate depth for each class using BFS from roots
|
|
131
|
+
depths: dict[URIRef, int] = {}
|
|
132
|
+
|
|
133
|
+
# Find roots
|
|
134
|
+
roots = {
|
|
135
|
+
cls for cls in classes
|
|
136
|
+
if not (parents[cls] - TOP_CLASSES)
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
# BFS to assign depths
|
|
140
|
+
current_level = roots
|
|
141
|
+
depth = 0
|
|
142
|
+
|
|
143
|
+
while current_level:
|
|
144
|
+
for cls in current_level:
|
|
145
|
+
if cls not in depths:
|
|
146
|
+
depths[cls] = depth
|
|
147
|
+
|
|
148
|
+
# Find children at next level
|
|
149
|
+
next_level: set[URIRef] = set()
|
|
150
|
+
for cls in current_level:
|
|
151
|
+
for child_cls in classes:
|
|
152
|
+
if cls in parents[child_cls] and child_cls not in depths:
|
|
153
|
+
next_level.add(child_cls)
|
|
154
|
+
|
|
155
|
+
current_level = next_level
|
|
156
|
+
depth += 1
|
|
157
|
+
|
|
158
|
+
# Safety limit to prevent infinite loops
|
|
159
|
+
if depth > 1000:
|
|
160
|
+
break
|
|
161
|
+
|
|
162
|
+
return max(depths.values()) if depths else 0
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _detect_cycles(
|
|
166
|
+
classes: set[URIRef],
|
|
167
|
+
parents: dict[URIRef, set[URIRef]],
|
|
168
|
+
) -> tuple[bool, list[URIRef]]:
|
|
169
|
+
"""Detect cycles in the class hierarchy.
|
|
170
|
+
|
|
171
|
+
Uses DFS-based cycle detection.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
classes: Set of all classes.
|
|
175
|
+
parents: Parent relationships.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Tuple of (has_cycles, list of cycle member URIs).
|
|
179
|
+
"""
|
|
180
|
+
# Build reverse mapping for traversal
|
|
181
|
+
# Note: We traverse "up" via parents to detect cycles
|
|
182
|
+
|
|
183
|
+
visited: set[URIRef] = set()
|
|
184
|
+
rec_stack: set[URIRef] = set()
|
|
185
|
+
cycle_members: set[URIRef] = set()
|
|
186
|
+
|
|
187
|
+
def dfs(cls: URIRef) -> bool:
|
|
188
|
+
visited.add(cls)
|
|
189
|
+
rec_stack.add(cls)
|
|
190
|
+
|
|
191
|
+
for parent in parents[cls]:
|
|
192
|
+
if parent not in visited:
|
|
193
|
+
if dfs(parent):
|
|
194
|
+
cycle_members.add(cls)
|
|
195
|
+
return True
|
|
196
|
+
elif parent in rec_stack:
|
|
197
|
+
# Found a cycle
|
|
198
|
+
cycle_members.add(cls)
|
|
199
|
+
cycle_members.add(parent)
|
|
200
|
+
return True
|
|
201
|
+
|
|
202
|
+
rec_stack.remove(cls)
|
|
203
|
+
return False
|
|
204
|
+
|
|
205
|
+
for cls in classes:
|
|
206
|
+
if cls not in visited:
|
|
207
|
+
if dfs(cls):
|
|
208
|
+
# Continue to find all cycle members
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
return bool(cycle_members), list(cycle_members)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _curie(graph: Graph, uri: URIRef) -> str:
|
|
215
|
+
"""Convert URI to CURIE or short form for display.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
graph: Graph with namespace bindings.
|
|
219
|
+
uri: URI to convert.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
CURIE or shortened URI string.
|
|
223
|
+
"""
|
|
224
|
+
try:
|
|
225
|
+
return graph.namespace_manager.normalizeUri(uri)
|
|
226
|
+
except Exception:
|
|
227
|
+
s = str(uri)
|
|
228
|
+
if "#" in s:
|
|
229
|
+
return s.split("#")[-1]
|
|
230
|
+
elif "/" in s:
|
|
231
|
+
return s.split("/")[-1]
|
|
232
|
+
return s
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""Import analysis for ontology description.
|
|
2
|
+
|
|
3
|
+
Handles owl:imports declarations and optional resolvability checking.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import urllib.request
|
|
7
|
+
import urllib.error
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeout
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from rdflib import Graph, URIRef
|
|
12
|
+
from rdflib.namespace import OWL
|
|
13
|
+
|
|
14
|
+
from rdf_construct.describe.models import ImportAnalysis, ImportInfo, ImportStatus
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# Timeout for HTTP requests (seconds)
|
|
18
|
+
REQUEST_TIMEOUT = 10
|
|
19
|
+
|
|
20
|
+
# Maximum concurrent resolution checks
|
|
21
|
+
MAX_WORKERS = 5
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def analyse_imports(
|
|
25
|
+
graph: Graph,
|
|
26
|
+
resolve: bool = True,
|
|
27
|
+
timeout: int = REQUEST_TIMEOUT,
|
|
28
|
+
) -> ImportAnalysis:
|
|
29
|
+
"""Analyse owl:imports declarations in the ontology.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
graph: RDF graph to analyse.
|
|
33
|
+
resolve: Whether to check resolvability of imports.
|
|
34
|
+
timeout: Timeout for resolution checks in seconds.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
ImportAnalysis with import information.
|
|
38
|
+
"""
|
|
39
|
+
# Find all owl:imports declarations
|
|
40
|
+
import_uris: list[URIRef] = []
|
|
41
|
+
for ontology in graph.subjects(None, OWL.Ontology):
|
|
42
|
+
for import_uri in graph.objects(ontology, OWL.imports):
|
|
43
|
+
if isinstance(import_uri, URIRef):
|
|
44
|
+
import_uris.append(import_uri)
|
|
45
|
+
|
|
46
|
+
if not import_uris:
|
|
47
|
+
return ImportAnalysis(imports=[], resolve_attempted=False)
|
|
48
|
+
|
|
49
|
+
# Build import info list
|
|
50
|
+
imports: list[ImportInfo] = []
|
|
51
|
+
|
|
52
|
+
if resolve:
|
|
53
|
+
# Resolve imports in parallel with timeout
|
|
54
|
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
55
|
+
futures = {
|
|
56
|
+
executor.submit(_check_resolvable, str(uri), timeout): uri
|
|
57
|
+
for uri in import_uris
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
for future in futures:
|
|
61
|
+
uri = futures[future]
|
|
62
|
+
try:
|
|
63
|
+
status, error = future.result(timeout=timeout + 5)
|
|
64
|
+
imports.append(ImportInfo(
|
|
65
|
+
uri=str(uri),
|
|
66
|
+
status=status,
|
|
67
|
+
error=error,
|
|
68
|
+
))
|
|
69
|
+
except FuturesTimeout:
|
|
70
|
+
imports.append(ImportInfo(
|
|
71
|
+
uri=str(uri),
|
|
72
|
+
status=ImportStatus.UNRESOLVABLE,
|
|
73
|
+
error="Resolution timed out",
|
|
74
|
+
))
|
|
75
|
+
except Exception as e:
|
|
76
|
+
imports.append(ImportInfo(
|
|
77
|
+
uri=str(uri),
|
|
78
|
+
status=ImportStatus.UNRESOLVABLE,
|
|
79
|
+
error=str(e),
|
|
80
|
+
))
|
|
81
|
+
else:
|
|
82
|
+
# Just list imports without resolution
|
|
83
|
+
imports = [
|
|
84
|
+
ImportInfo(uri=str(uri), status=ImportStatus.UNCHECKED)
|
|
85
|
+
for uri in import_uris
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
return ImportAnalysis(
|
|
89
|
+
imports=imports,
|
|
90
|
+
resolve_attempted=resolve,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _check_resolvable(uri: str, timeout: int) -> tuple[ImportStatus, Optional[str]]:
|
|
95
|
+
"""Check if a URI is resolvable via HTTP HEAD request.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
uri: URI to check.
|
|
99
|
+
timeout: Request timeout in seconds.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Tuple of (status, error_message).
|
|
103
|
+
"""
|
|
104
|
+
try:
|
|
105
|
+
# Create a HEAD request to check resolvability without downloading
|
|
106
|
+
request = urllib.request.Request(
|
|
107
|
+
uri,
|
|
108
|
+
method="HEAD",
|
|
109
|
+
headers={
|
|
110
|
+
"Accept": "application/rdf+xml, text/turtle, application/ld+json, */*",
|
|
111
|
+
"User-Agent": "rdf-construct/describe",
|
|
112
|
+
},
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
with urllib.request.urlopen(request, timeout=timeout) as response:
|
|
116
|
+
if response.status == 200:
|
|
117
|
+
return ImportStatus.RESOLVABLE, None
|
|
118
|
+
else:
|
|
119
|
+
return ImportStatus.UNRESOLVABLE, f"HTTP {response.status}"
|
|
120
|
+
|
|
121
|
+
except urllib.error.HTTPError as e:
|
|
122
|
+
# Some servers don't support HEAD but work with GET
|
|
123
|
+
if e.code == 405: # Method Not Allowed
|
|
124
|
+
return _try_get_request(uri, timeout)
|
|
125
|
+
return ImportStatus.UNRESOLVABLE, f"HTTP {e.code}: {e.reason}"
|
|
126
|
+
|
|
127
|
+
except urllib.error.URLError as e:
|
|
128
|
+
return ImportStatus.UNRESOLVABLE, f"Network error: {e.reason}"
|
|
129
|
+
|
|
130
|
+
except TimeoutError:
|
|
131
|
+
return ImportStatus.UNRESOLVABLE, "Request timed out"
|
|
132
|
+
|
|
133
|
+
except Exception as e:
|
|
134
|
+
return ImportStatus.UNRESOLVABLE, str(e)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _try_get_request(uri: str, timeout: int) -> tuple[ImportStatus, Optional[str]]:
|
|
138
|
+
"""Fall back to GET request if HEAD is not allowed.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
uri: URI to check.
|
|
142
|
+
timeout: Request timeout in seconds.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Tuple of (status, error_message).
|
|
146
|
+
"""
|
|
147
|
+
try:
|
|
148
|
+
request = urllib.request.Request(
|
|
149
|
+
uri,
|
|
150
|
+
headers={
|
|
151
|
+
"Accept": "application/rdf+xml, text/turtle, application/ld+json, */*",
|
|
152
|
+
"User-Agent": "rdf-construct/describe",
|
|
153
|
+
},
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Only read a small amount to check availability
|
|
157
|
+
with urllib.request.urlopen(request, timeout=timeout) as response:
|
|
158
|
+
# Read just enough to confirm it's responding
|
|
159
|
+
response.read(1024)
|
|
160
|
+
return ImportStatus.RESOLVABLE, None
|
|
161
|
+
|
|
162
|
+
except urllib.error.HTTPError as e:
|
|
163
|
+
return ImportStatus.UNRESOLVABLE, f"HTTP {e.code}: {e.reason}"
|
|
164
|
+
|
|
165
|
+
except urllib.error.URLError as e:
|
|
166
|
+
return ImportStatus.UNRESOLVABLE, f"Network error: {e.reason}"
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
return ImportStatus.UNRESOLVABLE, str(e)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def get_imported_namespaces(graph: Graph) -> set[str]:
|
|
173
|
+
"""Get the namespace URIs of imported ontologies.
|
|
174
|
+
|
|
175
|
+
Used for namespace categorisation.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
graph: RDF graph to analyse.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Set of namespace URI strings.
|
|
182
|
+
"""
|
|
183
|
+
namespaces = set()
|
|
184
|
+
|
|
185
|
+
for ontology in graph.subjects(None, OWL.Ontology):
|
|
186
|
+
for import_uri in graph.objects(ontology, OWL.imports):
|
|
187
|
+
if isinstance(import_uri, URIRef):
|
|
188
|
+
# The import URI is typically the namespace or close to it
|
|
189
|
+
ns = _extract_namespace(str(import_uri))
|
|
190
|
+
namespaces.add(ns)
|
|
191
|
+
|
|
192
|
+
return namespaces
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _extract_namespace(uri: str) -> str:
|
|
196
|
+
"""Extract namespace from an ontology URI.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
uri: Ontology URI.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Namespace string.
|
|
203
|
+
"""
|
|
204
|
+
# Common patterns:
|
|
205
|
+
# - http://example.org/ontology# -> http://example.org/ontology#
|
|
206
|
+
# - http://example.org/ontology/ -> http://example.org/ontology/
|
|
207
|
+
# - http://example.org/ontology -> http://example.org/ontology#
|
|
208
|
+
|
|
209
|
+
if uri.endswith("#") or uri.endswith("/"):
|
|
210
|
+
return uri
|
|
211
|
+
|
|
212
|
+
# Add hash if no separator at end
|
|
213
|
+
return uri + "#"
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""Metadata extraction for ontology description.
|
|
2
|
+
|
|
3
|
+
Extracts ontology-level metadata like IRI, title, description, license, creators.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from rdflib import Graph, URIRef, Literal, RDF, RDFS
|
|
7
|
+
from rdflib.namespace import OWL
|
|
8
|
+
|
|
9
|
+
from rdf_construct.describe.models import OntologyMetadata
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Dublin Core namespaces
|
|
13
|
+
DC = URIRef("http://purl.org/dc/elements/1.1/")
|
|
14
|
+
DCTERMS = URIRef("http://purl.org/dc/terms/")
|
|
15
|
+
|
|
16
|
+
# Common metadata predicates
|
|
17
|
+
DC_TITLE = URIRef(str(DC) + "title")
|
|
18
|
+
DC_DESCRIPTION = URIRef(str(DC) + "description")
|
|
19
|
+
DC_CREATOR = URIRef(str(DC) + "creator")
|
|
20
|
+
DC_RIGHTS = URIRef(str(DC) + "rights")
|
|
21
|
+
|
|
22
|
+
DCTERMS_TITLE = URIRef(str(DCTERMS) + "title")
|
|
23
|
+
DCTERMS_DESCRIPTION = URIRef(str(DCTERMS) + "description")
|
|
24
|
+
DCTERMS_CREATOR = URIRef(str(DCTERMS) + "creator")
|
|
25
|
+
DCTERMS_LICENSE = URIRef(str(DCTERMS) + "license")
|
|
26
|
+
DCTERMS_RIGHTS = URIRef(str(DCTERMS) + "rights")
|
|
27
|
+
|
|
28
|
+
# Creative Commons namespace
|
|
29
|
+
CC = URIRef("http://creativecommons.org/ns#")
|
|
30
|
+
CC_LICENSE = URIRef(str(CC) + "license")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def extract_metadata(graph: Graph) -> OntologyMetadata:
|
|
34
|
+
"""Extract ontology-level metadata.
|
|
35
|
+
|
|
36
|
+
Looks for owl:Ontology declaration and extracts common metadata
|
|
37
|
+
properties like title, description, license, and creators.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
graph: RDF graph to analyse.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
OntologyMetadata with extracted values.
|
|
44
|
+
"""
|
|
45
|
+
metadata = OntologyMetadata()
|
|
46
|
+
|
|
47
|
+
# Find ontology subject(s)
|
|
48
|
+
ontology_subjects = list(graph.subjects(RDF.type, OWL.Ontology))
|
|
49
|
+
|
|
50
|
+
if not ontology_subjects:
|
|
51
|
+
return metadata
|
|
52
|
+
|
|
53
|
+
# Use first ontology subject (typically there's only one)
|
|
54
|
+
ontology = ontology_subjects[0]
|
|
55
|
+
|
|
56
|
+
# Ontology IRI
|
|
57
|
+
if isinstance(ontology, URIRef):
|
|
58
|
+
metadata.ontology_iri = str(ontology)
|
|
59
|
+
|
|
60
|
+
# Version IRI
|
|
61
|
+
version_iri = _get_single_value(graph, ontology, OWL.versionIRI)
|
|
62
|
+
if version_iri:
|
|
63
|
+
metadata.version_iri = str(version_iri)
|
|
64
|
+
|
|
65
|
+
# Version info
|
|
66
|
+
version_info = _get_single_literal(graph, ontology, OWL.versionInfo)
|
|
67
|
+
if version_info:
|
|
68
|
+
metadata.version_info = version_info
|
|
69
|
+
|
|
70
|
+
# Title (try multiple predicates)
|
|
71
|
+
title = (
|
|
72
|
+
_get_single_literal(graph, ontology, RDFS.label)
|
|
73
|
+
or _get_single_literal(graph, ontology, DCTERMS_TITLE)
|
|
74
|
+
or _get_single_literal(graph, ontology, DC_TITLE)
|
|
75
|
+
)
|
|
76
|
+
if title:
|
|
77
|
+
metadata.title = title
|
|
78
|
+
|
|
79
|
+
# Description (try multiple predicates)
|
|
80
|
+
description = (
|
|
81
|
+
_get_single_literal(graph, ontology, RDFS.comment)
|
|
82
|
+
or _get_single_literal(graph, ontology, DCTERMS_DESCRIPTION)
|
|
83
|
+
or _get_single_literal(graph, ontology, DC_DESCRIPTION)
|
|
84
|
+
)
|
|
85
|
+
if description:
|
|
86
|
+
metadata.description = description
|
|
87
|
+
|
|
88
|
+
# License
|
|
89
|
+
license_uri = (
|
|
90
|
+
_get_single_value(graph, ontology, DCTERMS_LICENSE)
|
|
91
|
+
or _get_single_value(graph, ontology, CC_LICENSE)
|
|
92
|
+
)
|
|
93
|
+
if license_uri:
|
|
94
|
+
metadata.license_uri = str(license_uri)
|
|
95
|
+
# Try to get a label for the license
|
|
96
|
+
if isinstance(license_uri, URIRef):
|
|
97
|
+
license_label = _get_single_literal(graph, license_uri, RDFS.label)
|
|
98
|
+
if license_label:
|
|
99
|
+
metadata.license_label = license_label
|
|
100
|
+
|
|
101
|
+
# If no structured license, check for rights statement
|
|
102
|
+
if not metadata.license_uri:
|
|
103
|
+
rights = (
|
|
104
|
+
_get_single_literal(graph, ontology, DCTERMS_RIGHTS)
|
|
105
|
+
or _get_single_literal(graph, ontology, DC_RIGHTS)
|
|
106
|
+
)
|
|
107
|
+
if rights:
|
|
108
|
+
metadata.license_label = rights
|
|
109
|
+
|
|
110
|
+
# Creators
|
|
111
|
+
creators = []
|
|
112
|
+
for pred in [DCTERMS_CREATOR, DC_CREATOR]:
|
|
113
|
+
for creator in graph.objects(ontology, pred):
|
|
114
|
+
if isinstance(creator, Literal):
|
|
115
|
+
creators.append(str(creator))
|
|
116
|
+
elif isinstance(creator, URIRef):
|
|
117
|
+
# Try to get a label for the creator
|
|
118
|
+
label = _get_single_literal(graph, creator, RDFS.label)
|
|
119
|
+
if label:
|
|
120
|
+
creators.append(label)
|
|
121
|
+
else:
|
|
122
|
+
# Use URI local name
|
|
123
|
+
creators.append(_local_name(str(creator)))
|
|
124
|
+
|
|
125
|
+
if creators:
|
|
126
|
+
metadata.creators = creators
|
|
127
|
+
|
|
128
|
+
return metadata
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _get_single_value(graph: Graph, subject: URIRef, predicate: URIRef):
|
|
132
|
+
"""Get a single value for a predicate (URI or literal).
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
graph: RDF graph to query.
|
|
136
|
+
subject: Subject to query.
|
|
137
|
+
predicate: Predicate to look for.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
First value found or None.
|
|
141
|
+
"""
|
|
142
|
+
for obj in graph.objects(subject, predicate):
|
|
143
|
+
return obj
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _get_single_literal(graph: Graph, subject: URIRef, predicate: URIRef) -> str | None:
|
|
148
|
+
"""Get a single literal value for a predicate.
|
|
149
|
+
|
|
150
|
+
Prefers English language literals if multiple exist.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
graph: RDF graph to query.
|
|
154
|
+
subject: Subject to query.
|
|
155
|
+
predicate: Predicate to look for.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Literal string value or None.
|
|
159
|
+
"""
|
|
160
|
+
english_value = None
|
|
161
|
+
any_value = None
|
|
162
|
+
|
|
163
|
+
for obj in graph.objects(subject, predicate):
|
|
164
|
+
if isinstance(obj, Literal):
|
|
165
|
+
value = str(obj)
|
|
166
|
+
if obj.language == "en":
|
|
167
|
+
english_value = value
|
|
168
|
+
elif any_value is None:
|
|
169
|
+
any_value = value
|
|
170
|
+
|
|
171
|
+
return english_value or any_value
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _local_name(uri: str) -> str:
|
|
175
|
+
"""Extract local name from a URI.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
uri: Full URI string.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Local name portion.
|
|
182
|
+
"""
|
|
183
|
+
if "#" in uri:
|
|
184
|
+
return uri.split("#")[-1]
|
|
185
|
+
elif "/" in uri:
|
|
186
|
+
return uri.split("/")[-1]
|
|
187
|
+
return uri
|