rdf-construct 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. rdf_construct/__init__.py +12 -0
  2. rdf_construct/__main__.py +0 -0
  3. rdf_construct/cli.py +3429 -0
  4. rdf_construct/core/__init__.py +33 -0
  5. rdf_construct/core/config.py +116 -0
  6. rdf_construct/core/ordering.py +219 -0
  7. rdf_construct/core/predicate_order.py +212 -0
  8. rdf_construct/core/profile.py +157 -0
  9. rdf_construct/core/selector.py +64 -0
  10. rdf_construct/core/serialiser.py +232 -0
  11. rdf_construct/core/utils.py +89 -0
  12. rdf_construct/cq/__init__.py +77 -0
  13. rdf_construct/cq/expectations.py +365 -0
  14. rdf_construct/cq/formatters/__init__.py +45 -0
  15. rdf_construct/cq/formatters/json.py +104 -0
  16. rdf_construct/cq/formatters/junit.py +104 -0
  17. rdf_construct/cq/formatters/text.py +146 -0
  18. rdf_construct/cq/loader.py +300 -0
  19. rdf_construct/cq/runner.py +321 -0
  20. rdf_construct/diff/__init__.py +59 -0
  21. rdf_construct/diff/change_types.py +214 -0
  22. rdf_construct/diff/comparator.py +338 -0
  23. rdf_construct/diff/filters.py +133 -0
  24. rdf_construct/diff/formatters/__init__.py +71 -0
  25. rdf_construct/diff/formatters/json.py +192 -0
  26. rdf_construct/diff/formatters/markdown.py +210 -0
  27. rdf_construct/diff/formatters/text.py +195 -0
  28. rdf_construct/docs/__init__.py +60 -0
  29. rdf_construct/docs/config.py +238 -0
  30. rdf_construct/docs/extractors.py +603 -0
  31. rdf_construct/docs/generator.py +360 -0
  32. rdf_construct/docs/renderers/__init__.py +7 -0
  33. rdf_construct/docs/renderers/html.py +803 -0
  34. rdf_construct/docs/renderers/json.py +390 -0
  35. rdf_construct/docs/renderers/markdown.py +628 -0
  36. rdf_construct/docs/search.py +278 -0
  37. rdf_construct/docs/templates/html/base.html.jinja +44 -0
  38. rdf_construct/docs/templates/html/class.html.jinja +152 -0
  39. rdf_construct/docs/templates/html/hierarchy.html.jinja +28 -0
  40. rdf_construct/docs/templates/html/index.html.jinja +110 -0
  41. rdf_construct/docs/templates/html/instance.html.jinja +90 -0
  42. rdf_construct/docs/templates/html/namespaces.html.jinja +37 -0
  43. rdf_construct/docs/templates/html/property.html.jinja +124 -0
  44. rdf_construct/docs/templates/html/single_page.html.jinja +169 -0
  45. rdf_construct/lint/__init__.py +75 -0
  46. rdf_construct/lint/config.py +214 -0
  47. rdf_construct/lint/engine.py +396 -0
  48. rdf_construct/lint/formatters.py +327 -0
  49. rdf_construct/lint/rules.py +692 -0
  50. rdf_construct/localise/__init__.py +114 -0
  51. rdf_construct/localise/config.py +508 -0
  52. rdf_construct/localise/extractor.py +427 -0
  53. rdf_construct/localise/formatters/__init__.py +36 -0
  54. rdf_construct/localise/formatters/markdown.py +229 -0
  55. rdf_construct/localise/formatters/text.py +224 -0
  56. rdf_construct/localise/merger.py +346 -0
  57. rdf_construct/localise/reporter.py +356 -0
  58. rdf_construct/main.py +6 -0
  59. rdf_construct/merge/__init__.py +165 -0
  60. rdf_construct/merge/config.py +354 -0
  61. rdf_construct/merge/conflicts.py +281 -0
  62. rdf_construct/merge/formatters.py +426 -0
  63. rdf_construct/merge/merger.py +425 -0
  64. rdf_construct/merge/migrator.py +339 -0
  65. rdf_construct/merge/rules.py +377 -0
  66. rdf_construct/merge/splitter.py +1102 -0
  67. rdf_construct/puml2rdf/__init__.py +103 -0
  68. rdf_construct/puml2rdf/config.py +230 -0
  69. rdf_construct/puml2rdf/converter.py +420 -0
  70. rdf_construct/puml2rdf/merger.py +200 -0
  71. rdf_construct/puml2rdf/model.py +202 -0
  72. rdf_construct/puml2rdf/parser.py +565 -0
  73. rdf_construct/puml2rdf/validators.py +451 -0
  74. rdf_construct/refactor/__init__.py +72 -0
  75. rdf_construct/refactor/config.py +362 -0
  76. rdf_construct/refactor/deprecator.py +328 -0
  77. rdf_construct/refactor/formatters/__init__.py +8 -0
  78. rdf_construct/refactor/formatters/text.py +311 -0
  79. rdf_construct/refactor/renamer.py +294 -0
  80. rdf_construct/shacl/__init__.py +56 -0
  81. rdf_construct/shacl/config.py +166 -0
  82. rdf_construct/shacl/converters.py +520 -0
  83. rdf_construct/shacl/generator.py +364 -0
  84. rdf_construct/shacl/namespaces.py +93 -0
  85. rdf_construct/stats/__init__.py +29 -0
  86. rdf_construct/stats/collector.py +178 -0
  87. rdf_construct/stats/comparator.py +298 -0
  88. rdf_construct/stats/formatters/__init__.py +83 -0
  89. rdf_construct/stats/formatters/json.py +38 -0
  90. rdf_construct/stats/formatters/markdown.py +153 -0
  91. rdf_construct/stats/formatters/text.py +186 -0
  92. rdf_construct/stats/metrics/__init__.py +26 -0
  93. rdf_construct/stats/metrics/basic.py +147 -0
  94. rdf_construct/stats/metrics/complexity.py +137 -0
  95. rdf_construct/stats/metrics/connectivity.py +130 -0
  96. rdf_construct/stats/metrics/documentation.py +128 -0
  97. rdf_construct/stats/metrics/hierarchy.py +207 -0
  98. rdf_construct/stats/metrics/properties.py +88 -0
  99. rdf_construct/uml/__init__.py +22 -0
  100. rdf_construct/uml/context.py +194 -0
  101. rdf_construct/uml/mapper.py +371 -0
  102. rdf_construct/uml/odm_renderer.py +789 -0
  103. rdf_construct/uml/renderer.py +684 -0
  104. rdf_construct/uml/uml_layout.py +393 -0
  105. rdf_construct/uml/uml_style.py +613 -0
  106. rdf_construct-0.3.0.dist-info/METADATA +496 -0
  107. rdf_construct-0.3.0.dist-info/RECORD +110 -0
  108. rdf_construct-0.3.0.dist-info/WHEEL +4 -0
  109. rdf_construct-0.3.0.dist-info/entry_points.txt +3 -0
  110. rdf_construct-0.3.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1102 @@
1
+ """Core split logic for modularising RDF ontologies.
2
+
3
+ This module provides the OntologySplitter class that:
4
+ - Splits a monolithic ontology into multiple modules
5
+ - Supports namespace-based and explicit entity-based splitting
6
+ - Tracks cross-module dependencies
7
+ - Generates owl:imports declarations
8
+ - Produces a manifest documenting the split
9
+ - Supports data migration by instance type
10
+ """
11
+
12
+ from dataclasses import dataclass, field
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+ import yaml
17
+ from rdflib import Graph, URIRef, Namespace
18
+ from rdflib.namespace import RDF, RDFS, OWL
19
+
20
+
21
+ def select_classes(graph: Graph) -> set[URIRef]:
22
+ """Select all class entities from a graph.
23
+
24
+ Args:
25
+ graph: RDF graph to select from.
26
+
27
+ Returns:
28
+ Set of URIRefs for classes (owl:Class and rdfs:Class).
29
+ """
30
+ classes: set[URIRef] = set()
31
+ for s in graph.subjects(RDF.type, OWL.Class):
32
+ if isinstance(s, URIRef):
33
+ classes.add(s)
34
+ for s in graph.subjects(RDF.type, RDFS.Class):
35
+ if isinstance(s, URIRef):
36
+ classes.add(s)
37
+ return classes
38
+
39
+
40
+ def select_properties(graph: Graph) -> set[URIRef]:
41
+ """Select all property entities from a graph.
42
+
43
+ Args:
44
+ graph: RDF graph to select from.
45
+
46
+ Returns:
47
+ Set of URIRefs for properties (owl:ObjectProperty, DatatypeProperty, etc.).
48
+ """
49
+ properties: set[URIRef] = set()
50
+ property_types = [
51
+ OWL.ObjectProperty,
52
+ OWL.DatatypeProperty,
53
+ OWL.AnnotationProperty,
54
+ RDF.Property,
55
+ ]
56
+ for prop_type in property_types:
57
+ for s in graph.subjects(RDF.type, prop_type):
58
+ if isinstance(s, URIRef):
59
+ properties.add(s)
60
+ return properties
61
+
62
+
63
+ @dataclass
64
+ class ModuleDefinition:
65
+ """Definition of a single module to extract.
66
+
67
+ Attributes:
68
+ name: Module identifier (used in manifest).
69
+ output: Output filename.
70
+ description: Human-readable description.
71
+ classes: Explicit list of class URIs to include.
72
+ properties: Explicit list of property URIs to include.
73
+ namespaces: Namespace prefixes to include (for auto-detection).
74
+ include_descendants: Whether to include rdfs:subClassOf/subPropertyOf descendants.
75
+ imports: Explicit owl:imports to add.
76
+ auto_imports: Whether to generate imports from detected dependencies.
77
+ """
78
+
79
+ name: str
80
+ output: str
81
+ description: str | None = None
82
+ classes: list[str] = field(default_factory=list)
83
+ properties: list[str] = field(default_factory=list)
84
+ namespaces: list[str] = field(default_factory=list)
85
+ include_descendants: bool = False
86
+ imports: list[str] = field(default_factory=list)
87
+ auto_imports: bool = True
88
+
89
+ @classmethod
90
+ def from_dict(cls, data: dict[str, Any]) -> "ModuleDefinition":
91
+ """Create from dictionary.
92
+
93
+ Args:
94
+ data: Dictionary with module configuration.
95
+
96
+ Returns:
97
+ ModuleDefinition instance.
98
+ """
99
+ include = data.get("include", {})
100
+ return cls(
101
+ name=data["name"],
102
+ output=data.get("output", f"{data['name']}.ttl"),
103
+ description=data.get("description"),
104
+ classes=include.get("classes", data.get("classes", [])),
105
+ properties=include.get("properties", data.get("properties", [])),
106
+ namespaces=data.get("namespaces", []),
107
+ include_descendants=data.get("include_descendants", False),
108
+ imports=data.get("imports", []),
109
+ auto_imports=data.get("auto_imports", True),
110
+ )
111
+
112
+
113
+ @dataclass
114
+ class UnmatchedStrategy:
115
+ """Configuration for handling entities that don't match any module.
116
+
117
+ Attributes:
118
+ strategy: Either 'common' (put in common module) or 'error' (fail).
119
+ common_module: Name of the common module if strategy is 'common'.
120
+ common_output: Output filename for common module.
121
+ """
122
+
123
+ strategy: str = "common" # "common" or "error"
124
+ common_module: str = "common"
125
+ common_output: str = "common.ttl"
126
+
127
+
128
+ @dataclass
129
+ class SplitDataConfig:
130
+ """Configuration for splitting data files by instance type.
131
+
132
+ Attributes:
133
+ sources: Data files to split.
134
+ output_dir: Directory for split data files.
135
+ prefix: Prefix for output filenames (e.g., "data_").
136
+ """
137
+
138
+ sources: list[Path] = field(default_factory=list)
139
+ output_dir: Path | None = None
140
+ prefix: str = "data_"
141
+
142
+
143
+ @dataclass
144
+ class SplitConfig:
145
+ """Complete configuration for a split operation.
146
+
147
+ Attributes:
148
+ source: Path to the source ontology file.
149
+ output_dir: Directory for output module files.
150
+ modules: List of module definitions.
151
+ unmatched: Strategy for unmatched entities.
152
+ split_data: Optional data splitting configuration.
153
+ generate_manifest: Whether to generate manifest.yml.
154
+ dry_run: If True, report what would happen without writing.
155
+ """
156
+
157
+ source: Path
158
+ output_dir: Path
159
+ modules: list[ModuleDefinition] = field(default_factory=list)
160
+ unmatched: UnmatchedStrategy = field(default_factory=UnmatchedStrategy)
161
+ split_data: SplitDataConfig | None = None
162
+ generate_manifest: bool = True
163
+ dry_run: bool = False
164
+
165
+ @classmethod
166
+ def from_yaml(cls, path: Path) -> "SplitConfig":
167
+ """Load configuration from a YAML file.
168
+
169
+ Args:
170
+ path: Path to YAML configuration file.
171
+
172
+ Returns:
173
+ SplitConfig instance.
174
+
175
+ Raises:
176
+ FileNotFoundError: If config file doesn't exist.
177
+ ValueError: If config is invalid.
178
+ """
179
+ if not path.exists():
180
+ raise FileNotFoundError(f"Config file not found: {path}")
181
+
182
+ with open(path) as f:
183
+ data = yaml.safe_load(f)
184
+
185
+ return cls.from_dict(data, config_dir=path.parent)
186
+
187
+ @classmethod
188
+ def from_dict(cls, data: dict[str, Any], config_dir: Path | None = None) -> "SplitConfig":
189
+ """Create from dictionary.
190
+
191
+ Args:
192
+ data: Dictionary with configuration.
193
+ config_dir: Directory containing config file (for relative paths).
194
+
195
+ Returns:
196
+ SplitConfig instance.
197
+ """
198
+ config_dir = config_dir or Path(".")
199
+ split_data = data.get("split", data)
200
+
201
+ # Parse source
202
+ source = Path(split_data.get("source", ""))
203
+ if not source.is_absolute():
204
+ source = config_dir / source
205
+
206
+ # Parse output directory
207
+ output_dir = Path(split_data.get("output_dir", "modules"))
208
+ if not output_dir.is_absolute():
209
+ output_dir = config_dir / output_dir
210
+
211
+ # Parse modules
212
+ modules = [
213
+ ModuleDefinition.from_dict(m)
214
+ for m in split_data.get("modules", [])
215
+ ]
216
+
217
+ # Parse unmatched strategy
218
+ unmatched_data = split_data.get("unmatched", {})
219
+ unmatched = UnmatchedStrategy(
220
+ strategy=unmatched_data.get("strategy", "common"),
221
+ common_module=unmatched_data.get("module", "common"),
222
+ common_output=unmatched_data.get("output", "common.ttl"),
223
+ )
224
+
225
+ # Parse data splitting config
226
+ split_data_config = None
227
+ if "split_data" in split_data:
228
+ sd = split_data["split_data"]
229
+ sources = [
230
+ config_dir / Path(p) if not Path(p).is_absolute() else Path(p)
231
+ for p in sd.get("sources", [])
232
+ ]
233
+ output = sd.get("output_dir")
234
+ split_data_config = SplitDataConfig(
235
+ sources=sources,
236
+ output_dir=config_dir / Path(output) if output else None,
237
+ prefix=sd.get("prefix", "data_"),
238
+ )
239
+
240
+ return cls(
241
+ source=source,
242
+ output_dir=output_dir,
243
+ modules=modules,
244
+ unmatched=unmatched,
245
+ split_data=split_data_config,
246
+ generate_manifest=split_data.get("generate_manifest", True),
247
+ dry_run=split_data.get("dry_run", False),
248
+ )
249
+
250
+
251
+ @dataclass
252
+ class ModuleStats:
253
+ """Statistics for a single module.
254
+
255
+ Attributes:
256
+ name: Module name.
257
+ file: Output filename.
258
+ classes: Number of classes in module.
259
+ properties: Number of properties in module.
260
+ triples: Total triples in module.
261
+ imports: List of owl:imports.
262
+ dependencies: Modules this module depends on.
263
+ """
264
+
265
+ name: str
266
+ file: str
267
+ classes: int = 0
268
+ properties: int = 0
269
+ triples: int = 0
270
+ imports: list[str] = field(default_factory=list)
271
+ dependencies: list[str] = field(default_factory=list)
272
+
273
+
274
+ @dataclass
275
+ class SplitResult:
276
+ """Result of a split operation.
277
+
278
+ Attributes:
279
+ modules: Dictionary of module name -> Graph.
280
+ module_stats: Statistics per module.
281
+ entity_assignments: Mapping of entity URI -> module name.
282
+ unmatched_entities: Entities not assigned to any module.
283
+ dependencies: Cross-module dependency graph.
284
+ success: Whether split completed without errors.
285
+ error: Error message if success is False.
286
+ data_modules: Split data graphs by module (if data splitting enabled).
287
+ """
288
+
289
+ modules: dict[str, Graph] = field(default_factory=dict)
290
+ module_stats: list[ModuleStats] = field(default_factory=list)
291
+ entity_assignments: dict[str, str] = field(default_factory=dict)
292
+ unmatched_entities: set[str] = field(default_factory=set)
293
+ dependencies: dict[str, set[str]] = field(default_factory=dict)
294
+ success: bool = True
295
+ error: str | None = None
296
+ data_modules: dict[str, Graph] = field(default_factory=dict)
297
+
298
+ @property
299
+ def total_modules(self) -> int:
300
+ """Total number of modules created."""
301
+ return len(self.modules)
302
+
303
+ @property
304
+ def total_triples(self) -> int:
305
+ """Total triples across all modules."""
306
+ return sum(len(g) for g in self.modules.values())
307
+
308
+
309
+ class OntologySplitter:
310
+ """Splits a monolithic ontology into multiple modules.
311
+
312
+ The splitter:
313
+ 1. Loads the source ontology
314
+ 2. Assigns entities to modules based on configuration
315
+ 3. Handles unmatched entities per strategy
316
+ 4. Detects cross-module dependencies
317
+ 5. Generates owl:imports declarations
318
+ 6. Writes module files
319
+ 7. Produces a manifest documenting the split
320
+
321
+ Example:
322
+ config = SplitConfig.from_yaml(Path("split.yml"))
323
+ splitter = OntologySplitter(config)
324
+ result = splitter.split()
325
+
326
+ if result.success:
327
+ splitter.write_modules(result)
328
+ """
329
+
330
+ def __init__(self, config: SplitConfig):
331
+ """Initialize the splitter.
332
+
333
+ Args:
334
+ config: Split configuration.
335
+ """
336
+ self.config = config
337
+ self.source_graph: Graph | None = None
338
+ self.namespace_map: dict[str, str] = {} # namespace -> module name
339
+
340
+ def split(self) -> SplitResult:
341
+ """Execute the split operation.
342
+
343
+ Returns:
344
+ SplitResult with module graphs and statistics.
345
+ """
346
+ result = SplitResult()
347
+
348
+ # Load source ontology
349
+ try:
350
+ self.source_graph = self._load_source()
351
+ except Exception as e:
352
+ result.success = False
353
+ result.error = f"Failed to load source: {e}"
354
+ return result
355
+
356
+ # Build namespace -> module mapping (for namespace-based splitting)
357
+ self._build_namespace_map()
358
+
359
+ # Assign entities to modules
360
+ assignments = self._assign_entities(result)
361
+
362
+ if not result.success:
363
+ return result
364
+
365
+ # Create module graphs
366
+ self._create_module_graphs(assignments, result)
367
+
368
+ # Handle unmatched entities
369
+ if result.unmatched_entities:
370
+ self._handle_unmatched(result)
371
+
372
+ # Detect dependencies and generate imports
373
+ self._detect_dependencies(result)
374
+ self._add_imports(result)
375
+
376
+ # Calculate statistics
377
+ self._calculate_stats(result)
378
+
379
+ # Split data if configured
380
+ if self.config.split_data and self.config.split_data.sources:
381
+ self._split_data(result)
382
+
383
+ return result
384
+
385
+ def _load_source(self) -> Graph:
386
+ """Load the source ontology file.
387
+
388
+ Returns:
389
+ Loaded RDF graph.
390
+
391
+ Raises:
392
+ FileNotFoundError: If source doesn't exist.
393
+ ValueError: If source can't be parsed.
394
+ """
395
+ if not self.config.source.exists():
396
+ raise FileNotFoundError(f"Source not found: {self.config.source}")
397
+
398
+ graph = Graph()
399
+
400
+ # Determine format from extension
401
+ ext = self.config.source.suffix.lower()
402
+ format_map = {
403
+ ".ttl": "turtle",
404
+ ".turtle": "turtle",
405
+ ".rdf": "xml",
406
+ ".xml": "xml",
407
+ ".owl": "xml",
408
+ ".n3": "n3",
409
+ ".nt": "nt",
410
+ ".jsonld": "json-ld",
411
+ }
412
+ rdf_format = format_map.get(ext, "turtle")
413
+
414
+ graph.parse(self.config.source.as_posix(), format=rdf_format)
415
+
416
+ return graph
417
+
418
+ def _build_namespace_map(self) -> None:
419
+ """Build mapping from namespaces to module names."""
420
+ self.namespace_map = {}
421
+
422
+ for module in self.config.modules:
423
+ for ns in module.namespaces:
424
+ self.namespace_map[ns] = module.name
425
+
426
+ def _assign_entities(self, result: SplitResult) -> dict[str, set[URIRef]]:
427
+ """Assign entities to modules.
428
+
429
+ Args:
430
+ result: SplitResult to populate with assignments.
431
+
432
+ Returns:
433
+ Dictionary of module name -> set of entity URIs.
434
+ """
435
+ if self.source_graph is None:
436
+ result.success = False
437
+ result.error = "Source graph not loaded"
438
+ return {}
439
+
440
+ assignments: dict[str, set[URIRef]] = {m.name: set() for m in self.config.modules}
441
+
442
+ # Get all classes and properties from source
443
+ all_classes = select_classes(self.source_graph)
444
+ all_properties = select_properties(self.source_graph)
445
+ all_entities = all_classes | all_properties
446
+
447
+ # Assign entities to modules
448
+ for module in self.config.modules:
449
+ # By explicit class list
450
+ for cls_uri in module.classes:
451
+ uri = self._expand_curie(cls_uri)
452
+ if uri in all_entities:
453
+ assignments[module.name].add(uri)
454
+ result.entity_assignments[str(uri)] = module.name
455
+
456
+ # Include descendants if requested
457
+ if module.include_descendants:
458
+ descendants = self._get_descendants(uri, all_classes)
459
+ for desc in descendants:
460
+ if str(desc) not in result.entity_assignments:
461
+ assignments[module.name].add(desc)
462
+ result.entity_assignments[str(desc)] = module.name
463
+
464
+ # By explicit property list
465
+ for prop_uri in module.properties:
466
+ uri = self._expand_curie(prop_uri)
467
+ if uri in all_entities:
468
+ assignments[module.name].add(uri)
469
+ result.entity_assignments[str(uri)] = module.name
470
+
471
+ # Include descendants if requested
472
+ if module.include_descendants:
473
+ descendants = self._get_descendants(uri, all_properties, is_property=True)
474
+ for desc in descendants:
475
+ if str(desc) not in result.entity_assignments:
476
+ assignments[module.name].add(desc)
477
+ result.entity_assignments[str(desc)] = module.name
478
+
479
+ # By namespace
480
+ for ns in module.namespaces:
481
+ for entity in all_entities:
482
+ if str(entity).startswith(ns):
483
+ if str(entity) not in result.entity_assignments:
484
+ assignments[module.name].add(entity)
485
+ result.entity_assignments[str(entity)] = module.name
486
+
487
+ # Find unmatched entities
488
+ for entity in all_entities:
489
+ if str(entity) not in result.entity_assignments:
490
+ result.unmatched_entities.add(str(entity))
491
+
492
+ return assignments
493
+
494
+ def _expand_curie(self, curie: str) -> URIRef:
495
+ """Expand a CURIE to a full URI using the source graph's namespace bindings.
496
+
497
+ Args:
498
+ curie: CURIE or full URI string.
499
+
500
+ Returns:
501
+ URIRef of the expanded URI.
502
+ """
503
+ if self.source_graph is None:
504
+ return URIRef(curie)
505
+
506
+ # If already a full URI
507
+ if curie.startswith("http://") or curie.startswith("https://"):
508
+ return URIRef(curie)
509
+
510
+ # Try to expand as CURIE
511
+ if ":" in curie:
512
+ prefix, local = curie.split(":", 1)
513
+ for ns_prefix, ns_uri in self.source_graph.namespace_manager.namespaces():
514
+ if ns_prefix == prefix:
515
+ return URIRef(str(ns_uri) + local)
516
+
517
+ return URIRef(curie)
518
+
519
+ def _get_descendants(
520
+ self,
521
+ uri: URIRef,
522
+ entity_set: set[URIRef],
523
+ is_property: bool = False,
524
+ ) -> set[URIRef]:
525
+ """Get all descendants (subclasses/subproperties) of an entity.
526
+
527
+ Args:
528
+ uri: Parent entity URI.
529
+ entity_set: Set of all entities to consider.
530
+ is_property: Whether to look for subPropertyOf instead of subClassOf.
531
+
532
+ Returns:
533
+ Set of descendant URIs.
534
+ """
535
+ if self.source_graph is None:
536
+ return set()
537
+
538
+ predicate = RDFS.subPropertyOf if is_property else RDFS.subClassOf
539
+ descendants: set[URIRef] = set()
540
+ to_check = [uri]
541
+
542
+ while to_check:
543
+ parent = to_check.pop()
544
+ for s, p, o in self.source_graph.triples((None, predicate, parent)):
545
+ if isinstance(s, URIRef) and s in entity_set:
546
+ if s not in descendants:
547
+ descendants.add(s)
548
+ to_check.append(s)
549
+
550
+ return descendants
551
+
552
+ def _create_module_graphs(
553
+ self,
554
+ assignments: dict[str, set[URIRef]],
555
+ result: SplitResult,
556
+ ) -> None:
557
+ """Create RDF graphs for each module.
558
+
559
+ Args:
560
+ assignments: Entity assignments per module.
561
+ result: SplitResult to populate with graphs.
562
+ """
563
+ if self.source_graph is None:
564
+ return
565
+
566
+ for module in self.config.modules:
567
+ module_graph = Graph()
568
+
569
+ # Copy namespace bindings
570
+ for prefix, ns in self.source_graph.namespace_manager.namespaces():
571
+ module_graph.bind(prefix, ns)
572
+
573
+ # Add triples for assigned entities
574
+ entities = assignments.get(module.name, set())
575
+ for entity in entities:
576
+ # All triples where entity is subject
577
+ for s, p, o in self.source_graph.triples((entity, None, None)):
578
+ module_graph.add((s, p, o))
579
+
580
+ result.modules[module.name] = module_graph
581
+
582
+ def _handle_unmatched(self, result: SplitResult) -> None:
583
+ """Handle entities that weren't assigned to any module.
584
+
585
+ Args:
586
+ result: SplitResult with unmatched entities.
587
+ """
588
+ if not result.unmatched_entities:
589
+ return
590
+
591
+ if self.config.unmatched.strategy == "error":
592
+ result.success = False
593
+ result.error = (
594
+ f"Unmatched entities ({len(result.unmatched_entities)}): "
595
+ + ", ".join(list(result.unmatched_entities)[:5])
596
+ + ("..." if len(result.unmatched_entities) > 5 else "")
597
+ )
598
+ return
599
+
600
+ # Create common module
601
+ common_graph = Graph()
602
+
603
+ if self.source_graph is not None:
604
+ # Copy namespace bindings
605
+ for prefix, ns in self.source_graph.namespace_manager.namespaces():
606
+ common_graph.bind(prefix, ns)
607
+
608
+ # Add triples for unmatched entities
609
+ for entity_str in result.unmatched_entities:
610
+ entity = URIRef(entity_str)
611
+ for s, p, o in self.source_graph.triples((entity, None, None)):
612
+ common_graph.add((s, p, o))
613
+
614
+ # Record assignment
615
+ result.entity_assignments[entity_str] = self.config.unmatched.common_module
616
+
617
+ result.modules[self.config.unmatched.common_module] = common_graph
618
+
619
+ def _detect_dependencies(self, result: SplitResult) -> None:
620
+ """Detect cross-module dependencies.
621
+
622
+ A module depends on another if it references entities from that module.
623
+
624
+ Args:
625
+ result: SplitResult to populate with dependencies.
626
+ """
627
+ if self.source_graph is None:
628
+ return
629
+
630
+ for module_name, graph in result.modules.items():
631
+ deps: set[str] = set()
632
+
633
+ for s, p, o in graph:
634
+ # Check if object references an entity in another module
635
+ if isinstance(o, URIRef):
636
+ o_str = str(o)
637
+ if o_str in result.entity_assignments:
638
+ other_module = result.entity_assignments[o_str]
639
+ if other_module != module_name:
640
+ deps.add(other_module)
641
+
642
+ result.dependencies[module_name] = deps
643
+
644
+ def _add_imports(self, result: SplitResult) -> None:
645
+ """Add owl:imports declarations to module graphs.
646
+
647
+ Args:
648
+ result: SplitResult with module graphs.
649
+ """
650
+ for module in self.config.modules:
651
+ if module.name not in result.modules:
652
+ continue
653
+
654
+ graph = result.modules[module.name]
655
+
656
+ # Find or create ontology declaration
657
+ ontology_uri = self._get_or_create_ontology_uri(graph, module)
658
+
659
+ # Add explicit imports
660
+ for imp in module.imports:
661
+ graph.add((ontology_uri, OWL.imports, URIRef(imp)))
662
+
663
+ # Add auto-generated imports from dependencies
664
+ if module.auto_imports:
665
+ deps = result.dependencies.get(module.name, set())
666
+ for dep in deps:
667
+ # Find the module definition to get its output filename
668
+ dep_file = self._get_module_file(dep, result)
669
+ if dep_file:
670
+ graph.add((ontology_uri, OWL.imports, URIRef(dep_file)))
671
+
672
+ def _get_or_create_ontology_uri(self, graph: Graph, module: ModuleDefinition) -> URIRef:
673
+ """Get or create the ontology URI for a module.
674
+
675
+ Args:
676
+ graph: Module graph.
677
+ module: Module definition.
678
+
679
+ Returns:
680
+ Ontology URI.
681
+ """
682
+ # Look for existing ontology declaration
683
+ for s in graph.subjects(RDF.type, OWL.Ontology):
684
+ return s
685
+
686
+ # Create one based on module name
687
+ base_ns = None
688
+ for prefix, ns in graph.namespace_manager.namespaces():
689
+ if prefix == "":
690
+ base_ns = str(ns)
691
+ break
692
+
693
+ if base_ns:
694
+ ont_uri = URIRef(base_ns.rstrip("#/"))
695
+ else:
696
+ ont_uri = URIRef(f"http://example.org/{module.name}")
697
+
698
+ graph.add((ont_uri, RDF.type, OWL.Ontology))
699
+ return ont_uri
700
+
701
+ def _get_module_file(self, module_name: str, result: SplitResult) -> str | None:
702
+ """Get the output filename for a module.
703
+
704
+ Args:
705
+ module_name: Name of the module.
706
+ result: SplitResult.
707
+
708
+ Returns:
709
+ Output filename or None.
710
+ """
711
+ # Check defined modules
712
+ for module in self.config.modules:
713
+ if module.name == module_name:
714
+ return module.output
715
+
716
+ # Check common module
717
+ if module_name == self.config.unmatched.common_module:
718
+ return self.config.unmatched.common_output
719
+
720
+ return None
721
+
722
+ def _calculate_stats(self, result: SplitResult) -> None:
723
+ """Calculate statistics for each module.
724
+
725
+ Args:
726
+ result: SplitResult to populate with stats.
727
+ """
728
+ for module in self.config.modules:
729
+ if module.name not in result.modules:
730
+ continue
731
+
732
+ graph = result.modules[module.name]
733
+ stats = self._calculate_module_stats(module.name, module.output, graph, result)
734
+ result.module_stats.append(stats)
735
+
736
+ # Stats for common module
737
+ if self.config.unmatched.common_module in result.modules:
738
+ graph = result.modules[self.config.unmatched.common_module]
739
+ stats = self._calculate_module_stats(
740
+ self.config.unmatched.common_module,
741
+ self.config.unmatched.common_output,
742
+ graph,
743
+ result,
744
+ )
745
+ result.module_stats.append(stats)
746
+
747
+ def _calculate_module_stats(
748
+ self,
749
+ name: str,
750
+ output: str,
751
+ graph: Graph,
752
+ result: SplitResult,
753
+ ) -> ModuleStats:
754
+ """Calculate statistics for a single module.
755
+
756
+ Args:
757
+ name: Module name.
758
+ output: Output filename.
759
+ graph: Module graph.
760
+ result: SplitResult.
761
+
762
+ Returns:
763
+ ModuleStats instance.
764
+ """
765
+ # Count classes and properties
766
+ classes = set(graph.subjects(RDF.type, OWL.Class)) | set(
767
+ graph.subjects(RDF.type, RDFS.Class)
768
+ )
769
+ properties = (
770
+ set(graph.subjects(RDF.type, OWL.ObjectProperty))
771
+ | set(graph.subjects(RDF.type, OWL.DatatypeProperty))
772
+ | set(graph.subjects(RDF.type, OWL.AnnotationProperty))
773
+ | set(graph.subjects(RDF.type, RDF.Property))
774
+ )
775
+
776
+ # Get imports
777
+ imports = [str(o) for s, p, o in graph.triples((None, OWL.imports, None))]
778
+
779
+ # Get dependencies
780
+ deps = list(result.dependencies.get(name, set()))
781
+
782
+ return ModuleStats(
783
+ name=name,
784
+ file=output,
785
+ classes=len(classes),
786
+ properties=len(properties),
787
+ triples=len(graph),
788
+ imports=imports,
789
+ dependencies=deps,
790
+ )
791
+
792
+ def _split_data(self, result: SplitResult) -> None:
793
+ """Split data files by instance type.
794
+
795
+ Instances are assigned to the module containing their rdf:type.
796
+
797
+ Args:
798
+ result: SplitResult to populate with data modules.
799
+ """
800
+ if self.config.split_data is None:
801
+ return
802
+
803
+ # Load all data files
804
+ data_graph = Graph()
805
+ for data_path in self.config.split_data.sources:
806
+ if data_path.exists():
807
+ data_graph.parse(data_path.as_posix())
808
+
809
+ # Create data graphs per module
810
+ data_modules: dict[str, Graph] = {m.name: Graph() for m in self.config.modules}
811
+ if self.config.unmatched.common_module in result.modules:
812
+ data_modules[self.config.unmatched.common_module] = Graph()
813
+
814
+ # Copy namespace bindings to all data modules
815
+ for module_name in data_modules:
816
+ for prefix, ns in data_graph.namespace_manager.namespaces():
817
+ data_modules[module_name].bind(prefix, ns)
818
+
819
+ # Assign instances by type
820
+ for s, p, o in data_graph.triples((None, RDF.type, None)):
821
+ if isinstance(o, URIRef):
822
+ type_str = str(o)
823
+ if type_str in result.entity_assignments:
824
+ module_name = result.entity_assignments[type_str]
825
+ if module_name in data_modules:
826
+ # Add all triples for this subject
827
+ for triple in data_graph.triples((s, None, None)):
828
+ data_modules[module_name].add(triple)
829
+
830
+ result.data_modules = data_modules
831
+
832
+ def write_modules(self, result: SplitResult) -> None:
833
+ """Write module files to disk.
834
+
835
+ Args:
836
+ result: SplitResult with module graphs.
837
+ """
838
+ if self.config.dry_run:
839
+ return
840
+
841
+ # Ensure output directory exists
842
+ self.config.output_dir.mkdir(parents=True, exist_ok=True)
843
+
844
+ # Write module files
845
+ for module in self.config.modules:
846
+ if module.name in result.modules:
847
+ output_path = self.config.output_dir / module.output
848
+ result.modules[module.name].serialize(
849
+ destination=output_path.as_posix(), format="turtle"
850
+ )
851
+
852
+ # Write common module
853
+ if self.config.unmatched.common_module in result.modules:
854
+ output_path = self.config.output_dir / self.config.unmatched.common_output
855
+ result.modules[self.config.unmatched.common_module].serialize(
856
+ destination=output_path.as_posix(), format="turtle"
857
+ )
858
+
859
+ # Write data modules
860
+ if result.data_modules and self.config.split_data:
861
+ data_dir = self.config.split_data.output_dir or self.config.output_dir
862
+ data_dir.mkdir(parents=True, exist_ok=True)
863
+
864
+ for module_name, graph in result.data_modules.items():
865
+ if len(graph) > 0:
866
+ prefix = self.config.split_data.prefix
867
+ output_path = data_dir / f"{prefix}{module_name}.ttl"
868
+ graph.serialize(destination=output_path.as_posix(), format="turtle")
869
+
870
+ def write_manifest(self, result: SplitResult) -> None:
871
+ """Write manifest file documenting the split.
872
+
873
+ Args:
874
+ result: SplitResult with statistics.
875
+ """
876
+ if self.config.dry_run or not self.config.generate_manifest:
877
+ return
878
+
879
+ manifest = {
880
+ "source": str(self.config.source),
881
+ "output_dir": str(self.config.output_dir),
882
+ "modules": [],
883
+ "summary": {
884
+ "total_modules": result.total_modules,
885
+ "total_triples": result.total_triples,
886
+ "unmatched_entities": len(result.unmatched_entities),
887
+ },
888
+ }
889
+
890
+ for stats in result.module_stats:
891
+ manifest["modules"].append({
892
+ "name": stats.name,
893
+ "file": stats.file,
894
+ "classes": stats.classes,
895
+ "properties": stats.properties,
896
+ "triples": stats.triples,
897
+ "imports": stats.imports,
898
+ "dependencies": stats.dependencies,
899
+ })
900
+
901
+ # Generate dependency graph as ASCII art
902
+ dep_lines = self._format_dependency_graph(result)
903
+ if dep_lines:
904
+ manifest["dependency_graph"] = dep_lines
905
+
906
+ manifest_path = self.config.output_dir / "manifest.yml"
907
+ with open(manifest_path, "w") as f:
908
+ yaml.safe_dump(manifest, f, default_flow_style=False, sort_keys=False)
909
+
910
+ def _format_dependency_graph(self, result: SplitResult) -> str:
911
+ """Format dependency graph as ASCII tree.
912
+
913
+ Args:
914
+ result: SplitResult with dependencies.
915
+
916
+ Returns:
917
+ ASCII representation of dependency graph.
918
+ """
919
+ if not result.dependencies:
920
+ return ""
921
+
922
+ # Find root modules (those with no dependents)
923
+ all_deps: set[str] = set()
924
+ for deps in result.dependencies.values():
925
+ all_deps.update(deps)
926
+
927
+ roots = [m for m in result.modules if m not in all_deps]
928
+
929
+ if not roots:
930
+ roots = list(result.modules.keys())[:1]
931
+
932
+ lines = []
933
+ for root in roots:
934
+ self._format_tree(root, result.dependencies, lines, "")
935
+
936
+ return "\n".join(lines)
937
+
938
+ def _format_tree(
939
+ self,
940
+ node: str,
941
+ deps: dict[str, set[str]],
942
+ lines: list[str],
943
+ prefix: str,
944
+ visited: set[str] | None = None,
945
+ ) -> None:
946
+ """Recursively format a dependency tree.
947
+
948
+ Args:
949
+ node: Current node.
950
+ deps: Dependency graph.
951
+ lines: Output lines.
952
+ prefix: Current prefix for indentation.
953
+ visited: Already visited nodes (to detect cycles).
954
+ """
955
+ if visited is None:
956
+ visited = set()
957
+
958
+ # Get the output file for this node
959
+ file_name = self._get_module_file(node, SplitResult(modules={node: Graph()})) or node
960
+ lines.append(f"{prefix}{file_name}")
961
+
962
+ if node in visited:
963
+ return
964
+
965
+ visited.add(node)
966
+
967
+ # Find modules that depend on this one
968
+ dependents = [m for m, d in deps.items() if node in d]
969
+
970
+ for i, dep in enumerate(dependents):
971
+ is_last = i == len(dependents) - 1
972
+ child_prefix = prefix + ("└── " if is_last else "├── ")
973
+ next_prefix = prefix + (" " if is_last else "│ ")
974
+ self._format_tree(dep, deps, lines, child_prefix, visited.copy())
975
+
976
+
977
+ def split_by_namespace(
978
+ source: Path,
979
+ output_dir: Path,
980
+ dry_run: bool = False,
981
+ ) -> SplitResult:
982
+ """Convenience function to split an ontology by namespace.
983
+
984
+ Automatically detects modules from distinct namespaces in the source.
985
+
986
+ Args:
987
+ source: Path to source ontology.
988
+ output_dir: Directory for output modules.
989
+ dry_run: If True, don't write files.
990
+
991
+ Returns:
992
+ SplitResult with split information.
993
+ """
994
+ # Load source to detect namespaces
995
+ graph = Graph()
996
+ graph.parse(source.as_posix())
997
+
998
+ # Find distinct namespaces used in the ontology
999
+ namespaces: dict[str, str] = {} # namespace -> prefix
1000
+ for prefix, ns in graph.namespace_manager.namespaces():
1001
+ ns_str = str(ns)
1002
+ # Skip common namespaces
1003
+ if any(
1004
+ skip in ns_str
1005
+ for skip in ["w3.org", "purl.org", "xmlns.com"]
1006
+ ):
1007
+ continue
1008
+ namespaces[ns_str] = prefix or "default"
1009
+
1010
+ # Create module definitions
1011
+ modules = []
1012
+ for ns, prefix in namespaces.items():
1013
+ modules.append(
1014
+ ModuleDefinition(
1015
+ name=prefix,
1016
+ output=f"{prefix}.ttl",
1017
+ namespaces=[ns],
1018
+ )
1019
+ )
1020
+
1021
+ config = SplitConfig(
1022
+ source=source,
1023
+ output_dir=output_dir,
1024
+ modules=modules,
1025
+ dry_run=dry_run,
1026
+ )
1027
+
1028
+ splitter = OntologySplitter(config)
1029
+ result = splitter.split()
1030
+
1031
+ if result.success and not dry_run:
1032
+ splitter.write_modules(result)
1033
+ splitter.write_manifest(result)
1034
+
1035
+ return result
1036
+
1037
+
1038
+ def create_default_split_config() -> str:
1039
+ """Generate default split configuration as YAML string.
1040
+
1041
+ Returns:
1042
+ YAML configuration template.
1043
+ """
1044
+ return '''# rdf-construct split configuration
1045
+ # See MERGE_GUIDE.md for full documentation
1046
+
1047
+ split:
1048
+ # Source ontology to split
1049
+ source: ontology/split_monolith.ttl
1050
+
1051
+ # Output directory for modules
1052
+ output_dir: modules/
1053
+
1054
+ # Module definitions
1055
+ modules:
1056
+ # Split by explicit class list
1057
+ - name: core
1058
+ description: "Core upper ontology concepts"
1059
+ output: core.ttl
1060
+ include:
1061
+ classes:
1062
+ - ex:Entity
1063
+ - ex:Event
1064
+ - ex:State
1065
+ properties:
1066
+ - ex:identifier
1067
+ - ex:name
1068
+ include_descendants: true
1069
+
1070
+ # Split by namespace
1071
+ - name: organisation
1072
+ description: "Organisation domain module"
1073
+ output: organisation.ttl
1074
+ namespaces:
1075
+ - "http://example.org/ontology/org#"
1076
+
1077
+ # Module with explicit imports
1078
+ - name: building
1079
+ description: "Building domain module"
1080
+ output: building.ttl
1081
+ namespaces:
1082
+ - "http://example.org/ontology/building#"
1083
+ imports:
1084
+ - core.ttl
1085
+ auto_imports: true
1086
+
1087
+ # Handling for entities that don't match any module
1088
+ unmatched:
1089
+ strategy: common # "common" or "error"
1090
+ module: common
1091
+ output: common.ttl
1092
+
1093
+ # Generate manifest file
1094
+ generate_manifest: true
1095
+
1096
+ # Optional: Split data files by instance type
1097
+ # split_data:
1098
+ # sources:
1099
+ # - data/split_instances.ttl
1100
+ # output_dir: data/
1101
+ # prefix: data_
1102
+ '''