rdfsolve 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rdfsolve/__init__.py ADDED
@@ -0,0 +1,26 @@
1
+ """RDFSolve: A library for RDF schema analysis and VoID generation.
2
+
3
+ Main modules:
4
+ - parser: VoidParser class for parsing VoID descriptions and schemas
5
+ - config: RDF configuration tools for YAML model processing (separate module)
6
+ - utils: Common utility functions for RDF processing
7
+ """
8
+
9
+ # Import parser and models
10
+ from . import utils
11
+ from .models import LinkMLSchema, SchemaTriple, VoidSchema
12
+ from .parser import VoidParser
13
+ from .parser import parse_void_file as parse_void_simple
14
+
15
+ # Import version information
16
+ from .version import VERSION
17
+
18
+ __all__ = [
19
+ "VERSION",
20
+ "LinkMLSchema",
21
+ "SchemaTriple",
22
+ "VoidParser",
23
+ "VoidSchema",
24
+ "parse_void_simple",
25
+ "utils",
26
+ ]
rdfsolve/__main__.py ADDED
@@ -0,0 +1,12 @@
1
+ """Entrypoint module, in case you use `python -m rdfsolve`.
2
+
3
+ Why does this file exist, and why ``__main__``? For more info, read:
4
+
5
+ - https://www.python.org/dev/peps/pep-0338/
6
+ - https://docs.python.org/3/using/cmdline.html#cmdoption-m
7
+ """
8
+
9
+ from .cli import main
10
+
11
+ if __name__ == "__main__":
12
+ main()
rdfsolve/api.py ADDED
@@ -0,0 +1,515 @@
1
+ """Main RDFSolve functionalities for VoID extraction and conversion."""
2
+
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ import pandas as pd
6
+ from rdflib import Graph
7
+
8
+ from .parser import VoidParser
9
+
10
+ __all__ = [
11
+ "count_instances",
12
+ "count_instances_per_class",
13
+ "discover_void_graphs",
14
+ "extract_partitions_from_void",
15
+ "generate_void_from_endpoint",
16
+ "graph_to_jsonld",
17
+ "graph_to_linkml",
18
+ "graph_to_schema",
19
+ "graph_to_shacl",
20
+ "load_parser_from_file",
21
+ "load_parser_from_graph",
22
+ "retrieve_void_from_graphs",
23
+ "to_jsonld_from_file",
24
+ "to_linkml_from_file",
25
+ "to_rdfconfig_from_file",
26
+ "to_shacl_from_file",
27
+ ]
28
+
29
+
30
+ def load_parser_from_file(
31
+ void_file_path: str,
32
+ graph_uris: Optional[Union[str, List[str]]] = None,
33
+ exclude_graphs: bool = True,
34
+ ) -> VoidParser:
35
+ """Load a VoID file and return a parser for schema extraction.
36
+
37
+ Args:
38
+ void_file_path: Path to VoID Turtle file
39
+ graph_uris: Graph URIs to filter queries
40
+ exclude_graphs: Exclude system graphs
41
+
42
+ Returns:
43
+ VoidParser instance
44
+ """
45
+ return VoidParser(
46
+ void_source=void_file_path, graph_uris=graph_uris, exclude_graphs=exclude_graphs
47
+ )
48
+
49
+
50
+ def load_parser_from_graph(
51
+ graph: Graph,
52
+ graph_uris: Optional[Union[str, List[str]]] = None,
53
+ exclude_graphs: bool = True,
54
+ ) -> VoidParser:
55
+ """Load a VoID graph and return a parser for schema extraction.
56
+
57
+ Args:
58
+ graph: RDFLib Graph with VoID data
59
+ graph_uris: Graph URIs to filter queries
60
+ exclude_graphs: Exclude system graphs
61
+
62
+ Returns:
63
+ VoidParser instance
64
+ """
65
+ return VoidParser(void_source=graph, graph_uris=graph_uris, exclude_graphs=exclude_graphs)
66
+
67
+
68
+ def to_linkml_from_file(
69
+ void_file_path: str,
70
+ filter_void_nodes: bool = True,
71
+ schema_name: Optional[str] = None,
72
+ schema_description: Optional[str] = None,
73
+ schema_base_uri: Optional[str] = None,
74
+ ) -> str:
75
+ """Convert a VoID file to LinkML YAML schema.
76
+
77
+ Args:
78
+ void_file_path: Path to VoID file
79
+ filter_void_nodes: Remove VoID-specific nodes
80
+ schema_name: Name for the schema
81
+ schema_description: Description for the schema
82
+ schema_base_uri: Base URI for the schema
83
+
84
+ Returns:
85
+ LinkML YAML schema string
86
+ """
87
+ parser = load_parser_from_file(void_file_path)
88
+ return parser.to_linkml_yaml(
89
+ filter_void_nodes=filter_void_nodes,
90
+ schema_name=schema_name,
91
+ schema_description=schema_description,
92
+ schema_base_uri=schema_base_uri,
93
+ )
94
+
95
+
96
+ def to_shacl_from_file(
97
+ void_file_path: str,
98
+ filter_void_nodes: bool = True,
99
+ schema_name: Optional[str] = None,
100
+ schema_description: Optional[str] = None,
101
+ schema_base_uri: Optional[str] = None,
102
+ closed: bool = True,
103
+ suffix: Optional[str] = None,
104
+ include_annotations: bool = False,
105
+ ) -> str:
106
+ """Convert a VoID file to SHACL shapes.
107
+
108
+ Generates SHACL (Shapes Constraint Language) shapes from a VoID
109
+ description file. SHACL shapes define constraints on RDF data and
110
+ can be used for validation.
111
+
112
+ Args:
113
+ void_file_path: Path to VoID file
114
+ filter_void_nodes: Remove VoID-specific nodes
115
+ schema_name: Name for the schema
116
+ schema_description: Description for the schema
117
+ schema_base_uri: Base URI for the schema
118
+ closed: Generate closed shapes (only allow defined properties)
119
+ suffix: Optional suffix for shape names (e.g., "Shape")
120
+ include_annotations: Include class/slot annotations in shapes
121
+
122
+ Returns:
123
+ SHACL shapes as Turtle/RDF string
124
+
125
+ Example:
126
+ >>> from rdfsolve.api import to_shacl_from_file
127
+ >>> shacl_ttl = to_shacl_from_file(
128
+ ... "dataset_void.ttl", schema_name="my_dataset", closed=True
129
+ ... )
130
+ >>> with open("schema.shacl.ttl", "w") as f:
131
+ ... f.write(shacl_ttl)
132
+ """
133
+ parser = load_parser_from_file(void_file_path)
134
+ return parser.to_shacl(
135
+ filter_void_nodes=filter_void_nodes,
136
+ schema_name=schema_name,
137
+ schema_description=schema_description,
138
+ schema_base_uri=schema_base_uri,
139
+ closed=closed,
140
+ suffix=suffix,
141
+ include_annotations=include_annotations,
142
+ )
143
+
144
+
145
+ def to_rdfconfig_from_file(
146
+ void_file_path: str,
147
+ filter_void_nodes: bool = True,
148
+ endpoint_url: Optional[str] = None,
149
+ endpoint_name: Optional[str] = None,
150
+ graph_uri: Optional[str] = None,
151
+ ) -> Dict[str, str]:
152
+ """Convert a VoID file to RDF-config YAML files.
153
+
154
+ RDF-config is a schema standard that describes RDF data models using
155
+ YAML configuration files. This function generates three files:
156
+ - model.yml: Class and property structure
157
+ - prefix.yml: Namespace prefix definitions
158
+ - endpoint.yml: SPARQL endpoint configuration
159
+
160
+ Note: The rdf-config tool requires these files to be named exactly
161
+ model.yml, prefix.yml, and endpoint.yml, and placed in a directory
162
+ named {dataset}_config. The CLI automatically creates this structure.
163
+
164
+ Args:
165
+ void_file_path: Path to VoID file
166
+ filter_void_nodes: Remove VoID-specific nodes
167
+ endpoint_url: SPARQL endpoint URL (optional)
168
+ endpoint_name: Name for endpoint (default: "endpoint")
169
+ graph_uri: Named graph URI (optional)
170
+
171
+ Returns:
172
+ Dictionary with 'model', 'prefix', 'endpoint' keys containing
173
+ YAML strings
174
+
175
+ Example:
176
+ >>> from rdfsolve.api import to_rdfconfig_from_file
177
+ >>> rdfconfig = to_rdfconfig_from_file(
178
+ ... "dataset_void.ttl",
179
+ ... endpoint_url="https://example.org/sparql",
180
+ ... graph_uri="http://example.org/graph",
181
+ ... )
182
+ >>> # Save files
183
+ >>> with open("model.yml", "w") as f:
184
+ ... f.write(rdfconfig["model"])
185
+ >>> with open("prefix.yml", "w") as f:
186
+ ... f.write(rdfconfig["prefix"])
187
+ >>> with open("endpoint.yml", "w") as f:
188
+ ... f.write(rdfconfig["endpoint"])
189
+ """
190
+ parser = load_parser_from_file(void_file_path)
191
+ return parser.to_rdfconfig(
192
+ filter_void_nodes=filter_void_nodes,
193
+ endpoint_url=endpoint_url,
194
+ endpoint_name=endpoint_name,
195
+ graph_uri=graph_uri,
196
+ )
197
+
198
+
199
+ def to_jsonld_from_file(
200
+ void_file_path: str, filter_void_admin_nodes: bool = True
201
+ ) -> Dict[str, Any]:
202
+ """Convert a VoID file to JSON-LD format.
203
+
204
+ Args:
205
+ void_file_path: Path to VoID file
206
+ filter_void_admin_nodes: Remove VoID and administrative nodes
207
+
208
+ Returns:
209
+ JSON-LD with @context and @graph
210
+ """
211
+ parser = load_parser_from_file(void_file_path)
212
+ return parser.to_jsonld(filter_void_admin_nodes=filter_void_admin_nodes)
213
+
214
+
215
+ def graph_to_jsonld(
216
+ graph: Graph,
217
+ graph_uris: Optional[Union[str, List[str]]] = None,
218
+ filter_void_admin_nodes: bool = True,
219
+ ) -> Dict[str, Any]:
220
+ """Convert a VoID graph to JSON-LD format.
221
+
222
+ Args:
223
+ graph: RDFLib Graph with VoID data
224
+ graph_uris: Graph URIs to filter extraction
225
+ filter_void_admin_nodes: Remove VoID and administrative nodes
226
+
227
+ Returns:
228
+ JSON-LD with @context and @graph
229
+ """
230
+ parser = load_parser_from_graph(graph, graph_uris=graph_uris)
231
+ return parser.to_jsonld(filter_void_admin_nodes=filter_void_admin_nodes)
232
+
233
+
234
+ def graph_to_linkml(
235
+ graph: Graph,
236
+ graph_uris: Optional[Union[str, List[str]]] = None,
237
+ filter_void_nodes: bool = True,
238
+ schema_name: Optional[str] = None,
239
+ schema_description: Optional[str] = None,
240
+ schema_base_uri: Optional[str] = None,
241
+ ) -> str:
242
+ """Convert a VoID graph to LinkML YAML schema.
243
+
244
+ Args:
245
+ graph: RDFLib Graph with VoID data
246
+ graph_uris: Graph URIs to filter extraction
247
+ filter_void_nodes: Remove VoID-specific nodes
248
+ schema_name: Name for the schema
249
+ schema_description: Description for the schema
250
+ schema_base_uri: Base URI for the schema
251
+
252
+ Returns:
253
+ LinkML YAML schema string
254
+ """
255
+ parser = load_parser_from_graph(graph, graph_uris=graph_uris)
256
+ return parser.to_linkml_yaml(
257
+ filter_void_nodes=filter_void_nodes,
258
+ schema_name=schema_name,
259
+ schema_description=schema_description,
260
+ schema_base_uri=schema_base_uri,
261
+ )
262
+
263
+
264
+ def graph_to_shacl(
265
+ graph: Graph,
266
+ graph_uris: Optional[Union[str, List[str]]] = None,
267
+ filter_void_nodes: bool = True,
268
+ schema_name: Optional[str] = None,
269
+ schema_description: Optional[str] = None,
270
+ schema_base_uri: Optional[str] = None,
271
+ closed: bool = True,
272
+ suffix: Optional[str] = None,
273
+ include_annotations: bool = False,
274
+ ) -> str:
275
+ """Convert a VoID graph to SHACL shapes.
276
+
277
+ Generates SHACL (Shapes Constraint Language) shapes from a VoID
278
+ graph. SHACL shapes define constraints on RDF data and can be used
279
+ for validation.
280
+
281
+ Args:
282
+ graph: RDFLib Graph with VoID data
283
+ graph_uris: Graph URIs to filter extraction
284
+ filter_void_nodes: Remove VoID-specific nodes
285
+ schema_name: Name for the schema
286
+ schema_description: Description for the schema
287
+ schema_base_uri: Base URI for the schema
288
+ closed: Generate closed shapes (only allow defined properties)
289
+ suffix: Optional suffix for shape names (e.g., "Shape")
290
+ include_annotations: Include class/slot annotations in shapes
291
+
292
+ Returns:
293
+ SHACL shapes as Turtle/RDF string
294
+
295
+ Example:
296
+ >>> from rdflib import Graph
297
+ >>> from rdfsolve.api import graph_to_shacl
298
+ >>> void_graph = Graph()
299
+ >>> void_graph.parse("dataset_void.ttl", format="turtle")
300
+ >>> shacl_ttl = graph_to_shacl(void_graph, schema_name="my_dataset")
301
+ """
302
+ parser = load_parser_from_graph(graph, graph_uris=graph_uris)
303
+ return parser.to_shacl(
304
+ filter_void_nodes=filter_void_nodes,
305
+ schema_name=schema_name,
306
+ schema_description=schema_description,
307
+ schema_base_uri=schema_base_uri,
308
+ closed=closed,
309
+ suffix=suffix,
310
+ include_annotations=include_annotations,
311
+ )
312
+
313
+
314
+ def discover_void_graphs(
315
+ endpoint_url: str,
316
+ graph_uris: Optional[Union[str, List[str]]] = None,
317
+ exclude_graphs: bool = False,
318
+ ) -> Dict[str, Any]:
319
+ """Find VoID graphs in a SPARQL endpoint.
320
+
321
+ Discovery always includes well-known URIs and VoID graphs by default,
322
+ as these commonly contain metadata descriptions. Only Virtuoso system
323
+ graphs are excluded by default.
324
+
325
+ Args:
326
+ endpoint_url: SPARQL endpoint URL
327
+ graph_uris: Graph URIs to search
328
+ exclude_graphs: Exclude Virtuoso system graphs (default: False for discovery)
329
+
330
+ Returns:
331
+ Discovery metadata per graph URI
332
+ """
333
+ parser = VoidParser(graph_uris=graph_uris, exclude_graphs=exclude_graphs)
334
+ return parser.discover_void_graphs(endpoint_url)
335
+
336
+
337
+ def count_instances(
338
+ endpoint_url: str,
339
+ sample_limit: Optional[int] = None,
340
+ sample_offset: Optional[int] = None,
341
+ chunk_size: Optional[int] = None,
342
+ offset_limit_steps: Optional[int] = None,
343
+ delay_between_chunks: float = 20.0,
344
+ streaming: bool = False,
345
+ ) -> Union[Dict[str, int], Any]:
346
+ """Count instances per class in a SPARQL endpoint.
347
+
348
+ Args:
349
+ endpoint_url: SPARQL endpoint URL
350
+ sample_limit: Max results to return
351
+ sample_offset: Starting offset
352
+ chunk_size: Chunk size for pagination
353
+ offset_limit_steps: Combined LIMIT/OFFSET step
354
+ delay_between_chunks: Seconds between chunks
355
+ streaming: Return generator if True
356
+
357
+ Returns:
358
+ Dict mapping class URI to count, or generator
359
+ """
360
+ parser = VoidParser()
361
+ return parser.count_instances_per_class(
362
+ endpoint_url,
363
+ sample_limit=sample_limit,
364
+ sample_offset=sample_offset,
365
+ chunk_size=chunk_size,
366
+ offset_limit_steps=offset_limit_steps,
367
+ delay_between_chunks=delay_between_chunks,
368
+ streaming=streaming,
369
+ )
370
+
371
+
372
+ def extract_partitions_from_void(
373
+ endpoint_url: str, void_graph_uris: List[str]
374
+ ) -> List[Dict[str, str]]:
375
+ """Extract partition data from discovered VoID graphs.
376
+
377
+ Args:
378
+ endpoint_url: SPARQL endpoint URL
379
+ void_graph_uris: List of VoID graph URIs with partitions
380
+
381
+ Returns:
382
+ List of partition records (class-property-object)
383
+ """
384
+ parser = VoidParser()
385
+ return parser.retrieve_partitions_from_void(endpoint_url, void_graph_uris)
386
+
387
+
388
+ def retrieve_void_from_graphs(
389
+ endpoint_url: str,
390
+ void_graph_uris: List[str],
391
+ graph_uris: Optional[Union[str, List[str]]] = None,
392
+ partitions: Optional[List[Dict[str, str]]] = None,
393
+ ) -> Graph:
394
+ """Retrieve VoID descriptions from specific graphs at endpoint.
395
+
396
+ If partition data is provided (from discover_void_graphs), builds the
397
+ graph directly from that data. Otherwise, runs a new discovery query.
398
+
399
+ Args:
400
+ endpoint_url: SPARQL endpoint URL
401
+ void_graph_uris: List of graph URIs containing VoID
402
+ graph_uris: Graph URIs to filter queries
403
+ partitions: Optional partition data from discover_void_graphs result
404
+
405
+ Returns:
406
+ RDF Graph with VoID descriptions built from partition data
407
+ """
408
+ parser = VoidParser(graph_uris=graph_uris)
409
+
410
+ # If partition data provided, build graph directly (no CONSTRUCT needed)
411
+ if partitions:
412
+ base_uri = void_graph_uris[0] if void_graph_uris else None
413
+ return parser.build_void_graph_from_partitions(partitions, base_uri=base_uri)
414
+
415
+ # Otherwise, run discovery to get partitions and build graph
416
+ discovery_result = parser.discover_void_graphs(endpoint_url)
417
+ partitions = discovery_result.get("partitions", [])
418
+
419
+ if partitions:
420
+ base_uri = void_graph_uris[0] if void_graph_uris else None
421
+ return parser.build_void_graph_from_partitions(partitions, base_uri=base_uri)
422
+
423
+ # Fallback: return empty graph if no partitions found
424
+ from rdflib import Graph
425
+
426
+ return Graph()
427
+
428
+
429
+ def generate_void_from_endpoint(
430
+ endpoint_url: str,
431
+ graph_uris: Optional[Union[str, List[str]]] = None,
432
+ output_file: Optional[str] = None,
433
+ counts: bool = True,
434
+ offset_limit_steps: Optional[int] = None,
435
+ exclude_graphs: bool = True,
436
+ dataset_uri: Optional[str] = None,
437
+ void_base_uri: Optional[str] = None,
438
+ ) -> Graph:
439
+ """Generate VoID description from a SPARQL endpoint.
440
+
441
+ Args:
442
+ endpoint_url: SPARQL endpoint URL
443
+ graph_uris: Graph URI(s) to analyze
444
+ output_file: Path to save Turtle output
445
+ counts: Include instance counts
446
+ offset_limit_steps: Chunk size for pagination
447
+ exclude_graphs: Exclude system graphs
448
+ dataset_uri: Custom URI for the VoID dataset (default: uses first graph_uri or endpoint URL)
449
+ void_base_uri: Custom base URI for VoID partition IRIs
450
+
451
+ Returns:
452
+ RDF graph with VoID description
453
+ """
454
+ # Determine dataset_uri if not provided
455
+ if dataset_uri is None:
456
+ if graph_uris:
457
+ dataset_uri = graph_uris[0] if isinstance(graph_uris, list) else graph_uris
458
+ else:
459
+ # Use endpoint URL as fallback
460
+ dataset_uri = endpoint_url.rstrip("/")
461
+
462
+ # Note: VoidParser.generate_void_from_sparql uses graph_uris for building partition IRIs
463
+ # The dataset_uri is embedded in the VoID graph structure
464
+ return VoidParser.generate_void_from_sparql(
465
+ endpoint_url=endpoint_url,
466
+ graph_uris=graph_uris,
467
+ output_file=output_file,
468
+ counts=counts,
469
+ offset_limit_steps=offset_limit_steps,
470
+ exclude_graphs=exclude_graphs,
471
+ void_base_uri=void_base_uri,
472
+ )
473
+
474
+
475
+ def graph_to_schema(
476
+ void_graph: Graph,
477
+ graph_uris: Optional[Union[str, List[str]]] = None,
478
+ filter_void_admin_nodes: bool = True,
479
+ ) -> pd.DataFrame:
480
+ """Convert VoID graph to schema DataFrame.
481
+
482
+ Args:
483
+ void_graph: RDFLib graph with VoID data
484
+ graph_uris: Graph URIs to extract
485
+ filter_void_admin_nodes: Filter VoID or administrative nodes
486
+
487
+ Returns:
488
+ DataFrame with schema patterns (subject/property/object URIs)
489
+ """
490
+ parser = VoidParser(void_source=void_graph, graph_uris=graph_uris)
491
+ return parser.to_schema(filter_void_admin_nodes=filter_void_admin_nodes)
492
+
493
+
494
+ def count_instances_per_class(
495
+ endpoint_url: str,
496
+ graph_uris: Optional[Union[str, List[str]]] = None,
497
+ sample_limit: Optional[int] = None,
498
+ exclude_graphs: bool = True,
499
+ ) -> Dict[str, int]:
500
+ """Count instances per class in a SPARQL endpoint.
501
+
502
+ Args:
503
+ endpoint_url: SPARQL endpoint URL
504
+ graph_uris: Graph URI(s) to query
505
+ sample_limit: Max results to sample
506
+ exclude_graphs: Exclude service/system graphs from counting
507
+
508
+ Returns:
509
+ Class URI to instance count mapping
510
+ """
511
+ parser = VoidParser(graph_uris=graph_uris, exclude_graphs=exclude_graphs)
512
+ result = parser.count_instances_per_class(endpoint_url, sample_limit=sample_limit)
513
+ if isinstance(result, dict):
514
+ return result
515
+ return dict(result) # Convert generator to dict if needed