rdf-starbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,434 @@
1
+ """
2
+ RDF/XML parser and serializer for RDF-StarBase.
3
+
4
+ RDF/XML is the original W3C standard serialization format for RDF.
5
+ While less human-readable than Turtle, it's still widely used in
6
+ legacy systems and some enterprise applications.
7
+
8
+ Key features:
9
+ - XML namespaces for prefixes
10
+ - rdf:Description for resource descriptions
11
+ - rdf:about for subject identification
12
+ - rdf:resource for object references
13
+ - Typed literals with rdf:datatype
14
+ - Language tags with xml:lang
15
+
16
+ Note: RDF-Star embedded triples are NOT supported in RDF/XML as there
17
+ is no standard syntax for them.
18
+ """
19
+
20
+ import re
21
+ import xml.etree.ElementTree as ET
22
+ from dataclasses import dataclass, field
23
+ from typing import Optional, Union, TextIO
24
+ from io import StringIO
25
+
26
+
27
+ @dataclass
28
+ class Triple:
29
+ """A simple triple representation for parsing."""
30
+ subject: str
31
+ predicate: str
32
+ object: str
33
+ subject_triple: Optional["Triple"] = None
34
+ object_triple: Optional["Triple"] = None
35
+
36
+
37
+ @dataclass
38
+ class RDFXMLDocument:
39
+ """Parsed RDF/XML document."""
40
+ prefixes: dict[str, str] = field(default_factory=dict)
41
+ triples: list[Triple] = field(default_factory=list)
42
+ base: Optional[str] = None
43
+
44
+
45
+ # XML namespace URIs
46
+ RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
47
+ RDFS_NS = "http://www.w3.org/2000/01/rdf-schema#"
48
+ XSD_NS = "http://www.w3.org/2001/XMLSchema#"
49
+ XML_NS = "http://www.w3.org/XML/1998/namespace"
50
+
51
+ # Standard prefixes
52
+ STANDARD_PREFIXES = {
53
+ "rdf": RDF_NS,
54
+ "rdfs": RDFS_NS,
55
+ "xsd": XSD_NS,
56
+ "owl": "http://www.w3.org/2002/07/owl#",
57
+ "foaf": "http://xmlns.com/foaf/0.1/",
58
+ "dc": "http://purl.org/dc/elements/1.1/",
59
+ "dcterms": "http://purl.org/dc/terms/",
60
+ }
61
+
62
+
63
+ class RDFXMLParser:
64
+ """
65
+ Parser for RDF/XML documents.
66
+
67
+ Supports:
68
+ - rdf:RDF root element
69
+ - rdf:Description with rdf:about
70
+ - Property elements with rdf:resource
71
+ - Typed nodes (abbreviated syntax)
72
+ - Literals with rdf:datatype and xml:lang
73
+ - rdf:parseType="Resource" for blank nodes
74
+ - rdf:parseType="Collection" for lists
75
+ - rdf:nodeID for blank nodes
76
+ """
77
+
78
+ def __init__(self):
79
+ self.prefixes = {}
80
+ self.base = None
81
+ self.triples = []
82
+ self.blank_counter = 0
83
+
84
+ def parse(self, source: Union[str, TextIO]) -> RDFXMLDocument:
85
+ """Parse RDF/XML from string or file-like object."""
86
+ if isinstance(source, str):
87
+ # Check if it's a file path or XML content
88
+ if source.strip().startswith("<"):
89
+ root = ET.fromstring(source)
90
+ else:
91
+ root = ET.parse(source).getroot()
92
+ else:
93
+ root = ET.parse(source).getroot()
94
+
95
+ self.prefixes = {}
96
+ self.base = None
97
+ self.triples = []
98
+ self.blank_counter = 0
99
+
100
+ # Extract namespaces from root
101
+ self._extract_namespaces(root)
102
+
103
+ # Extract base URI
104
+ base_attr = root.get(f"{{{XML_NS}}}base")
105
+ if base_attr:
106
+ self.base = base_attr
107
+
108
+ # Process root element
109
+ if root.tag == f"{{{RDF_NS}}}RDF":
110
+ # Standard rdf:RDF wrapper
111
+ for child in root:
112
+ self._process_element(child)
113
+ else:
114
+ # Document element is a typed node
115
+ self._process_element(root)
116
+
117
+ return RDFXMLDocument(
118
+ prefixes=self.prefixes.copy(),
119
+ triples=self.triples.copy(),
120
+ base=self.base
121
+ )
122
+
123
+ def _extract_namespaces(self, element: ET.Element):
124
+ """Extract namespace declarations."""
125
+ # ElementTree handles namespaces with {uri}localname format
126
+ # We need to extract from the raw XML for proper prefix handling
127
+ # For now, use standard prefixes
128
+ for prefix, uri in STANDARD_PREFIXES.items():
129
+ self.prefixes[prefix] = uri
130
+
131
+ def _gen_blank_node(self) -> str:
132
+ """Generate a new blank node ID."""
133
+ self.blank_counter += 1
134
+ return f"_:b{self.blank_counter}"
135
+
136
+ def _get_subject(self, element: ET.Element) -> str:
137
+ """Get the subject URI from an element."""
138
+ # Check rdf:about
139
+ about = element.get(f"{{{RDF_NS}}}about")
140
+ if about:
141
+ return self._resolve_uri(about)
142
+
143
+ # Check rdf:ID
144
+ rdf_id = element.get(f"{{{RDF_NS}}}ID")
145
+ if rdf_id:
146
+ base = self.base or ""
147
+ return f"{base}#{rdf_id}"
148
+
149
+ # Check rdf:nodeID
150
+ node_id = element.get(f"{{{RDF_NS}}}nodeID")
151
+ if node_id:
152
+ return f"_:{node_id}"
153
+
154
+ # Generate blank node
155
+ return self._gen_blank_node()
156
+
157
+ def _resolve_uri(self, uri: str) -> str:
158
+ """Resolve a URI against the base."""
159
+ if uri.startswith("#") and self.base:
160
+ return self.base + uri
161
+ if not uri.startswith(("http://", "https://", "urn:", "_:")):
162
+ if self.base:
163
+ return self.base + uri
164
+ return uri
165
+
166
+ def _process_element(self, element: ET.Element, subject: Optional[str] = None) -> str:
167
+ """Process an RDF/XML element and extract triples."""
168
+ tag = element.tag
169
+
170
+ # Handle typed nodes (not rdf:Description)
171
+ if tag != f"{{{RDF_NS}}}Description":
172
+ # This is a typed node - extract type triple
173
+ subject = self._get_subject(element)
174
+ type_uri = tag.replace("{", "").replace("}", "")
175
+ self.triples.append(Triple(
176
+ subject=subject,
177
+ predicate=f"{RDF_NS}type",
178
+ object=type_uri
179
+ ))
180
+ else:
181
+ subject = self._get_subject(element)
182
+
183
+ # Process property elements
184
+ for prop_elem in element:
185
+ self._process_property(subject, prop_elem)
186
+
187
+ return subject
188
+
189
+ def _process_property(self, subject: str, prop_elem: ET.Element):
190
+ """Process a property element."""
191
+ predicate = prop_elem.tag.replace("{", "").replace("}", "")
192
+
193
+ # Check for rdf:resource (object reference)
194
+ resource = prop_elem.get(f"{{{RDF_NS}}}resource")
195
+ if resource is not None:
196
+ obj = self._resolve_uri(resource)
197
+ self.triples.append(Triple(subject, predicate, obj))
198
+ return
199
+
200
+ # Check for rdf:nodeID (blank node reference)
201
+ node_id = prop_elem.get(f"{{{RDF_NS}}}nodeID")
202
+ if node_id is not None:
203
+ obj = f"_:{node_id}"
204
+ self.triples.append(Triple(subject, predicate, obj))
205
+ return
206
+
207
+ # Check for rdf:parseType
208
+ parse_type = prop_elem.get(f"{{{RDF_NS}}}parseType")
209
+ if parse_type == "Resource":
210
+ # Blank node with properties
211
+ obj = self._gen_blank_node()
212
+ self.triples.append(Triple(subject, predicate, obj))
213
+ for child in prop_elem:
214
+ self._process_property(obj, child)
215
+ return
216
+ elif parse_type == "Collection":
217
+ # RDF list
218
+ obj = self._process_collection(prop_elem)
219
+ self.triples.append(Triple(subject, predicate, obj))
220
+ return
221
+ elif parse_type == "Literal":
222
+ # XML literal (preserve as string)
223
+ xml_content = ET.tostring(prop_elem, encoding='unicode', method='xml')
224
+ # Extract inner content
225
+ start = xml_content.find(">") + 1
226
+ end = xml_content.rfind("<")
227
+ literal_content = xml_content[start:end]
228
+ obj = f'"{self._escape_string(literal_content)}"^^<{RDF_NS}XMLLiteral>'
229
+ self.triples.append(Triple(subject, predicate, obj))
230
+ return
231
+
232
+ # Check for nested element (object is another resource)
233
+ children = list(prop_elem)
234
+ if children:
235
+ # Nested resource
236
+ obj = self._process_element(children[0])
237
+ self.triples.append(Triple(subject, predicate, obj))
238
+ return
239
+
240
+ # Simple literal
241
+ text = prop_elem.text or ""
242
+
243
+ # Check for datatype
244
+ datatype = prop_elem.get(f"{{{RDF_NS}}}datatype")
245
+ if datatype:
246
+ obj = f'"{self._escape_string(text)}"^^<{datatype}>'
247
+ else:
248
+ # Check for language
249
+ lang = prop_elem.get(f"{{{XML_NS}}}lang")
250
+ if lang:
251
+ obj = f'"{self._escape_string(text)}"@{lang}'
252
+ else:
253
+ obj = f'"{self._escape_string(text)}"'
254
+
255
+ self.triples.append(Triple(subject, predicate, obj))
256
+
257
+ def _process_collection(self, prop_elem: ET.Element) -> str:
258
+ """Process an rdf:parseType="Collection" element."""
259
+ rdf_first = f"{RDF_NS}first"
260
+ rdf_rest = f"{RDF_NS}rest"
261
+ rdf_nil = f"{RDF_NS}nil"
262
+
263
+ children = list(prop_elem)
264
+ if not children:
265
+ return rdf_nil
266
+
267
+ head = self._gen_blank_node()
268
+ current = head
269
+
270
+ for i, child in enumerate(children):
271
+ item = self._process_element(child)
272
+ self.triples.append(Triple(current, rdf_first, item))
273
+
274
+ if i < len(children) - 1:
275
+ next_node = self._gen_blank_node()
276
+ self.triples.append(Triple(current, rdf_rest, next_node))
277
+ current = next_node
278
+ else:
279
+ self.triples.append(Triple(current, rdf_rest, rdf_nil))
280
+
281
+ return head
282
+
283
+ def _escape_string(self, s: str) -> str:
284
+ """Escape special characters in a string."""
285
+ s = s.replace("\\", "\\\\")
286
+ s = s.replace('"', '\\"')
287
+ s = s.replace("\n", "\\n")
288
+ s = s.replace("\r", "\\r")
289
+ s = s.replace("\t", "\\t")
290
+ return s
291
+
292
+
293
+ class RDFXMLSerializer:
294
+ """
295
+ Serializer for RDF/XML output.
296
+
297
+ Produces standard RDF/XML with rdf:Description elements.
298
+ """
299
+
300
+ def __init__(self, prefixes: Optional[dict[str, str]] = None):
301
+ self.prefixes = prefixes or STANDARD_PREFIXES.copy()
302
+ self.inverse_prefixes = {v: k for k, v in self.prefixes.items()}
303
+
304
+ def serialize(self, triples: list[Triple], pretty: bool = True) -> str:
305
+ """Serialize triples to RDF/XML."""
306
+ # Register namespaces so ElementTree uses proper prefixes
307
+ for prefix, uri in self.prefixes.items():
308
+ ET.register_namespace(prefix, uri)
309
+ ET.register_namespace("rdf", RDF_NS)
310
+
311
+ # Create root element
312
+ root = ET.Element(f"{{{RDF_NS}}}RDF")
313
+
314
+ # Note: Don't manually add xmlns attributes - ElementTree handles this
315
+ # when we use register_namespace and reference namespaces in elements
316
+
317
+ # Group triples by subject
318
+ subjects = {}
319
+ for triple in triples:
320
+ if triple.subject not in subjects:
321
+ subjects[triple.subject] = []
322
+ subjects[triple.subject].append(triple)
323
+
324
+ # Create Description elements
325
+ for subject, subject_triples in subjects.items():
326
+ self._add_description(root, subject, subject_triples)
327
+
328
+ # Generate XML
329
+ if pretty:
330
+ self._indent(root)
331
+
332
+ return ET.tostring(root, encoding='unicode', xml_declaration=True)
333
+
334
+ def _add_description(self, parent: ET.Element, subject: str, triples: list[Triple]):
335
+ """Add an rdf:Description element."""
336
+ desc = ET.SubElement(parent, f"{{{RDF_NS}}}Description")
337
+
338
+ if subject.startswith("_:"):
339
+ desc.set(f"{{{RDF_NS}}}nodeID", subject[2:])
340
+ else:
341
+ desc.set(f"{{{RDF_NS}}}about", subject)
342
+
343
+ for triple in triples:
344
+ self._add_property(desc, triple)
345
+
346
+ def _add_property(self, parent: ET.Element, triple: Triple):
347
+ """Add a property element."""
348
+ pred = triple.predicate
349
+ obj = triple.object
350
+
351
+ # Convert IRI to Clark notation {namespace}localname for ElementTree
352
+ prop_tag = self._uri_to_clark(pred)
353
+
354
+ # Create property element
355
+ prop = ET.SubElement(parent, prop_tag)
356
+
357
+ # Check if object is a literal
358
+ if obj.startswith('"'):
359
+ # Parse literal
360
+ self._set_literal(prop, obj)
361
+ elif obj.startswith("_:"):
362
+ # Blank node reference
363
+ prop.set(f"{{{RDF_NS}}}nodeID", obj[2:])
364
+ else:
365
+ # IRI reference
366
+ prop.set(f"{{{RDF_NS}}}resource", obj)
367
+
368
+ def _uri_to_clark(self, uri: str) -> str:
369
+ """Convert a URI to Clark notation {namespace}localname."""
370
+ # Try to split at last # or last /
371
+ if "#" in uri:
372
+ namespace, localname = uri.rsplit("#", 1)
373
+ return f"{{{namespace}#}}{localname}"
374
+ elif "/" in uri:
375
+ namespace, localname = uri.rsplit("/", 1)
376
+ return f"{{{namespace}/}}{localname}"
377
+ else:
378
+ # Can't split - use as-is (will fail if not a valid name)
379
+ return uri
380
+
381
+ def _set_literal(self, element: ET.Element, literal: str):
382
+ """Set literal value on element."""
383
+ # Parse literal format: "value"@lang or "value"^^<datatype>
384
+ match = re.match(r'"((?:[^"\\]|\\.)*)"\s*(?:@([a-z-]+)|(?:\^\^<([^>]+)>))?', literal, re.I)
385
+ if not match:
386
+ element.text = literal
387
+ return
388
+
389
+ value = match.group(1)
390
+ # Unescape
391
+ value = value.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "\r")
392
+ value = value.replace('\\"', '"').replace("\\\\", "\\")
393
+
394
+ element.text = value
395
+
396
+ lang = match.group(2)
397
+ datatype = match.group(3)
398
+
399
+ if lang:
400
+ element.set(f"{{{XML_NS}}}lang", lang)
401
+ elif datatype:
402
+ element.set(f"{{{RDF_NS}}}datatype", datatype)
403
+
404
+ def _indent(self, elem: ET.Element, level: int = 0):
405
+ """Add indentation to XML elements."""
406
+ i = "\n" + " " * level
407
+ if len(elem):
408
+ if not elem.text or not elem.text.strip():
409
+ elem.text = i + " "
410
+ if not elem.tail or not elem.tail.strip():
411
+ elem.tail = i
412
+ for child in elem:
413
+ self._indent(child, level + 1)
414
+ if not child.tail or not child.tail.strip():
415
+ child.tail = i
416
+ else:
417
+ if level and (not elem.tail or not elem.tail.strip()):
418
+ elem.tail = i
419
+
420
+
421
+ def parse_rdfxml(source: Union[str, TextIO]) -> RDFXMLDocument:
422
+ """Convenience function to parse RDF/XML."""
423
+ parser = RDFXMLParser()
424
+ return parser.parse(source)
425
+
426
+
427
+ def serialize_rdfxml(
428
+ triples: list[Triple],
429
+ prefixes: Optional[dict[str, str]] = None,
430
+ pretty: bool = True
431
+ ) -> str:
432
+ """Convenience function to serialize to RDF/XML."""
433
+ serializer = RDFXMLSerializer(prefixes)
434
+ return serializer.serialize(triples, pretty)