rdf-starbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,419 @@
1
+ """
2
+ N-Triples and N-Quads Parser and Serializer.
3
+
4
+ N-Triples is the simplest RDF format: one triple per line.
5
+ N-Quads extends N-Triples with an optional graph name.
6
+
7
+ Grammar:
8
+ ntriplesDoc ::= triple? (EOL triple)* EOL?
9
+ triple ::= subject predicate object '.'
10
+ subject ::= IRIREF | BLANK_NODE_LABEL | quotedTriple
11
+ predicate ::= IRIREF
12
+ object ::= IRIREF | BLANK_NODE_LABEL | literal | quotedTriple
13
+ quotedTriple ::= '<<' subject predicate object '>>'
14
+
15
+ Reference: https://www.w3.org/TR/n-triples/
16
+ """
17
+
18
+ from typing import Iterator, Optional, Tuple, List, Union
19
+ from dataclasses import dataclass
20
+ from pathlib import Path
21
+ import re
22
+ from io import StringIO
23
+
24
+ from rdf_starbase.formats.turtle import Triple, ParsedDocument
25
+
26
+
27
+ class NTriplesParser:
28
+ """
29
+ Parser for N-Triples and N-Triples-Star format.
30
+
31
+ N-Triples is line-oriented: each line is one triple.
32
+ This makes it efficient for streaming large files.
33
+
34
+ Format:
35
+ <subject> <predicate> <object> .
36
+ <subject> <predicate> "literal" .
37
+ <subject> <predicate> "literal"@lang .
38
+ <subject> <predicate> "literal"^^<datatype> .
39
+
40
+ N-Triples-Star adds quoted triples:
41
+ << <s> <p> <o> >> <p2> <o2> .
42
+ """
43
+
44
+ # Regex patterns for N-Triples tokens
45
+ IRI_PATTERN = re.compile(r'<([^>]*)>')
46
+ BLANK_NODE_PATTERN = re.compile(r'_:([A-Za-z_][A-Za-z0-9_.-]*)')
47
+ LITERAL_PATTERN = re.compile(
48
+ r'"((?:[^"\\]|\\.)*)"|' # Double-quoted string
49
+ r"'((?:[^'\\]|\\.)*)'" # Single-quoted string
50
+ )
51
+ LANG_TAG_PATTERN = re.compile(r'@([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)')
52
+ DATATYPE_PATTERN = re.compile(r'\^\^<([^>]*)>')
53
+ QUOTED_TRIPLE_START = re.compile(r'<<')
54
+ QUOTED_TRIPLE_END = re.compile(r'>>')
55
+
56
+ def __init__(self):
57
+ self.line_number = 0
58
+
59
+ def parse(self, source: Union[str, Path, StringIO]) -> ParsedDocument:
60
+ """
61
+ Parse N-Triples content.
62
+
63
+ Args:
64
+ source: N-Triples content as string, file path, or StringIO
65
+
66
+ Returns:
67
+ ParsedDocument with triples (no prefixes in N-Triples)
68
+ """
69
+ if isinstance(source, Path):
70
+ text = source.read_text(encoding="utf-8")
71
+ elif isinstance(source, StringIO):
72
+ text = source.read()
73
+ else:
74
+ text = source
75
+
76
+ triples = list(self.parse_lines(text.splitlines()))
77
+ return ParsedDocument(triples=triples)
78
+
79
+ def parse_file(self, path: Union[str, Path]) -> ParsedDocument:
80
+ """Parse an N-Triples file."""
81
+ return self.parse(Path(path))
82
+
83
+ def parse_lines(self, lines: List[str]) -> Iterator[Triple]:
84
+ """
85
+ Parse lines of N-Triples.
86
+
87
+ Args:
88
+ lines: List of N-Triples lines
89
+
90
+ Yields:
91
+ Triple objects
92
+ """
93
+ for i, line in enumerate(lines):
94
+ self.line_number = i + 1
95
+
96
+ # Strip whitespace and skip empty lines/comments
97
+ line = line.strip()
98
+ if not line or line.startswith('#'):
99
+ continue
100
+
101
+ try:
102
+ triple = self._parse_line(line)
103
+ if triple:
104
+ yield triple
105
+ except Exception as e:
106
+ raise ValueError(f"Error parsing line {self.line_number}: {e}\nLine: {line}")
107
+
108
+ def parse_stream(self, stream) -> Iterator[Triple]:
109
+ """
110
+ Parse N-Triples from a stream (file-like object).
111
+
112
+ Useful for processing large files without loading into memory.
113
+
114
+ Args:
115
+ stream: File-like object with readline()
116
+
117
+ Yields:
118
+ Triple objects
119
+ """
120
+ self.line_number = 0
121
+ for line in stream:
122
+ self.line_number += 1
123
+ line = line.strip()
124
+ if not line or line.startswith('#'):
125
+ continue
126
+
127
+ triple = self._parse_line(line)
128
+ if triple:
129
+ yield triple
130
+
131
+ def _parse_line(self, line: str) -> Optional[Triple]:
132
+ """Parse a single N-Triples line."""
133
+ pos = 0
134
+
135
+ # Parse subject
136
+ subject, pos = self._parse_subject(line, pos)
137
+ pos = self._skip_ws(line, pos)
138
+
139
+ # Parse predicate
140
+ predicate, pos = self._parse_iri(line, pos)
141
+ pos = self._skip_ws(line, pos)
142
+
143
+ # Parse object
144
+ obj, pos = self._parse_object(line, pos)
145
+ pos = self._skip_ws(line, pos)
146
+
147
+ # Expect period
148
+ if pos < len(line) and line[pos] == '.':
149
+ pos += 1
150
+
151
+ # Handle RDF-Star: subject or object might be Triple
152
+ if isinstance(subject, Triple):
153
+ return Triple(
154
+ subject="",
155
+ predicate=predicate,
156
+ object=obj if isinstance(obj, str) else "",
157
+ subject_triple=subject,
158
+ object_triple=obj if isinstance(obj, Triple) else None
159
+ )
160
+ elif isinstance(obj, Triple):
161
+ return Triple(
162
+ subject=subject,
163
+ predicate=predicate,
164
+ object="",
165
+ object_triple=obj
166
+ )
167
+ else:
168
+ return Triple(subject=subject, predicate=predicate, object=obj)
169
+
170
+ def _skip_ws(self, line: str, pos: int) -> int:
171
+ """Skip whitespace."""
172
+ while pos < len(line) and line[pos] in ' \t':
173
+ pos += 1
174
+ return pos
175
+
176
+ def _parse_subject(self, line: str, pos: int) -> Tuple[Union[str, Triple], int]:
177
+ """Parse a subject (IRI, blank node, or quoted triple)."""
178
+ pos = self._skip_ws(line, pos)
179
+
180
+ # Check for quoted triple
181
+ if line[pos:pos+2] == '<<':
182
+ return self._parse_quoted_triple(line, pos)
183
+
184
+ # Check for blank node
185
+ if line[pos:pos+2] == '_:':
186
+ return self._parse_blank_node(line, pos)
187
+
188
+ # Otherwise IRI
189
+ return self._parse_iri(line, pos)
190
+
191
+ def _parse_object(self, line: str, pos: int) -> Tuple[Union[str, Triple], int]:
192
+ """Parse an object (IRI, blank node, literal, or quoted triple)."""
193
+ pos = self._skip_ws(line, pos)
194
+
195
+ # Check for quoted triple
196
+ if line[pos:pos+2] == '<<':
197
+ return self._parse_quoted_triple(line, pos)
198
+
199
+ # Check for literal
200
+ if line[pos] == '"':
201
+ return self._parse_literal(line, pos)
202
+
203
+ # Check for blank node
204
+ if line[pos:pos+2] == '_:':
205
+ return self._parse_blank_node(line, pos)
206
+
207
+ # Otherwise IRI
208
+ return self._parse_iri(line, pos)
209
+
210
+ def _parse_iri(self, line: str, pos: int) -> Tuple[str, int]:
211
+ """Parse an IRI <...>."""
212
+ if pos >= len(line) or line[pos] != '<':
213
+ raise ValueError(f"Expected '<' at position {pos}")
214
+
215
+ end = line.find('>', pos + 1)
216
+ if end == -1:
217
+ raise ValueError(f"Unclosed IRI at position {pos}")
218
+
219
+ iri = line[pos+1:end]
220
+ return iri, end + 1
221
+
222
+ def _parse_blank_node(self, line: str, pos: int) -> Tuple[str, int]:
223
+ """Parse a blank node _:label."""
224
+ match = self.BLANK_NODE_PATTERN.match(line, pos)
225
+ if not match:
226
+ raise ValueError(f"Invalid blank node at position {pos}")
227
+
228
+ return f"_:{match.group(1)}", match.end()
229
+
230
+ def _parse_literal(self, line: str, pos: int) -> Tuple[str, int]:
231
+ """Parse a literal "..."."""
232
+ if line[pos] != '"':
233
+ raise ValueError(f"Expected '\"' at position {pos}")
234
+
235
+ pos += 1
236
+ value = []
237
+
238
+ while pos < len(line):
239
+ c = line[pos]
240
+ if c == '"':
241
+ pos += 1
242
+ break
243
+ elif c == '\\':
244
+ pos += 1
245
+ if pos < len(line):
246
+ escaped = line[pos]
247
+ if escaped == 'n':
248
+ value.append('\n')
249
+ elif escaped == 't':
250
+ value.append('\t')
251
+ elif escaped == 'r':
252
+ value.append('\r')
253
+ elif escaped == '\\':
254
+ value.append('\\')
255
+ elif escaped == '"':
256
+ value.append('"')
257
+ elif escaped == 'u':
258
+ # Unicode escape \uXXXX
259
+ hex_chars = line[pos+1:pos+5]
260
+ value.append(chr(int(hex_chars, 16)))
261
+ pos += 4
262
+ elif escaped == 'U':
263
+ # Long unicode escape \UXXXXXXXX
264
+ hex_chars = line[pos+1:pos+9]
265
+ value.append(chr(int(hex_chars, 16)))
266
+ pos += 8
267
+ else:
268
+ value.append(escaped)
269
+ pos += 1
270
+ else:
271
+ value.append(c)
272
+ pos += 1
273
+
274
+ string_value = ''.join(value)
275
+
276
+ # Check for language tag
277
+ if pos < len(line) and line[pos] == '@':
278
+ lang_match = self.LANG_TAG_PATTERN.match(line, pos)
279
+ if lang_match:
280
+ lang = lang_match.group(1)
281
+ return f'"{string_value}"@{lang}', lang_match.end()
282
+
283
+ # Check for datatype
284
+ if pos < len(line) and line[pos:pos+2] == '^^':
285
+ pos += 2
286
+ datatype, pos = self._parse_iri(line, pos)
287
+ return f'"{string_value}"^^<{datatype}>', pos
288
+
289
+ return f'"{string_value}"', pos
290
+
291
+ def _parse_quoted_triple(self, line: str, pos: int) -> Tuple[Triple, int]:
292
+ """Parse an RDF-Star quoted triple << s p o >>."""
293
+ if line[pos:pos+2] != '<<':
294
+ raise ValueError(f"Expected '<<' at position {pos}")
295
+ pos += 2
296
+ pos = self._skip_ws(line, pos)
297
+
298
+ # Parse subject
299
+ subject, pos = self._parse_subject(line, pos)
300
+ pos = self._skip_ws(line, pos)
301
+
302
+ # Parse predicate
303
+ predicate, pos = self._parse_iri(line, pos)
304
+ pos = self._skip_ws(line, pos)
305
+
306
+ # Parse object
307
+ obj, pos = self._parse_object(line, pos)
308
+ pos = self._skip_ws(line, pos)
309
+
310
+ # Expect >>
311
+ if line[pos:pos+2] != '>>':
312
+ raise ValueError(f"Expected '>>' at position {pos}")
313
+ pos += 2
314
+
315
+ if isinstance(subject, Triple):
316
+ triple = Triple("", predicate, obj if isinstance(obj, str) else "",
317
+ subject_triple=subject,
318
+ object_triple=obj if isinstance(obj, Triple) else None)
319
+ elif isinstance(obj, Triple):
320
+ triple = Triple(subject, predicate, "", object_triple=obj)
321
+ else:
322
+ triple = Triple(subject, predicate, obj)
323
+
324
+ return triple, pos
325
+
326
+
327
+ class NTriplesSerializer:
328
+ """
329
+ Serializer for N-Triples format.
330
+
331
+ Output is one triple per line, fully expanded (no prefixes).
332
+ """
333
+
334
+ def serialize(self, triples: List[Triple]) -> str:
335
+ """
336
+ Serialize triples to N-Triples format.
337
+
338
+ Args:
339
+ triples: List of Triple objects
340
+
341
+ Returns:
342
+ N-Triples formatted string
343
+ """
344
+ lines = []
345
+ for triple in triples:
346
+ lines.append(self._format_triple(triple))
347
+ return '\n'.join(lines) + '\n' if lines else ''
348
+
349
+ def serialize_to_file(self, triples: List[Triple], path: Union[str, Path]):
350
+ """Serialize triples to an N-Triples file."""
351
+ content = self.serialize(triples)
352
+ Path(path).write_text(content, encoding="utf-8")
353
+
354
+ def _format_triple(self, triple: Triple) -> str:
355
+ """Format a single triple as N-Triples line."""
356
+ s = self._format_subject(triple)
357
+ p = self._format_iri(triple.predicate)
358
+ o = self._format_object(triple)
359
+ return f"{s} {p} {o} ."
360
+
361
+ def _format_subject(self, triple: Triple) -> str:
362
+ """Format the subject."""
363
+ if triple.subject_triple:
364
+ return self._format_quoted_triple(triple.subject_triple)
365
+ return self._format_term(triple.subject)
366
+
367
+ def _format_object(self, triple: Triple) -> str:
368
+ """Format the object."""
369
+ if triple.object_triple:
370
+ return self._format_quoted_triple(triple.object_triple)
371
+ return self._format_term(triple.object)
372
+
373
+ def _format_quoted_triple(self, triple: Triple) -> str:
374
+ """Format a quoted triple."""
375
+ s = self._format_subject(triple)
376
+ p = self._format_iri(triple.predicate)
377
+ o = self._format_object(triple)
378
+ return f"<< {s} {p} {o} >>"
379
+
380
+ def _format_term(self, term: str) -> str:
381
+ """Format a term (IRI, blank node, or literal)."""
382
+ if term.startswith('_:'):
383
+ return term
384
+ elif term.startswith('"'):
385
+ return self._format_literal(term)
386
+ else:
387
+ return self._format_iri(term)
388
+
389
+ def _format_iri(self, iri: str) -> str:
390
+ """Format an IRI."""
391
+ if iri.startswith('<') and iri.endswith('>'):
392
+ return iri
393
+ return f"<{iri}>"
394
+
395
+ def _format_literal(self, literal: str) -> str:
396
+ """Format a literal, escaping special characters."""
397
+ # Already formatted literal
398
+ if literal.startswith('"'):
399
+ return literal
400
+
401
+ # Need to escape
402
+ escaped = literal.replace('\\', '\\\\').replace('"', '\\"')
403
+ escaped = escaped.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
404
+ return f'"{escaped}"'
405
+
406
+
407
+ # Convenience functions
408
+ def parse_ntriples(source: Union[str, Path]) -> ParsedDocument:
409
+ """Parse N-Triples content or file."""
410
+ parser = NTriplesParser()
411
+ if isinstance(source, Path) or (isinstance(source, str) and len(source) < 500 and Path(source).exists()):
412
+ return parser.parse_file(source)
413
+ return parser.parse(source)
414
+
415
+
416
+ def serialize_ntriples(triples: List[Triple]) -> str:
417
+ """Serialize triples to N-Triples format."""
418
+ serializer = NTriplesSerializer()
419
+ return serializer.serialize(triples)