flow-toon-format 0.9.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flow_toon_format-0.9.0b2.dist-info/METADATA +200 -0
- flow_toon_format-0.9.0b2.dist-info/RECORD +24 -0
- flow_toon_format-0.9.0b2.dist-info/WHEEL +4 -0
- flow_toon_format-0.9.0b2.dist-info/entry_points.txt +2 -0
- flow_toon_format-0.9.0b2.dist-info/licenses/LICENSE +24 -0
- toon_format/__init__.py +40 -0
- toon_format/__main__.py +13 -0
- toon_format/_literal_utils.py +70 -0
- toon_format/_parsing_utils.py +167 -0
- toon_format/_scanner.py +289 -0
- toon_format/_string_utils.py +169 -0
- toon_format/_validation.py +150 -0
- toon_format/cli.py +217 -0
- toon_format/constants.py +84 -0
- toon_format/decoder.py +788 -0
- toon_format/encoder.py +56 -0
- toon_format/encoders.py +456 -0
- toon_format/logging_config.py +92 -0
- toon_format/normalize.py +237 -0
- toon_format/primitives.py +171 -0
- toon_format/py.typed +0 -0
- toon_format/types.py +64 -0
- toon_format/utils.py +187 -0
- toon_format/writer.py +53 -0
toon_format/decoder.py
ADDED
|
@@ -0,0 +1,788 @@
|
|
|
1
|
+
# Copyright (c) 2025 TOON Format Organization
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
"""TOON decoder implementation following v1.3 spec.
|
|
4
|
+
|
|
5
|
+
This module provides the main `decode()` function and ToonDecodeError exception
|
|
6
|
+
for converting TOON format strings back to Python values. Supports strict and
|
|
7
|
+
lenient parsing modes, handles all TOON syntax forms (objects, arrays, primitives),
|
|
8
|
+
and validates array lengths and delimiters.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
12
|
+
|
|
13
|
+
from ._literal_utils import is_boolean_or_null_literal, is_numeric_literal
|
|
14
|
+
from ._parsing_utils import (
|
|
15
|
+
find_first_unquoted,
|
|
16
|
+
find_unquoted_char,
|
|
17
|
+
parse_delimited_values,
|
|
18
|
+
)
|
|
19
|
+
from ._scanner import ParsedLine, to_parsed_lines
|
|
20
|
+
from ._string_utils import unescape_string as _unescape_string
|
|
21
|
+
from .constants import (
|
|
22
|
+
CLOSE_BRACE,
|
|
23
|
+
CLOSE_BRACKET,
|
|
24
|
+
COLON,
|
|
25
|
+
COMMA,
|
|
26
|
+
DOUBLE_QUOTE,
|
|
27
|
+
FALSE_LITERAL,
|
|
28
|
+
LIST_ITEM_MARKER,
|
|
29
|
+
OPEN_BRACE,
|
|
30
|
+
OPEN_BRACKET,
|
|
31
|
+
PIPE,
|
|
32
|
+
TAB,
|
|
33
|
+
TRUE_LITERAL,
|
|
34
|
+
)
|
|
35
|
+
from .types import DecodeOptions, JsonValue
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ToonDecodeError(Exception):
|
|
39
|
+
"""TOON decoding error."""
|
|
40
|
+
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def unescape_string(value: str) -> str:
|
|
45
|
+
"""Unescape a quoted string.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
value: Escaped string (without surrounding quotes)
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Unescaped string
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
ToonDecodeError: If escape sequence is invalid
|
|
55
|
+
"""
|
|
56
|
+
try:
|
|
57
|
+
return _unescape_string(value)
|
|
58
|
+
except ValueError as e:
|
|
59
|
+
raise ToonDecodeError(str(e)) from e
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def parse_primitive(token: str) -> JsonValue:
|
|
63
|
+
"""Parse a primitive token.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
token: Token string
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Parsed value
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
ToonDecodeError: If quoted string is malformed
|
|
73
|
+
"""
|
|
74
|
+
token = token.strip()
|
|
75
|
+
|
|
76
|
+
# Quoted string
|
|
77
|
+
if token.startswith(DOUBLE_QUOTE):
|
|
78
|
+
if not token.endswith(DOUBLE_QUOTE) or len(token) < 2:
|
|
79
|
+
raise ToonDecodeError("Unterminated string: missing closing quote")
|
|
80
|
+
return unescape_string(token[1:-1])
|
|
81
|
+
|
|
82
|
+
# Boolean and null literals
|
|
83
|
+
if is_boolean_or_null_literal(token):
|
|
84
|
+
if token == TRUE_LITERAL:
|
|
85
|
+
return True
|
|
86
|
+
if token == FALSE_LITERAL:
|
|
87
|
+
return False
|
|
88
|
+
return None # NULL_LITERAL
|
|
89
|
+
|
|
90
|
+
# Try to parse as number using utility function
|
|
91
|
+
if token and is_numeric_literal(token):
|
|
92
|
+
try:
|
|
93
|
+
# Try int first
|
|
94
|
+
if "." not in token and "e" not in token.lower():
|
|
95
|
+
return int(token)
|
|
96
|
+
# Then float
|
|
97
|
+
return float(token)
|
|
98
|
+
except ValueError:
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
# Otherwise it's an unquoted string (including octal-like "0123")
|
|
102
|
+
return token
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def parse_header(
|
|
106
|
+
line: str,
|
|
107
|
+
) -> Optional[Tuple[Optional[str], int, str, Optional[List[str]]]]:
|
|
108
|
+
"""Parse an array header.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
line: Line content
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Tuple of (key, length, delimiter, fields) or None if not a header
|
|
115
|
+
|
|
116
|
+
Raises:
|
|
117
|
+
ToonDecodeError: If header is malformed
|
|
118
|
+
"""
|
|
119
|
+
line = line.strip()
|
|
120
|
+
|
|
121
|
+
# Find the bracket segment (respecting quoted strings)
|
|
122
|
+
bracket_start = find_unquoted_char(line, OPEN_BRACKET)
|
|
123
|
+
if bracket_start == -1:
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
# Extract key (if any)
|
|
127
|
+
key = None
|
|
128
|
+
if bracket_start > 0:
|
|
129
|
+
key_part = line[:bracket_start].strip()
|
|
130
|
+
key = parse_key(key_part) if key_part else None
|
|
131
|
+
|
|
132
|
+
# Find closing bracket
|
|
133
|
+
bracket_end = find_unquoted_char(line, CLOSE_BRACKET, bracket_start)
|
|
134
|
+
if bracket_end == -1:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
# Parse bracket content: [#?N<delim?>]
|
|
138
|
+
bracket_content = line[bracket_start + 1 : bracket_end]
|
|
139
|
+
|
|
140
|
+
# Remove optional # marker
|
|
141
|
+
if bracket_content.startswith("#"):
|
|
142
|
+
bracket_content = bracket_content[1:]
|
|
143
|
+
|
|
144
|
+
# Determine delimiter from bracket content
|
|
145
|
+
delimiter = COMMA # default
|
|
146
|
+
length_str = bracket_content
|
|
147
|
+
|
|
148
|
+
if bracket_content.endswith(TAB):
|
|
149
|
+
delimiter = TAB
|
|
150
|
+
length_str = bracket_content[:-1]
|
|
151
|
+
elif bracket_content.endswith(PIPE):
|
|
152
|
+
delimiter = PIPE
|
|
153
|
+
length_str = bracket_content[:-1]
|
|
154
|
+
elif bracket_content.endswith(COMMA):
|
|
155
|
+
# Explicit comma delimiter (for tabular arrays)
|
|
156
|
+
delimiter = COMMA
|
|
157
|
+
length_str = bracket_content[:-1]
|
|
158
|
+
|
|
159
|
+
# Parse length
|
|
160
|
+
try:
|
|
161
|
+
length = int(length_str)
|
|
162
|
+
except ValueError:
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
# Check for fields segment
|
|
166
|
+
fields = None
|
|
167
|
+
after_bracket = line[bracket_end + 1 :].strip()
|
|
168
|
+
|
|
169
|
+
if after_bracket.startswith(OPEN_BRACE):
|
|
170
|
+
brace_end = find_unquoted_char(after_bracket, CLOSE_BRACE)
|
|
171
|
+
if brace_end == -1:
|
|
172
|
+
raise ToonDecodeError("Unterminated fields segment")
|
|
173
|
+
|
|
174
|
+
fields_content = after_bracket[1:brace_end]
|
|
175
|
+
# Parse fields using the delimiter
|
|
176
|
+
field_tokens = parse_delimited_values(fields_content, delimiter)
|
|
177
|
+
fields = [parse_key(f.strip()) for f in field_tokens]
|
|
178
|
+
|
|
179
|
+
after_bracket = after_bracket[brace_end + 1 :].strip()
|
|
180
|
+
|
|
181
|
+
# Must end with colon
|
|
182
|
+
if not after_bracket.startswith(COLON):
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
return (key, length, delimiter, fields)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def parse_key(key_str: str) -> str:
|
|
189
|
+
"""Parse a key (quoted or unquoted).
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
key_str: Key string
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Parsed key
|
|
196
|
+
|
|
197
|
+
Raises:
|
|
198
|
+
ToonDecodeError: If quoted key is malformed
|
|
199
|
+
"""
|
|
200
|
+
key_str = key_str.strip()
|
|
201
|
+
|
|
202
|
+
if key_str.startswith(DOUBLE_QUOTE):
|
|
203
|
+
if not key_str.endswith(DOUBLE_QUOTE) or len(key_str) < 2:
|
|
204
|
+
raise ToonDecodeError("Unterminated quoted key")
|
|
205
|
+
return unescape_string(key_str[1:-1])
|
|
206
|
+
|
|
207
|
+
return key_str
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def split_key_value(line: str) -> Tuple[str, str]:
|
|
211
|
+
"""Split a line into key and value at first unquoted colon.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
line: Line content
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Tuple of (key, value)
|
|
218
|
+
|
|
219
|
+
Raises:
|
|
220
|
+
ToonDecodeError: If no colon found
|
|
221
|
+
"""
|
|
222
|
+
colon_idx = find_unquoted_char(line, COLON)
|
|
223
|
+
if colon_idx == -1:
|
|
224
|
+
raise ToonDecodeError("Missing colon after key")
|
|
225
|
+
|
|
226
|
+
key = line[:colon_idx].strip()
|
|
227
|
+
value = line[colon_idx + 1 :].strip()
|
|
228
|
+
return (key, value)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def decode(input_str: str, options: Optional[DecodeOptions] = None) -> JsonValue:
|
|
232
|
+
"""Decode a TOON-formatted string to a Python value.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
input_str: TOON-formatted string
|
|
236
|
+
options: Optional decoding options
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Decoded Python value
|
|
240
|
+
|
|
241
|
+
Raises:
|
|
242
|
+
ToonDecodeError: If input is malformed
|
|
243
|
+
"""
|
|
244
|
+
if options is None:
|
|
245
|
+
options = DecodeOptions()
|
|
246
|
+
|
|
247
|
+
indent_size = options.indent
|
|
248
|
+
strict = options.strict
|
|
249
|
+
|
|
250
|
+
# Parse lines using scanner module
|
|
251
|
+
try:
|
|
252
|
+
parsed_lines, blank_lines_info = to_parsed_lines(input_str, indent_size, strict)
|
|
253
|
+
except SyntaxError as e:
|
|
254
|
+
# Convert scanner's SyntaxError to ToonDecodeError
|
|
255
|
+
raise ToonDecodeError(str(e)) from e
|
|
256
|
+
|
|
257
|
+
# Convert ParsedLine to have stripped content (decoder expects stripped)
|
|
258
|
+
# Note: ParsedLine.content keeps whitespace after indent removal, but decoder needs stripped
|
|
259
|
+
lines: List[ParsedLine] = [
|
|
260
|
+
ParsedLine(
|
|
261
|
+
raw=line.raw,
|
|
262
|
+
depth=line.depth,
|
|
263
|
+
indent=line.indent,
|
|
264
|
+
content=line.content.strip(),
|
|
265
|
+
line_num=line.line_num,
|
|
266
|
+
)
|
|
267
|
+
for line in parsed_lines
|
|
268
|
+
]
|
|
269
|
+
|
|
270
|
+
# Remove blank lines outside arrays (Section 12)
|
|
271
|
+
# For simplicity, we'll handle this during parsing
|
|
272
|
+
|
|
273
|
+
# Check for empty input (per spec Section 8: empty/whitespace-only → empty object)
|
|
274
|
+
non_blank_lines = [ln for ln in lines if not ln.is_blank]
|
|
275
|
+
if not non_blank_lines:
|
|
276
|
+
return {}
|
|
277
|
+
|
|
278
|
+
# Determine root form (Section 5)
|
|
279
|
+
first_line = non_blank_lines[0]
|
|
280
|
+
|
|
281
|
+
# Check if it's a root array header
|
|
282
|
+
header_info = parse_header(first_line.content)
|
|
283
|
+
if header_info is not None and header_info[0] is None: # No key = root array
|
|
284
|
+
# Root array
|
|
285
|
+
return decode_array(lines, 0, 0, header_info, strict)
|
|
286
|
+
|
|
287
|
+
# Check if it's a single primitive
|
|
288
|
+
if len(non_blank_lines) == 1:
|
|
289
|
+
line_content = first_line.content
|
|
290
|
+
# Check if it's not a key-value line
|
|
291
|
+
try:
|
|
292
|
+
split_key_value(line_content)
|
|
293
|
+
# It's a key-value, so root object
|
|
294
|
+
except ToonDecodeError:
|
|
295
|
+
# Not a key-value, check if it's a header
|
|
296
|
+
if header_info is None:
|
|
297
|
+
# Single primitive
|
|
298
|
+
return parse_primitive(line_content)
|
|
299
|
+
|
|
300
|
+
# Otherwise, root object
|
|
301
|
+
return decode_object(lines, 0, 0, strict)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def decode_object(
|
|
305
|
+
lines: List[ParsedLine], start_idx: int, parent_depth: int, strict: bool
|
|
306
|
+
) -> Dict[str, Any]:
|
|
307
|
+
"""Decode an object starting at given line index.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
lines: List of lines
|
|
311
|
+
start_idx: Starting line index
|
|
312
|
+
parent_depth: Parent indentation depth
|
|
313
|
+
strict: Strict mode flag
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
Decoded object
|
|
317
|
+
"""
|
|
318
|
+
result: Dict[str, Any] = {}
|
|
319
|
+
i = start_idx
|
|
320
|
+
expected_depth = parent_depth if start_idx == 0 else parent_depth + 1
|
|
321
|
+
|
|
322
|
+
while i < len(lines):
|
|
323
|
+
line = lines[i]
|
|
324
|
+
|
|
325
|
+
# Skip blank lines outside arrays (allowed)
|
|
326
|
+
if line.is_blank:
|
|
327
|
+
i += 1
|
|
328
|
+
continue
|
|
329
|
+
|
|
330
|
+
# Stop if we've dedented below expected depth
|
|
331
|
+
if line.depth < expected_depth:
|
|
332
|
+
break
|
|
333
|
+
|
|
334
|
+
# Skip lines that are too deeply indented (they belong to nested structures)
|
|
335
|
+
if line.depth > expected_depth:
|
|
336
|
+
i += 1
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
content = line.content
|
|
340
|
+
|
|
341
|
+
# Check for array header
|
|
342
|
+
header_info = parse_header(content)
|
|
343
|
+
if header_info is not None:
|
|
344
|
+
key, length, delimiter, fields = header_info
|
|
345
|
+
if key is not None:
|
|
346
|
+
# Array field
|
|
347
|
+
array_val, next_i = decode_array_from_header(
|
|
348
|
+
lines, i, line.depth, header_info, strict
|
|
349
|
+
)
|
|
350
|
+
result[key] = array_val
|
|
351
|
+
i = next_i
|
|
352
|
+
continue
|
|
353
|
+
|
|
354
|
+
# Must be a key-value line
|
|
355
|
+
try:
|
|
356
|
+
key_str, value_str = split_key_value(content)
|
|
357
|
+
except ToonDecodeError:
|
|
358
|
+
# Invalid line, skip in non-strict mode
|
|
359
|
+
if strict:
|
|
360
|
+
raise
|
|
361
|
+
i += 1
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
key = parse_key(key_str)
|
|
365
|
+
|
|
366
|
+
# Check if value is empty (nested object)
|
|
367
|
+
if not value_str:
|
|
368
|
+
# Nested object
|
|
369
|
+
result[key] = decode_object(lines, i + 1, line.depth, strict)
|
|
370
|
+
# Skip past nested object
|
|
371
|
+
i += 1
|
|
372
|
+
while i < len(lines) and lines[i].depth > line.depth:
|
|
373
|
+
i += 1
|
|
374
|
+
else:
|
|
375
|
+
# Primitive value
|
|
376
|
+
result[key] = parse_primitive(value_str)
|
|
377
|
+
i += 1
|
|
378
|
+
|
|
379
|
+
return result
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def decode_array_from_header(
|
|
383
|
+
lines: List[ParsedLine],
|
|
384
|
+
header_idx: int,
|
|
385
|
+
header_depth: int,
|
|
386
|
+
header_info: Tuple[Optional[str], int, str, Optional[List[str]]],
|
|
387
|
+
strict: bool,
|
|
388
|
+
) -> Tuple[List[Any], int]:
|
|
389
|
+
"""Decode array starting from a header line.
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
lines: List of lines
|
|
393
|
+
header_idx: Index of header line
|
|
394
|
+
header_depth: Depth of header line
|
|
395
|
+
header_info: Parsed header info
|
|
396
|
+
strict: Strict mode flag
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
Tuple of (decoded array, next line index)
|
|
400
|
+
"""
|
|
401
|
+
key, length, delimiter, fields = header_info
|
|
402
|
+
header_line = lines[header_idx].content
|
|
403
|
+
|
|
404
|
+
# Check if there's inline content after the colon
|
|
405
|
+
# Use split_key_value to find the colon position (respects quoted strings)
|
|
406
|
+
try:
|
|
407
|
+
_, inline_content = split_key_value(header_line)
|
|
408
|
+
except ToonDecodeError:
|
|
409
|
+
# No colon found (shouldn't happen with valid headers)
|
|
410
|
+
inline_content = ""
|
|
411
|
+
|
|
412
|
+
# Inline primitive array (can be empty if length is 0)
|
|
413
|
+
if inline_content or (not fields and length == 0):
|
|
414
|
+
# Inline primitive array (handles empty arrays like [0]:)
|
|
415
|
+
return (
|
|
416
|
+
decode_inline_array(inline_content, delimiter, length, strict),
|
|
417
|
+
header_idx + 1,
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
# Non-inline array
|
|
421
|
+
if fields is not None:
|
|
422
|
+
# Tabular array
|
|
423
|
+
return decode_tabular_array(
|
|
424
|
+
lines, header_idx + 1, header_depth, fields, delimiter, length, strict
|
|
425
|
+
)
|
|
426
|
+
else:
|
|
427
|
+
# List format (mixed/non-uniform)
|
|
428
|
+
return decode_list_array(lines, header_idx + 1, header_depth, delimiter, length, strict)
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def decode_array(
|
|
432
|
+
lines: List[ParsedLine],
|
|
433
|
+
start_idx: int,
|
|
434
|
+
parent_depth: int,
|
|
435
|
+
header_info: Tuple[Optional[str], int, str, Optional[List[str]]],
|
|
436
|
+
strict: bool,
|
|
437
|
+
) -> List[Any]:
|
|
438
|
+
"""Decode array (convenience wrapper).
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
lines: List of lines
|
|
442
|
+
start_idx: Starting line index
|
|
443
|
+
parent_depth: Parent depth
|
|
444
|
+
header_info: Header info
|
|
445
|
+
strict: Strict mode
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
Decoded array
|
|
449
|
+
"""
|
|
450
|
+
arr, _ = decode_array_from_header(lines, start_idx, parent_depth, header_info, strict)
|
|
451
|
+
return arr
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def decode_inline_array(
|
|
455
|
+
content: str, delimiter: str, expected_length: int, strict: bool
|
|
456
|
+
) -> List[Any]:
|
|
457
|
+
"""Decode an inline primitive array.
|
|
458
|
+
|
|
459
|
+
Args:
|
|
460
|
+
content: Inline content after colon
|
|
461
|
+
delimiter: Active delimiter
|
|
462
|
+
expected_length: Expected array length
|
|
463
|
+
strict: Strict mode flag
|
|
464
|
+
|
|
465
|
+
Returns:
|
|
466
|
+
Decoded array
|
|
467
|
+
|
|
468
|
+
Raises:
|
|
469
|
+
ToonDecodeError: If length mismatch in strict mode
|
|
470
|
+
"""
|
|
471
|
+
if not content and expected_length == 0:
|
|
472
|
+
return []
|
|
473
|
+
|
|
474
|
+
tokens = parse_delimited_values(content, delimiter)
|
|
475
|
+
values = [parse_primitive(token) for token in tokens]
|
|
476
|
+
|
|
477
|
+
if strict and len(values) != expected_length:
|
|
478
|
+
raise ToonDecodeError(f"Expected {expected_length} values, but got {len(values)}")
|
|
479
|
+
|
|
480
|
+
return values
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def decode_tabular_array(
|
|
484
|
+
lines: List[ParsedLine],
|
|
485
|
+
start_idx: int,
|
|
486
|
+
header_depth: int,
|
|
487
|
+
fields: List[str],
|
|
488
|
+
delimiter: str,
|
|
489
|
+
expected_length: int,
|
|
490
|
+
strict: bool,
|
|
491
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
492
|
+
"""Decode a tabular array.
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
lines: List of lines
|
|
496
|
+
start_idx: Starting line index (after header)
|
|
497
|
+
header_depth: Depth of header
|
|
498
|
+
fields: Field names
|
|
499
|
+
delimiter: Active delimiter
|
|
500
|
+
expected_length: Expected number of rows
|
|
501
|
+
strict: Strict mode flag
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
Tuple of (decoded array, next line index)
|
|
505
|
+
|
|
506
|
+
Raises:
|
|
507
|
+
ToonDecodeError: If row width or count mismatch in strict mode
|
|
508
|
+
"""
|
|
509
|
+
result = []
|
|
510
|
+
i = start_idx
|
|
511
|
+
row_depth = header_depth + 1
|
|
512
|
+
|
|
513
|
+
while i < len(lines):
|
|
514
|
+
line = lines[i]
|
|
515
|
+
|
|
516
|
+
# Handle blank lines
|
|
517
|
+
if line.is_blank:
|
|
518
|
+
if strict:
|
|
519
|
+
# In strict mode: blank lines at or above row depth are errors
|
|
520
|
+
# Blank lines dedented below row depth mean array has ended
|
|
521
|
+
if line.depth >= row_depth:
|
|
522
|
+
raise ToonDecodeError("Blank lines not allowed inside arrays")
|
|
523
|
+
else:
|
|
524
|
+
break
|
|
525
|
+
else:
|
|
526
|
+
# In non-strict mode: ignore all blank lines and continue
|
|
527
|
+
i += 1
|
|
528
|
+
continue
|
|
529
|
+
|
|
530
|
+
# Stop if dedented or different depth
|
|
531
|
+
if line.depth < row_depth:
|
|
532
|
+
break
|
|
533
|
+
if line.depth > row_depth:
|
|
534
|
+
# End of tabular rows (might be next key-value)
|
|
535
|
+
break
|
|
536
|
+
|
|
537
|
+
content = line.content
|
|
538
|
+
|
|
539
|
+
# Disambiguation: check if this is a row or a key-value line
|
|
540
|
+
# A row has no unquoted colon, or delimiter before colon
|
|
541
|
+
if is_row_line(content, delimiter):
|
|
542
|
+
# Parse as row
|
|
543
|
+
tokens = parse_delimited_values(content, delimiter)
|
|
544
|
+
values = [parse_primitive(token) for token in tokens]
|
|
545
|
+
|
|
546
|
+
if strict and len(values) != len(fields):
|
|
547
|
+
raise ToonDecodeError(
|
|
548
|
+
f"Expected {len(fields)} values in row, but got {len(values)}"
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
obj = {fields[j]: values[j] for j in range(min(len(fields), len(values)))}
|
|
552
|
+
result.append(obj)
|
|
553
|
+
i += 1
|
|
554
|
+
else:
|
|
555
|
+
# Not a row, end of tabular data
|
|
556
|
+
break
|
|
557
|
+
|
|
558
|
+
if strict and len(result) != expected_length:
|
|
559
|
+
raise ToonDecodeError(f"Expected {expected_length} rows, but got {len(result)}")
|
|
560
|
+
|
|
561
|
+
return result, i
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def is_row_line(line: str, delimiter: str) -> bool:
|
|
565
|
+
"""Check if a line is a tabular row (not a key-value line).
|
|
566
|
+
|
|
567
|
+
A line is a tabular row if:
|
|
568
|
+
- It has no unquoted colon, OR
|
|
569
|
+
- The first unquoted delimiter appears before the first unquoted colon
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
line: Line content
|
|
573
|
+
delimiter: Active delimiter
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
True if it's a row line
|
|
577
|
+
"""
|
|
578
|
+
# Find first occurrence of delimiter or colon (single pass optimization)
|
|
579
|
+
pos, char = find_first_unquoted(line, [delimiter, COLON])
|
|
580
|
+
|
|
581
|
+
# No special chars found -> row
|
|
582
|
+
if pos == -1:
|
|
583
|
+
return True
|
|
584
|
+
|
|
585
|
+
# First special char is delimiter -> row
|
|
586
|
+
# First special char is colon -> key-value
|
|
587
|
+
return char == delimiter
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def decode_list_array(
|
|
591
|
+
lines: List[ParsedLine],
|
|
592
|
+
start_idx: int,
|
|
593
|
+
header_depth: int,
|
|
594
|
+
delimiter: str,
|
|
595
|
+
expected_length: int,
|
|
596
|
+
strict: bool,
|
|
597
|
+
) -> Tuple[List[Any], int]:
|
|
598
|
+
"""Decode a list-format array (mixed/non-uniform).
|
|
599
|
+
|
|
600
|
+
Args:
|
|
601
|
+
lines: List of lines
|
|
602
|
+
start_idx: Starting line index
|
|
603
|
+
header_depth: Header depth
|
|
604
|
+
delimiter: Active delimiter
|
|
605
|
+
expected_length: Expected number of items
|
|
606
|
+
strict: Strict mode flag
|
|
607
|
+
|
|
608
|
+
Returns:
|
|
609
|
+
Tuple of (decoded array, next line index)
|
|
610
|
+
|
|
611
|
+
Raises:
|
|
612
|
+
ToonDecodeError: If item count mismatch in strict mode
|
|
613
|
+
"""
|
|
614
|
+
result: List[Any] = []
|
|
615
|
+
i = start_idx
|
|
616
|
+
item_depth = header_depth + 1
|
|
617
|
+
|
|
618
|
+
while i < len(lines):
|
|
619
|
+
line = lines[i]
|
|
620
|
+
|
|
621
|
+
# Handle blank lines
|
|
622
|
+
if line.is_blank:
|
|
623
|
+
if strict:
|
|
624
|
+
# In strict mode: blank lines at or above item depth are errors
|
|
625
|
+
# Blank lines dedented below item depth mean array has ended
|
|
626
|
+
if line.depth >= item_depth:
|
|
627
|
+
raise ToonDecodeError("Blank lines not allowed inside arrays")
|
|
628
|
+
else:
|
|
629
|
+
break
|
|
630
|
+
else:
|
|
631
|
+
# In non-strict mode: ignore all blank lines and continue
|
|
632
|
+
i += 1
|
|
633
|
+
continue
|
|
634
|
+
|
|
635
|
+
# Stop if dedented
|
|
636
|
+
if line.depth < item_depth:
|
|
637
|
+
break
|
|
638
|
+
|
|
639
|
+
# Must start with "- "
|
|
640
|
+
content = line.content
|
|
641
|
+
if not content.startswith(LIST_ITEM_MARKER):
|
|
642
|
+
# Not a list item, end of array
|
|
643
|
+
break
|
|
644
|
+
|
|
645
|
+
# Remove "- " prefix
|
|
646
|
+
item_content = content[len(LIST_ITEM_MARKER) :].strip()
|
|
647
|
+
|
|
648
|
+
# Check what kind of item this is
|
|
649
|
+
item_header = parse_header(item_content)
|
|
650
|
+
if item_header is not None:
|
|
651
|
+
# It's an array header: - [N]: ... or - key[N]: ...
|
|
652
|
+
key, length, item_delim, fields = item_header
|
|
653
|
+
|
|
654
|
+
if key is None:
|
|
655
|
+
# - [N]: inline array
|
|
656
|
+
colon_idx = item_content.find(COLON)
|
|
657
|
+
if colon_idx != -1:
|
|
658
|
+
inline_part = item_content[colon_idx + 1 :].strip()
|
|
659
|
+
# Inline primitive array (handles empty arrays like [0]:)
|
|
660
|
+
if inline_part or length == 0:
|
|
661
|
+
item_val = decode_inline_array(inline_part, item_delim, length, strict)
|
|
662
|
+
result.append(item_val)
|
|
663
|
+
i += 1
|
|
664
|
+
continue
|
|
665
|
+
else:
|
|
666
|
+
# - key[N]: array field in object
|
|
667
|
+
# This is an object with an array as its first field
|
|
668
|
+
item_obj: Dict[str, Any] = {}
|
|
669
|
+
array_val, next_i = decode_array_from_header(
|
|
670
|
+
lines, i, line.depth, item_header, strict
|
|
671
|
+
)
|
|
672
|
+
item_obj[key] = array_val
|
|
673
|
+
|
|
674
|
+
# Continue reading remaining fields at depth +1
|
|
675
|
+
i = next_i
|
|
676
|
+
while i < len(lines) and lines[i].depth == line.depth + 1:
|
|
677
|
+
field_line = lines[i]
|
|
678
|
+
if field_line.is_blank:
|
|
679
|
+
i += 1
|
|
680
|
+
continue
|
|
681
|
+
|
|
682
|
+
field_content = field_line.content
|
|
683
|
+
|
|
684
|
+
# Check for array header
|
|
685
|
+
field_header = parse_header(field_content)
|
|
686
|
+
if field_header is not None and field_header[0] is not None:
|
|
687
|
+
field_key, field_length, field_delim, field_fields = field_header
|
|
688
|
+
assert field_key is not None # Already checked above
|
|
689
|
+
field_val, next_i = decode_array_from_header(
|
|
690
|
+
lines, i, field_line.depth, field_header, strict
|
|
691
|
+
)
|
|
692
|
+
item_obj[field_key] = field_val
|
|
693
|
+
i = next_i
|
|
694
|
+
continue
|
|
695
|
+
|
|
696
|
+
try:
|
|
697
|
+
field_key_str, field_value_str = split_key_value(field_content)
|
|
698
|
+
field_key = parse_key(field_key_str)
|
|
699
|
+
|
|
700
|
+
if not field_value_str:
|
|
701
|
+
# Nested object
|
|
702
|
+
item_obj[field_key] = decode_object(
|
|
703
|
+
lines, i + 1, field_line.depth, strict
|
|
704
|
+
)
|
|
705
|
+
i += 1
|
|
706
|
+
while i < len(lines) and lines[i].depth > field_line.depth:
|
|
707
|
+
i += 1
|
|
708
|
+
else:
|
|
709
|
+
item_obj[field_key] = parse_primitive(field_value_str)
|
|
710
|
+
i += 1
|
|
711
|
+
except ToonDecodeError:
|
|
712
|
+
break
|
|
713
|
+
|
|
714
|
+
result.append(item_obj)
|
|
715
|
+
continue
|
|
716
|
+
|
|
717
|
+
# Check if it's an object (has colon)
|
|
718
|
+
try:
|
|
719
|
+
key_str, value_str = split_key_value(item_content)
|
|
720
|
+
# It's an object item
|
|
721
|
+
obj_item: Dict[str, Any] = {}
|
|
722
|
+
|
|
723
|
+
# First field
|
|
724
|
+
key = parse_key(key_str)
|
|
725
|
+
if not value_str:
|
|
726
|
+
# First field is nested object: fields at depth +2
|
|
727
|
+
nested = decode_object(lines, i + 1, line.depth + 1, strict)
|
|
728
|
+
obj_item[key] = nested
|
|
729
|
+
# Skip nested content
|
|
730
|
+
i += 1
|
|
731
|
+
while i < len(lines) and lines[i].depth > line.depth + 1:
|
|
732
|
+
i += 1
|
|
733
|
+
else:
|
|
734
|
+
# First field is primitive
|
|
735
|
+
obj_item[key] = parse_primitive(value_str)
|
|
736
|
+
i += 1
|
|
737
|
+
|
|
738
|
+
# Remaining fields at depth +1
|
|
739
|
+
while i < len(lines) and lines[i].depth == line.depth + 1:
|
|
740
|
+
field_line = lines[i]
|
|
741
|
+
if field_line.is_blank:
|
|
742
|
+
i += 1
|
|
743
|
+
continue
|
|
744
|
+
|
|
745
|
+
field_content = field_line.content
|
|
746
|
+
|
|
747
|
+
# Check for array header
|
|
748
|
+
field_header = parse_header(field_content)
|
|
749
|
+
if field_header is not None and field_header[0] is not None:
|
|
750
|
+
field_key, field_length, field_delim, field_fields = field_header
|
|
751
|
+
assert field_key is not None # Already checked above
|
|
752
|
+
field_val, next_i = decode_array_from_header(
|
|
753
|
+
lines, i, field_line.depth, field_header, strict
|
|
754
|
+
)
|
|
755
|
+
obj_item[field_key] = field_val
|
|
756
|
+
i = next_i
|
|
757
|
+
continue
|
|
758
|
+
|
|
759
|
+
try:
|
|
760
|
+
field_key_str, field_value_str = split_key_value(field_content)
|
|
761
|
+
field_key = parse_key(field_key_str)
|
|
762
|
+
|
|
763
|
+
if not field_value_str:
|
|
764
|
+
# Nested object
|
|
765
|
+
obj_item[field_key] = decode_object(lines, i + 1, field_line.depth, strict)
|
|
766
|
+
i += 1
|
|
767
|
+
while i < len(lines) and lines[i].depth > field_line.depth:
|
|
768
|
+
i += 1
|
|
769
|
+
else:
|
|
770
|
+
obj_item[field_key] = parse_primitive(field_value_str)
|
|
771
|
+
i += 1
|
|
772
|
+
except ToonDecodeError:
|
|
773
|
+
break
|
|
774
|
+
|
|
775
|
+
result.append(obj_item)
|
|
776
|
+
except ToonDecodeError:
|
|
777
|
+
# Not an object, must be a primitive
|
|
778
|
+
# Special case: empty content after "- " is an empty object
|
|
779
|
+
if not item_content:
|
|
780
|
+
result.append({})
|
|
781
|
+
else:
|
|
782
|
+
result.append(parse_primitive(item_content))
|
|
783
|
+
i += 1
|
|
784
|
+
|
|
785
|
+
if strict and len(result) != expected_length:
|
|
786
|
+
raise ToonDecodeError(f"Expected {expected_length} items, but got {len(result)}")
|
|
787
|
+
|
|
788
|
+
return result, i
|