flow-toon-format 0.9.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,289 @@
1
+ # Copyright (c) 2025 TOON Format Organization
2
+ # SPDX-License-Identifier: MIT
3
+ """Scanner for parsing TOON input into lines with depth information.
4
+
5
+ This module implements the first stage of the TOON decoding pipeline:
6
+ scanning the input text and converting it into structured line objects
7
+ with depth and indentation metadata. Handles strict and lenient parsing modes.
8
+ """
9
+
10
+ from dataclasses import dataclass
11
+ from typing import List, Optional, Tuple
12
+
13
+ from .constants import SPACE, TAB
14
+
15
+
16
+ @dataclass
17
+ class ParsedLine:
18
+ """A parsed line with metadata.
19
+
20
+ Attributes:
21
+ raw: The original raw line content
22
+ depth: The indentation depth (number of indent levels)
23
+ indent: The number of leading spaces
24
+ content: The line content after removing indentation
25
+ line_num: The 1-based line number in the source
26
+ """
27
+
28
+ raw: str
29
+ depth: int
30
+ indent: int
31
+ content: str
32
+ line_num: int
33
+
34
+ @property
35
+ def is_blank(self) -> bool:
36
+ """Check if this line is blank (only whitespace).
37
+
38
+ Returns:
39
+ True if the line contains only whitespace
40
+ """
41
+ return not self.content.strip()
42
+
43
+
44
+ @dataclass
45
+ class BlankLineInfo:
46
+ """Information about a blank line.
47
+
48
+ Attributes:
49
+ line_num: The 1-based line number
50
+ indent: The number of leading spaces
51
+ depth: The computed indentation depth
52
+ """
53
+
54
+ line_num: int
55
+ indent: int
56
+ depth: int
57
+
58
+
59
+ class LineCursor:
60
+ """Iterator-like class for traversing parsed lines.
61
+
62
+ Provides methods to peek at the current line, advance to the next line,
63
+ and check for lines at specific depths. This abstraction makes the decoder
64
+ logic cleaner and easier to test.
65
+ """
66
+
67
+ def __init__(
68
+ self,
69
+ lines: List[ParsedLine],
70
+ blank_lines: Optional[List[BlankLineInfo]] = None,
71
+ ) -> None:
72
+ """Initialize a line cursor.
73
+
74
+ Args:
75
+ lines: The parsed lines to traverse
76
+ blank_lines: Optional list of blank line information
77
+ """
78
+ self._lines = lines
79
+ self._index = 0
80
+ self._blank_lines = blank_lines or []
81
+
82
+ def get_blank_lines(self) -> List[BlankLineInfo]:
83
+ """Get the list of blank lines."""
84
+ return self._blank_lines
85
+
86
+ def peek(self) -> Optional[ParsedLine]:
87
+ """Peek at the current line without advancing.
88
+
89
+ Returns:
90
+ The current line, or None if at end
91
+ """
92
+ if self._index >= len(self._lines):
93
+ return None
94
+ return self._lines[self._index]
95
+
96
+ def next(self) -> Optional[ParsedLine]:
97
+ """Get the current line and advance.
98
+
99
+ Returns:
100
+ The current line, or None if at end
101
+ """
102
+ if self._index >= len(self._lines):
103
+ return None
104
+ line = self._lines[self._index]
105
+ self._index += 1
106
+ return line
107
+
108
+ def current(self) -> Optional[ParsedLine]:
109
+ """Get the most recently consumed line.
110
+
111
+ Returns:
112
+ The previous line, or None if no line has been consumed
113
+ """
114
+ if self._index > 0:
115
+ return self._lines[self._index - 1]
116
+ return None
117
+
118
+ def advance(self) -> None:
119
+ """Advance to the next line."""
120
+ self._index += 1
121
+
122
+ def at_end(self) -> bool:
123
+ """Check if cursor is at the end of lines.
124
+
125
+ Returns:
126
+ True if at end
127
+ """
128
+ return self._index >= len(self._lines)
129
+
130
+ @property
131
+ def length(self) -> int:
132
+ """Get the total number of lines."""
133
+ return len(self._lines)
134
+
135
+ def peek_at_depth(self, target_depth: int) -> Optional[ParsedLine]:
136
+ """Peek at the next line at a specific depth.
137
+
138
+ Args:
139
+ target_depth: The target depth
140
+
141
+ Returns:
142
+ The line if it matches the depth, None otherwise
143
+ """
144
+ line = self.peek()
145
+ if not line or line.depth < target_depth:
146
+ return None
147
+ if line.depth == target_depth:
148
+ return line
149
+ return None
150
+
151
+ def has_more_at_depth(self, target_depth: int) -> bool:
152
+ """Check if there are more lines at a specific depth.
153
+
154
+ Args:
155
+ target_depth: The target depth
156
+
157
+ Returns:
158
+ True if there are more lines at the target depth
159
+ """
160
+ return self.peek_at_depth(target_depth) is not None
161
+
162
+ def skip_deeper_than(self, depth: int) -> None:
163
+ """Skip all lines that are deeper than the given depth.
164
+
165
+ This is useful for skipping over nested structures after processing them.
166
+
167
+ Args:
168
+ depth: The reference depth. All lines with depth > this will be skipped.
169
+
170
+ Example:
171
+ >>> cursor.skip_deeper_than(1) # Skip all lines at depth 2, 3, 4, etc.
172
+ """
173
+ line = self.peek()
174
+ while line and line.depth > depth:
175
+ self.advance()
176
+ line = self.peek()
177
+
178
+
179
+ def to_parsed_lines(
180
+ source: str,
181
+ indent_size: int,
182
+ strict: bool,
183
+ ) -> Tuple[List[ParsedLine], List[BlankLineInfo]]:
184
+ """Convert source string to parsed lines with depth information.
185
+
186
+ Per Section 12 of the TOON specification for indentation handling.
187
+ This is the entry point for the scanning stage of the decoder pipeline.
188
+
189
+ Args:
190
+ source: The source string to parse
191
+ indent_size: The number of spaces per indentation level
192
+ strict: Whether to enforce strict indentation validation
193
+
194
+ Returns:
195
+ A tuple of (parsed_lines, blank_lines)
196
+
197
+ Raises:
198
+ SyntaxError: If strict mode validation fails (tabs in indentation, invalid spacing)
199
+
200
+ Examples:
201
+ >>> lines, blanks = to_parsed_lines("name: Alice\\n age: 30", 2, True)
202
+ >>> lines[0].content
203
+ 'name: Alice'
204
+ >>> lines[1].depth
205
+ 1
206
+ """
207
+ if not source.strip():
208
+ return [], []
209
+
210
+ lines = source.split("\n")
211
+ parsed: List[ParsedLine] = []
212
+ blank_lines: List[BlankLineInfo] = []
213
+
214
+ for i, raw in enumerate(lines):
215
+ line_num = i + 1
216
+ indent = 0
217
+ while indent < len(raw) and raw[indent] == SPACE:
218
+ indent += 1
219
+
220
+ content = raw[indent:]
221
+
222
+ # Compute depth for both blank and non-blank lines
223
+ depth = _compute_depth_from_indent(indent, indent_size)
224
+
225
+ # Track blank lines (but still include them in parsed list for validation)
226
+ is_blank = not content.strip()
227
+ if is_blank:
228
+ blank_lines.append(
229
+ BlankLineInfo(
230
+ line_num=line_num,
231
+ indent=indent,
232
+ depth=depth,
233
+ )
234
+ )
235
+ # Blank lines are not validated for indentation
236
+ # But we still add them to parsed list for array blank line detection
237
+
238
+ # Strict mode validation (skip for blank lines)
239
+ if strict and not is_blank:
240
+ # Find the full leading whitespace region (spaces and tabs)
241
+ ws_end = 0
242
+ while ws_end < len(raw) and (raw[ws_end] == SPACE or raw[ws_end] == TAB):
243
+ ws_end += 1
244
+
245
+ # Check for tabs in leading whitespace (before actual content)
246
+ if TAB in raw[:ws_end]:
247
+ raise SyntaxError(
248
+ f"Line {line_num}: Tabs not allowed in indentation in strict mode"
249
+ )
250
+
251
+ # Check for exact multiples of indent_size
252
+ if indent > 0 and indent % indent_size != 0:
253
+ raise SyntaxError(
254
+ f"Line {line_num}: Indent must be exact multiple of {indent_size}, "
255
+ f"but found {indent} spaces"
256
+ )
257
+
258
+ parsed.append(
259
+ ParsedLine(
260
+ raw=raw,
261
+ indent=indent,
262
+ content=content,
263
+ depth=depth,
264
+ line_num=line_num,
265
+ )
266
+ )
267
+
268
+ return parsed, blank_lines
269
+
270
+
271
+ def _compute_depth_from_indent(indent_spaces: int, indent_size: int) -> int:
272
+ """Compute depth from indentation spaces.
273
+
274
+ Args:
275
+ indent_spaces: Number of leading spaces
276
+ indent_size: Number of spaces per indentation level
277
+
278
+ Returns:
279
+ The computed depth
280
+
281
+ Examples:
282
+ >>> _compute_depth_from_indent(0, 2)
283
+ 0
284
+ >>> _compute_depth_from_indent(4, 2)
285
+ 2
286
+ >>> _compute_depth_from_indent(3, 2) # Lenient mode
287
+ 1
288
+ """
289
+ return indent_spaces // indent_size
@@ -0,0 +1,169 @@
1
+ # Copyright (c) 2025 TOON Format Organization
2
+ # SPDX-License-Identifier: MIT
3
+ """String utilities for TOON encoding and decoding.
4
+
5
+ This module provides shared string processing functions used by both
6
+ the encoder and decoder, following the TOON specification Section 7.1
7
+ for escape sequences and quoted string handling.
8
+ """
9
+
10
+ from .constants import (
11
+ BACKSLASH,
12
+ CARRIAGE_RETURN,
13
+ DOUBLE_QUOTE,
14
+ NEWLINE,
15
+ TAB,
16
+ )
17
+
18
+
19
+ def escape_string(value: str) -> str:
20
+ """Escape special characters in a string for encoding.
21
+
22
+ Handles backslashes, quotes, newlines, carriage returns, and tabs.
23
+ Per Section 7.1 of the TOON specification.
24
+
25
+ Args:
26
+ value: The string to escape
27
+
28
+ Returns:
29
+ The escaped string
30
+
31
+ Examples:
32
+ >>> escape_string('hello\\nworld')
33
+ 'hello\\\\nworld'
34
+ >>> escape_string('say "hello"')
35
+ 'say \\\\"hello\\\\"'
36
+ """
37
+ return (
38
+ value.replace(BACKSLASH, BACKSLASH + BACKSLASH)
39
+ .replace(DOUBLE_QUOTE, BACKSLASH + DOUBLE_QUOTE)
40
+ .replace(NEWLINE, BACKSLASH + "n")
41
+ .replace(CARRIAGE_RETURN, BACKSLASH + "r")
42
+ .replace(TAB, BACKSLASH + "t")
43
+ )
44
+
45
+
46
+ def unescape_string(value: str) -> str:
47
+ """Unescape a string by processing escape sequences.
48
+
49
+ Handles `\\n`, `\\t`, `\\r`, `\\\\`, and `\\"` escape sequences.
50
+ Per Section 7.1 of the TOON specification.
51
+
52
+ Args:
53
+ value: The string to unescape (without surrounding quotes)
54
+
55
+ Returns:
56
+ The unescaped string
57
+
58
+ Raises:
59
+ ValueError: If an invalid escape sequence is encountered
60
+
61
+ Examples:
62
+ >>> unescape_string('hello\\\\nworld')
63
+ 'hello\\nworld'
64
+ >>> unescape_string('say \\\\"hello\\\\"')
65
+ 'say "hello"'
66
+ """
67
+ result = ""
68
+ i = 0
69
+
70
+ while i < len(value):
71
+ if value[i] == BACKSLASH:
72
+ if i + 1 >= len(value):
73
+ raise ValueError("Invalid escape sequence: backslash at end of string")
74
+
75
+ next_char = value[i + 1]
76
+ if next_char == "n":
77
+ result += NEWLINE
78
+ i += 2
79
+ continue
80
+ if next_char == "t":
81
+ result += TAB
82
+ i += 2
83
+ continue
84
+ if next_char == "r":
85
+ result += CARRIAGE_RETURN
86
+ i += 2
87
+ continue
88
+ if next_char == BACKSLASH:
89
+ result += BACKSLASH
90
+ i += 2
91
+ continue
92
+ if next_char == DOUBLE_QUOTE:
93
+ result += DOUBLE_QUOTE
94
+ i += 2
95
+ continue
96
+
97
+ raise ValueError(f"Invalid escape sequence: \\{next_char}")
98
+
99
+ result += value[i]
100
+ i += 1
101
+
102
+ return result
103
+
104
+
105
+ def find_closing_quote(content: str, start: int) -> int:
106
+ """Find the index of the closing double quote, accounting for escape sequences.
107
+
108
+ Args:
109
+ content: The string to search in
110
+ start: The index of the opening quote
111
+
112
+ Returns:
113
+ The index of the closing quote, or -1 if not found
114
+
115
+ Examples:
116
+ >>> find_closing_quote('"hello"', 0)
117
+ 6
118
+ >>> find_closing_quote('"hello \\\\"world\\\\""', 0)
119
+ 17
120
+ """
121
+ i = start + 1
122
+ while i < len(content):
123
+ if content[i] == BACKSLASH and i + 1 < len(content):
124
+ # Skip escaped character
125
+ i += 2
126
+ continue
127
+ if content[i] == DOUBLE_QUOTE:
128
+ return i
129
+ i += 1
130
+ return -1 # Not found
131
+
132
+
133
+ def find_unquoted_char(content: str, char: str, start: int = 0) -> int:
134
+ """Find the index of a specific character outside of quoted sections.
135
+
136
+ Args:
137
+ content: The string to search in
138
+ char: The character to look for
139
+ start: Optional starting index (defaults to 0)
140
+
141
+ Returns:
142
+ The index of the character, or -1 if not found outside quotes
143
+
144
+ Examples:
145
+ >>> find_unquoted_char('key: "value: nested"', ':', 0)
146
+ 3
147
+ >>> find_unquoted_char('"key: nested": value', ':', 0)
148
+ 13
149
+ """
150
+ in_quotes = False
151
+ i = start
152
+
153
+ while i < len(content):
154
+ if content[i] == BACKSLASH and i + 1 < len(content) and in_quotes:
155
+ # Skip escaped character
156
+ i += 2
157
+ continue
158
+
159
+ if content[i] == DOUBLE_QUOTE:
160
+ in_quotes = not in_quotes
161
+ i += 1
162
+ continue
163
+
164
+ if content[i] == char and not in_quotes:
165
+ return i
166
+
167
+ i += 1
168
+
169
+ return -1
@@ -0,0 +1,150 @@
1
+ # Copyright (c) 2025 TOON Format Organization
2
+ # SPDX-License-Identifier: MIT
3
+ """Validation utilities for TOON encoding.
4
+
5
+ This module provides validation functions to determine whether strings,
6
+ keys, and values can be safely encoded without quotes or need quoting
7
+ according to TOON specification rules.
8
+ """
9
+
10
+ import re
11
+
12
+ from ._literal_utils import is_boolean_or_null_literal
13
+ from .constants import (
14
+ COMMA,
15
+ LIST_ITEM_MARKER,
16
+ NUMERIC_REGEX,
17
+ OCTAL_REGEX,
18
+ VALID_KEY_REGEX,
19
+ )
20
+
21
+
22
+ def is_valid_unquoted_key(key: str) -> bool:
23
+ """Check if a key can be used without quotes.
24
+
25
+ Valid unquoted keys must start with a letter or underscore,
26
+ followed by letters, digits, underscores, or dots.
27
+ Per Section 8.2 of the TOON specification.
28
+
29
+ Args:
30
+ key: The key to validate
31
+
32
+ Returns:
33
+ True if the key can be used without quotes
34
+
35
+ Examples:
36
+ >>> is_valid_unquoted_key("name")
37
+ True
38
+ >>> is_valid_unquoted_key("user_id")
39
+ True
40
+ >>> is_valid_unquoted_key("config.value")
41
+ True
42
+ >>> is_valid_unquoted_key("123") # Starts with digit
43
+ False
44
+ >>> is_valid_unquoted_key("my-key") # Contains hyphen
45
+ False
46
+ """
47
+ if not key:
48
+ return False
49
+ return bool(re.match(VALID_KEY_REGEX, key, re.IGNORECASE))
50
+
51
+
52
+ def is_safe_unquoted(value: str, delimiter: str = COMMA) -> bool:
53
+ """Determine if a string value can be safely encoded without quotes.
54
+
55
+ A string needs quoting if it:
56
+ - Is empty
57
+ - Has leading or trailing whitespace
58
+ - Could be confused with a literal (boolean, null, number)
59
+ - Contains structural characters (colons, brackets, braces)
60
+ - Contains quotes or backslashes (need escaping)
61
+ - Contains control characters (newlines, tabs, etc.)
62
+ - Contains the active delimiter
63
+ - Starts with a list marker (hyphen)
64
+
65
+ Per Section 7.2 of the TOON specification.
66
+
67
+ Args:
68
+ value: The string value to check
69
+ delimiter: The active delimiter (default: comma)
70
+
71
+ Returns:
72
+ True if the string can be safely encoded without quotes
73
+
74
+ Examples:
75
+ >>> is_safe_unquoted("hello")
76
+ True
77
+ >>> is_safe_unquoted("") # Empty
78
+ False
79
+ >>> is_safe_unquoted("true") # Reserved literal
80
+ False
81
+ >>> is_safe_unquoted("123") # Looks like number
82
+ False
83
+ >>> is_safe_unquoted("hello world") # Has whitespace (but not leading/trailing)
84
+ True
85
+ """
86
+ if not value:
87
+ return False
88
+
89
+ if value != value.strip():
90
+ return False
91
+
92
+ # Check if it looks like any literal value (boolean, null, or numeric)
93
+ if is_boolean_or_null_literal(value) or is_numeric_like(value):
94
+ return False
95
+
96
+ # Check for colon (always structural)
97
+ if ":" in value:
98
+ return False
99
+
100
+ # Check for quotes and backslash (always need escaping)
101
+ if '"' in value or "\\" in value:
102
+ return False
103
+
104
+ # Check for brackets and braces (always structural)
105
+ if re.search(r"[\[\]{}]", value):
106
+ return False
107
+
108
+ # Check for control characters (newline, carriage return, tab)
109
+ if re.search(r"[\n\r\t]", value):
110
+ return False
111
+
112
+ # Check for the active delimiter
113
+ if delimiter in value:
114
+ return False
115
+
116
+ # Check for hyphen at start (list marker)
117
+ if value.startswith(LIST_ITEM_MARKER):
118
+ return False
119
+
120
+ return True
121
+
122
+
123
+ def is_numeric_like(value: str) -> bool:
124
+ """Check if a string looks like a number.
125
+
126
+ Match numbers like `42`, `-3.14`, `1e-6`, `05`, etc.
127
+ Includes octal-like numbers (leading zero) which must be quoted.
128
+
129
+ Args:
130
+ value: The string to check
131
+
132
+ Returns:
133
+ True if the string looks like a number
134
+
135
+ Examples:
136
+ >>> is_numeric_like("42")
137
+ True
138
+ >>> is_numeric_like("-3.14")
139
+ True
140
+ >>> is_numeric_like("1e-6")
141
+ True
142
+ >>> is_numeric_like("0123") # Octal-like
143
+ True
144
+ >>> is_numeric_like("hello")
145
+ False
146
+ """
147
+ return bool(
148
+ re.match(NUMERIC_REGEX, value, re.IGNORECASE)
149
+ or re.match(OCTAL_REGEX, value) # Octal pattern
150
+ )