PyPI - flow-toon-format - Versions diffs - 0.9.0b2__py3-none-any.whl - Mend

flow-toon-format 0.9.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

flow_toon_format-0.9.0b2.dist-info/METADATA +200 -0
flow_toon_format-0.9.0b2.dist-info/RECORD +24 -0
flow_toon_format-0.9.0b2.dist-info/WHEEL +4 -0
flow_toon_format-0.9.0b2.dist-info/entry_points.txt +2 -0
flow_toon_format-0.9.0b2.dist-info/licenses/LICENSE +24 -0
toon_format/__init__.py +40 -0
toon_format/__main__.py +13 -0
toon_format/_literal_utils.py +70 -0
toon_format/_parsing_utils.py +167 -0
toon_format/_scanner.py +289 -0
toon_format/_string_utils.py +169 -0
toon_format/_validation.py +150 -0
toon_format/cli.py +217 -0
toon_format/constants.py +84 -0
toon_format/decoder.py +788 -0
toon_format/encoder.py +56 -0
toon_format/encoders.py +456 -0
toon_format/logging_config.py +92 -0
toon_format/normalize.py +237 -0
toon_format/primitives.py +171 -0
toon_format/py.typed +0 -0
toon_format/types.py +64 -0
toon_format/utils.py +187 -0
toon_format/writer.py +53 -0

toon_format/_scanner.py ADDED Viewed

@@ -0,0 +1,289 @@
+# Copyright (c) 2025 TOON Format Organization
+# SPDX-License-Identifier: MIT
+"""Scanner for parsing TOON input into lines with depth information.
+This module implements the first stage of the TOON decoding pipeline:
+scanning the input text and converting it into structured line objects
+with depth and indentation metadata. Handles strict and lenient parsing modes.
+"""
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+from .constants import SPACE, TAB
+@dataclass
+class ParsedLine:
+    """A parsed line with metadata.
+    Attributes:
+        raw: The original raw line content
+        depth: The indentation depth (number of indent levels)
+        indent: The number of leading spaces
+        content: The line content after removing indentation
+        line_num: The 1-based line number in the source
+    """
+    raw: str
+    depth: int
+    indent: int
+    content: str
+    line_num: int
+    @property
+    def is_blank(self) -> bool:
+        """Check if this line is blank (only whitespace).
+        Returns:
+            True if the line contains only whitespace
+        """
+        return not self.content.strip()
+@dataclass
+class BlankLineInfo:
+    """Information about a blank line.
+    Attributes:
+        line_num: The 1-based line number
+        indent: The number of leading spaces
+        depth: The computed indentation depth
+    """
+    line_num: int
+    indent: int
+    depth: int
+class LineCursor:
+    """Iterator-like class for traversing parsed lines.
+    Provides methods to peek at the current line, advance to the next line,
+    and check for lines at specific depths. This abstraction makes the decoder
+    logic cleaner and easier to test.
+    """
+    def __init__(
+        self,
+        lines: List[ParsedLine],
+        blank_lines: Optional[List[BlankLineInfo]] = None,
+    ) -> None:
+        """Initialize a line cursor.
+        Args:
+            lines: The parsed lines to traverse
+            blank_lines: Optional list of blank line information
+        """
+        self._lines = lines
+        self._index = 0
+        self._blank_lines = blank_lines or []
+    def get_blank_lines(self) -> List[BlankLineInfo]:
+        """Get the list of blank lines."""
+        return self._blank_lines
+    def peek(self) -> Optional[ParsedLine]:
+        """Peek at the current line without advancing.
+        Returns:
+            The current line, or None if at end
+        """
+        if self._index >= len(self._lines):
+            return None
+        return self._lines[self._index]
+    def next(self) -> Optional[ParsedLine]:
+        """Get the current line and advance.
+        Returns:
+            The current line, or None if at end
+        """
+        if self._index >= len(self._lines):
+            return None
+        line = self._lines[self._index]
+        self._index += 1
+        return line
+    def current(self) -> Optional[ParsedLine]:
+        """Get the most recently consumed line.
+        Returns:
+            The previous line, or None if no line has been consumed
+        """
+        if self._index > 0:
+            return self._lines[self._index - 1]
+        return None
+    def advance(self) -> None:
+        """Advance to the next line."""
+        self._index += 1
+    def at_end(self) -> bool:
+        """Check if cursor is at the end of lines.
+        Returns:
+            True if at end
+        """
+        return self._index >= len(self._lines)
+    @property
+    def length(self) -> int:
+        """Get the total number of lines."""
+        return len(self._lines)
+    def peek_at_depth(self, target_depth: int) -> Optional[ParsedLine]:
+        """Peek at the next line at a specific depth.
+        Args:
+            target_depth: The target depth
+        Returns:
+            The line if it matches the depth, None otherwise
+        """
+        line = self.peek()
+        if not line or line.depth < target_depth:
+            return None
+        if line.depth == target_depth:
+            return line
+        return None
+    def has_more_at_depth(self, target_depth: int) -> bool:
+        """Check if there are more lines at a specific depth.
+        Args:
+            target_depth: The target depth
+        Returns:
+            True if there are more lines at the target depth
+        """
+        return self.peek_at_depth(target_depth) is not None
+    def skip_deeper_than(self, depth: int) -> None:
+        """Skip all lines that are deeper than the given depth.
+        This is useful for skipping over nested structures after processing them.
+        Args:
+            depth: The reference depth. All lines with depth > this will be skipped.
+        Example:
+            >>> cursor.skip_deeper_than(1)  # Skip all lines at depth 2, 3, 4, etc.
+        """
+        line = self.peek()
+        while line and line.depth > depth:
+            self.advance()
+            line = self.peek()
+def to_parsed_lines(
+    source: str,
+    indent_size: int,
+    strict: bool,
+) -> Tuple[List[ParsedLine], List[BlankLineInfo]]:
+    """Convert source string to parsed lines with depth information.
+    Per Section 12 of the TOON specification for indentation handling.
+    This is the entry point for the scanning stage of the decoder pipeline.
+    Args:
+        source: The source string to parse
+        indent_size: The number of spaces per indentation level
+        strict: Whether to enforce strict indentation validation
+    Returns:
+        A tuple of (parsed_lines, blank_lines)
+    Raises:
+        SyntaxError: If strict mode validation fails (tabs in indentation, invalid spacing)
+    Examples:
+        >>> lines, blanks = to_parsed_lines("name: Alice\\n  age: 30", 2, True)
+        >>> lines[0].content
+        'name: Alice'
+        >>> lines[1].depth
+        1
+    """
+    if not source.strip():
+        return [], []
+    lines = source.split("\n")
+    parsed: List[ParsedLine] = []
+    blank_lines: List[BlankLineInfo] = []
+    for i, raw in enumerate(lines):
+        line_num = i + 1
+        indent = 0
+        while indent < len(raw) and raw[indent] == SPACE:
+            indent += 1
+        content = raw[indent:]
+        # Compute depth for both blank and non-blank lines
+        depth = _compute_depth_from_indent(indent, indent_size)
+        # Track blank lines (but still include them in parsed list for validation)
+        is_blank = not content.strip()
+        if is_blank:
+            blank_lines.append(
+                BlankLineInfo(
+                    line_num=line_num,
+                    indent=indent,
+                    depth=depth,
+                )
+            )
+            # Blank lines are not validated for indentation
+            # But we still add them to parsed list for array blank line detection
+        # Strict mode validation (skip for blank lines)
+        if strict and not is_blank:
+            # Find the full leading whitespace region (spaces and tabs)
+            ws_end = 0
+            while ws_end < len(raw) and (raw[ws_end] == SPACE or raw[ws_end] == TAB):
+                ws_end += 1
+            # Check for tabs in leading whitespace (before actual content)
+            if TAB in raw[:ws_end]:
+                raise SyntaxError(
+                    f"Line {line_num}: Tabs not allowed in indentation in strict mode"
+                )
+            # Check for exact multiples of indent_size
+            if indent > 0 and indent % indent_size != 0:
+                raise SyntaxError(
+                    f"Line {line_num}: Indent must be exact multiple of {indent_size}, "
+                    f"but found {indent} spaces"
+                )
+        parsed.append(
+            ParsedLine(
+                raw=raw,
+                indent=indent,
+                content=content,
+                depth=depth,
+                line_num=line_num,
+            )
+        )
+    return parsed, blank_lines
+def _compute_depth_from_indent(indent_spaces: int, indent_size: int) -> int:
+    """Compute depth from indentation spaces.
+    Args:
+        indent_spaces: Number of leading spaces
+        indent_size: Number of spaces per indentation level
+    Returns:
+        The computed depth
+    Examples:
+        >>> _compute_depth_from_indent(0, 2)
+        0
+        >>> _compute_depth_from_indent(4, 2)
+        2
+        >>> _compute_depth_from_indent(3, 2)  # Lenient mode
+        1
+    """
+    return indent_spaces // indent_size

toon_format/_string_utils.py ADDED Viewed

@@ -0,0 +1,169 @@
+# Copyright (c) 2025 TOON Format Organization
+# SPDX-License-Identifier: MIT
+"""String utilities for TOON encoding and decoding.
+This module provides shared string processing functions used by both
+the encoder and decoder, following the TOON specification Section 7.1
+for escape sequences and quoted string handling.
+"""
+from .constants import (
+    BACKSLASH,
+    CARRIAGE_RETURN,
+    DOUBLE_QUOTE,
+    NEWLINE,
+    TAB,
+)
+def escape_string(value: str) -> str:
+    """Escape special characters in a string for encoding.
+    Handles backslashes, quotes, newlines, carriage returns, and tabs.
+    Per Section 7.1 of the TOON specification.
+    Args:
+        value: The string to escape
+    Returns:
+        The escaped string
+    Examples:
+        >>> escape_string('hello\\nworld')
+        'hello\\\\nworld'
+        >>> escape_string('say "hello"')
+        'say \\\\"hello\\\\"'
+    """
+    return (
+        value.replace(BACKSLASH, BACKSLASH + BACKSLASH)
+        .replace(DOUBLE_QUOTE, BACKSLASH + DOUBLE_QUOTE)
+        .replace(NEWLINE, BACKSLASH + "n")
+        .replace(CARRIAGE_RETURN, BACKSLASH + "r")
+        .replace(TAB, BACKSLASH + "t")
+    )
+def unescape_string(value: str) -> str:
+    """Unescape a string by processing escape sequences.
+    Handles `\\n`, `\\t`, `\\r`, `\\\\`, and `\\"` escape sequences.
+    Per Section 7.1 of the TOON specification.
+    Args:
+        value: The string to unescape (without surrounding quotes)
+    Returns:
+        The unescaped string
+    Raises:
+        ValueError: If an invalid escape sequence is encountered
+    Examples:
+        >>> unescape_string('hello\\\\nworld')
+        'hello\\nworld'
+        >>> unescape_string('say \\\\"hello\\\\"')
+        'say "hello"'
+    """
+    result = ""
+    i = 0
+    while i < len(value):
+        if value[i] == BACKSLASH:
+            if i + 1 >= len(value):
+                raise ValueError("Invalid escape sequence: backslash at end of string")
+            next_char = value[i + 1]
+            if next_char == "n":
+                result += NEWLINE
+                i += 2
+                continue
+            if next_char == "t":
+                result += TAB
+                i += 2
+                continue
+            if next_char == "r":
+                result += CARRIAGE_RETURN
+                i += 2
+                continue
+            if next_char == BACKSLASH:
+                result += BACKSLASH
+                i += 2
+                continue
+            if next_char == DOUBLE_QUOTE:
+                result += DOUBLE_QUOTE
+                i += 2
+                continue
+            raise ValueError(f"Invalid escape sequence: \\{next_char}")
+        result += value[i]
+        i += 1
+    return result
+def find_closing_quote(content: str, start: int) -> int:
+    """Find the index of the closing double quote, accounting for escape sequences.
+    Args:
+        content: The string to search in
+        start: The index of the opening quote
+    Returns:
+        The index of the closing quote, or -1 if not found
+    Examples:
+        >>> find_closing_quote('"hello"', 0)
+        6
+        >>> find_closing_quote('"hello \\\\"world\\\\""', 0)
+        17
+    """
+    i = start + 1
+    while i < len(content):
+        if content[i] == BACKSLASH and i + 1 < len(content):
+            # Skip escaped character
+            i += 2
+            continue
+        if content[i] == DOUBLE_QUOTE:
+            return i
+        i += 1
+    return -1  # Not found
+def find_unquoted_char(content: str, char: str, start: int = 0) -> int:
+    """Find the index of a specific character outside of quoted sections.
+    Args:
+        content: The string to search in
+        char: The character to look for
+        start: Optional starting index (defaults to 0)
+    Returns:
+        The index of the character, or -1 if not found outside quotes
+    Examples:
+        >>> find_unquoted_char('key: "value: nested"', ':', 0)
+        3
+        >>> find_unquoted_char('"key: nested": value', ':', 0)
+        13
+    """
+    in_quotes = False
+    i = start
+    while i < len(content):
+        if content[i] == BACKSLASH and i + 1 < len(content) and in_quotes:
+            # Skip escaped character
+            i += 2
+            continue
+        if content[i] == DOUBLE_QUOTE:
+            in_quotes = not in_quotes
+            i += 1
+            continue
+        if content[i] == char and not in_quotes:
+            return i
+        i += 1
+    return -1

toon_format/_validation.py ADDED Viewed

@@ -0,0 +1,150 @@
+# Copyright (c) 2025 TOON Format Organization
+# SPDX-License-Identifier: MIT
+"""Validation utilities for TOON encoding.
+This module provides validation functions to determine whether strings,
+keys, and values can be safely encoded without quotes or need quoting
+according to TOON specification rules.
+"""
+import re
+from ._literal_utils import is_boolean_or_null_literal
+from .constants import (
+    COMMA,
+    LIST_ITEM_MARKER,
+    NUMERIC_REGEX,
+    OCTAL_REGEX,
+    VALID_KEY_REGEX,
+)
+def is_valid_unquoted_key(key: str) -> bool:
+    """Check if a key can be used without quotes.
+    Valid unquoted keys must start with a letter or underscore,
+    followed by letters, digits, underscores, or dots.
+    Per Section 8.2 of the TOON specification.
+    Args:
+        key: The key to validate
+    Returns:
+        True if the key can be used without quotes
+    Examples:
+        >>> is_valid_unquoted_key("name")
+        True
+        >>> is_valid_unquoted_key("user_id")
+        True
+        >>> is_valid_unquoted_key("config.value")
+        True
+        >>> is_valid_unquoted_key("123")  # Starts with digit
+        False
+        >>> is_valid_unquoted_key("my-key")  # Contains hyphen
+        False
+    """
+    if not key:
+        return False
+    return bool(re.match(VALID_KEY_REGEX, key, re.IGNORECASE))
+def is_safe_unquoted(value: str, delimiter: str = COMMA) -> bool:
+    """Determine if a string value can be safely encoded without quotes.
+    A string needs quoting if it:
+    - Is empty
+    - Has leading or trailing whitespace
+    - Could be confused with a literal (boolean, null, number)
+    - Contains structural characters (colons, brackets, braces)
+    - Contains quotes or backslashes (need escaping)
+    - Contains control characters (newlines, tabs, etc.)
+    - Contains the active delimiter
+    - Starts with a list marker (hyphen)
+    Per Section 7.2 of the TOON specification.
+    Args:
+        value: The string value to check
+        delimiter: The active delimiter (default: comma)
+    Returns:
+        True if the string can be safely encoded without quotes
+    Examples:
+        >>> is_safe_unquoted("hello")
+        True
+        >>> is_safe_unquoted("")  # Empty
+        False
+        >>> is_safe_unquoted("true")  # Reserved literal
+        False
+        >>> is_safe_unquoted("123")  # Looks like number
+        False
+        >>> is_safe_unquoted("hello world")  # Has whitespace (but not leading/trailing)
+        True
+    """
+    if not value:
+        return False
+    if value != value.strip():
+        return False
+    # Check if it looks like any literal value (boolean, null, or numeric)
+    if is_boolean_or_null_literal(value) or is_numeric_like(value):
+        return False
+    # Check for colon (always structural)
+    if ":" in value:
+        return False
+    # Check for quotes and backslash (always need escaping)
+    if '"' in value or "\\" in value:
+        return False
+    # Check for brackets and braces (always structural)
+    if re.search(r"[\[\]{}]", value):
+        return False
+    # Check for control characters (newline, carriage return, tab)
+    if re.search(r"[\n\r\t]", value):
+        return False
+    # Check for the active delimiter
+    if delimiter in value:
+        return False
+    # Check for hyphen at start (list marker)
+    if value.startswith(LIST_ITEM_MARKER):
+        return False
+    return True
+def is_numeric_like(value: str) -> bool:
+    """Check if a string looks like a number.
+    Match numbers like `42`, `-3.14`, `1e-6`, `05`, etc.
+    Includes octal-like numbers (leading zero) which must be quoted.
+    Args:
+        value: The string to check
+    Returns:
+        True if the string looks like a number
+    Examples:
+        >>> is_numeric_like("42")
+        True
+        >>> is_numeric_like("-3.14")
+        True
+        >>> is_numeric_like("1e-6")
+        True
+        >>> is_numeric_like("0123")  # Octal-like
+        True
+        >>> is_numeric_like("hello")
+        False
+    """
+    return bool(
+        re.match(NUMERIC_REGEX, value, re.IGNORECASE)
+        or re.match(OCTAL_REGEX, value)  # Octal pattern
+    )