PyPI - toonify - Versions diffs - 0.0.1__py3-none-any.whl - Mend

toonify 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

toon/__init__.py +22 -0
toon/cli.py +276 -0
toon/constants.py +42 -0
toon/decoder.py +370 -0
toon/encoder.py +317 -0
toon/utils.py +243 -0
toonify-0.0.1.dist-info/METADATA +383 -0
toonify-0.0.1.dist-info/RECORD +11 -0
toonify-0.0.1.dist-info/WHEEL +4 -0
toonify-0.0.1.dist-info/entry_points.txt +2 -0
toonify-0.0.1.dist-info/licenses/LICENSE +21 -0

toon/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""TOON (Token-Oriented Object Notation) - A compact serialization format for LLMs."""
+from .encoder import encode
+from .decoder import decode
+from .constants import (
+    COMMA, TAB, PIPE,
+    KEY_FOLDING_OFF, KEY_FOLDING_SAFE,
+    EXPAND_PATHS_OFF, EXPAND_PATHS_SAFE
+)
+__version__ = '1.0.0'
+__all__ = [
+    'encode',
+    'decode',
+    'COMMA',
+    'TAB',
+    'PIPE',
+    'KEY_FOLDING_OFF',
+    'KEY_FOLDING_SAFE',
+    'EXPAND_PATHS_OFF',
+    'EXPAND_PATHS_SAFE',
+]

toon/cli.py ADDED Viewed

@@ -0,0 +1,276 @@
+"""Command-line interface for TOON format conversion."""
+import sys
+import json
+import argparse
+from pathlib import Path
+from typing import Optional
+try:
+    import tiktoken
+    TIKTOKEN_AVAILABLE = True
+except ImportError:
+    TIKTOKEN_AVAILABLE = False
+from . import encode, decode
+def count_tokens(text: str) -> Optional[int]:
+    """
+    Count tokens in text using tiktoken (o200k_base encoding).
+    Args:
+        text: Text to count tokens in
+    Returns:
+        Token count or None if tiktoken not available
+    """
+    if not TIKTOKEN_AVAILABLE:
+        return None
+    try:
+        encoding = tiktoken.get_encoding("o200k_base")
+        return len(encoding.encode(text))
+    except Exception:
+        return None
+def detect_mode(input_path: Optional[str], force_encode: bool, force_decode: bool) -> str:
+    """
+    Detect conversion mode from file extension or flags.
+    Args:
+        input_path: Input file path
+        force_encode: Force encode mode
+        force_decode: Force decode mode
+    Returns:
+        'encode' or 'decode'
+    """
+    if force_encode:
+        return 'encode'
+    if force_decode:
+        return 'decode'
+    if input_path and input_path != '-':
+        path = Path(input_path)
+        ext = path.suffix.lower()
+        if ext == '.json':
+            return 'encode'
+        elif ext == '.toon':
+            return 'decode'
+    # Default to encode
+    return 'encode'
+def read_input(input_path: Optional[str]) -> str:
+    """
+    Read input from file or stdin.
+    Args:
+        input_path: Input file path or '-' for stdin
+    Returns:
+        Input content
+    """
+    if not input_path or input_path == '-':
+        return sys.stdin.read()
+    with open(input_path, 'r', encoding='utf-8') as f:
+        return f.read()
+def write_output(content: str, output_path: Optional[str]) -> None:
+    """
+    Write output to file or stdout.
+    Args:
+        content: Content to write
+        output_path: Output file path or None for stdout
+    """
+    if not output_path:
+        print(content)
+    else:
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(content)
+def main():
+    """Main CLI entry point."""
+    parser = argparse.ArgumentParser(
+        description='TOON (Token-Oriented Object Notation) - Convert between JSON and TOON formats',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Encode JSON file to TOON
+  toon input.json -o output.toon
+  # Decode TOON file to JSON
+  toon input.toon -o output.json
+  # Pipe JSON and encode to TOON
+  echo '{"key": "value"}' | toon -e
+  # Force decode mode with custom delimiter
+  toon input.txt -d --delimiter tab
+  # Show token statistics
+  toon input.json --stats
+        """
+    )
+    parser.add_argument(
+        'input',
+        nargs='?',
+        help='Input file path (or "-" for stdin, default: stdin)'
+    )
+    parser.add_argument(
+        '-o', '--output',
+        help='Output file path (default: stdout)'
+    )
+    parser.add_argument(
+        '-e', '--encode',
+        action='store_true',
+        help='Force encode mode (JSON to TOON)'
+    )
+    parser.add_argument(
+        '-d', '--decode',
+        action='store_true',
+        help='Force decode mode (TOON to JSON)'
+    )
+    parser.add_argument(
+        '--delimiter',
+        choices=['comma', 'tab', 'pipe'],
+        default='comma',
+        help='Array delimiter (default: comma)'
+    )
+    parser.add_argument(
+        '--indent',
+        type=int,
+        default=2,
+        help='Indentation size (default: 2)'
+    )
+    parser.add_argument(
+        '--stats',
+        action='store_true',
+        help='Show token statistics'
+    )
+    parser.add_argument(
+        '--no-strict',
+        action='store_true',
+        help='Disable strict validation (decode only)'
+    )
+    parser.add_argument(
+        '--key-folding',
+        choices=['off', 'safe'],
+        default='off',
+        help='Key folding mode (encode only, default: off)'
+    )
+    parser.add_argument(
+        '--flatten-depth',
+        type=int,
+        help='Maximum key folding depth (encode only)'
+    )
+    parser.add_argument(
+        '--expand-paths',
+        choices=['off', 'safe'],
+        default='off',
+        help='Path expansion mode (decode only, default: off)'
+    )
+    args = parser.parse_args()
+    # Validate arguments
+    if args.encode and args.decode:
+        parser.error('Cannot specify both --encode and --decode')
+    try:
+        # Read input
+        input_content = read_input(args.input)
+        # Detect mode
+        mode = detect_mode(args.input, args.encode, args.decode)
+        # Convert
+        if mode == 'encode':
+            # Parse JSON
+            data = json.loads(input_content)
+            # Encode to TOON
+            options = {
+                'delimiter': args.delimiter,
+                'indent': args.indent,
+                'key_folding': args.key_folding,
+            }
+            if args.flatten_depth is not None:
+                options['flatten_depth'] = args.flatten_depth
+            output_content = encode(data, options)
+            # Show statistics if requested
+            if args.stats:
+                input_tokens = count_tokens(input_content)
+                output_tokens = count_tokens(output_content)
+                print(f'Input (JSON):  {len(input_content)} bytes', file=sys.stderr)
+                print(f'Output (TOON): {len(output_content)} bytes', file=sys.stderr)
+                print(f'Size reduction: {(1 - len(output_content) / len(input_content)) * 100:.1f}%', file=sys.stderr)
+                if input_tokens is not None and output_tokens is not None:
+                    print(f'Input tokens:  {input_tokens}', file=sys.stderr)
+                    print(f'Output tokens: {output_tokens}', file=sys.stderr)
+                    print(f'Token reduction: {(1 - output_tokens / input_tokens) * 100:.1f}%', file=sys.stderr)
+                else:
+                    print('(Install tiktoken for token statistics)', file=sys.stderr)
+                print('---', file=sys.stderr)
+        else:  # decode
+            # Decode TOON
+            options = {
+                'strict': not args.no_strict,
+                'expand_paths': args.expand_paths,
+                'default_delimiter': args.delimiter,
+            }
+            data = decode(input_content, options)
+            # Convert to JSON
+            output_content = json.dumps(data, indent=2, ensure_ascii=False)
+            # Show statistics if requested
+            if args.stats:
+                input_tokens = count_tokens(input_content)
+                output_tokens = count_tokens(output_content)
+                print(f'Input (TOON): {len(input_content)} bytes', file=sys.stderr)
+                print(f'Output (JSON): {len(output_content)} bytes', file=sys.stderr)
+                print(f'Size increase: {(len(output_content) / len(input_content) - 1) * 100:.1f}%', file=sys.stderr)
+                if input_tokens is not None and output_tokens is not None:
+                    print(f'Input tokens:  {input_tokens}', file=sys.stderr)
+                    print(f'Output tokens: {output_tokens}', file=sys.stderr)
+                    print(f'Token increase: {(output_tokens / input_tokens - 1) * 100:.1f}%', file=sys.stderr)
+                else:
+                    print('(Install tiktoken for token statistics)', file=sys.stderr)
+                print('---', file=sys.stderr)
+        # Write output
+        write_output(output_content, args.output)
+        return 0
+    except json.JSONDecodeError as e:
+        print(f'Error parsing JSON: {e}', file=sys.stderr)
+        return 1
+    except FileNotFoundError as e:
+        print(f'Error: {e}', file=sys.stderr)
+        return 1
+    except Exception as e:
+        print(f'Error: {e}', file=sys.stderr)
+        return 1
+if __name__ == '__main__':
+    sys.exit(main())

toon/constants.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""Constants used throughout the TOON library."""
+# Delimiters
+COMMA = ','
+TAB = '\t'
+PIPE = '|'
+DEFAULT_DELIMITER = COMMA
+# Special characters
+COLON = ':'
+QUOTE = '"'
+BACKSLASH = '\\'
+NEWLINE = '\n'
+SPACE = ' '
+LEFT_BRACKET = '['
+RIGHT_BRACKET = ']'
+LEFT_BRACE = '{'
+RIGHT_BRACE = '}'
+# Literals
+TRUE_LITERAL = 'true'
+FALSE_LITERAL = 'false'
+NULL_LITERAL = 'null'
+# Default options
+DEFAULT_INDENT = 2
+DEFAULT_KEY_FOLDING = 'off'
+DEFAULT_EXPAND_PATHS = 'off'
+DEFAULT_STRICT = True
+# Key folding modes
+KEY_FOLDING_OFF = 'off'
+KEY_FOLDING_SAFE = 'safe'
+# Path expansion modes
+EXPAND_PATHS_OFF = 'off'
+EXPAND_PATHS_SAFE = 'safe'
+# Special delimiter names
+DELIMITER_TAB = 'tab'
+DELIMITER_PIPE = 'pipe'
+DELIMITER_COMMA = 'comma'

toon/decoder.py ADDED Viewed

@@ -0,0 +1,370 @@
+"""TOON decoder - convert TOON format to Python objects."""
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from .constants import (
+    COMMA, TAB, PIPE, COLON, QUOTE, NEWLINE, SPACE,
+    DEFAULT_DELIMITER, DEFAULT_STRICT,
+    EXPAND_PATHS_OFF, EXPAND_PATHS_SAFE,
+    LEFT_BRACKET, RIGHT_BRACKET, LEFT_BRACE, RIGHT_BRACE
+)
+from .utils import unescape_string, parse_literal
+class DecoderOptions:
+    """Options for TOON decoding."""
+    def __init__(
+        self,
+        strict: bool = DEFAULT_STRICT,
+        expand_paths: str = EXPAND_PATHS_OFF,
+        default_delimiter: str = DEFAULT_DELIMITER
+    ):
+        """
+        Initialize decoder options.
+        Args:
+            strict: Validate structure strictly
+            expand_paths: Path expansion mode ('off' | 'safe')
+            default_delimiter: Default delimiter for arrays
+        """
+        self.strict = strict
+        self.expand_paths = expand_paths
+        self.default_delimiter = default_delimiter
+def decode(toon_string: str, options: Optional[Dict[str, Any]] = None) -> Any:
+    """
+    Decode TOON format string to Python data structure.
+    Args:
+        toon_string: TOON formatted string
+        options: Decoding options
+            - strict: bool (default True) - validate structure
+            - expand_paths: 'off' (default) or 'safe'
+            - default_delimiter: ',' (default)
+    Returns:
+        Python object (dict or list)
+    Example:
+        >>> toon = '''users[1]{id,name}:
+        ...   1,Alice'''
+        >>> decode(toon)
+        {'users': [{'id': 1, 'name': 'Alice'}]}
+    """
+    if options is None:
+        options = {}
+    opts = DecoderOptions(
+        strict=options.get('strict', DEFAULT_STRICT),
+        expand_paths=options.get('expand_paths', EXPAND_PATHS_OFF),
+        default_delimiter=options.get('default_delimiter', DEFAULT_DELIMITER)
+    )
+    lines = toon_string.split(NEWLINE)
+    # Handle special case of top-level inline values
+    stripped = toon_string.strip()
+    if stripped.startswith(LEFT_BRACKET) and stripped.endswith(RIGHT_BRACKET):
+        # Top-level array
+        return _parse_value(stripped, opts)
+    elif stripped == '{}':
+        # Empty object
+        return {}
+    result, _ = _parse_lines(lines, 0, 0, opts)
+    # Apply path expansion if enabled
+    if opts.expand_paths == EXPAND_PATHS_SAFE and isinstance(result, dict):
+        result = _expand_paths(result)
+    return result
+def _parse_lines(lines: List[str], start_idx: int, base_indent: int, opts: DecoderOptions) -> Tuple[Any, int]:
+    """
+    Parse lines starting from start_idx with given base indentation.
+    Returns:
+        (parsed_value, next_line_index)
+    """
+    if start_idx >= len(lines):
+        return {}, start_idx
+    result = {}
+    i = start_idx
+    while i < len(lines):
+        line = lines[i]
+        # Skip empty lines
+        if not line.strip():
+            i += 1
+            continue
+        # Calculate indentation
+        indent = len(line) - len(line.lstrip())
+        # If indentation is less than base, we're done with this block
+        if indent < base_indent:
+            break
+        # If indentation is greater than expected, skip (part of previous value)
+        if indent > base_indent:
+            i += 1
+            continue
+        # Parse the line
+        stripped = line.strip()
+        # Check for array header: name[N]{fields}: or name[N]:
+        array_match = re.match(r'^([^:\[\]]+)\[(\d+)\](?:\{([^}]+)\})?' + COLON + r'\s*$', stripped)
+        if array_match:
+            key = array_match.group(1)
+            count = int(array_match.group(2))
+            fields_str = array_match.group(3)
+            if fields_str:
+                # Tabular array
+                fields = [f.strip() for f in fields_str.split(COMMA)]
+                array_value, i = _parse_tabular_array(lines, i + 1, indent, count, fields, opts)
+            else:
+                # List array
+                array_value, i = _parse_list_array(lines, i + 1, indent, count, opts)
+            result[key] = array_value
+            continue
+        # Check for key-value pair
+        if COLON in stripped:
+            key, value_str = stripped.split(COLON, 1)
+            key = key.strip()
+            value_str = value_str.strip()
+            if value_str:
+                # Inline value
+                result[key] = _parse_value(value_str, opts)
+                i += 1
+            else:
+                # Nested value on next lines
+                nested_value, i = _parse_lines(lines, i + 1, indent + 2, opts)
+                result[key] = nested_value
+        else:
+            # No colon - might be a continuation or error
+            i += 1
+    return result, i
+def _parse_tabular_array(
+    lines: List[str],
+    start_idx: int,
+    base_indent: int,
+    count: int,
+    fields: List[str],
+    opts: DecoderOptions
+) -> Tuple[List[Dict], int]:
+    """Parse a tabular array."""
+    result = []
+    i = start_idx
+    expected_indent = base_indent + 2
+    # Detect delimiter from first row
+    delimiter = opts.default_delimiter
+    if i < len(lines):
+        first_row = lines[i].strip()
+        if TAB in first_row:
+            delimiter = TAB
+        elif PIPE in first_row:
+            delimiter = PIPE
+    for _ in range(count):
+        if i >= len(lines):
+            break
+        line = lines[i]
+        indent = len(line) - len(line.lstrip())
+        if indent != expected_indent:
+            if opts.strict:
+                break
+            i += 1
+            continue
+        # Parse row values
+        row_str = line.strip()
+        values = _split_row(row_str, delimiter)
+        # Create object from fields and values
+        obj = {}
+        for j, field in enumerate(fields):
+            if j < len(values):
+                obj[field] = _parse_value(values[j], opts)
+            else:
+                obj[field] = None
+        result.append(obj)
+        i += 1
+    return result, i
+def _parse_list_array(
+    lines: List[str],
+    start_idx: int,
+    base_indent: int,
+    count: int,
+    opts: DecoderOptions
+) -> Tuple[List[Any], int]:
+    """Parse a list array."""
+    result = []
+    i = start_idx
+    expected_indent = base_indent + 2
+    for _ in range(count):
+        if i >= len(lines):
+            break
+        line = lines[i]
+        indent = len(line) - len(line.lstrip())
+        if indent < expected_indent:
+            break
+        if indent == expected_indent:
+            # Check if it's a simple value or nested object
+            stripped = line.strip()
+            if COLON in stripped and not stripped.startswith(LEFT_BRACKET):
+                # Nested object
+                nested_obj, i = _parse_lines(lines, i, indent, opts)
+                result.append(nested_obj)
+            else:
+                # Simple value
+                result.append(_parse_value(stripped, opts))
+                i += 1
+        else:
+            i += 1
+    return result, i
+def _split_row(row_str: str, delimiter: str) -> List[str]:
+    """
+    Split a row by delimiter, respecting quoted strings.
+    Args:
+        row_str: Row string to split
+        delimiter: Delimiter character
+    Returns:
+        List of field values
+    """
+    values = []
+    current = []
+    in_quote = False
+    i = 0
+    while i < len(row_str):
+        char = row_str[i]
+        if char == QUOTE:
+            if in_quote and i + 1 < len(row_str) and row_str[i + 1] == QUOTE:
+                # Escaped quote
+                current.append(QUOTE)
+                i += 2
+            else:
+                in_quote = not in_quote
+                i += 1
+        elif char == delimiter and not in_quote:
+            values.append(''.join(current))
+            current = []
+            i += 1
+        else:
+            current.append(char)
+            i += 1
+    # Add last value
+    if current or values:
+        values.append(''.join(current))
+    return values
+def _parse_value(value_str: str, opts: DecoderOptions) -> Any:
+    """Parse a single value string."""
+    value_str = value_str.strip()
+    if not value_str:
+        return None
+    # Check for quoted string
+    if value_str.startswith(QUOTE) and value_str.endswith(QUOTE) and len(value_str) >= 2:
+        # Unquote and unescape
+        inner = value_str[1:-1]
+        return unescape_string(inner)
+    # Check for inline array [val1,val2,...]
+    if value_str.startswith(LEFT_BRACKET) and value_str.endswith(RIGHT_BRACKET):
+        inner = value_str[1:-1]
+        if not inner:
+            return []
+        # Detect delimiter
+        delimiter = COMMA
+        if TAB in inner:
+            delimiter = TAB
+        elif PIPE in inner:
+            delimiter = PIPE
+        values = _split_row(inner, delimiter)
+        return [_parse_value(v.strip(), opts) for v in values]
+    # Check for empty object
+    if value_str == '{}':
+        return {}
+    # Parse as literal (bool, null, number, or string)
+    return parse_literal(value_str)
+def _expand_paths(obj: dict) -> dict:
+    """
+    Expand dotted paths into nested objects.
+    Args:
+        obj: Object with potentially dotted keys
+    Returns:
+        Expanded object
+    """
+    result = {}
+    for key, value in obj.items():
+        if '.' in key:
+            # Split path and create nested structure
+            parts = key.split('.')
+            current = result
+            for i, part in enumerate(parts[:-1]):
+                if part not in current:
+                    current[part] = {}
+                elif not isinstance(current[part], dict):
+                    # Conflict - keep original
+                    result[key] = value
+                    break
+                current = current[part]
+            else:
+                # Set final value
+                current[parts[-1]] = value
+        else:
+            result[key] = value
+    # Recursively expand nested objects
+    for key, value in result.items():
+        if isinstance(value, dict):
+            result[key] = _expand_paths(value)
+        elif isinstance(value, list):
+            result[key] = [_expand_paths(item) if isinstance(item, dict) else item for item in value]
+    return result