toonify 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
toon/__init__.py ADDED
@@ -0,0 +1,22 @@
1
+ """TOON (Token-Oriented Object Notation) - A compact serialization format for LLMs."""
2
+
3
+ from .encoder import encode
4
+ from .decoder import decode
5
+ from .constants import (
6
+ COMMA, TAB, PIPE,
7
+ KEY_FOLDING_OFF, KEY_FOLDING_SAFE,
8
+ EXPAND_PATHS_OFF, EXPAND_PATHS_SAFE
9
+ )
10
+
11
+ __version__ = '1.0.0'
12
+ __all__ = [
13
+ 'encode',
14
+ 'decode',
15
+ 'COMMA',
16
+ 'TAB',
17
+ 'PIPE',
18
+ 'KEY_FOLDING_OFF',
19
+ 'KEY_FOLDING_SAFE',
20
+ 'EXPAND_PATHS_OFF',
21
+ 'EXPAND_PATHS_SAFE',
22
+ ]
toon/cli.py ADDED
@@ -0,0 +1,276 @@
1
+ """Command-line interface for TOON format conversion."""
2
+ import sys
3
+ import json
4
+ import argparse
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ try:
9
+ import tiktoken
10
+ TIKTOKEN_AVAILABLE = True
11
+ except ImportError:
12
+ TIKTOKEN_AVAILABLE = False
13
+
14
+ from . import encode, decode
15
+
16
+
17
+ def count_tokens(text: str) -> Optional[int]:
18
+ """
19
+ Count tokens in text using tiktoken (o200k_base encoding).
20
+
21
+ Args:
22
+ text: Text to count tokens in
23
+
24
+ Returns:
25
+ Token count or None if tiktoken not available
26
+ """
27
+ if not TIKTOKEN_AVAILABLE:
28
+ return None
29
+
30
+ try:
31
+ encoding = tiktoken.get_encoding("o200k_base")
32
+ return len(encoding.encode(text))
33
+ except Exception:
34
+ return None
35
+
36
+
37
+ def detect_mode(input_path: Optional[str], force_encode: bool, force_decode: bool) -> str:
38
+ """
39
+ Detect conversion mode from file extension or flags.
40
+
41
+ Args:
42
+ input_path: Input file path
43
+ force_encode: Force encode mode
44
+ force_decode: Force decode mode
45
+
46
+ Returns:
47
+ 'encode' or 'decode'
48
+ """
49
+ if force_encode:
50
+ return 'encode'
51
+ if force_decode:
52
+ return 'decode'
53
+
54
+ if input_path and input_path != '-':
55
+ path = Path(input_path)
56
+ ext = path.suffix.lower()
57
+ if ext == '.json':
58
+ return 'encode'
59
+ elif ext == '.toon':
60
+ return 'decode'
61
+
62
+ # Default to encode
63
+ return 'encode'
64
+
65
+
66
+ def read_input(input_path: Optional[str]) -> str:
67
+ """
68
+ Read input from file or stdin.
69
+
70
+ Args:
71
+ input_path: Input file path or '-' for stdin
72
+
73
+ Returns:
74
+ Input content
75
+ """
76
+ if not input_path or input_path == '-':
77
+ return sys.stdin.read()
78
+
79
+ with open(input_path, 'r', encoding='utf-8') as f:
80
+ return f.read()
81
+
82
+
83
+ def write_output(content: str, output_path: Optional[str]) -> None:
84
+ """
85
+ Write output to file or stdout.
86
+
87
+ Args:
88
+ content: Content to write
89
+ output_path: Output file path or None for stdout
90
+ """
91
+ if not output_path:
92
+ print(content)
93
+ else:
94
+ with open(output_path, 'w', encoding='utf-8') as f:
95
+ f.write(content)
96
+
97
+
98
+ def main():
99
+ """Main CLI entry point."""
100
+ parser = argparse.ArgumentParser(
101
+ description='TOON (Token-Oriented Object Notation) - Convert between JSON and TOON formats',
102
+ formatter_class=argparse.RawDescriptionHelpFormatter,
103
+ epilog="""
104
+ Examples:
105
+ # Encode JSON file to TOON
106
+ toon input.json -o output.toon
107
+
108
+ # Decode TOON file to JSON
109
+ toon input.toon -o output.json
110
+
111
+ # Pipe JSON and encode to TOON
112
+ echo '{"key": "value"}' | toon -e
113
+
114
+ # Force decode mode with custom delimiter
115
+ toon input.txt -d --delimiter tab
116
+
117
+ # Show token statistics
118
+ toon input.json --stats
119
+ """
120
+ )
121
+
122
+ parser.add_argument(
123
+ 'input',
124
+ nargs='?',
125
+ help='Input file path (or "-" for stdin, default: stdin)'
126
+ )
127
+ parser.add_argument(
128
+ '-o', '--output',
129
+ help='Output file path (default: stdout)'
130
+ )
131
+ parser.add_argument(
132
+ '-e', '--encode',
133
+ action='store_true',
134
+ help='Force encode mode (JSON to TOON)'
135
+ )
136
+ parser.add_argument(
137
+ '-d', '--decode',
138
+ action='store_true',
139
+ help='Force decode mode (TOON to JSON)'
140
+ )
141
+ parser.add_argument(
142
+ '--delimiter',
143
+ choices=['comma', 'tab', 'pipe'],
144
+ default='comma',
145
+ help='Array delimiter (default: comma)'
146
+ )
147
+ parser.add_argument(
148
+ '--indent',
149
+ type=int,
150
+ default=2,
151
+ help='Indentation size (default: 2)'
152
+ )
153
+ parser.add_argument(
154
+ '--stats',
155
+ action='store_true',
156
+ help='Show token statistics'
157
+ )
158
+ parser.add_argument(
159
+ '--no-strict',
160
+ action='store_true',
161
+ help='Disable strict validation (decode only)'
162
+ )
163
+ parser.add_argument(
164
+ '--key-folding',
165
+ choices=['off', 'safe'],
166
+ default='off',
167
+ help='Key folding mode (encode only, default: off)'
168
+ )
169
+ parser.add_argument(
170
+ '--flatten-depth',
171
+ type=int,
172
+ help='Maximum key folding depth (encode only)'
173
+ )
174
+ parser.add_argument(
175
+ '--expand-paths',
176
+ choices=['off', 'safe'],
177
+ default='off',
178
+ help='Path expansion mode (decode only, default: off)'
179
+ )
180
+
181
+ args = parser.parse_args()
182
+
183
+ # Validate arguments
184
+ if args.encode and args.decode:
185
+ parser.error('Cannot specify both --encode and --decode')
186
+
187
+ try:
188
+ # Read input
189
+ input_content = read_input(args.input)
190
+
191
+ # Detect mode
192
+ mode = detect_mode(args.input, args.encode, args.decode)
193
+
194
+ # Convert
195
+ if mode == 'encode':
196
+ # Parse JSON
197
+ data = json.loads(input_content)
198
+
199
+ # Encode to TOON
200
+ options = {
201
+ 'delimiter': args.delimiter,
202
+ 'indent': args.indent,
203
+ 'key_folding': args.key_folding,
204
+ }
205
+ if args.flatten_depth is not None:
206
+ options['flatten_depth'] = args.flatten_depth
207
+
208
+ output_content = encode(data, options)
209
+
210
+ # Show statistics if requested
211
+ if args.stats:
212
+ input_tokens = count_tokens(input_content)
213
+ output_tokens = count_tokens(output_content)
214
+
215
+ print(f'Input (JSON): {len(input_content)} bytes', file=sys.stderr)
216
+ print(f'Output (TOON): {len(output_content)} bytes', file=sys.stderr)
217
+ print(f'Size reduction: {(1 - len(output_content) / len(input_content)) * 100:.1f}%', file=sys.stderr)
218
+
219
+ if input_tokens is not None and output_tokens is not None:
220
+ print(f'Input tokens: {input_tokens}', file=sys.stderr)
221
+ print(f'Output tokens: {output_tokens}', file=sys.stderr)
222
+ print(f'Token reduction: {(1 - output_tokens / input_tokens) * 100:.1f}%', file=sys.stderr)
223
+ else:
224
+ print('(Install tiktoken for token statistics)', file=sys.stderr)
225
+
226
+ print('---', file=sys.stderr)
227
+
228
+ else: # decode
229
+ # Decode TOON
230
+ options = {
231
+ 'strict': not args.no_strict,
232
+ 'expand_paths': args.expand_paths,
233
+ 'default_delimiter': args.delimiter,
234
+ }
235
+
236
+ data = decode(input_content, options)
237
+
238
+ # Convert to JSON
239
+ output_content = json.dumps(data, indent=2, ensure_ascii=False)
240
+
241
+ # Show statistics if requested
242
+ if args.stats:
243
+ input_tokens = count_tokens(input_content)
244
+ output_tokens = count_tokens(output_content)
245
+
246
+ print(f'Input (TOON): {len(input_content)} bytes', file=sys.stderr)
247
+ print(f'Output (JSON): {len(output_content)} bytes', file=sys.stderr)
248
+ print(f'Size increase: {(len(output_content) / len(input_content) - 1) * 100:.1f}%', file=sys.stderr)
249
+
250
+ if input_tokens is not None and output_tokens is not None:
251
+ print(f'Input tokens: {input_tokens}', file=sys.stderr)
252
+ print(f'Output tokens: {output_tokens}', file=sys.stderr)
253
+ print(f'Token increase: {(output_tokens / input_tokens - 1) * 100:.1f}%', file=sys.stderr)
254
+ else:
255
+ print('(Install tiktoken for token statistics)', file=sys.stderr)
256
+
257
+ print('---', file=sys.stderr)
258
+
259
+ # Write output
260
+ write_output(output_content, args.output)
261
+
262
+ return 0
263
+
264
+ except json.JSONDecodeError as e:
265
+ print(f'Error parsing JSON: {e}', file=sys.stderr)
266
+ return 1
267
+ except FileNotFoundError as e:
268
+ print(f'Error: {e}', file=sys.stderr)
269
+ return 1
270
+ except Exception as e:
271
+ print(f'Error: {e}', file=sys.stderr)
272
+ return 1
273
+
274
+
275
+ if __name__ == '__main__':
276
+ sys.exit(main())
toon/constants.py ADDED
@@ -0,0 +1,42 @@
1
+ """Constants used throughout the TOON library."""
2
+
3
+ # Delimiters
4
+ COMMA = ','
5
+ TAB = '\t'
6
+ PIPE = '|'
7
+ DEFAULT_DELIMITER = COMMA
8
+
9
+ # Special characters
10
+ COLON = ':'
11
+ QUOTE = '"'
12
+ BACKSLASH = '\\'
13
+ NEWLINE = '\n'
14
+ SPACE = ' '
15
+ LEFT_BRACKET = '['
16
+ RIGHT_BRACKET = ']'
17
+ LEFT_BRACE = '{'
18
+ RIGHT_BRACE = '}'
19
+
20
+ # Literals
21
+ TRUE_LITERAL = 'true'
22
+ FALSE_LITERAL = 'false'
23
+ NULL_LITERAL = 'null'
24
+
25
+ # Default options
26
+ DEFAULT_INDENT = 2
27
+ DEFAULT_KEY_FOLDING = 'off'
28
+ DEFAULT_EXPAND_PATHS = 'off'
29
+ DEFAULT_STRICT = True
30
+
31
+ # Key folding modes
32
+ KEY_FOLDING_OFF = 'off'
33
+ KEY_FOLDING_SAFE = 'safe'
34
+
35
+ # Path expansion modes
36
+ EXPAND_PATHS_OFF = 'off'
37
+ EXPAND_PATHS_SAFE = 'safe'
38
+
39
+ # Special delimiter names
40
+ DELIMITER_TAB = 'tab'
41
+ DELIMITER_PIPE = 'pipe'
42
+ DELIMITER_COMMA = 'comma'
toon/decoder.py ADDED
@@ -0,0 +1,370 @@
1
+ """TOON decoder - convert TOON format to Python objects."""
2
+ import re
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+ from .constants import (
5
+ COMMA, TAB, PIPE, COLON, QUOTE, NEWLINE, SPACE,
6
+ DEFAULT_DELIMITER, DEFAULT_STRICT,
7
+ EXPAND_PATHS_OFF, EXPAND_PATHS_SAFE,
8
+ LEFT_BRACKET, RIGHT_BRACKET, LEFT_BRACE, RIGHT_BRACE
9
+ )
10
+ from .utils import unescape_string, parse_literal
11
+
12
+
13
+ class DecoderOptions:
14
+ """Options for TOON decoding."""
15
+
16
+ def __init__(
17
+ self,
18
+ strict: bool = DEFAULT_STRICT,
19
+ expand_paths: str = EXPAND_PATHS_OFF,
20
+ default_delimiter: str = DEFAULT_DELIMITER
21
+ ):
22
+ """
23
+ Initialize decoder options.
24
+
25
+ Args:
26
+ strict: Validate structure strictly
27
+ expand_paths: Path expansion mode ('off' | 'safe')
28
+ default_delimiter: Default delimiter for arrays
29
+ """
30
+ self.strict = strict
31
+ self.expand_paths = expand_paths
32
+ self.default_delimiter = default_delimiter
33
+
34
+
35
+ def decode(toon_string: str, options: Optional[Dict[str, Any]] = None) -> Any:
36
+ """
37
+ Decode TOON format string to Python data structure.
38
+
39
+ Args:
40
+ toon_string: TOON formatted string
41
+ options: Decoding options
42
+ - strict: bool (default True) - validate structure
43
+ - expand_paths: 'off' (default) or 'safe'
44
+ - default_delimiter: ',' (default)
45
+
46
+ Returns:
47
+ Python object (dict or list)
48
+
49
+ Example:
50
+ >>> toon = '''users[1]{id,name}:
51
+ ... 1,Alice'''
52
+ >>> decode(toon)
53
+ {'users': [{'id': 1, 'name': 'Alice'}]}
54
+ """
55
+ if options is None:
56
+ options = {}
57
+
58
+ opts = DecoderOptions(
59
+ strict=options.get('strict', DEFAULT_STRICT),
60
+ expand_paths=options.get('expand_paths', EXPAND_PATHS_OFF),
61
+ default_delimiter=options.get('default_delimiter', DEFAULT_DELIMITER)
62
+ )
63
+
64
+ lines = toon_string.split(NEWLINE)
65
+
66
+ # Handle special case of top-level inline values
67
+ stripped = toon_string.strip()
68
+ if stripped.startswith(LEFT_BRACKET) and stripped.endswith(RIGHT_BRACKET):
69
+ # Top-level array
70
+ return _parse_value(stripped, opts)
71
+ elif stripped == '{}':
72
+ # Empty object
73
+ return {}
74
+
75
+ result, _ = _parse_lines(lines, 0, 0, opts)
76
+
77
+ # Apply path expansion if enabled
78
+ if opts.expand_paths == EXPAND_PATHS_SAFE and isinstance(result, dict):
79
+ result = _expand_paths(result)
80
+
81
+ return result
82
+
83
+
84
+ def _parse_lines(lines: List[str], start_idx: int, base_indent: int, opts: DecoderOptions) -> Tuple[Any, int]:
85
+ """
86
+ Parse lines starting from start_idx with given base indentation.
87
+
88
+ Returns:
89
+ (parsed_value, next_line_index)
90
+ """
91
+ if start_idx >= len(lines):
92
+ return {}, start_idx
93
+
94
+ result = {}
95
+ i = start_idx
96
+
97
+ while i < len(lines):
98
+ line = lines[i]
99
+
100
+ # Skip empty lines
101
+ if not line.strip():
102
+ i += 1
103
+ continue
104
+
105
+ # Calculate indentation
106
+ indent = len(line) - len(line.lstrip())
107
+
108
+ # If indentation is less than base, we're done with this block
109
+ if indent < base_indent:
110
+ break
111
+
112
+ # If indentation is greater than expected, skip (part of previous value)
113
+ if indent > base_indent:
114
+ i += 1
115
+ continue
116
+
117
+ # Parse the line
118
+ stripped = line.strip()
119
+
120
+ # Check for array header: name[N]{fields}: or name[N]:
121
+ array_match = re.match(r'^([^:\[\]]+)\[(\d+)\](?:\{([^}]+)\})?' + COLON + r'\s*$', stripped)
122
+ if array_match:
123
+ key = array_match.group(1)
124
+ count = int(array_match.group(2))
125
+ fields_str = array_match.group(3)
126
+
127
+ if fields_str:
128
+ # Tabular array
129
+ fields = [f.strip() for f in fields_str.split(COMMA)]
130
+ array_value, i = _parse_tabular_array(lines, i + 1, indent, count, fields, opts)
131
+ else:
132
+ # List array
133
+ array_value, i = _parse_list_array(lines, i + 1, indent, count, opts)
134
+
135
+ result[key] = array_value
136
+ continue
137
+
138
+ # Check for key-value pair
139
+ if COLON in stripped:
140
+ key, value_str = stripped.split(COLON, 1)
141
+ key = key.strip()
142
+ value_str = value_str.strip()
143
+
144
+ if value_str:
145
+ # Inline value
146
+ result[key] = _parse_value(value_str, opts)
147
+ i += 1
148
+ else:
149
+ # Nested value on next lines
150
+ nested_value, i = _parse_lines(lines, i + 1, indent + 2, opts)
151
+ result[key] = nested_value
152
+ else:
153
+ # No colon - might be a continuation or error
154
+ i += 1
155
+
156
+ return result, i
157
+
158
+
159
+ def _parse_tabular_array(
160
+ lines: List[str],
161
+ start_idx: int,
162
+ base_indent: int,
163
+ count: int,
164
+ fields: List[str],
165
+ opts: DecoderOptions
166
+ ) -> Tuple[List[Dict], int]:
167
+ """Parse a tabular array."""
168
+ result = []
169
+ i = start_idx
170
+ expected_indent = base_indent + 2
171
+
172
+ # Detect delimiter from first row
173
+ delimiter = opts.default_delimiter
174
+ if i < len(lines):
175
+ first_row = lines[i].strip()
176
+ if TAB in first_row:
177
+ delimiter = TAB
178
+ elif PIPE in first_row:
179
+ delimiter = PIPE
180
+
181
+ for _ in range(count):
182
+ if i >= len(lines):
183
+ break
184
+
185
+ line = lines[i]
186
+ indent = len(line) - len(line.lstrip())
187
+
188
+ if indent != expected_indent:
189
+ if opts.strict:
190
+ break
191
+ i += 1
192
+ continue
193
+
194
+ # Parse row values
195
+ row_str = line.strip()
196
+ values = _split_row(row_str, delimiter)
197
+
198
+ # Create object from fields and values
199
+ obj = {}
200
+ for j, field in enumerate(fields):
201
+ if j < len(values):
202
+ obj[field] = _parse_value(values[j], opts)
203
+ else:
204
+ obj[field] = None
205
+
206
+ result.append(obj)
207
+ i += 1
208
+
209
+ return result, i
210
+
211
+
212
+ def _parse_list_array(
213
+ lines: List[str],
214
+ start_idx: int,
215
+ base_indent: int,
216
+ count: int,
217
+ opts: DecoderOptions
218
+ ) -> Tuple[List[Any], int]:
219
+ """Parse a list array."""
220
+ result = []
221
+ i = start_idx
222
+ expected_indent = base_indent + 2
223
+
224
+ for _ in range(count):
225
+ if i >= len(lines):
226
+ break
227
+
228
+ line = lines[i]
229
+ indent = len(line) - len(line.lstrip())
230
+
231
+ if indent < expected_indent:
232
+ break
233
+
234
+ if indent == expected_indent:
235
+ # Check if it's a simple value or nested object
236
+ stripped = line.strip()
237
+
238
+ if COLON in stripped and not stripped.startswith(LEFT_BRACKET):
239
+ # Nested object
240
+ nested_obj, i = _parse_lines(lines, i, indent, opts)
241
+ result.append(nested_obj)
242
+ else:
243
+ # Simple value
244
+ result.append(_parse_value(stripped, opts))
245
+ i += 1
246
+ else:
247
+ i += 1
248
+
249
+ return result, i
250
+
251
+
252
+ def _split_row(row_str: str, delimiter: str) -> List[str]:
253
+ """
254
+ Split a row by delimiter, respecting quoted strings.
255
+
256
+ Args:
257
+ row_str: Row string to split
258
+ delimiter: Delimiter character
259
+
260
+ Returns:
261
+ List of field values
262
+ """
263
+ values = []
264
+ current = []
265
+ in_quote = False
266
+ i = 0
267
+
268
+ while i < len(row_str):
269
+ char = row_str[i]
270
+
271
+ if char == QUOTE:
272
+ if in_quote and i + 1 < len(row_str) and row_str[i + 1] == QUOTE:
273
+ # Escaped quote
274
+ current.append(QUOTE)
275
+ i += 2
276
+ else:
277
+ in_quote = not in_quote
278
+ i += 1
279
+ elif char == delimiter and not in_quote:
280
+ values.append(''.join(current))
281
+ current = []
282
+ i += 1
283
+ else:
284
+ current.append(char)
285
+ i += 1
286
+
287
+ # Add last value
288
+ if current or values:
289
+ values.append(''.join(current))
290
+
291
+ return values
292
+
293
+
294
+ def _parse_value(value_str: str, opts: DecoderOptions) -> Any:
295
+ """Parse a single value string."""
296
+ value_str = value_str.strip()
297
+
298
+ if not value_str:
299
+ return None
300
+
301
+ # Check for quoted string
302
+ if value_str.startswith(QUOTE) and value_str.endswith(QUOTE) and len(value_str) >= 2:
303
+ # Unquote and unescape
304
+ inner = value_str[1:-1]
305
+ return unescape_string(inner)
306
+
307
+ # Check for inline array [val1,val2,...]
308
+ if value_str.startswith(LEFT_BRACKET) and value_str.endswith(RIGHT_BRACKET):
309
+ inner = value_str[1:-1]
310
+ if not inner:
311
+ return []
312
+
313
+ # Detect delimiter
314
+ delimiter = COMMA
315
+ if TAB in inner:
316
+ delimiter = TAB
317
+ elif PIPE in inner:
318
+ delimiter = PIPE
319
+
320
+ values = _split_row(inner, delimiter)
321
+ return [_parse_value(v.strip(), opts) for v in values]
322
+
323
+ # Check for empty object
324
+ if value_str == '{}':
325
+ return {}
326
+
327
+ # Parse as literal (bool, null, number, or string)
328
+ return parse_literal(value_str)
329
+
330
+
331
+ def _expand_paths(obj: dict) -> dict:
332
+ """
333
+ Expand dotted paths into nested objects.
334
+
335
+ Args:
336
+ obj: Object with potentially dotted keys
337
+
338
+ Returns:
339
+ Expanded object
340
+ """
341
+ result = {}
342
+
343
+ for key, value in obj.items():
344
+ if '.' in key:
345
+ # Split path and create nested structure
346
+ parts = key.split('.')
347
+ current = result
348
+
349
+ for i, part in enumerate(parts[:-1]):
350
+ if part not in current:
351
+ current[part] = {}
352
+ elif not isinstance(current[part], dict):
353
+ # Conflict - keep original
354
+ result[key] = value
355
+ break
356
+ current = current[part]
357
+ else:
358
+ # Set final value
359
+ current[parts[-1]] = value
360
+ else:
361
+ result[key] = value
362
+
363
+ # Recursively expand nested objects
364
+ for key, value in result.items():
365
+ if isinstance(value, dict):
366
+ result[key] = _expand_paths(value)
367
+ elif isinstance(value, list):
368
+ result[key] = [_expand_paths(item) if isinstance(item, dict) else item for item in value]
369
+
370
+ return result