cton 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d011537d6c7b854d8ffffb3dc1e7848534a7919c12f4c1cc4ba84db338cb1669
4
- data.tar.gz: c96a91aae6acc9dd37e3df0b8e0fe28acfbdb6c20ea1f22a946676962f7bf7ac
3
+ metadata.gz: '0295922a011dd898278f9f57de2f48f2acbe0ce3363f263f4e2b2753993ebdea'
4
+ data.tar.gz: 33bb13ee23584ff6cd51bf6bc6c1a869d02f206ff0792a172c9aa7cdc5547977
5
5
  SHA512:
6
- metadata.gz: 77be2aa2db0a728eaa6835be6a66456fefcd12ca27f32d3fbb3a6d77d91f4b109eca64eb958f24bb63b7ae2e193f389029fc6f1986cd590b708da3e3820b288e
7
- data.tar.gz: b09740b530363e012c7f00ba6b33aaec366503310756e328a00d9def849fcb13158e911265cf57dcf6a8b613be6cd3e0e6bf164e1cfa480983b73c29a53e8cbb
6
+ metadata.gz: 9dff47df67680eabf6fb7ac05dac606e969df0a0d31d575318fa4a72c51c8fe85d38b671f4e2b8b8caa0e969184c044e06547b135b9a5b5b0baa4c3e28232322
7
+ data.tar.gz: be363392d2305b6940e46060310a908922bbf822ae487e9d4b20441e4cc51e5e8161332cd5d8c0846743663d1251ff7b1b5480f21f36f9b52c0aafb0404f4b74
data/.rubocop.yml CHANGED
@@ -1,4 +1,5 @@
1
1
  AllCops:
2
+ NewCops: enable
2
3
  TargetRubyVersion: 3.1
3
4
 
4
5
  Style/StringLiterals:
@@ -6,3 +7,15 @@ Style/StringLiterals:
6
7
 
7
8
  Style/StringLiteralsInInterpolation:
8
9
  EnforcedStyle: double_quotes
10
+
11
+ Style/FrozenStringLiteralComment:
12
+ Enabled: true
13
+
14
+ Metrics/MethodLength:
15
+ Max: 25
16
+
17
+ Metrics/ClassLength:
18
+ Max: 200
19
+
20
+ Layout/LineLength:
21
+ Max: 120
data/CHANGELOG.md CHANGED
@@ -5,6 +5,23 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.1.1] - 2025-11-18
9
+
10
+ ### Changed
11
+
12
+ - **Performance**: Refactored `Encoder` to use `StringIO` and `Decoder` to use `StringScanner` for significantly improved performance and memory usage.
13
+ - **Architecture**: Split `Cton` module into dedicated `Cton::Encoder` and `Cton::Decoder` classes for better maintainability.
14
+
15
+ ### Fixed
16
+
17
+ - **Parsing**: Fixed an issue where unterminated strings were not correctly detected.
18
+ - **Whitespace**: Improved whitespace handling in the decoder, specifically fixing issues with whitespace between keys and structure markers.
19
+
20
+ ### Added
21
+
22
+ - **Type Safety**: Added comprehensive RBS signatures (`sig/cton.rbs`) for better IDE support and static analysis.
23
+ - **Tests**: Expanded test coverage for validation, complex tables, mixed arrays, unicode values, and error cases.
24
+
8
25
  ## [0.1.0] - 2025-11-18
9
26
 
10
27
  ### Added
data/README.md CHANGED
@@ -90,12 +90,16 @@ Following the TOON specification's guardrails, the encoder now:
90
90
  - Canonicalizes float/BigDecimal output: no exponent notation, no trailing zeros, and `-0` collapses to `0`.
91
91
  - Converts `NaN` and `±Infinity` inputs to `null`, matching TOON's normalization guidance so downstream decoders don't explode on non-finite numbers.
92
92
 
93
+ ## Type Safety
94
+
95
+ CTON ships with RBS signatures (`sig/cton.rbs`) to support type checking and IDE autocompletion.
96
+
93
97
  ## Development
94
98
 
95
99
  ```bash
96
- bin/setup # install dependencies
97
- bundle exec rspec
98
- bin/console # interactive playground
100
+ bin/setup # install dependencies
101
+ bundle exec rake # run tests and rubocop
102
+ bin/console # interactive playground
99
103
  ```
100
104
 
101
105
  To release a new version, bump `Cton::VERSION` and run `bundle exec rake release`.
@@ -0,0 +1,491 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "strscan"
4
+
5
+ module Cton
6
+ class Decoder
7
+ TERMINATORS = [",", ";", ")", "]", "}"].freeze
8
+
9
+ def initialize(symbolize_names: false)
10
+ @symbolize_names = symbolize_names
11
+ end
12
+
13
+ def decode(cton)
14
+ @scanner = StringScanner.new(cton.to_s)
15
+ skip_ws
16
+
17
+ value = if key_ahead?
18
+ parse_document
19
+ else
20
+ parse_value(allow_key_boundary: true)
21
+ end
22
+
23
+ skip_ws
24
+ raise ParseError, "Unexpected trailing data" unless @scanner.eos?
25
+
26
+ value
27
+ end
28
+
29
+ private
30
+
31
+ attr_reader :symbolize_names, :scanner
32
+
33
+ def parse_document
34
+ result = {}
35
+ until @scanner.eos?
36
+ key = parse_key_name
37
+ value = parse_value_for_key
38
+ result[key] = value
39
+ skip_ws
40
+ end
41
+ result
42
+ end
43
+
44
+ def parse_value_for_key
45
+ skip_ws
46
+ if @scanner.scan(/\(/)
47
+ parse_object
48
+ elsif @scanner.scan(/\[/)
49
+ parse_array
50
+ elsif @scanner.scan(/=/)
51
+ parse_scalar(allow_key_boundary: true)
52
+ else
53
+ raise ParseError, "Unexpected token at position #{@scanner.pos}"
54
+ end
55
+ end
56
+
57
+ def parse_object
58
+ skip_ws
59
+ if @scanner.scan(/\)/)
60
+ return {}
61
+ end
62
+
63
+ pairs = {}
64
+ loop do
65
+ key = parse_key_name
66
+ expect!("=")
67
+ value = parse_value
68
+ pairs[key] = value
69
+ skip_ws
70
+ break if @scanner.scan(/\)/)
71
+ expect!(",")
72
+ skip_ws
73
+ end
74
+ pairs
75
+ end
76
+
77
+ def parse_array
78
+ length = parse_integer_literal
79
+ expect!("]")
80
+ skip_ws
81
+
82
+ header = parse_header if @scanner.peek(1) == "{"
83
+
84
+ expect!("=")
85
+ return [] if length.zero?
86
+
87
+ header ? parse_table_rows(length, header) : parse_array_elements(length)
88
+ end
89
+
90
+ def parse_header
91
+ expect!("{")
92
+ fields = []
93
+ loop do
94
+ fields << parse_key_name
95
+ break if @scanner.scan(/\}/)
96
+ expect!(",")
97
+ end
98
+ fields
99
+ end
100
+
101
+ def parse_table_rows(length, header)
102
+ rows = []
103
+ length.times do |row_index|
104
+ row = {}
105
+ header.each_with_index do |field, column_index|
106
+ allow_boundary = row_index == length - 1 && column_index == header.length - 1
107
+ row[field] = parse_scalar(allow_key_boundary: allow_boundary)
108
+ expect!(",") if column_index < header.length - 1
109
+ end
110
+ rows << symbolize_keys(row)
111
+ expect!(";") if row_index < length - 1
112
+ end
113
+ rows
114
+ end
115
+
116
+ def parse_array_elements(length)
117
+ values = []
118
+ length.times do |index|
119
+ allow_boundary = index == length - 1
120
+ values << parse_value(allow_key_boundary: allow_boundary)
121
+ expect!(",") if index < length - 1
122
+ end
123
+ values
124
+ end
125
+
126
+ def parse_value(allow_key_boundary: false)
127
+ skip_ws
128
+ if @scanner.scan(/\(/)
129
+ parse_object
130
+ elsif @scanner.scan(/\[/)
131
+ parse_array
132
+ elsif @scanner.peek(1) == '"'
133
+ parse_string
134
+ else
135
+ parse_scalar(allow_key_boundary: allow_key_boundary)
136
+ end
137
+ end
138
+
139
+ def parse_scalar(allow_key_boundary: false)
140
+ skip_ws
141
+ return parse_string if @scanner.peek(1) == '"'
142
+
143
+ start_pos = @scanner.pos
144
+
145
+ # If we allow key boundary, we need to be careful not to consume the next key
146
+ # This is the tricky part. The original implementation scanned ahead.
147
+ # With StringScanner, we can scan until a terminator or whitespace.
148
+
149
+ token = if allow_key_boundary
150
+ scan_until_boundary_or_terminator
151
+ else
152
+ scan_until_terminator
153
+ end
154
+
155
+ raise ParseError, "Empty value at #{start_pos}" if token.nil? || token.empty?
156
+
157
+ convert_scalar(token)
158
+ end
159
+
160
+ def scan_until_terminator
161
+ # Scan until we hit a terminator char, whitespace, or structure char
162
+ # Terminators: , ; ) ] }
163
+ # Structure: ( [ {
164
+ # Whitespace
165
+
166
+ @scanner.scan(/[^,;\]\}\)\(\[\{\s]+/)
167
+ end
168
+
169
+ def scan_until_boundary_or_terminator
170
+ # This is complex because "key=" looks like a scalar "key" followed by "="
171
+ # But "value" followed by "key=" means "value" ends before "key".
172
+ # The original logic used `next_key_index`.
173
+
174
+ # Let's try to replicate the logic:
175
+ # Scan characters that are safe for keys/values.
176
+ # If we see something that looks like a key start, check if it is followed by [(=
177
+
178
+ start_pos = @scanner.pos
179
+
180
+ # Fast path: scan until something interesting happens
181
+ chunk = @scanner.scan(/[0-9A-Za-z_.:-]+/)
182
+ return nil unless chunk
183
+
184
+ # Now we might have consumed too much if the chunk contains a key.
185
+ # e.g. "valuekey=" -> chunk is "valuekey"
186
+ # We need to check if there is a split point inside `chunk` or if `chunk` itself is followed by [(=
187
+
188
+ # Actually, the original logic was:
189
+ # Find the *first* position where a valid key starts AND is followed by [(=
190
+
191
+ # Let's re-implement `next_key_index` logic but using the scanner's string
192
+
193
+ rest_of_string = @scanner.string[@scanner.pos..-1]
194
+ # But we also need to consider the chunk we just scanned?
195
+ # No, `scan_until_boundary_or_terminator` is called when we are at the start of a scalar.
196
+
197
+ # Let's reset and do it properly.
198
+ @scanner.pos = start_pos
199
+
200
+ full_scalar = scan_until_terminator
201
+ return nil unless full_scalar
202
+
203
+ # Now check if `full_scalar` contains a key boundary
204
+ # A key boundary is a substring that matches SAFE_TOKEN and is followed by [(=
205
+
206
+ # We need to look at `full_scalar` + whatever follows (whitespace?) + [(=
207
+ # But `scan_until_terminator` stops at whitespace.
208
+
209
+ # If `full_scalar` is "valuekey", and next char is "=", then "key" is the key.
210
+ # But wait, "value" and "key" must be separated?
211
+ # In CTON, "valuekey=..." is ambiguous if no separator.
212
+ # The README says: "Removing every newline makes certain inputs ambiguous... The default separator avoids that... You may pass separator: ''... decoding such strings is only safe if you can guarantee extra quoting or whitespace".
213
+
214
+ # So if we are in `allow_key_boundary` mode (top level), we must look for embedded keys.
215
+
216
+ # Let's look for the pattern inside the text we just consumed + lookahead.
217
+ # Actually, the original `next_key_index` scanned from the current position.
218
+
219
+ # Let's implement a helper that searches for the boundary in the remaining string
220
+ # starting from `start_pos`.
221
+
222
+ boundary_idx = find_key_boundary(start_pos)
223
+
224
+ if boundary_idx
225
+ # We found a boundary at `boundary_idx`.
226
+ # The scalar ends at `boundary_idx`.
227
+ length = boundary_idx - start_pos
228
+ @scanner.pos = start_pos
229
+ token = @scanner.peek(length)
230
+ @scanner.pos += length
231
+ token
232
+ else
233
+ # No boundary found, so the whole thing we scanned is the token
234
+ # We already scanned it into `full_scalar` but we need to put the scanner in the right place.
235
+ # Wait, I reset the scanner.
236
+ @scanner.pos = start_pos + full_scalar.length
237
+ full_scalar
238
+ end
239
+ end
240
+
241
+ def find_key_boundary(from_index)
242
+ str = @scanner.string
243
+ len = str.length
244
+ idx = from_index
245
+
246
+ # We are looking for a sequence that matches SAFE_KEY followed by [(=
247
+ # But we are currently parsing a scalar.
248
+
249
+ # Optimization: we only care about boundaries that appear *before* any terminator/whitespace.
250
+ # Because if we hit a terminator/whitespace, the scalar ends anyway.
251
+
252
+ # So we only need to check inside the `scan_until_terminator` range?
253
+ # No, because "valuekey=" has no terminator/whitespace between value and key.
254
+
255
+ while idx < len
256
+ char = str[idx]
257
+
258
+ # If we hit a terminator or whitespace, we stop looking for boundaries
259
+ # because the scalar naturally ends here.
260
+ if TERMINATORS.include?(char) || whitespace?(char) || "([{".include?(char)
261
+ return nil
262
+ end
263
+
264
+ # Check if a key starts here
265
+ if safe_key_char?(char)
266
+ # Check if this potential key is followed by [(=
267
+ # We need to scan this potential key
268
+ key_end = idx
269
+ while key_end < len && safe_key_char?(str[key_end])
270
+ key_end += 1
271
+ end
272
+
273
+ # Check what follows
274
+ next_char_idx = key_end
275
+ # Skip whitespace after key? No, keys are immediately followed by [(= usually?
276
+ # The original `next_key_index` did NOT skip whitespace after the key candidate.
277
+ # "next_char = @source[idx]" (where idx is after key)
278
+
279
+ if next_char_idx < len
280
+ next_char = str[next_char_idx]
281
+ if ["(", "[", "="].include?(next_char)
282
+ # Found a boundary!
283
+ # But wait, is this the *start* of the scalar?
284
+ # If idx == from_index, then the scalar IS the key? No, that means we are at the start.
285
+ # If we are at the start, and it looks like a key, then it IS a key, so we should have parsed it as a key?
286
+ # No, `parse_scalar` is called when we expect a value.
287
+ # If we are parsing a document "key=valuekey2=value2", we are parsing "valuekey2".
288
+ # "key2" is the next key. So "value" is the scalar.
289
+ # So if idx > from_index, we found a split.
290
+
291
+ return idx if idx > from_index
292
+ end
293
+ end
294
+
295
+ # If not a boundary, we continue scanning from inside the key?
296
+ # "valuekey=" -> at 'k', key is "key", followed by '=', so split at 'k'.
297
+ # "valukey=" -> at 'l', key is "lukey", followed by '=', so split at 'l'.
298
+ # This seems to imply we should check every position?
299
+ # The original code:
300
+ # if safe_key_char?(char)
301
+ # start = idx
302
+ # idx += 1 while ...
303
+ # if start > from_index && ... return start
304
+ # idx = start + 1 <-- This is important! It backtracks to check nested keys.
305
+ # next
306
+
307
+ # Yes, we need to check every position.
308
+
309
+ # Optimization: The key must end at `key_end`.
310
+ # If `str[key_end]` is not [(=, then this `key_candidate` is not a key.
311
+ # But maybe a suffix of it is?
312
+ # e.g. "abc=" -> "abc" followed by "=". Split at start? No.
313
+ # "a" followed by "bc="? No.
314
+
315
+ # Actually, if we find a valid key char, we scan to the end of the valid key chars.
316
+ # Let's say we have "abc=def".
317
+ # At 'a': key is "abc". Next is "=". "abc" is a key.
318
+ # If we are at start (from_index), then the whole thing is a key?
319
+ # But we are parsing a scalar.
320
+ # If `parse_scalar` sees "abc=", and `allow_key_boundary` is true.
321
+ # Does it mean "abc" is the scalar? Or "abc" is the next key?
322
+ # If "abc" is the next key, then the scalar before it is empty?
323
+ # "key=abc=def" -> key="key", value="abc", next_key="def"? No.
324
+ # "key=value next=val" -> value="value", next="next".
325
+ # "key=valuenext=val" -> value="value", next="next".
326
+
327
+ # So if we find a key boundary at `idx`, it means the scalar ends at `idx`.
328
+
329
+ # Let's stick to the original logic:
330
+ # Scan the maximal sequence of safe chars.
331
+ # If it is followed by [(=, then it IS a key.
332
+ # If it starts after `from_index`, then we found the boundary.
333
+ # If it starts AT `from_index`, then... what?
334
+ # If we are parsing a scalar, and we see "key=...", then the scalar is empty?
335
+ # That shouldn't happen if we called `parse_scalar`.
336
+ # Unless `parse_document` called `parse_value_for_key` -> `parse_scalar`.
337
+ # But `parse_document` calls `parse_key_name` first.
338
+ # So we are inside `parse_value`.
339
+
340
+ # Example: "a=1b=2".
341
+ # parse "a", expect "=", parse value.
342
+ # value starts at "1".
343
+ # "1" is safe char. "1b" is safe.
344
+ # "b" is safe.
345
+ # At "1": max key is "1b". Next is "=". "1b" is a key? Yes.
346
+ # Is "1b" followed by "="? Yes.
347
+ # Does it start > from_index? "1" is at from_index. No.
348
+ # So "1b" is NOT a boundary.
349
+ # Continue to next char "b".
350
+ # At "b": max key is "b". Next is "=". "b" is a key.
351
+ # Does it start > from_index? Yes ("b" index > "1" index).
352
+ # So boundary is at "b".
353
+ # Scalar is "1".
354
+
355
+ # So the logic is:
356
+ # For each char at `idx`:
357
+ # If it can start a key:
358
+ # Find end of key `end_key`.
359
+ # If `str[end_key]` is [(= :
360
+ # If `idx > from_index`: return `idx`.
361
+ # idx += 1
362
+
363
+ # But wait, "1b" was a key candidate.
364
+ # If we advanced `idx` to `end_key`, we would skip "b".
365
+ # So we must NOT advance `idx` to `end_key` blindly.
366
+ # We must check `idx`, then `idx+1`, etc.
367
+
368
+ # But `safe_key_char?` is true for all chars in "1b".
369
+ # So we check "1...", then "b...".
370
+
371
+ # Correct.
372
+ end
373
+
374
+ idx += 1
375
+ end
376
+ nil
377
+ end
378
+
379
+ def convert_scalar(token)
380
+ case token
381
+ when "true" then true
382
+ when "false" then false
383
+ when "null" then nil
384
+ else
385
+ if integer?(token)
386
+ token.to_i
387
+ elsif float?(token)
388
+ token.to_f
389
+ else
390
+ token
391
+ end
392
+ end
393
+ end
394
+
395
+ def parse_string
396
+ expect!("\"")
397
+ buffer = +""
398
+ loop do
399
+ if @scanner.eos?
400
+ raise ParseError, "Unterminated string"
401
+ end
402
+
403
+ char = @scanner.getch
404
+
405
+ if char == '\\'
406
+ escaped = @scanner.getch
407
+ raise ParseError, "Invalid escape sequence" if escaped.nil?
408
+ buffer << case escaped
409
+ when 'n' then "\n"
410
+ when 'r' then "\r"
411
+ when 't' then "\t"
412
+ when '"', '\\' then escaped
413
+ else
414
+ raise ParseError, "Unsupported escape sequence"
415
+ end
416
+ elsif char == '"'
417
+ break
418
+ else
419
+ buffer << char
420
+ end
421
+ end
422
+ buffer
423
+ end
424
+
425
+ def parse_key_name
426
+ skip_ws
427
+ token = @scanner.scan(/[0-9A-Za-z_.:-]+/)
428
+ raise ParseError, "Invalid key" if token.nil?
429
+ symbolize_names ? token.to_sym : token
430
+ end
431
+
432
+ def parse_integer_literal
433
+ token = @scanner.scan(/-?\d+/)
434
+ raise ParseError, "Expected digits" if token.nil?
435
+ Integer(token, 10)
436
+ rescue ArgumentError
437
+ raise ParseError, "Invalid length literal"
438
+ end
439
+
440
+ def symbolize_keys(row)
441
+ symbolize_names ? row.transform_keys(&:to_sym) : row
442
+ end
443
+
444
+ def expect!(char)
445
+ skip_ws
446
+ unless @scanner.scan(Regexp.new(Regexp.escape(char)))
447
+ raise ParseError, "Expected #{char.inspect}, got #{@scanner.peek(1).inspect}"
448
+ end
449
+ end
450
+
451
+ def skip_ws
452
+ @scanner.skip(/\s+/)
453
+ end
454
+
455
+ def whitespace?(char)
456
+ char == " " || char == "\t" || char == "\n" || char == "\r"
457
+ end
458
+
459
+ def key_ahead?
460
+ # Check if the next token looks like a key followed by [(=
461
+ # We need to preserve position
462
+ pos = @scanner.pos
463
+ skip_ws
464
+
465
+ # Scan a key
466
+ if @scanner.scan(/[0-9A-Za-z_.:-]+/)
467
+ # Check what follows
468
+ skip_ws
469
+ next_char = @scanner.peek(1)
470
+ result = ["(", "[", "="].include?(next_char)
471
+ @scanner.pos = pos
472
+ result
473
+ else
474
+ @scanner.pos = pos
475
+ false
476
+ end
477
+ end
478
+
479
+ def safe_key_char?(char)
480
+ !char.nil? && char.match?(/[0-9A-Za-z_.:-]/)
481
+ end
482
+
483
+ def integer?(token)
484
+ token.match?(/\A-?(?:0|[1-9]\d*)\z/)
485
+ end
486
+
487
+ def float?(token)
488
+ token.match?(/\A-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?\z/)
489
+ end
490
+ end
491
+ end