cton 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/cton/decoder.rb CHANGED
@@ -5,13 +5,15 @@ require "strscan"
5
5
  module Cton
6
6
  class Decoder
7
7
  TERMINATORS = [",", ";", ")", "]", "}"].freeze
8
+ KEY_VALUE_BOUNDARY_TOKENS = ["(", "[", "="].freeze
8
9
 
9
10
  def initialize(symbolize_names: false)
10
11
  @symbolize_names = symbolize_names
11
12
  end
12
13
 
13
14
  def decode(cton)
14
- @scanner = StringScanner.new(cton.to_s)
15
+ @raw_string = cton.to_s
16
+ @scanner = StringScanner.new(@raw_string)
15
17
  skip_ws
16
18
 
17
19
  value = if key_ahead?
@@ -21,14 +23,28 @@ module Cton
21
23
  end
22
24
 
23
25
  skip_ws
24
- raise ParseError, "Unexpected trailing data" unless @scanner.eos?
26
+ raise_error("Unexpected trailing data") unless @scanner.eos?
25
27
 
26
28
  value
27
29
  end
28
30
 
29
31
  private
30
32
 
31
- attr_reader :symbolize_names, :scanner
33
+ attr_reader :symbolize_names, :scanner, :raw_string
34
+
35
+ def raise_error(message)
36
+ line, col = calculate_location(@scanner.pos)
37
+ raise ParseError, "#{message} at line #{line}, column #{col}"
38
+ end
39
+
40
+ def calculate_location(pos)
41
+ string = raw_string
42
+ consumed = string[0...pos]
43
+ line = consumed.count("\n") + 1
44
+ last_newline = consumed.rindex("\n")
45
+ col = last_newline ? pos - last_newline : pos + 1
46
+ [line, col]
47
+ end
32
48
 
33
49
  def parse_document
34
50
  result = {}
@@ -43,22 +59,20 @@ module Cton
43
59
 
44
60
  def parse_value_for_key
45
61
  skip_ws
46
- if @scanner.scan(/\(/)
62
+ if @scanner.scan("(")
47
63
  parse_object
48
- elsif @scanner.scan(/\[/)
64
+ elsif @scanner.scan("[")
49
65
  parse_array
50
- elsif @scanner.scan(/=/)
66
+ elsif @scanner.scan("=")
51
67
  parse_scalar(allow_key_boundary: true)
52
68
  else
53
- raise ParseError, "Unexpected token at position #{@scanner.pos}"
69
+ raise_error("Unexpected token")
54
70
  end
55
71
  end
56
72
 
57
73
  def parse_object
58
74
  skip_ws
59
- if @scanner.scan(/\)/)
60
- return {}
61
- end
75
+ return {} if @scanner.scan(")")
62
76
 
63
77
  pairs = {}
64
78
  loop do
@@ -67,7 +81,8 @@ module Cton
67
81
  value = parse_value
68
82
  pairs[key] = value
69
83
  skip_ws
70
- break if @scanner.scan(/\)/)
84
+ break if @scanner.scan(")")
85
+
71
86
  expect!(",")
72
87
  skip_ws
73
88
  end
@@ -92,7 +107,8 @@ module Cton
92
107
  fields = []
93
108
  loop do
94
109
  fields << parse_key_name
95
- break if @scanner.scan(/\}/)
110
+ break if @scanner.scan("}")
111
+
96
112
  expect!(",")
97
113
  end
98
114
  fields
@@ -125,9 +141,9 @@ module Cton
125
141
 
126
142
  def parse_value(allow_key_boundary: false)
127
143
  skip_ws
128
- if @scanner.scan(/\(/)
144
+ if @scanner.scan("(")
129
145
  parse_object
130
- elsif @scanner.scan(/\[/)
146
+ elsif @scanner.scan("[")
131
147
  parse_array
132
148
  elsif @scanner.peek(1) == '"'
133
149
  parse_string
@@ -140,242 +156,88 @@ module Cton
140
156
  skip_ws
141
157
  return parse_string if @scanner.peek(1) == '"'
142
158
 
143
- start_pos = @scanner.pos
144
-
145
- # If we allow key boundary, we need to be careful not to consume the next key
146
- # This is the tricky part. The original implementation scanned ahead.
147
- # With StringScanner, we can scan until a terminator or whitespace.
148
-
159
+ @scanner.pos
160
+
149
161
  token = if allow_key_boundary
150
162
  scan_until_boundary_or_terminator
151
163
  else
152
164
  scan_until_terminator
153
165
  end
154
166
 
155
- raise ParseError, "Empty value at #{start_pos}" if token.nil? || token.empty?
167
+ raise_error("Empty value") if token.nil? || token.empty?
156
168
 
157
169
  convert_scalar(token)
158
170
  end
159
171
 
160
172
  def scan_until_terminator
161
- # Scan until we hit a terminator char, whitespace, or structure char
162
- # Terminators: , ; ) ] }
163
- # Structure: ( [ {
164
- # Whitespace
165
-
166
- @scanner.scan(/[^,;\]\}\)\(\[\{\s]+/)
173
+ start_pos = @scanner.pos
174
+ end_pos = find_terminator_position(start_pos)
175
+ consume_slice(start_pos, end_pos)
167
176
  end
168
177
 
169
178
  def scan_until_boundary_or_terminator
170
- # This is complex because "key=" looks like a scalar "key" followed by "="
171
- # But "value" followed by "key=" means "value" ends before "key".
172
- # The original logic used `next_key_index`.
173
-
174
- # Let's try to replicate the logic:
175
- # Scan characters that are safe for keys/values.
176
- # If we see something that looks like a key start, check if it is followed by [(=
177
-
178
179
  start_pos = @scanner.pos
179
-
180
- # Fast path: scan until something interesting happens
181
- chunk = @scanner.scan(/[0-9A-Za-z_.:-]+/)
182
- return nil unless chunk
183
-
184
- # Now we might have consumed too much if the chunk contains a key.
185
- # e.g. "valuekey=" -> chunk is "valuekey"
186
- # We need to check if there is a split point inside `chunk` or if `chunk` itself is followed by [(=
187
-
188
- # Actually, the original logic was:
189
- # Find the *first* position where a valid key starts AND is followed by [(=
190
-
191
- # Let's re-implement `next_key_index` logic but using the scanner's string
192
-
193
- rest_of_string = @scanner.string[@scanner.pos..-1]
194
- # But we also need to consider the chunk we just scanned?
195
- # No, `scan_until_boundary_or_terminator` is called when we are at the start of a scalar.
196
-
197
- # Let's reset and do it properly.
198
- @scanner.pos = start_pos
199
-
200
- full_scalar = scan_until_terminator
201
- return nil unless full_scalar
202
-
203
- # Now check if `full_scalar` contains a key boundary
204
- # A key boundary is a substring that matches SAFE_TOKEN and is followed by [(=
205
-
206
- # We need to look at `full_scalar` + whatever follows (whitespace?) + [(=
207
- # But `scan_until_terminator` stops at whitespace.
208
-
209
- # If `full_scalar` is "valuekey", and next char is "=", then "key" is the key.
210
- # But wait, "value" and "key" must be separated?
211
- # In CTON, "valuekey=..." is ambiguous if no separator.
212
- # The README says: "Removing every newline makes certain inputs ambiguous... The default separator avoids that... You may pass separator: ''... decoding such strings is only safe if you can guarantee extra quoting or whitespace".
213
-
214
- # So if we are in `allow_key_boundary` mode (top level), we must look for embedded keys.
215
-
216
- # Let's look for the pattern inside the text we just consumed + lookahead.
217
- # Actually, the original `next_key_index` scanned from the current position.
218
-
219
- # Let's implement a helper that searches for the boundary in the remaining string
220
- # starting from `start_pos`.
221
-
222
- boundary_idx = find_key_boundary(start_pos)
223
-
224
- if boundary_idx
225
- # We found a boundary at `boundary_idx`.
226
- # The scalar ends at `boundary_idx`.
227
- length = boundary_idx - start_pos
228
- @scanner.pos = start_pos
229
- token = @scanner.peek(length)
230
- @scanner.pos += length
231
- token
232
- else
233
- # No boundary found, so the whole thing we scanned is the token
234
- # We already scanned it into `full_scalar` but we need to put the scanner in the right place.
235
- # Wait, I reset the scanner.
236
- @scanner.pos = start_pos + full_scalar.length
237
- full_scalar
180
+ boundary_pos = find_key_boundary(start_pos)
181
+ end_pos = boundary_pos || find_terminator_position(start_pos)
182
+ consume_slice(start_pos, end_pos)
183
+ end
184
+
185
+ def consume_slice(start_pos, end_pos)
186
+ return nil if end_pos <= start_pos
187
+
188
+ token = raw_string.byteslice(start_pos, end_pos - start_pos)
189
+ @scanner.pos = end_pos
190
+ token
191
+ end
192
+
193
+ def find_terminator_position(start_pos)
194
+ str = raw_string
195
+ len = str.length
196
+ idx = start_pos
197
+
198
+ while idx < len
199
+ char = str[idx]
200
+ break if terminator?(char)
201
+
202
+ idx += 1
238
203
  end
204
+
205
+ idx
239
206
  end
240
207
 
241
208
  def find_key_boundary(from_index)
242
- str = @scanner.string
209
+ str = raw_string
243
210
  len = str.length
244
211
  idx = from_index
245
-
246
- # We are looking for a sequence that matches SAFE_KEY followed by [(=
247
- # But we are currently parsing a scalar.
248
-
249
- # Optimization: we only care about boundaries that appear *before* any terminator/whitespace.
250
- # Because if we hit a terminator/whitespace, the scalar ends anyway.
251
-
252
- # So we only need to check inside the `scan_until_terminator` range?
253
- # No, because "valuekey=" has no terminator/whitespace between value and key.
254
-
212
+
255
213
  while idx < len
256
214
  char = str[idx]
257
-
258
- # If we hit a terminator or whitespace, we stop looking for boundaries
259
- # because the scalar naturally ends here.
260
- if TERMINATORS.include?(char) || whitespace?(char) || "([{".include?(char)
261
- return nil
262
- end
263
-
264
- # Check if a key starts here
215
+
216
+ return nil if terminator?(char)
217
+
265
218
  if safe_key_char?(char)
266
- # Check if this potential key is followed by [(=
267
- # We need to scan this potential key
268
219
  key_end = idx
269
- while key_end < len && safe_key_char?(str[key_end])
270
- key_end += 1
271
- end
272
-
273
- # Check what follows
274
- next_char_idx = key_end
275
- # Skip whitespace after key? No, keys are immediately followed by [(= usually?
276
- # The original `next_key_index` did NOT skip whitespace after the key candidate.
277
- # "next_char = @source[idx]" (where idx is after key)
278
-
279
- if next_char_idx < len
280
- next_char = str[next_char_idx]
281
- if ["(", "[", "="].include?(next_char)
282
- # Found a boundary!
283
- # But wait, is this the *start* of the scalar?
284
- # If idx == from_index, then the scalar IS the key? No, that means we are at the start.
285
- # If we are at the start, and it looks like a key, then it IS a key, so we should have parsed it as a key?
286
- # No, `parse_scalar` is called when we expect a value.
287
- # If we are parsing a document "key=valuekey2=value2", we are parsing "valuekey2".
288
- # "key2" is the next key. So "value" is the scalar.
289
- # So if idx > from_index, we found a split.
290
-
291
- return idx if idx > from_index
292
- end
220
+ key_end += 1 while key_end < len && safe_key_char?(str[key_end])
221
+
222
+ if key_end < len && KEY_VALUE_BOUNDARY_TOKENS.include?(str[key_end]) && idx > from_index && boundary_start_allowed?(str[idx])
223
+ return idx
293
224
  end
294
-
295
- # If not a boundary, we continue scanning from inside the key?
296
- # "valuekey=" -> at 'k', key is "key", followed by '=', so split at 'k'.
297
- # "valukey=" -> at 'l', key is "lukey", followed by '=', so split at 'l'.
298
- # This seems to imply we should check every position?
299
- # The original code:
300
- # if safe_key_char?(char)
301
- # start = idx
302
- # idx += 1 while ...
303
- # if start > from_index && ... return start
304
- # idx = start + 1 <-- This is important! It backtracks to check nested keys.
305
- # next
306
-
307
- # Yes, we need to check every position.
308
-
309
- # Optimization: The key must end at `key_end`.
310
- # If `str[key_end]` is not [(=, then this `key_candidate` is not a key.
311
- # But maybe a suffix of it is?
312
- # e.g. "abc=" -> "abc" followed by "=". Split at start? No.
313
- # "a" followed by "bc="? No.
314
-
315
- # Actually, if we find a valid key char, we scan to the end of the valid key chars.
316
- # Let's say we have "abc=def".
317
- # At 'a': key is "abc". Next is "=". "abc" is a key.
318
- # If we are at start (from_index), then the whole thing is a key?
319
- # But we are parsing a scalar.
320
- # If `parse_scalar` sees "abc=", and `allow_key_boundary` is true.
321
- # Does it mean "abc" is the scalar? Or "abc" is the next key?
322
- # If "abc" is the next key, then the scalar before it is empty?
323
- # "key=abc=def" -> key="key", value="abc", next_key="def"? No.
324
- # "key=value next=val" -> value="value", next="next".
325
- # "key=valuenext=val" -> value="value", next="next".
326
-
327
- # So if we find a key boundary at `idx`, it means the scalar ends at `idx`.
328
-
329
- # Let's stick to the original logic:
330
- # Scan the maximal sequence of safe chars.
331
- # If it is followed by [(=, then it IS a key.
332
- # If it starts after `from_index`, then we found the boundary.
333
- # If it starts AT `from_index`, then... what?
334
- # If we are parsing a scalar, and we see "key=...", then the scalar is empty?
335
- # That shouldn't happen if we called `parse_scalar`.
336
- # Unless `parse_document` called `parse_value_for_key` -> `parse_scalar`.
337
- # But `parse_document` calls `parse_key_name` first.
338
- # So we are inside `parse_value`.
339
-
340
- # Example: "a=1b=2".
341
- # parse "a", expect "=", parse value.
342
- # value starts at "1".
343
- # "1" is safe char. "1b" is safe.
344
- # "b" is safe.
345
- # At "1": max key is "1b". Next is "=". "1b" is a key? Yes.
346
- # Is "1b" followed by "="? Yes.
347
- # Does it start > from_index? "1" is at from_index. No.
348
- # So "1b" is NOT a boundary.
349
- # Continue to next char "b".
350
- # At "b": max key is "b". Next is "=". "b" is a key.
351
- # Does it start > from_index? Yes ("b" index > "1" index).
352
- # So boundary is at "b".
353
- # Scalar is "1".
354
-
355
- # So the logic is:
356
- # For each char at `idx`:
357
- # If it can start a key:
358
- # Find end of key `end_key`.
359
- # If `str[end_key]` is [(= :
360
- # If `idx > from_index`: return `idx`.
361
- # idx += 1
362
-
363
- # But wait, "1b" was a key candidate.
364
- # If we advanced `idx` to `end_key`, we would skip "b".
365
- # So we must NOT advance `idx` to `end_key` blindly.
366
- # We must check `idx`, then `idx+1`, etc.
367
-
368
- # But `safe_key_char?` is true for all chars in "1b".
369
- # So we check "1...", then "b...".
370
-
371
- # Correct.
372
225
  end
373
-
226
+
374
227
  idx += 1
375
228
  end
229
+
376
230
  nil
377
231
  end
378
232
 
233
+ def terminator?(char)
234
+ TERMINATORS.include?(char) || whitespace?(char) || ["(", "[", "{"].include?(char)
235
+ end
236
+
237
+ def boundary_start_allowed?(char)
238
+ !char.nil? && char.match?(/[A-Za-z_.:-]/)
239
+ end
240
+
379
241
  def convert_scalar(token)
380
242
  case token
381
243
  when "true" then true
@@ -396,22 +258,20 @@ module Cton
396
258
  expect!("\"")
397
259
  buffer = +""
398
260
  loop do
399
- if @scanner.eos?
400
- raise ParseError, "Unterminated string"
401
- end
402
-
261
+ raise_error("Unterminated string") if @scanner.eos?
262
+
403
263
  char = @scanner.getch
404
-
405
- if char == '\\'
264
+
265
+ if char == "\\"
406
266
  escaped = @scanner.getch
407
- raise ParseError, "Invalid escape sequence" if escaped.nil?
267
+ raise_error("Invalid escape sequence") if escaped.nil?
408
268
  buffer << case escaped
409
- when 'n' then "\n"
410
- when 'r' then "\r"
411
- when 't' then "\t"
412
- when '"', '\\' then escaped
269
+ when "n" then "\n"
270
+ when "r" then "\r"
271
+ when "t" then "\t"
272
+ when '"', "\\" then escaped
413
273
  else
414
- raise ParseError, "Unsupported escape sequence"
274
+ raise_error("Unsupported escape sequence")
415
275
  end
416
276
  elsif char == '"'
417
277
  break
@@ -425,16 +285,16 @@ module Cton
425
285
  def parse_key_name
426
286
  skip_ws
427
287
  token = @scanner.scan(/[0-9A-Za-z_.:-]+/)
428
- raise ParseError, "Invalid key" if token.nil?
288
+ raise_error("Invalid key") if token.nil?
429
289
  symbolize_names ? token.to_sym : token
430
290
  end
431
291
 
432
292
  def parse_integer_literal
433
293
  token = @scanner.scan(/-?\d+/)
434
- raise ParseError, "Expected digits" if token.nil?
294
+ raise_error("Expected digits") if token.nil?
435
295
  Integer(token, 10)
436
296
  rescue ArgumentError
437
- raise ParseError, "Invalid length literal"
297
+ raise_error("Invalid length literal")
438
298
  end
439
299
 
440
300
  def symbolize_keys(row)
@@ -443,9 +303,9 @@ module Cton
443
303
 
444
304
  def expect!(char)
445
305
  skip_ws
446
- unless @scanner.scan(Regexp.new(Regexp.escape(char)))
447
- raise ParseError, "Expected #{char.inspect}, got #{@scanner.peek(1).inspect}"
448
- end
306
+ return if @scanner.scan(Regexp.new(Regexp.escape(char)))
307
+
308
+ raise_error("Expected #{char.inspect}, got #{@scanner.peek(1).inspect}")
449
309
  end
450
310
 
451
311
  def skip_ws
@@ -453,18 +313,14 @@ module Cton
453
313
  end
454
314
 
455
315
  def whitespace?(char)
456
- char == " " || char == "\t" || char == "\n" || char == "\r"
316
+ [" ", "\t", "\n", "\r"].include?(char)
457
317
  end
458
318
 
459
319
  def key_ahead?
460
- # Check if the next token looks like a key followed by [(=
461
- # We need to preserve position
462
320
  pos = @scanner.pos
463
321
  skip_ws
464
-
465
- # Scan a key
322
+
466
323
  if @scanner.scan(/[0-9A-Za-z_.:-]+/)
467
- # Check what follows
468
324
  skip_ws
469
325
  next_char = @scanner.peek(1)
470
326
  result = ["(", "[", "="].include?(next_char)