cton 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +33 -0
- data/README.md +258 -35
- data/bench/encode_decode_bench.rb +65 -0
- data/lib/cton/decoder.rb +98 -242
- data/lib/cton/encoder.rb +171 -41
- data/lib/cton/version.rb +1 -1
- data/lib/cton.rb +17 -3
- metadata +4 -2
data/lib/cton/decoder.rb
CHANGED
|
@@ -5,13 +5,15 @@ require "strscan"
|
|
|
5
5
|
module Cton
|
|
6
6
|
class Decoder
|
|
7
7
|
TERMINATORS = [",", ";", ")", "]", "}"].freeze
|
|
8
|
+
KEY_VALUE_BOUNDARY_TOKENS = ["(", "[", "="].freeze
|
|
8
9
|
|
|
9
10
|
def initialize(symbolize_names: false)
|
|
10
11
|
@symbolize_names = symbolize_names
|
|
11
12
|
end
|
|
12
13
|
|
|
13
14
|
def decode(cton)
|
|
14
|
-
@
|
|
15
|
+
@raw_string = cton.to_s
|
|
16
|
+
@scanner = StringScanner.new(@raw_string)
|
|
15
17
|
skip_ws
|
|
16
18
|
|
|
17
19
|
value = if key_ahead?
|
|
@@ -21,14 +23,28 @@ module Cton
|
|
|
21
23
|
end
|
|
22
24
|
|
|
23
25
|
skip_ws
|
|
24
|
-
|
|
26
|
+
raise_error("Unexpected trailing data") unless @scanner.eos?
|
|
25
27
|
|
|
26
28
|
value
|
|
27
29
|
end
|
|
28
30
|
|
|
29
31
|
private
|
|
30
32
|
|
|
31
|
-
attr_reader :symbolize_names, :scanner
|
|
33
|
+
attr_reader :symbolize_names, :scanner, :raw_string
|
|
34
|
+
|
|
35
|
+
def raise_error(message)
|
|
36
|
+
line, col = calculate_location(@scanner.pos)
|
|
37
|
+
raise ParseError, "#{message} at line #{line}, column #{col}"
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def calculate_location(pos)
|
|
41
|
+
string = raw_string
|
|
42
|
+
consumed = string[0...pos]
|
|
43
|
+
line = consumed.count("\n") + 1
|
|
44
|
+
last_newline = consumed.rindex("\n")
|
|
45
|
+
col = last_newline ? pos - last_newline : pos + 1
|
|
46
|
+
[line, col]
|
|
47
|
+
end
|
|
32
48
|
|
|
33
49
|
def parse_document
|
|
34
50
|
result = {}
|
|
@@ -43,22 +59,20 @@ module Cton
|
|
|
43
59
|
|
|
44
60
|
def parse_value_for_key
|
|
45
61
|
skip_ws
|
|
46
|
-
if @scanner.scan(
|
|
62
|
+
if @scanner.scan("(")
|
|
47
63
|
parse_object
|
|
48
|
-
elsif @scanner.scan(
|
|
64
|
+
elsif @scanner.scan("[")
|
|
49
65
|
parse_array
|
|
50
|
-
elsif @scanner.scan(
|
|
66
|
+
elsif @scanner.scan("=")
|
|
51
67
|
parse_scalar(allow_key_boundary: true)
|
|
52
68
|
else
|
|
53
|
-
|
|
69
|
+
raise_error("Unexpected token")
|
|
54
70
|
end
|
|
55
71
|
end
|
|
56
72
|
|
|
57
73
|
def parse_object
|
|
58
74
|
skip_ws
|
|
59
|
-
if @scanner.scan(
|
|
60
|
-
return {}
|
|
61
|
-
end
|
|
75
|
+
return {} if @scanner.scan(")")
|
|
62
76
|
|
|
63
77
|
pairs = {}
|
|
64
78
|
loop do
|
|
@@ -67,7 +81,8 @@ module Cton
|
|
|
67
81
|
value = parse_value
|
|
68
82
|
pairs[key] = value
|
|
69
83
|
skip_ws
|
|
70
|
-
break if @scanner.scan(
|
|
84
|
+
break if @scanner.scan(")")
|
|
85
|
+
|
|
71
86
|
expect!(",")
|
|
72
87
|
skip_ws
|
|
73
88
|
end
|
|
@@ -92,7 +107,8 @@ module Cton
|
|
|
92
107
|
fields = []
|
|
93
108
|
loop do
|
|
94
109
|
fields << parse_key_name
|
|
95
|
-
break if @scanner.scan(
|
|
110
|
+
break if @scanner.scan("}")
|
|
111
|
+
|
|
96
112
|
expect!(",")
|
|
97
113
|
end
|
|
98
114
|
fields
|
|
@@ -125,9 +141,9 @@ module Cton
|
|
|
125
141
|
|
|
126
142
|
def parse_value(allow_key_boundary: false)
|
|
127
143
|
skip_ws
|
|
128
|
-
if @scanner.scan(
|
|
144
|
+
if @scanner.scan("(")
|
|
129
145
|
parse_object
|
|
130
|
-
elsif @scanner.scan(
|
|
146
|
+
elsif @scanner.scan("[")
|
|
131
147
|
parse_array
|
|
132
148
|
elsif @scanner.peek(1) == '"'
|
|
133
149
|
parse_string
|
|
@@ -140,242 +156,88 @@ module Cton
|
|
|
140
156
|
skip_ws
|
|
141
157
|
return parse_string if @scanner.peek(1) == '"'
|
|
142
158
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
# If we allow key boundary, we need to be careful not to consume the next key
|
|
146
|
-
# This is the tricky part. The original implementation scanned ahead.
|
|
147
|
-
# With StringScanner, we can scan until a terminator or whitespace.
|
|
148
|
-
|
|
159
|
+
@scanner.pos
|
|
160
|
+
|
|
149
161
|
token = if allow_key_boundary
|
|
150
162
|
scan_until_boundary_or_terminator
|
|
151
163
|
else
|
|
152
164
|
scan_until_terminator
|
|
153
165
|
end
|
|
154
166
|
|
|
155
|
-
|
|
167
|
+
raise_error("Empty value") if token.nil? || token.empty?
|
|
156
168
|
|
|
157
169
|
convert_scalar(token)
|
|
158
170
|
end
|
|
159
171
|
|
|
160
172
|
def scan_until_terminator
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
# Whitespace
|
|
165
|
-
|
|
166
|
-
@scanner.scan(/[^,;\]\}\)\(\[\{\s]+/)
|
|
173
|
+
start_pos = @scanner.pos
|
|
174
|
+
end_pos = find_terminator_position(start_pos)
|
|
175
|
+
consume_slice(start_pos, end_pos)
|
|
167
176
|
end
|
|
168
177
|
|
|
169
178
|
def scan_until_boundary_or_terminator
|
|
170
|
-
# This is complex because "key=" looks like a scalar "key" followed by "="
|
|
171
|
-
# But "value" followed by "key=" means "value" ends before "key".
|
|
172
|
-
# The original logic used `next_key_index`.
|
|
173
|
-
|
|
174
|
-
# Let's try to replicate the logic:
|
|
175
|
-
# Scan characters that are safe for keys/values.
|
|
176
|
-
# If we see something that looks like a key start, check if it is followed by [(=
|
|
177
|
-
|
|
178
179
|
start_pos = @scanner.pos
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
# Now check if `full_scalar` contains a key boundary
|
|
204
|
-
# A key boundary is a substring that matches SAFE_TOKEN and is followed by [(=
|
|
205
|
-
|
|
206
|
-
# We need to look at `full_scalar` + whatever follows (whitespace?) + [(=
|
|
207
|
-
# But `scan_until_terminator` stops at whitespace.
|
|
208
|
-
|
|
209
|
-
# If `full_scalar` is "valuekey", and next char is "=", then "key" is the key.
|
|
210
|
-
# But wait, "value" and "key" must be separated?
|
|
211
|
-
# In CTON, "valuekey=..." is ambiguous if no separator.
|
|
212
|
-
# The README says: "Removing every newline makes certain inputs ambiguous... The default separator avoids that... You may pass separator: ''... decoding such strings is only safe if you can guarantee extra quoting or whitespace".
|
|
213
|
-
|
|
214
|
-
# So if we are in `allow_key_boundary` mode (top level), we must look for embedded keys.
|
|
215
|
-
|
|
216
|
-
# Let's look for the pattern inside the text we just consumed + lookahead.
|
|
217
|
-
# Actually, the original `next_key_index` scanned from the current position.
|
|
218
|
-
|
|
219
|
-
# Let's implement a helper that searches for the boundary in the remaining string
|
|
220
|
-
# starting from `start_pos`.
|
|
221
|
-
|
|
222
|
-
boundary_idx = find_key_boundary(start_pos)
|
|
223
|
-
|
|
224
|
-
if boundary_idx
|
|
225
|
-
# We found a boundary at `boundary_idx`.
|
|
226
|
-
# The scalar ends at `boundary_idx`.
|
|
227
|
-
length = boundary_idx - start_pos
|
|
228
|
-
@scanner.pos = start_pos
|
|
229
|
-
token = @scanner.peek(length)
|
|
230
|
-
@scanner.pos += length
|
|
231
|
-
token
|
|
232
|
-
else
|
|
233
|
-
# No boundary found, so the whole thing we scanned is the token
|
|
234
|
-
# We already scanned it into `full_scalar` but we need to put the scanner in the right place.
|
|
235
|
-
# Wait, I reset the scanner.
|
|
236
|
-
@scanner.pos = start_pos + full_scalar.length
|
|
237
|
-
full_scalar
|
|
180
|
+
boundary_pos = find_key_boundary(start_pos)
|
|
181
|
+
end_pos = boundary_pos || find_terminator_position(start_pos)
|
|
182
|
+
consume_slice(start_pos, end_pos)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def consume_slice(start_pos, end_pos)
|
|
186
|
+
return nil if end_pos <= start_pos
|
|
187
|
+
|
|
188
|
+
token = raw_string.byteslice(start_pos, end_pos - start_pos)
|
|
189
|
+
@scanner.pos = end_pos
|
|
190
|
+
token
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def find_terminator_position(start_pos)
|
|
194
|
+
str = raw_string
|
|
195
|
+
len = str.length
|
|
196
|
+
idx = start_pos
|
|
197
|
+
|
|
198
|
+
while idx < len
|
|
199
|
+
char = str[idx]
|
|
200
|
+
break if terminator?(char)
|
|
201
|
+
|
|
202
|
+
idx += 1
|
|
238
203
|
end
|
|
204
|
+
|
|
205
|
+
idx
|
|
239
206
|
end
|
|
240
207
|
|
|
241
208
|
def find_key_boundary(from_index)
|
|
242
|
-
str =
|
|
209
|
+
str = raw_string
|
|
243
210
|
len = str.length
|
|
244
211
|
idx = from_index
|
|
245
|
-
|
|
246
|
-
# We are looking for a sequence that matches SAFE_KEY followed by [(=
|
|
247
|
-
# But we are currently parsing a scalar.
|
|
248
|
-
|
|
249
|
-
# Optimization: we only care about boundaries that appear *before* any terminator/whitespace.
|
|
250
|
-
# Because if we hit a terminator/whitespace, the scalar ends anyway.
|
|
251
|
-
|
|
252
|
-
# So we only need to check inside the `scan_until_terminator` range?
|
|
253
|
-
# No, because "valuekey=" has no terminator/whitespace between value and key.
|
|
254
|
-
|
|
212
|
+
|
|
255
213
|
while idx < len
|
|
256
214
|
char = str[idx]
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
if TERMINATORS.include?(char) || whitespace?(char) || "([{".include?(char)
|
|
261
|
-
return nil
|
|
262
|
-
end
|
|
263
|
-
|
|
264
|
-
# Check if a key starts here
|
|
215
|
+
|
|
216
|
+
return nil if terminator?(char)
|
|
217
|
+
|
|
265
218
|
if safe_key_char?(char)
|
|
266
|
-
# Check if this potential key is followed by [(=
|
|
267
|
-
# We need to scan this potential key
|
|
268
219
|
key_end = idx
|
|
269
|
-
while key_end < len && safe_key_char?(str[key_end])
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
# Check what follows
|
|
274
|
-
next_char_idx = key_end
|
|
275
|
-
# Skip whitespace after key? No, keys are immediately followed by [(= usually?
|
|
276
|
-
# The original `next_key_index` did NOT skip whitespace after the key candidate.
|
|
277
|
-
# "next_char = @source[idx]" (where idx is after key)
|
|
278
|
-
|
|
279
|
-
if next_char_idx < len
|
|
280
|
-
next_char = str[next_char_idx]
|
|
281
|
-
if ["(", "[", "="].include?(next_char)
|
|
282
|
-
# Found a boundary!
|
|
283
|
-
# But wait, is this the *start* of the scalar?
|
|
284
|
-
# If idx == from_index, then the scalar IS the key? No, that means we are at the start.
|
|
285
|
-
# If we are at the start, and it looks like a key, then it IS a key, so we should have parsed it as a key?
|
|
286
|
-
# No, `parse_scalar` is called when we expect a value.
|
|
287
|
-
# If we are parsing a document "key=valuekey2=value2", we are parsing "valuekey2".
|
|
288
|
-
# "key2" is the next key. So "value" is the scalar.
|
|
289
|
-
# So if idx > from_index, we found a split.
|
|
290
|
-
|
|
291
|
-
return idx if idx > from_index
|
|
292
|
-
end
|
|
220
|
+
key_end += 1 while key_end < len && safe_key_char?(str[key_end])
|
|
221
|
+
|
|
222
|
+
if key_end < len && KEY_VALUE_BOUNDARY_TOKENS.include?(str[key_end]) && idx > from_index && boundary_start_allowed?(str[idx])
|
|
223
|
+
return idx
|
|
293
224
|
end
|
|
294
|
-
|
|
295
|
-
# If not a boundary, we continue scanning from inside the key?
|
|
296
|
-
# "valuekey=" -> at 'k', key is "key", followed by '=', so split at 'k'.
|
|
297
|
-
# "valukey=" -> at 'l', key is "lukey", followed by '=', so split at 'l'.
|
|
298
|
-
# This seems to imply we should check every position?
|
|
299
|
-
# The original code:
|
|
300
|
-
# if safe_key_char?(char)
|
|
301
|
-
# start = idx
|
|
302
|
-
# idx += 1 while ...
|
|
303
|
-
# if start > from_index && ... return start
|
|
304
|
-
# idx = start + 1 <-- This is important! It backtracks to check nested keys.
|
|
305
|
-
# next
|
|
306
|
-
|
|
307
|
-
# Yes, we need to check every position.
|
|
308
|
-
|
|
309
|
-
# Optimization: The key must end at `key_end`.
|
|
310
|
-
# If `str[key_end]` is not [(=, then this `key_candidate` is not a key.
|
|
311
|
-
# But maybe a suffix of it is?
|
|
312
|
-
# e.g. "abc=" -> "abc" followed by "=". Split at start? No.
|
|
313
|
-
# "a" followed by "bc="? No.
|
|
314
|
-
|
|
315
|
-
# Actually, if we find a valid key char, we scan to the end of the valid key chars.
|
|
316
|
-
# Let's say we have "abc=def".
|
|
317
|
-
# At 'a': key is "abc". Next is "=". "abc" is a key.
|
|
318
|
-
# If we are at start (from_index), then the whole thing is a key?
|
|
319
|
-
# But we are parsing a scalar.
|
|
320
|
-
# If `parse_scalar` sees "abc=", and `allow_key_boundary` is true.
|
|
321
|
-
# Does it mean "abc" is the scalar? Or "abc" is the next key?
|
|
322
|
-
# If "abc" is the next key, then the scalar before it is empty?
|
|
323
|
-
# "key=abc=def" -> key="key", value="abc", next_key="def"? No.
|
|
324
|
-
# "key=value next=val" -> value="value", next="next".
|
|
325
|
-
# "key=valuenext=val" -> value="value", next="next".
|
|
326
|
-
|
|
327
|
-
# So if we find a key boundary at `idx`, it means the scalar ends at `idx`.
|
|
328
|
-
|
|
329
|
-
# Let's stick to the original logic:
|
|
330
|
-
# Scan the maximal sequence of safe chars.
|
|
331
|
-
# If it is followed by [(=, then it IS a key.
|
|
332
|
-
# If it starts after `from_index`, then we found the boundary.
|
|
333
|
-
# If it starts AT `from_index`, then... what?
|
|
334
|
-
# If we are parsing a scalar, and we see "key=...", then the scalar is empty?
|
|
335
|
-
# That shouldn't happen if we called `parse_scalar`.
|
|
336
|
-
# Unless `parse_document` called `parse_value_for_key` -> `parse_scalar`.
|
|
337
|
-
# But `parse_document` calls `parse_key_name` first.
|
|
338
|
-
# So we are inside `parse_value`.
|
|
339
|
-
|
|
340
|
-
# Example: "a=1b=2".
|
|
341
|
-
# parse "a", expect "=", parse value.
|
|
342
|
-
# value starts at "1".
|
|
343
|
-
# "1" is safe char. "1b" is safe.
|
|
344
|
-
# "b" is safe.
|
|
345
|
-
# At "1": max key is "1b". Next is "=". "1b" is a key? Yes.
|
|
346
|
-
# Is "1b" followed by "="? Yes.
|
|
347
|
-
# Does it start > from_index? "1" is at from_index. No.
|
|
348
|
-
# So "1b" is NOT a boundary.
|
|
349
|
-
# Continue to next char "b".
|
|
350
|
-
# At "b": max key is "b". Next is "=". "b" is a key.
|
|
351
|
-
# Does it start > from_index? Yes ("b" index > "1" index).
|
|
352
|
-
# So boundary is at "b".
|
|
353
|
-
# Scalar is "1".
|
|
354
|
-
|
|
355
|
-
# So the logic is:
|
|
356
|
-
# For each char at `idx`:
|
|
357
|
-
# If it can start a key:
|
|
358
|
-
# Find end of key `end_key`.
|
|
359
|
-
# If `str[end_key]` is [(= :
|
|
360
|
-
# If `idx > from_index`: return `idx`.
|
|
361
|
-
# idx += 1
|
|
362
|
-
|
|
363
|
-
# But wait, "1b" was a key candidate.
|
|
364
|
-
# If we advanced `idx` to `end_key`, we would skip "b".
|
|
365
|
-
# So we must NOT advance `idx` to `end_key` blindly.
|
|
366
|
-
# We must check `idx`, then `idx+1`, etc.
|
|
367
|
-
|
|
368
|
-
# But `safe_key_char?` is true for all chars in "1b".
|
|
369
|
-
# So we check "1...", then "b...".
|
|
370
|
-
|
|
371
|
-
# Correct.
|
|
372
225
|
end
|
|
373
|
-
|
|
226
|
+
|
|
374
227
|
idx += 1
|
|
375
228
|
end
|
|
229
|
+
|
|
376
230
|
nil
|
|
377
231
|
end
|
|
378
232
|
|
|
233
|
+
def terminator?(char)
|
|
234
|
+
TERMINATORS.include?(char) || whitespace?(char) || ["(", "[", "{"].include?(char)
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
def boundary_start_allowed?(char)
|
|
238
|
+
!char.nil? && char.match?(/[A-Za-z_.:-]/)
|
|
239
|
+
end
|
|
240
|
+
|
|
379
241
|
def convert_scalar(token)
|
|
380
242
|
case token
|
|
381
243
|
when "true" then true
|
|
@@ -396,22 +258,20 @@ module Cton
|
|
|
396
258
|
expect!("\"")
|
|
397
259
|
buffer = +""
|
|
398
260
|
loop do
|
|
399
|
-
if @scanner.eos?
|
|
400
|
-
|
|
401
|
-
end
|
|
402
|
-
|
|
261
|
+
raise_error("Unterminated string") if @scanner.eos?
|
|
262
|
+
|
|
403
263
|
char = @scanner.getch
|
|
404
|
-
|
|
405
|
-
if char ==
|
|
264
|
+
|
|
265
|
+
if char == "\\"
|
|
406
266
|
escaped = @scanner.getch
|
|
407
|
-
|
|
267
|
+
raise_error("Invalid escape sequence") if escaped.nil?
|
|
408
268
|
buffer << case escaped
|
|
409
|
-
when
|
|
410
|
-
when
|
|
411
|
-
when
|
|
412
|
-
when '"',
|
|
269
|
+
when "n" then "\n"
|
|
270
|
+
when "r" then "\r"
|
|
271
|
+
when "t" then "\t"
|
|
272
|
+
when '"', "\\" then escaped
|
|
413
273
|
else
|
|
414
|
-
|
|
274
|
+
raise_error("Unsupported escape sequence")
|
|
415
275
|
end
|
|
416
276
|
elsif char == '"'
|
|
417
277
|
break
|
|
@@ -425,16 +285,16 @@ module Cton
|
|
|
425
285
|
def parse_key_name
|
|
426
286
|
skip_ws
|
|
427
287
|
token = @scanner.scan(/[0-9A-Za-z_.:-]+/)
|
|
428
|
-
|
|
288
|
+
raise_error("Invalid key") if token.nil?
|
|
429
289
|
symbolize_names ? token.to_sym : token
|
|
430
290
|
end
|
|
431
291
|
|
|
432
292
|
def parse_integer_literal
|
|
433
293
|
token = @scanner.scan(/-?\d+/)
|
|
434
|
-
|
|
294
|
+
raise_error("Expected digits") if token.nil?
|
|
435
295
|
Integer(token, 10)
|
|
436
296
|
rescue ArgumentError
|
|
437
|
-
|
|
297
|
+
raise_error("Invalid length literal")
|
|
438
298
|
end
|
|
439
299
|
|
|
440
300
|
def symbolize_keys(row)
|
|
@@ -443,9 +303,9 @@ module Cton
|
|
|
443
303
|
|
|
444
304
|
def expect!(char)
|
|
445
305
|
skip_ws
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
306
|
+
return if @scanner.scan(Regexp.new(Regexp.escape(char)))
|
|
307
|
+
|
|
308
|
+
raise_error("Expected #{char.inspect}, got #{@scanner.peek(1).inspect}")
|
|
449
309
|
end
|
|
450
310
|
|
|
451
311
|
def skip_ws
|
|
@@ -453,18 +313,14 @@ module Cton
|
|
|
453
313
|
end
|
|
454
314
|
|
|
455
315
|
def whitespace?(char)
|
|
456
|
-
|
|
316
|
+
[" ", "\t", "\n", "\r"].include?(char)
|
|
457
317
|
end
|
|
458
318
|
|
|
459
319
|
def key_ahead?
|
|
460
|
-
# Check if the next token looks like a key followed by [(=
|
|
461
|
-
# We need to preserve position
|
|
462
320
|
pos = @scanner.pos
|
|
463
321
|
skip_ws
|
|
464
|
-
|
|
465
|
-
# Scan a key
|
|
322
|
+
|
|
466
323
|
if @scanner.scan(/[0-9A-Za-z_.:-]+/)
|
|
467
|
-
# Check what follows
|
|
468
324
|
skip_ws
|
|
469
325
|
next_char = @scanner.peek(1)
|
|
470
326
|
result = ["(", "[", "="].include?(next_char)
|