odin-foundation 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/odin/diff/differ.rb +115 -0
- data/lib/odin/diff/patcher.rb +64 -0
- data/lib/odin/export.rb +330 -0
- data/lib/odin/parsing/parser.rb +1193 -0
- data/lib/odin/parsing/token.rb +26 -0
- data/lib/odin/parsing/token_type.rb +40 -0
- data/lib/odin/parsing/tokenizer.rb +825 -0
- data/lib/odin/parsing/value_parser.rb +322 -0
- data/lib/odin/resolver/import_resolver.rb +137 -0
- data/lib/odin/serialization/canonicalize.rb +112 -0
- data/lib/odin/serialization/stringify.rb +582 -0
- data/lib/odin/transform/format_exporters.rb +819 -0
- data/lib/odin/transform/source_parsers.rb +385 -0
- data/lib/odin/transform/transform_engine.rb +2837 -0
- data/lib/odin/transform/transform_parser.rb +979 -0
- data/lib/odin/transform/transform_types.rb +278 -0
- data/lib/odin/transform/verb_context.rb +87 -0
- data/lib/odin/transform/verbs/aggregation_verbs.rb +106 -0
- data/lib/odin/transform/verbs/collection_verbs.rb +640 -0
- data/lib/odin/transform/verbs/datetime_verbs.rb +602 -0
- data/lib/odin/transform/verbs/financial_verbs.rb +356 -0
- data/lib/odin/transform/verbs/geo_verbs.rb +125 -0
- data/lib/odin/transform/verbs/numeric_verbs.rb +434 -0
- data/lib/odin/transform/verbs/object_verbs.rb +123 -0
- data/lib/odin/types/array_item.rb +42 -0
- data/lib/odin/types/diff.rb +89 -0
- data/lib/odin/types/directive.rb +28 -0
- data/lib/odin/types/document.rb +92 -0
- data/lib/odin/types/document_builder.rb +67 -0
- data/lib/odin/types/dyn_value.rb +270 -0
- data/lib/odin/types/errors.rb +149 -0
- data/lib/odin/types/modifiers.rb +45 -0
- data/lib/odin/types/ordered_map.rb +79 -0
- data/lib/odin/types/schema.rb +262 -0
- data/lib/odin/types/value_type.rb +28 -0
- data/lib/odin/types/values.rb +618 -0
- data/lib/odin/types.rb +12 -0
- data/lib/odin/utils/format_utils.rb +186 -0
- data/lib/odin/utils/path_utils.rb +25 -0
- data/lib/odin/utils/security_limits.rb +17 -0
- data/lib/odin/validation/format_validators.rb +238 -0
- data/lib/odin/validation/redos_protection.rb +102 -0
- data/lib/odin/validation/schema_parser.rb +813 -0
- data/lib/odin/validation/schema_serializer.rb +262 -0
- data/lib/odin/validation/validator.rb +1061 -0
- data/lib/odin/version.rb +5 -0
- data/lib/odin.rb +90 -0
- metadata +160 -0
|
@@ -0,0 +1,825 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "strscan"
|
|
4
|
+
|
|
5
|
+
module Odin
|
|
6
|
+
module Parsing
|
|
7
|
+
class Tokenizer
|
|
8
|
+
MAX_DOCUMENT_SIZE = Utils::SecurityLimits::MAX_DOCUMENT_SIZE
|
|
9
|
+
|
|
10
|
+
# Pre-compiled regex patterns for StringScanner (all frozen)
|
|
11
|
+
RE_WHITESPACE = /[ \t]+/
|
|
12
|
+
RE_NEWLINE_CRLF = /\r\n?/
|
|
13
|
+
RE_IDENTIFIER = /[a-zA-Z_][a-zA-Z0-9_\-]*/
|
|
14
|
+
RE_IDENT_PATH = /[a-zA-Z_][a-zA-Z0-9_\-.]*/
|
|
15
|
+
RE_NUMERIC = /[+\-]?[0-9eE.+\-]+/
|
|
16
|
+
RE_CURRENCY_VAL = /[+\-]?[0-9.]+(?:[eE][+\-]?\d+)?(?::[a-zA-Z0-9_\-]+)?/
|
|
17
|
+
RE_WORD = /[a-zA-Z0-9_.\-]+/
|
|
18
|
+
RE_HEADER_CONTENT = /[^}\r\n]*/
|
|
19
|
+
RE_COMMENT_CONTENT = /[^\r\n]*/
|
|
20
|
+
RE_REF_PATH = /[a-zA-Z0-9_.\[\]()?\-@']*/
|
|
21
|
+
RE_BINARY_DATA = /[^\s;\r\n]*/
|
|
22
|
+
RE_BARE_VALUE = /[^\s;:\r\n]+/
|
|
23
|
+
RE_DATE_OR_NUM = /[0-9eE.\-:+TZ]+/
|
|
24
|
+
RE_DATE_PREFIX = /\A\d{4}-\d{2}-\d{2}T/
|
|
25
|
+
RE_DATE_EXACT = /\A\d{4}-\d{2}-\d{2}\z/
|
|
26
|
+
RE_DURATION = /P[0-9YMWDTHS.]+/
|
|
27
|
+
RE_TIME_VAL = /T[0-9:.+\-Z]+/
|
|
28
|
+
RE_ARRAY_INDEX = /\[[^\]]*\]/
|
|
29
|
+
|
|
30
|
+
ESCAPE_MAP = {
|
|
31
|
+
'"' => '"',
|
|
32
|
+
'\\' => '\\',
|
|
33
|
+
'n' => "\n",
|
|
34
|
+
't' => "\t",
|
|
35
|
+
'r' => "\r",
|
|
36
|
+
'0' => "\0",
|
|
37
|
+
'/' => '/'
|
|
38
|
+
}.freeze
|
|
39
|
+
|
|
40
|
+
def initialize(text)
|
|
41
|
+
@source = text
|
|
42
|
+
@scanner = StringScanner.new(text)
|
|
43
|
+
@line = 1
|
|
44
|
+
@col = 1
|
|
45
|
+
@tokens = Array.new(text.length / 10 + 16)
|
|
46
|
+
@token_count = 0
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def tokenize
|
|
50
|
+
check_document_size!
|
|
51
|
+
skip_bom
|
|
52
|
+
scan_tokens
|
|
53
|
+
emit(TokenType::EOF, "", @line, @col)
|
|
54
|
+
@tokens.first(@token_count)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private
|
|
58
|
+
|
|
59
|
+
def check_document_size!
|
|
60
|
+
if @source.bytesize > MAX_DOCUMENT_SIZE
|
|
61
|
+
raise Errors::ParseError.new(
|
|
62
|
+
Errors::ParseErrorCode::MAXIMUM_DOCUMENT_SIZE_EXCEEDED,
|
|
63
|
+
1, 1, "Document size #{@source.bytesize} exceeds limit #{MAX_DOCUMENT_SIZE}"
|
|
64
|
+
)
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def skip_bom
|
|
69
|
+
if @source.start_with?("\uFEFF")
|
|
70
|
+
@scanner.pos = "\uFEFF".bytesize
|
|
71
|
+
@col = 1
|
|
72
|
+
elsif @source.bytesize >= 3 &&
|
|
73
|
+
@source.getbyte(0) == 0xEF &&
|
|
74
|
+
@source.getbyte(1) == 0xBB &&
|
|
75
|
+
@source.getbyte(2) == 0xBF
|
|
76
|
+
@scanner.pos = 3
|
|
77
|
+
@col = 1
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def emit(type, value, line, col, raw: nil)
|
|
82
|
+
@tokens[@token_count] = Token.new(type, value, line, col, raw: raw)
|
|
83
|
+
@token_count += 1
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Track line/col after consuming text
|
|
87
|
+
def track(text)
|
|
88
|
+
i = 0
|
|
89
|
+
len = text.length
|
|
90
|
+
while i < len
|
|
91
|
+
if text.getbyte(i) == 10 # \n
|
|
92
|
+
@line += 1
|
|
93
|
+
@col = 1
|
|
94
|
+
else
|
|
95
|
+
@col += 1
|
|
96
|
+
end
|
|
97
|
+
i += 1
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Advance scanner by n bytes, updating line/col
|
|
102
|
+
def skip_bytes(n)
|
|
103
|
+
text = @scanner.peek(n)
|
|
104
|
+
@scanner.pos += n
|
|
105
|
+
track(text)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def scan_tokens
|
|
109
|
+
s = @scanner
|
|
110
|
+
|
|
111
|
+
until s.eos?
|
|
112
|
+
# Skip horizontal whitespace
|
|
113
|
+
if (ws = s.scan(RE_WHITESPACE))
|
|
114
|
+
@col += ws.length
|
|
115
|
+
next
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
line = @line
|
|
119
|
+
col = @col
|
|
120
|
+
|
|
121
|
+
byte = s.string.getbyte(s.pos)
|
|
122
|
+
|
|
123
|
+
case byte
|
|
124
|
+
when 10 # \n
|
|
125
|
+
s.pos += 1
|
|
126
|
+
emit(TokenType::NEWLINE, "\n", line, col)
|
|
127
|
+
@line += 1
|
|
128
|
+
@col = 1
|
|
129
|
+
when 13 # \r
|
|
130
|
+
if s.string.getbyte(s.pos + 1) == 10
|
|
131
|
+
s.pos += 2
|
|
132
|
+
else
|
|
133
|
+
s.pos += 1
|
|
134
|
+
end
|
|
135
|
+
emit(TokenType::NEWLINE, "\n", line, col)
|
|
136
|
+
@line += 1
|
|
137
|
+
@col = 1
|
|
138
|
+
when 59 # ;
|
|
139
|
+
s.pos += 1
|
|
140
|
+
@col += 1
|
|
141
|
+
text = s.scan(RE_COMMENT_CONTENT) || ""
|
|
142
|
+
emit(TokenType::COMMENT, text.strip, line, col)
|
|
143
|
+
@col += text.length
|
|
144
|
+
when 123 # {
|
|
145
|
+
scan_header(line, col)
|
|
146
|
+
when 61 # =
|
|
147
|
+
s.pos += 1
|
|
148
|
+
@col += 1
|
|
149
|
+
emit(TokenType::EQUALS, "=", line, col)
|
|
150
|
+
# Skip whitespace after =
|
|
151
|
+
if (ws = s.scan(RE_WHITESPACE))
|
|
152
|
+
@col += ws.length
|
|
153
|
+
end
|
|
154
|
+
scan_value_side
|
|
155
|
+
when 124 # |
|
|
156
|
+
s.pos += 1
|
|
157
|
+
@col += 1
|
|
158
|
+
emit(TokenType::PIPE, "|", line, col)
|
|
159
|
+
when 35 # #
|
|
160
|
+
scan_number_prefix(line, col)
|
|
161
|
+
when 34 # "
|
|
162
|
+
scan_string(line, col)
|
|
163
|
+
when 63 # ?
|
|
164
|
+
s.pos += 1
|
|
165
|
+
@col += 1
|
|
166
|
+
word = s.scan(RE_WORD) || ""
|
|
167
|
+
@col += word.length
|
|
168
|
+
if word == "true" || word == "false"
|
|
169
|
+
emit(TokenType::BOOLEAN, word, line, col)
|
|
170
|
+
else
|
|
171
|
+
emit(TokenType::ERROR, "Invalid boolean: ?#{word}", line, col)
|
|
172
|
+
end
|
|
173
|
+
when 126 # ~
|
|
174
|
+
s.pos += 1
|
|
175
|
+
@col += 1
|
|
176
|
+
emit(TokenType::NULL, "~", line, col)
|
|
177
|
+
when 64 # @
|
|
178
|
+
scan_reference(line, col)
|
|
179
|
+
when 94 # ^
|
|
180
|
+
scan_binary(line, col)
|
|
181
|
+
when 37 # %
|
|
182
|
+
scan_verb(line, col)
|
|
183
|
+
when 44 # ,
|
|
184
|
+
s.pos += 1
|
|
185
|
+
@col += 1
|
|
186
|
+
emit(TokenType::PATH, ",", line, col)
|
|
187
|
+
when 33, 42 # ! *
|
|
188
|
+
s.pos += 1
|
|
189
|
+
@col += 1
|
|
190
|
+
emit(TokenType::MODIFIER, byte == 33 ? "!" : "*", line, col)
|
|
191
|
+
when 45 # -
|
|
192
|
+
scan_identifier(line, col)
|
|
193
|
+
when 58 # :
|
|
194
|
+
scan_directive(line, col)
|
|
195
|
+
when 46 # .
|
|
196
|
+
scan_identifier(line, col)
|
|
197
|
+
when 38 # &
|
|
198
|
+
scan_identifier(line, col)
|
|
199
|
+
when 91 # [
|
|
200
|
+
scan_array_indexed_path(line, col)
|
|
201
|
+
else
|
|
202
|
+
if ident_start_byte?(byte)
|
|
203
|
+
scan_identifier(line, col)
|
|
204
|
+
elsif digit_byte?(byte)
|
|
205
|
+
scan_date_or_number(line, col)
|
|
206
|
+
else
|
|
207
|
+
s.pos += 1
|
|
208
|
+
@col += 1
|
|
209
|
+
emit(TokenType::ERROR, byte.chr, line, col)
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def scan_value_side
|
|
216
|
+
s = @scanner
|
|
217
|
+
|
|
218
|
+
# Parse modifiers after =
|
|
219
|
+
loop do
|
|
220
|
+
break if s.eos?
|
|
221
|
+
byte = s.string.getbyte(s.pos)
|
|
222
|
+
break if byte == 10 || byte == 13 # newline
|
|
223
|
+
|
|
224
|
+
case byte
|
|
225
|
+
when 33 # !
|
|
226
|
+
line = @line; col = @col
|
|
227
|
+
s.pos += 1; @col += 1
|
|
228
|
+
emit(TokenType::MODIFIER, "!", line, col)
|
|
229
|
+
if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
|
|
230
|
+
when 42 # *
|
|
231
|
+
line = @line; col = @col
|
|
232
|
+
s.pos += 1; @col += 1
|
|
233
|
+
emit(TokenType::MODIFIER, "*", line, col)
|
|
234
|
+
if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
|
|
235
|
+
when 45 # -
|
|
236
|
+
line = @line; col = @col
|
|
237
|
+
s.pos += 1; @col += 1
|
|
238
|
+
emit(TokenType::MODIFIER, "-", line, col)
|
|
239
|
+
if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
|
|
240
|
+
else
|
|
241
|
+
break
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Now scan the actual value
|
|
246
|
+
return if s.eos?
|
|
247
|
+
byte = s.string.getbyte(s.pos)
|
|
248
|
+
return if byte == 10 || byte == 13
|
|
249
|
+
|
|
250
|
+
line = @line
|
|
251
|
+
col = @col
|
|
252
|
+
|
|
253
|
+
case byte
|
|
254
|
+
when 35 # #
|
|
255
|
+
scan_number_prefix(line, col)
|
|
256
|
+
when 34 # "
|
|
257
|
+
scan_string(line, col)
|
|
258
|
+
when 63 # ?
|
|
259
|
+
s.pos += 1; @col += 1
|
|
260
|
+
word = s.scan(RE_WORD) || ""
|
|
261
|
+
@col += word.length
|
|
262
|
+
if word == "true" || word == "false"
|
|
263
|
+
emit(TokenType::BOOLEAN, word, line, col)
|
|
264
|
+
else
|
|
265
|
+
emit(TokenType::ERROR, "Invalid boolean: ?#{word}", line, col)
|
|
266
|
+
end
|
|
267
|
+
when 126 # ~
|
|
268
|
+
s.pos += 1; @col += 1
|
|
269
|
+
emit(TokenType::NULL, "~", line, col)
|
|
270
|
+
when 64 # @
|
|
271
|
+
scan_reference(line, col)
|
|
272
|
+
when 94 # ^
|
|
273
|
+
scan_binary(line, col)
|
|
274
|
+
when 37 # %
|
|
275
|
+
scan_verb(line, col)
|
|
276
|
+
when 59 # ;
|
|
277
|
+
s.pos += 1; @col += 1
|
|
278
|
+
text = s.scan(RE_COMMENT_CONTENT) || ""
|
|
279
|
+
emit(TokenType::COMMENT, text.strip, line, col)
|
|
280
|
+
@col += text.length
|
|
281
|
+
return
|
|
282
|
+
else
|
|
283
|
+
if digit_byte?(byte)
|
|
284
|
+
scan_date_or_number(line, col)
|
|
285
|
+
elsif byte == 116 || byte == 102 # t, f
|
|
286
|
+
scan_bare_boolean_or_identifier(line, col)
|
|
287
|
+
elsif byte == 80 # P
|
|
288
|
+
scan_possible_duration(line, col)
|
|
289
|
+
elsif byte == 84 # T
|
|
290
|
+
scan_possible_time(line, col)
|
|
291
|
+
elsif ident_start_byte?(byte)
|
|
292
|
+
scan_bare_string_value(line, col)
|
|
293
|
+
else
|
|
294
|
+
s.pos += 1; @col += 1
|
|
295
|
+
emit(TokenType::ERROR, byte.chr, line, col)
|
|
296
|
+
return
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
# After value, check for directives and comments
|
|
301
|
+
if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
|
|
302
|
+
return if s.eos?
|
|
303
|
+
|
|
304
|
+
byte = s.string.getbyte(s.pos)
|
|
305
|
+
if byte == 58 # :
|
|
306
|
+
scan_directive(@line, @col)
|
|
307
|
+
if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
return if s.eos?
|
|
311
|
+
byte = s.string.getbyte(s.pos)
|
|
312
|
+
if byte == 59 # ;
|
|
313
|
+
sl = @line; sc = @col
|
|
314
|
+
s.pos += 1; @col += 1
|
|
315
|
+
text = s.scan(RE_COMMENT_CONTENT) || ""
|
|
316
|
+
emit(TokenType::COMMENT, text.strip, sl, sc)
|
|
317
|
+
@col += text.length
|
|
318
|
+
end
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
def scan_header(line, col)
|
|
322
|
+
s = @scanner
|
|
323
|
+
s.pos += 1; @col += 1 # skip {
|
|
324
|
+
emit(TokenType::HEADER_OPEN, "{", line, col)
|
|
325
|
+
|
|
326
|
+
if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
|
|
327
|
+
|
|
328
|
+
if !s.eos? && s.string.getbyte(s.pos) == 125 # }
|
|
329
|
+
hline = @line; hcol = @col
|
|
330
|
+
s.pos += 1; @col += 1
|
|
331
|
+
emit(TokenType::HEADER_CLOSE, "}", hline, hcol)
|
|
332
|
+
return
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
path_line = @line
|
|
336
|
+
path_col = @col
|
|
337
|
+
path = s.scan(RE_HEADER_CONTENT) || ""
|
|
338
|
+
@col += path.length
|
|
339
|
+
path = path.strip
|
|
340
|
+
emit(TokenType::PATH, path, path_line, path_col) unless path.empty?
|
|
341
|
+
|
|
342
|
+
if !s.eos? && s.string.getbyte(s.pos) == 125 # }
|
|
343
|
+
hline = @line; hcol = @col
|
|
344
|
+
s.pos += 1; @col += 1
|
|
345
|
+
emit(TokenType::HEADER_CLOSE, "}", hline, hcol)
|
|
346
|
+
else
|
|
347
|
+
emit(TokenType::ERROR, "Unterminated header", line, col)
|
|
348
|
+
end
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
def scan_number_prefix(line, col)
|
|
352
|
+
s = @scanner
|
|
353
|
+
s.pos += 1; @col += 1 # skip first #
|
|
354
|
+
|
|
355
|
+
if s.eos?
|
|
356
|
+
emit(TokenType::ERROR, "Invalid numeric format", line, col)
|
|
357
|
+
return
|
|
358
|
+
end
|
|
359
|
+
byte = s.string.getbyte(s.pos)
|
|
360
|
+
|
|
361
|
+
case byte
|
|
362
|
+
when 35 # ## integer
|
|
363
|
+
s.pos += 1; @col += 1
|
|
364
|
+
val = scan_numeric_value
|
|
365
|
+
if val.empty?
|
|
366
|
+
emit(TokenType::ERROR, "Invalid numeric format", line, col)
|
|
367
|
+
else
|
|
368
|
+
emit(TokenType::INTEGER, val, line, col)
|
|
369
|
+
end
|
|
370
|
+
when 36 # #$ currency
|
|
371
|
+
s.pos += 1; @col += 1
|
|
372
|
+
val = scan_currency_value
|
|
373
|
+
if val.empty?
|
|
374
|
+
emit(TokenType::ERROR, "Invalid numeric format", line, col)
|
|
375
|
+
else
|
|
376
|
+
emit(TokenType::CURRENCY, val, line, col)
|
|
377
|
+
end
|
|
378
|
+
when 37 # #% percent
|
|
379
|
+
s.pos += 1; @col += 1
|
|
380
|
+
val = scan_numeric_value
|
|
381
|
+
if val.empty?
|
|
382
|
+
emit(TokenType::ERROR, "Invalid numeric format", line, col)
|
|
383
|
+
else
|
|
384
|
+
emit(TokenType::PERCENT, val, line, col)
|
|
385
|
+
end
|
|
386
|
+
else # # number
|
|
387
|
+
val = scan_numeric_value
|
|
388
|
+
if val.empty?
|
|
389
|
+
emit(TokenType::ERROR, "Invalid numeric format", line, col)
|
|
390
|
+
else
|
|
391
|
+
emit(TokenType::NUMBER, val, line, col)
|
|
392
|
+
end
|
|
393
|
+
end
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
def scan_numeric_value
|
|
397
|
+
val = @scanner.scan(RE_NUMERIC) || ""
|
|
398
|
+
@col += val.length
|
|
399
|
+
val
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
def scan_currency_value
|
|
403
|
+
val = @scanner.scan(RE_CURRENCY_VAL) || ""
|
|
404
|
+
@col += val.length
|
|
405
|
+
val
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
def scan_string(line, col)
|
|
409
|
+
s = @scanner
|
|
410
|
+
s.pos += 1; @col += 1 # skip opening "
|
|
411
|
+
|
|
412
|
+
# Check for multi-line """
|
|
413
|
+
if !s.eos? && s.string.getbyte(s.pos) == 34 &&
|
|
414
|
+
s.pos + 1 < s.string.bytesize && s.string.getbyte(s.pos + 1) == 34
|
|
415
|
+
s.pos += 2; @col += 2
|
|
416
|
+
scan_multiline_string(line, col)
|
|
417
|
+
return
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
result = +""
|
|
421
|
+
until s.eos?
|
|
422
|
+
# Scan non-special characters in bulk
|
|
423
|
+
chunk = s.scan(/[^"\\\r\n]+/)
|
|
424
|
+
if chunk
|
|
425
|
+
result << chunk
|
|
426
|
+
@col += chunk.length
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
break if s.eos?
|
|
430
|
+
byte = s.string.getbyte(s.pos)
|
|
431
|
+
|
|
432
|
+
case byte
|
|
433
|
+
when 92 # backslash
|
|
434
|
+
s.pos += 1; @col += 1
|
|
435
|
+
if s.eos?
|
|
436
|
+
emit(TokenType::ERROR, "Unterminated escape sequence", line, col)
|
|
437
|
+
return
|
|
438
|
+
end
|
|
439
|
+
esc_byte = s.string.getbyte(s.pos)
|
|
440
|
+
if esc_byte == 110 then result << "\n"; s.pos += 1; @col += 1 # n
|
|
441
|
+
elsif esc_byte == 116 then result << "\t"; s.pos += 1; @col += 1 # t
|
|
442
|
+
elsif esc_byte == 114 then result << "\r"; s.pos += 1; @col += 1 # r
|
|
443
|
+
elsif esc_byte == 34 then result << '"'; s.pos += 1; @col += 1 # "
|
|
444
|
+
elsif esc_byte == 92 then result << '\\'; s.pos += 1; @col += 1 # \
|
|
445
|
+
elsif esc_byte == 48 then result << "\0"; s.pos += 1; @col += 1 # 0
|
|
446
|
+
elsif esc_byte == 47 then result << '/'; s.pos += 1; @col += 1 # /
|
|
447
|
+
elsif esc_byte == 117 # u
|
|
448
|
+
s.pos += 1; @col += 1
|
|
449
|
+
result << scan_unicode_escape(line, col, 4)
|
|
450
|
+
elsif esc_byte == 85 # U
|
|
451
|
+
s.pos += 1; @col += 1
|
|
452
|
+
result << scan_unicode_escape(line, col, 8)
|
|
453
|
+
else
|
|
454
|
+
# Read the actual character (may be multi-byte)
|
|
455
|
+
esc_char = s.scan(/./) || "?"
|
|
456
|
+
@col += 1
|
|
457
|
+
emit(TokenType::ERROR, "Invalid escape: \\#{esc_char}", line, col)
|
|
458
|
+
return
|
|
459
|
+
end
|
|
460
|
+
when 34 # closing "
|
|
461
|
+
s.pos += 1; @col += 1
|
|
462
|
+
emit(TokenType::STRING, result, line, col)
|
|
463
|
+
return
|
|
464
|
+
when 10, 13 # newline
|
|
465
|
+
emit(TokenType::ERROR, "Unterminated string", line, col)
|
|
466
|
+
return
|
|
467
|
+
end
|
|
468
|
+
end
|
|
469
|
+
|
|
470
|
+
emit(TokenType::ERROR, "Unterminated string", line, col)
|
|
471
|
+
end
|
|
472
|
+
|
|
473
|
+
def scan_multiline_string(line, col)
|
|
474
|
+
s = @scanner
|
|
475
|
+
# Skip initial newline after opening """
|
|
476
|
+
if !s.eos?
|
|
477
|
+
byte = s.string.getbyte(s.pos)
|
|
478
|
+
if byte == 10
|
|
479
|
+
s.pos += 1; @line += 1; @col = 1
|
|
480
|
+
elsif byte == 13
|
|
481
|
+
s.pos += 1; @line += 1; @col = 1
|
|
482
|
+
if !s.eos? && s.string.getbyte(s.pos) == 10
|
|
483
|
+
s.pos += 1
|
|
484
|
+
end
|
|
485
|
+
end
|
|
486
|
+
end
|
|
487
|
+
|
|
488
|
+
result = +""
|
|
489
|
+
until s.eos?
|
|
490
|
+
# Check for closing """
|
|
491
|
+
if s.string.getbyte(s.pos) == 34 &&
|
|
492
|
+
s.pos + 2 < s.string.bytesize &&
|
|
493
|
+
s.string.getbyte(s.pos + 1) == 34 &&
|
|
494
|
+
s.string.getbyte(s.pos + 2) == 34
|
|
495
|
+
s.pos += 3; @col += 3
|
|
496
|
+
emit(TokenType::STRING, result, line, col)
|
|
497
|
+
return
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
byte = s.string.getbyte(s.pos)
|
|
501
|
+
if byte == 13 # \r
|
|
502
|
+
result << "\n"
|
|
503
|
+
s.pos += 1; @line += 1; @col = 1
|
|
504
|
+
if !s.eos? && s.string.getbyte(s.pos) == 10
|
|
505
|
+
s.pos += 1
|
|
506
|
+
end
|
|
507
|
+
elsif byte == 10 # \n
|
|
508
|
+
result << "\n"
|
|
509
|
+
s.pos += 1; @line += 1; @col = 1
|
|
510
|
+
else
|
|
511
|
+
# Scan non-special chars in bulk
|
|
512
|
+
chunk = s.scan(/[^"\r\n]+/)
|
|
513
|
+
if chunk
|
|
514
|
+
result << chunk
|
|
515
|
+
@col += chunk.length
|
|
516
|
+
else
|
|
517
|
+
# Single quote that isn't part of """
|
|
518
|
+
result << s.string[s.pos]
|
|
519
|
+
s.pos += 1; @col += 1
|
|
520
|
+
end
|
|
521
|
+
end
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
emit(TokenType::ERROR, "Unterminated multi-line string", line, col)
|
|
525
|
+
end
|
|
526
|
+
|
|
527
|
+
def scan_unicode_escape(line, col, num_digits)
|
|
528
|
+
s = @scanner
|
|
529
|
+
hex = s.peek(num_digits)
|
|
530
|
+
unless hex.length == num_digits && hex.match?(/\A[0-9a-fA-F]+\z/)
|
|
531
|
+
emit(TokenType::ERROR, "Invalid unicode escape", line, col)
|
|
532
|
+
return ""
|
|
533
|
+
end
|
|
534
|
+
s.pos += num_digits; @col += num_digits
|
|
535
|
+
codepoint = hex.to_i(16)
|
|
536
|
+
|
|
537
|
+
# Check for surrogate pair
|
|
538
|
+
if codepoint >= 0xD800 && codepoint <= 0xDBFF
|
|
539
|
+
if !s.eos? && s.string.getbyte(s.pos) == 92 && # backslash
|
|
540
|
+
s.pos + 1 < s.string.bytesize && s.string.getbyte(s.pos + 1) == 117 # u
|
|
541
|
+
s.pos += 2; @col += 2
|
|
542
|
+
low_hex = s.peek(4)
|
|
543
|
+
unless low_hex.length == 4 && low_hex.match?(/\A[0-9a-fA-F]+\z/)
|
|
544
|
+
emit(TokenType::ERROR, "Invalid surrogate pair", line, col)
|
|
545
|
+
return ""
|
|
546
|
+
end
|
|
547
|
+
s.pos += 4; @col += 4
|
|
548
|
+
low = low_hex.to_i(16)
|
|
549
|
+
if low >= 0xDC00 && low <= 0xDFFF
|
|
550
|
+
codepoint = 0x10000 + ((codepoint - 0xD800) << 10) + (low - 0xDC00)
|
|
551
|
+
else
|
|
552
|
+
emit(TokenType::ERROR, "Invalid low surrogate", line, col)
|
|
553
|
+
return ""
|
|
554
|
+
end
|
|
555
|
+
else
|
|
556
|
+
emit(TokenType::ERROR, "Expected low surrogate", line, col)
|
|
557
|
+
return ""
|
|
558
|
+
end
|
|
559
|
+
end
|
|
560
|
+
|
|
561
|
+
[codepoint].pack("U")
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
def scan_reference(line, col)
|
|
565
|
+
s = @scanner
|
|
566
|
+
s.pos += 1; @col += 1 # skip @
|
|
567
|
+
|
|
568
|
+
if !s.eos? && s.string.getbyte(s.pos) == 35 # @#
|
|
569
|
+
s.pos += 1; @col += 1
|
|
570
|
+
emit(TokenType::ERROR, "@#", line, col)
|
|
571
|
+
return
|
|
572
|
+
end
|
|
573
|
+
|
|
574
|
+
path = s.scan(RE_REF_PATH) || ""
|
|
575
|
+
@col += path.length
|
|
576
|
+
# Normalize leading zeros in array indices: [007] -> [7]
|
|
577
|
+
path = path.gsub(/\[(\d+)\]/) { "[#{$1.to_i}]" }
|
|
578
|
+
emit(TokenType::REFERENCE, path, line, col)
|
|
579
|
+
end
|
|
580
|
+
|
|
581
|
+
def scan_binary(line, col)
|
|
582
|
+
s = @scanner
|
|
583
|
+
s.pos += 1; @col += 1 # skip ^
|
|
584
|
+
data = s.scan(RE_BINARY_DATA) || ""
|
|
585
|
+
@col += data.length
|
|
586
|
+
emit(TokenType::BINARY, data, line, col)
|
|
587
|
+
end
|
|
588
|
+
|
|
589
|
+
def scan_verb(line, col)
|
|
590
|
+
s = @scanner
|
|
591
|
+
s.pos += 1; @col += 1 # skip %
|
|
592
|
+
|
|
593
|
+
if s.eos? || s.string.getbyte(s.pos) == 32 || s.string.getbyte(s.pos) == 9 ||
|
|
594
|
+
s.string.getbyte(s.pos) == 10 || s.string.getbyte(s.pos) == 13
|
|
595
|
+
emit(TokenType::ERROR, "Empty verb name", line, col)
|
|
596
|
+
return
|
|
597
|
+
end
|
|
598
|
+
|
|
599
|
+
name = +""
|
|
600
|
+
if !s.eos? && s.string.getbyte(s.pos) == 38 # &
|
|
601
|
+
name << "&"
|
|
602
|
+
s.pos += 1; @col += 1
|
|
603
|
+
end
|
|
604
|
+
|
|
605
|
+
word = s.scan(/[a-zA-Z0-9_.\-]+/) || ""
|
|
606
|
+
name << word
|
|
607
|
+
@col += word.length
|
|
608
|
+
|
|
609
|
+
if name.empty?
|
|
610
|
+
emit(TokenType::ERROR, "Invalid verb", line, col)
|
|
611
|
+
return
|
|
612
|
+
end
|
|
613
|
+
|
|
614
|
+
emit(TokenType::VERB, name, line, col)
|
|
615
|
+
scan_verb_arguments
|
|
616
|
+
end
|
|
617
|
+
|
|
618
|
+
def scan_verb_arguments
|
|
619
|
+
s = @scanner
|
|
620
|
+
until s.eos?
|
|
621
|
+
if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
|
|
622
|
+
break if s.eos?
|
|
623
|
+
|
|
624
|
+
byte = s.string.getbyte(s.pos)
|
|
625
|
+
break if byte == 10 || byte == 13 || byte == 59 || byte == 58 # \n \r ; :
|
|
626
|
+
|
|
627
|
+
line = @line
|
|
628
|
+
col = @col
|
|
629
|
+
|
|
630
|
+
case byte
|
|
631
|
+
when 34 then scan_string(line, col) # "
|
|
632
|
+
when 35 then scan_number_prefix(line, col) # #
|
|
633
|
+
when 63 # ?
|
|
634
|
+
s.pos += 1; @col += 1
|
|
635
|
+
word = s.scan(RE_WORD) || ""
|
|
636
|
+
@col += word.length
|
|
637
|
+
if word == "true" || word == "false"
|
|
638
|
+
emit(TokenType::BOOLEAN, word, line, col)
|
|
639
|
+
else
|
|
640
|
+
emit(TokenType::ERROR, "Invalid boolean: ?#{word}", line, col)
|
|
641
|
+
end
|
|
642
|
+
when 126 # ~
|
|
643
|
+
s.pos += 1; @col += 1
|
|
644
|
+
emit(TokenType::NULL, "~", line, col)
|
|
645
|
+
when 64 then scan_reference(line, col) # @
|
|
646
|
+
when 94 then scan_binary(line, col) # ^
|
|
647
|
+
when 37 then scan_verb(line, col) # %
|
|
648
|
+
when 124 # |
|
|
649
|
+
s.pos += 1; @col += 1
|
|
650
|
+
emit(TokenType::PIPE, "|", line, col)
|
|
651
|
+
else
|
|
652
|
+
if digit_byte?(byte)
|
|
653
|
+
scan_date_or_number(line, col)
|
|
654
|
+
elsif byte == 116 || byte == 102 # t, f
|
|
655
|
+
scan_bare_boolean_or_identifier(line, col)
|
|
656
|
+
elsif byte == 80 # P
|
|
657
|
+
scan_possible_duration(line, col)
|
|
658
|
+
elsif byte == 84 # T
|
|
659
|
+
scan_possible_time(line, col)
|
|
660
|
+
elsif ident_start_byte?(byte)
|
|
661
|
+
scan_bare_string_value(line, col)
|
|
662
|
+
else
|
|
663
|
+
break
|
|
664
|
+
end
|
|
665
|
+
end
|
|
666
|
+
end
|
|
667
|
+
end
|
|
668
|
+
|
|
669
|
+
def scan_directive(line, col)
|
|
670
|
+
s = @scanner
|
|
671
|
+
s.pos += 1; @col += 1 # skip :
|
|
672
|
+
name = s.scan(RE_WORD) || ""
|
|
673
|
+
@col += name.length
|
|
674
|
+
|
|
675
|
+
if name.empty?
|
|
676
|
+
emit(TokenType::ERROR, "Empty directive", line, col)
|
|
677
|
+
return
|
|
678
|
+
end
|
|
679
|
+
emit(TokenType::DIRECTIVE, name, line, col)
|
|
680
|
+
|
|
681
|
+
# Directive may have a string value
|
|
682
|
+
if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
|
|
683
|
+
return if s.eos?
|
|
684
|
+
byte = s.string.getbyte(s.pos)
|
|
685
|
+
return if byte == 10 || byte == 13 || byte == 59 # \n \r ;
|
|
686
|
+
|
|
687
|
+
if byte == 34 # "
|
|
688
|
+
scan_string(@line, @col)
|
|
689
|
+
end
|
|
690
|
+
end
|
|
691
|
+
|
|
692
|
+
def scan_array_indexed_path(line, col)
|
|
693
|
+
s = @scanner
|
|
694
|
+
word = +""
|
|
695
|
+
# Read [index]
|
|
696
|
+
if (idx = s.scan(RE_ARRAY_INDEX))
|
|
697
|
+
word << idx
|
|
698
|
+
@col += idx.length
|
|
699
|
+
end
|
|
700
|
+
# Continue with identifier chars, dots, and more brackets
|
|
701
|
+
loop do
|
|
702
|
+
if (chunk = s.scan(/[a-zA-Z0-9_.\-]+/))
|
|
703
|
+
word << chunk
|
|
704
|
+
@col += chunk.length
|
|
705
|
+
elsif (idx = s.scan(RE_ARRAY_INDEX))
|
|
706
|
+
word << idx
|
|
707
|
+
@col += idx.length
|
|
708
|
+
else
|
|
709
|
+
break
|
|
710
|
+
end
|
|
711
|
+
end
|
|
712
|
+
emit(TokenType::PATH, word, line, col)
|
|
713
|
+
end
|
|
714
|
+
|
|
715
|
+
def scan_identifier(line, col)
|
|
716
|
+
s = @scanner
|
|
717
|
+
word = +""
|
|
718
|
+
|
|
719
|
+
# Allow leading dot or &
|
|
720
|
+
byte = s.string.getbyte(s.pos)
|
|
721
|
+
if byte == 46 || byte == 38 # . or &
|
|
722
|
+
word << s.string[s.pos]
|
|
723
|
+
s.pos += 1; @col += 1
|
|
724
|
+
end
|
|
725
|
+
|
|
726
|
+
# Scan identifier body with dots and brackets
|
|
727
|
+
loop do
|
|
728
|
+
if (chunk = s.scan(/[a-zA-Z0-9_.\-]+/))
|
|
729
|
+
word << chunk
|
|
730
|
+
@col += chunk.length
|
|
731
|
+
elsif (idx = s.scan(RE_ARRAY_INDEX))
|
|
732
|
+
word << idx
|
|
733
|
+
@col += idx.length
|
|
734
|
+
elsif !s.eos? && s.string.getbyte(s.pos) == 38 # &
|
|
735
|
+
word << "&"
|
|
736
|
+
s.pos += 1; @col += 1
|
|
737
|
+
else
|
|
738
|
+
break
|
|
739
|
+
end
|
|
740
|
+
end
|
|
741
|
+
|
|
742
|
+
emit(TokenType::PATH, word, line, col)
|
|
743
|
+
end
|
|
744
|
+
|
|
745
|
+
def scan_bare_boolean_or_identifier(line, col)
|
|
746
|
+
s = @scanner
|
|
747
|
+
word = s.scan(RE_WORD) || ""
|
|
748
|
+
@col += word.length
|
|
749
|
+
|
|
750
|
+
if word == "true" || word == "false"
|
|
751
|
+
emit(TokenType::BOOLEAN, word, line, col)
|
|
752
|
+
else
|
|
753
|
+
# It's a bare string value — don't span multiple words
|
|
754
|
+
emit(TokenType::STRING, word, line, col, raw: "bare")
|
|
755
|
+
end
|
|
756
|
+
end
|
|
757
|
+
|
|
758
|
+
def scan_possible_duration(line, col)
|
|
759
|
+
s = @scanner
|
|
760
|
+
saved_pos = s.pos
|
|
761
|
+
saved_col = @col
|
|
762
|
+
saved_line = @line
|
|
763
|
+
|
|
764
|
+
val = s.scan(RE_DURATION)
|
|
765
|
+
if val && val.length > 1 && val.match?(/[0-9]/)
|
|
766
|
+
@col += val.length
|
|
767
|
+
emit(TokenType::DURATION, val, line, col)
|
|
768
|
+
else
|
|
769
|
+
s.pos = saved_pos
|
|
770
|
+
@col = saved_col
|
|
771
|
+
@line = saved_line
|
|
772
|
+
scan_bare_string_value(line, col)
|
|
773
|
+
end
|
|
774
|
+
end
|
|
775
|
+
|
|
776
|
+
def scan_possible_time(line, col)
|
|
777
|
+
s = @scanner
|
|
778
|
+
saved_pos = s.pos
|
|
779
|
+
saved_col = @col
|
|
780
|
+
saved_line = @line
|
|
781
|
+
|
|
782
|
+
val = s.scan(RE_TIME_VAL)
|
|
783
|
+
if val && val.length > 1
|
|
784
|
+
@col += val.length
|
|
785
|
+
emit(TokenType::TIME, val, line, col)
|
|
786
|
+
else
|
|
787
|
+
s.pos = saved_pos
|
|
788
|
+
@col = saved_col
|
|
789
|
+
@line = saved_line
|
|
790
|
+
scan_bare_string_value(line, col)
|
|
791
|
+
end
|
|
792
|
+
end
|
|
793
|
+
|
|
794
|
+
def scan_date_or_number(line, col)
|
|
795
|
+
s = @scanner
|
|
796
|
+
val = s.scan(RE_DATE_OR_NUM) || ""
|
|
797
|
+
@col += val.length
|
|
798
|
+
|
|
799
|
+
if val.match?(RE_DATE_PREFIX)
|
|
800
|
+
emit(TokenType::TIMESTAMP, val, line, col)
|
|
801
|
+
elsif val.match?(RE_DATE_EXACT)
|
|
802
|
+
emit(TokenType::DATE, val, line, col)
|
|
803
|
+
else
|
|
804
|
+
emit(TokenType::NUMBER, val, line, col)
|
|
805
|
+
end
|
|
806
|
+
end
|
|
807
|
+
|
|
808
|
+
def scan_bare_string_value(line, col)
|
|
809
|
+
s = @scanner
|
|
810
|
+
val = s.scan(RE_BARE_VALUE) || ""
|
|
811
|
+
@col += val.length
|
|
812
|
+
emit(TokenType::STRING, val, line, col, raw: "bare")
|
|
813
|
+
end
|
|
814
|
+
|
|
815
|
+
# Byte classification helpers (no allocation)
|
|
816
|
+
def ident_start_byte?(b)
|
|
817
|
+
(b >= 65 && b <= 90) || (b >= 97 && b <= 122) || b == 95
|
|
818
|
+
end
|
|
819
|
+
|
|
820
|
+
def digit_byte?(b)
|
|
821
|
+
b >= 48 && b <= 57
|
|
822
|
+
end
|
|
823
|
+
end
|
|
824
|
+
end
|
|
825
|
+
end
|