@johntalton/json-tokenizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +101 -0
  3. package/package.json +15 -0
  4. package/src/index.js +799 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 John
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,101 @@
1
+ # JSON Tokenizer
2
+
3
+ Generator function that tokenizes string based on JSON format.
4
+
5
+ - Uses Generator based API
6
+ - Produces tokens for all input text (including error tokens)
7
+ - Uses `Intl.Segmenter` with `'grapheme'` granularity
8
+ - Segmenter locale can be custom set (default: `'en-US'`)
9
+ - Allows for `AbortSignal` to control termination
10
+ - Best effort to match `JSON.parse` restriction
11
+
12
+ ## Example
13
+
14
+ Basic initialization and iteration:
15
+
16
+ ```js
17
+ import { JSONTokenizer } from '@johntalton/json-tokenizer'
18
+
19
+ const signal = AbortSignal.timeout(100)
20
+ const text = '{ }'
21
+
22
+ for(const token of JSONTokenizer.tokenize(text, { signal })) {
23
+ const { type, value } = token
24
+ // ...
25
+ }
26
+ ```
27
+
28
+ The following shows the Token-Stream from a valid JSON text
29
+
30
+ ```js
31
+ import { JSONTokenizer } from '@johntalton/json-tokenizer'
32
+
33
+ const text = JSON.stringify({
34
+ team: 'Mystery Inc',
35
+ members: [ 'Fred', 'Daphne', 'Velma', 'Shaggy', 'Scooby' ],
36
+ aired: 1969
37
+ }, undefined, '\t')
38
+
39
+ const stream = JSONTokenizer.tokenize(text)
40
+
41
+ for(const token of stream) {
42
+ console.log(token)
43
+ }
44
+ /*
45
+ { type: 'object-open', value: '{' }
46
+ { type: 'whitespace', value: '\n\t' }
47
+ { type: 'open-key-quote', value: '"' }
48
+ { type: 'key', value: 'team' }
49
+ { type: 'close-key-quote', value: '"' }
50
+ { type: 'colon', value: ':' }
51
+ { type: 'whitespace', value: ' ' }
52
+ { type: 'open-string-quote', value: '"' }
53
+ { type: 'string', value: 'Mystery Inc' }
54
+ { type: 'close-string-quote', value: '"' }
55
+ { type: 'object-member-comma', value: ',' }
56
+ { type: 'whitespace', value: '\n\t' }
57
+ { type: 'open-key-quote', value: '"' }
58
+ { type: 'key', value: 'members' }
59
+ { type: 'close-key-quote', value: '"' }
60
+ { type: 'colon', value: ':' }
61
+ { type: 'whitespace', value: ' ' }
62
+ { type: 'array-open', value: '[' }
63
+ { type: 'whitespace', value: '\n\t\t' }
64
+ { type: 'open-string-quote', value: '"' }
65
+ { type: 'string', value: 'Fred' }
66
+ { type: 'close-string-quote', value: '"' }
67
+ { type: 'array-element-comma', value: ',' }
68
+ { type: 'whitespace', value: '\n\t\t' }
69
+ { type: 'open-string-quote', value: '"' }
70
+ { type: 'string', value: 'Daphne' }
71
+ { type: 'close-string-quote', value: '"' }
72
+ { type: 'array-element-comma', value: ',' }
73
+ { type: 'whitespace', value: '\n\t\t' }
74
+ { type: 'open-string-quote', value: '"' }
75
+ { type: 'string', value: 'Velma' }
76
+ { type: 'close-string-quote', value: '"' }
77
+ { type: 'array-element-comma', value: ',' }
78
+ { type: 'whitespace', value: '\n\t\t' }
79
+ { type: 'open-string-quote', value: '"' }
80
+ { type: 'string', value: 'Shaggy' }
81
+ { type: 'close-string-quote', value: '"' }
82
+ { type: 'array-element-comma', value: ',' }
83
+ { type: 'whitespace', value: '\n\t\t' }
84
+ { type: 'open-string-quote', value: '"' }
85
+ { type: 'string', value: 'Scooby' }
86
+ { type: 'close-string-quote', value: '"' }
87
+ { type: 'whitespace', value: '\n\t' }
88
+ { type: 'array-close', value: ']' }
89
+ { type: 'object-member-comma', value: ',' }
90
+ { type: 'whitespace', value: '\n\t' }
91
+ { type: 'open-key-quote', value: '"' }
92
+ { type: 'key', value: 'aired' }
93
+ { type: 'close-key-quote', value: '"' }
94
+ { type: 'colon', value: ':' }
95
+ { type: 'whitespace', value: ' ' }
96
+ { type: 'number', value: '1969' }
97
+ { type: 'whitespace', value: '\n' }
98
+ { type: 'object-close', value: '}' }
99
+ { type: 'eof', value: '' }
100
+ */
101
+ ```
package/package.json ADDED
@@ -0,0 +1,15 @@
1
+ {
2
+ "name": "@johntalton/json-tokenizer",
3
+ "type": "module",
4
+ "version": "1.0.0",
5
+ "license": "MIT",
6
+ "exports": {
7
+ ".": "./src/index.js"
8
+ },
9
+ "files": [
10
+ "src/*.js"
11
+ ],
12
+ "repository": {
13
+ "url": "git+https://github.com/johntalton/json-tokenizer.git"
14
+ }
15
+ }
package/src/index.js ADDED
@@ -0,0 +1,799 @@
1
+ /**
2
+ * @typedef {Object} TokenizerOptions
3
+ * @property {string|undefined} [locale = 'en-US']
4
+ * @property {AbortSignal|undefined} [signal]
5
+ * @property {boolean|undefined} [debug]
6
+ */
7
+
8
+ export const TOKEN = {
9
+ ERROR: 'error',
10
+ EOF: 'eof',
11
+ WHITESPACE: 'whitespace',
12
+
13
+ // Array / Elements
14
+ ARRAY_OPEN: 'array-open',
15
+ ARRAY_CLOSE: 'array-close',
16
+ ARRAY_CLOSE_IMMEDIATE: 'array-close-immediate',
17
+ ARRAY_ELEMENT_COMMA: 'array-element-comma',
18
+
19
+ // Object
20
+ OBJECT_OPEN: 'object-open',
21
+ OBJECT_CLOSE: 'object-close',
22
+ OBJECT_CLOSE_IMMEDIATE: 'object-close-immediate',
23
+ OBJECT_MEMBER_COMMA: 'object-member-comma',
24
+ OBJECT_KEY_OPEN: 'open-key-quote',
25
+ OBJECT_KEY: 'key',
26
+ OBJECT_KEY_CLOSE: 'close-key-quote',
27
+ OBJECT_COLON: 'colon',
28
+
29
+ // String
30
+ STRING_OPEN: 'open-string-quote',
31
+ STRING: 'string',
32
+ STRING_CLOSE: 'close-string-quote',
33
+
34
+ // Primitives
35
+ TRUE: 'true',
36
+ FALSE: 'false',
37
+ NULL: 'null',
38
+
39
+ // Number
40
+ NUMBER: 'number'
41
+ }
42
+
43
+ export const EMPTY = ''
44
+
45
+ export const STATE = {
46
+ ELEMENTS: 'es',
47
+ ELEMENT: 'e',
48
+ ELEMENT_AFTER: 'e_',
49
+ VALUE: 'v',
50
+ WS: 'w',
51
+ OBJ_OPEN: 'oo',
52
+ OBJ_CLOSE: 'oc',
53
+ ARY_OPEN: 'ao',
54
+ ARY_CLOSE: 'ac',
55
+ MEMBERS: 'ms',
56
+ MEMBER: 'm',
57
+ KEY: 'key',
58
+ MEMBER_KEY_AFTER: 'mka',
59
+ STR: 'str',
60
+ COLON: ':',
61
+ NULL: 'null',
62
+ TRUE: 'true',
63
+ FALSE: 'false',
64
+ NUMBER: 'num',
65
+ NUMBER_INT: 'numi',
66
+ NUMBER_INT19: 'num19',
67
+ NUMBER_INT_AFTER: 'num_',
68
+ NUMBER_EXPONENT_SIGN: 'esign',
69
+ NUMBER_EXPONENT: 'exp',
70
+ NUMBER_EXPONENT_FIRST: 'expf',
71
+ NUMBER_DECIMAL_FIRST: 'numdf',
72
+ NUMBER_DECIMAL: 'numd',
73
+ U_HEX4: 'hex'
74
+ }
75
+
76
+ export const ESCAPE_CHARS = [
77
+ '"', '\\', '/', 'b', 'f', 'n', 'r', 't', 'u'
78
+ ]
79
+
80
+ export const DEFAULT_LOCALE = 'en-US'
81
+
82
+ export class JSONTokenizer {
83
+ /**
84
+ * @param {string} str
85
+ */
86
+ static isValueEscapeChar(str) {
87
+ return ESCAPE_CHARS.includes(str)
88
+ }
89
+
90
+ /**
91
+ * @param {string} str
92
+ */
93
+ static isValidChar(str) {
94
+ // 0020 - 10FFFF
95
+ const codePoint = str.codePointAt(0)
96
+ if(codePoint === undefined) { return false }
97
+ if(codePoint < 0x0020) { return false }
98
+ if(codePoint > 0x10FFFF) { return false }
99
+
100
+ return true
101
+ }
102
+
103
+ /**
104
+ * @param {string} str
105
+ * @param {TokenizerOptions} [options]
106
+ */
107
+ static *tokenize(str, options) {
108
+ const debug = (options?.debug ?? false) === true
109
+ const signal = options?.signal
110
+ const locale = options?.locale ?? DEFAULT_LOCALE
111
+
112
+ const seg = new Intl.Segmenter(locale, { granularity: 'grapheme' })
113
+ const segments = seg.segment(str)
114
+ using segmentIter = segments[Symbol.iterator]()
115
+
116
+ /** @type {Array<string>} */
117
+ const stack = []
118
+
119
+ /** @type {string|undefined} */
120
+ let state = STATE.ELEMENT
121
+
122
+ let next = segmentIter.next()
123
+
124
+ let accumulator = EMPTY
125
+
126
+ //
127
+ if(next.done) {
128
+ yield { type: TOKEN.ERROR, value: EMPTY }
129
+ return
130
+ }
131
+
132
+ //
133
+ while(true) {
134
+ if(signal?.aborted) {
135
+ //
136
+ break
137
+ }
138
+
139
+ //
140
+ if(next.done) {
141
+ if(accumulator !== EMPTY) { yield { type: TOKEN.ERROR, value: accumulator } }
142
+ if(stack.length !== 0) { yield { type: TOKEN.ERROR, value: EMPTY }}
143
+ yield { type: TOKEN.EOF, value: EMPTY }
144
+ break
145
+ }
146
+
147
+ //
148
+ if(state === undefined) {
149
+ if(accumulator !== EMPTY) { yield { type: TOKEN.ERROR, value: accumulator } }
150
+ if(!next.done) {
151
+ accumulator = EMPTY
152
+ while(!next.done) {
153
+ accumulator += next.value.segment
154
+ next = segmentIter.next()
155
+ }
156
+ yield { type: TOKEN.ERROR, value: accumulator }
157
+ }
158
+
159
+ yield { type: TOKEN.EOF, value: EMPTY }
160
+ break
161
+ }
162
+
163
+ //
164
+ if(debug) {
165
+ console.log({
166
+ seg: next.value.segment,
167
+ state, stack:
168
+ stack.join(','),
169
+ accumulator
170
+ })
171
+ }
172
+
173
+ //
174
+ switch(state) {
175
+ case STATE.ELEMENTS:
176
+ switch(next.value.segment) {
177
+ case ']':
178
+ state = stack.pop()
179
+ break
180
+ case ',':
181
+ yield { type: TOKEN.ARRAY_ELEMENT_COMMA, value: next.value.segment }
182
+ stack.push(STATE.ELEMENTS)
183
+ state = STATE.ELEMENT
184
+ next = segmentIter.next()
185
+ break
186
+ default:
187
+ stack.push(STATE.ELEMENTS)
188
+ state = STATE.ELEMENT
189
+ break
190
+ }
191
+ break
192
+ case STATE.ELEMENT:
193
+ switch(next.value.segment) {
194
+ case ' ': case '\r': case '\n': case '\t':
195
+ stack.push(STATE.ELEMENT)
196
+ state = STATE.WS
197
+ break
198
+ default:
199
+ stack.push(STATE.ELEMENT_AFTER)
200
+ state = STATE.VALUE
201
+ break
202
+ }
203
+ break
204
+ case STATE.ELEMENT_AFTER:
205
+ switch(next.value.segment) {
206
+ case ' ': case '\r': case '\n': case '\t':
207
+ stack.push(STATE.ELEMENT_AFTER)
208
+ state = STATE.WS
209
+ break
210
+ case '}':
211
+ state = stack.pop()
212
+ break
213
+ case ']':
214
+ state = stack.pop()
215
+ break
216
+ case ',':
217
+ state = stack.pop()
218
+ break
219
+ default:
220
+ yield { type: TOKEN.ERROR, value: next.value.segment }
221
+ next = segmentIter.next()
222
+ break
223
+ }
224
+ break
225
+ case STATE.WS:
226
+ switch(next.value.segment) {
227
+ case ' ': case '\r': case '\n': case '\t':
228
+ accumulator += next.value.segment
229
+ next = segmentIter.next()
230
+ if(next.done) {
231
+ yield { type: TOKEN.WHITESPACE, value: accumulator }
232
+ accumulator = EMPTY
233
+ }
234
+ break
235
+ default:
236
+ yield { type: TOKEN.WHITESPACE, value: accumulator }
237
+ accumulator = EMPTY
238
+ state = stack.pop()
239
+ break
240
+ }
241
+ break
242
+ case STATE.VALUE:
243
+ switch(next.value.segment) {
244
+ case '{':
245
+ yield { type: TOKEN.OBJECT_OPEN, value: next.value.segment }
246
+ state = STATE.OBJ_OPEN
247
+ next = segmentIter.next()
248
+ break
249
+ case '[':
250
+ yield { type: TOKEN.ARRAY_OPEN, value: next.value.segment }
251
+ state = STATE.ARY_OPEN
252
+ next = segmentIter.next()
253
+ break
254
+ case '"':
255
+ yield { type: TOKEN.STRING_OPEN, value: next.value.segment }
256
+ accumulator = EMPTY
257
+ state = STATE.STR
258
+ next = segmentIter.next()
259
+ break
260
+ case 't':
261
+ accumulator = 't'
262
+
263
+ next = segmentIter.next()
264
+ if(next.done || next.value.segment !== 'r') {
265
+ yield { type: TOKEN.ERROR, value: accumulator }
266
+ accumulator = EMPTY
267
+ state = stack.pop()
268
+ break
269
+ }
270
+ accumulator += next.value?.segment
271
+
272
+ next = segmentIter.next()
273
+ if(next.done || next.value.segment !== 'u') {
274
+ yield { type: TOKEN.ERROR, value: accumulator }
275
+ accumulator = EMPTY
276
+ state = stack.pop()
277
+ break
278
+ }
279
+ accumulator += next.value?.segment
280
+
281
+ next = segmentIter.next()
282
+ if(next.done || next.value.segment !== 'e') {
283
+ yield { type: TOKEN.ERROR, value: accumulator }
284
+ accumulator = EMPTY
285
+ state = stack.pop()
286
+ break
287
+ }
288
+ accumulator += next.value?.segment
289
+
290
+ yield { type: TOKEN.TRUE, value: accumulator }
291
+ accumulator = EMPTY
292
+
293
+ next = segmentIter.next()
294
+ state = stack.pop()
295
+ break
296
+ case 'f':
297
+ accumulator = 'f'
298
+
299
+ next = segmentIter.next()
300
+ if(next.done || next.value.segment !== 'a') {
301
+ yield { type: TOKEN.ERROR, value: accumulator }
302
+ accumulator = EMPTY
303
+ state = stack.pop()
304
+ break
305
+ }
306
+ accumulator += next.value?.segment
307
+
308
+ next = segmentIter.next()
309
+ if(next.done || next.value.segment !== 'l') {
310
+ yield { type: TOKEN.ERROR, value: accumulator }
311
+ accumulator = EMPTY
312
+ state = stack.pop()
313
+ break
314
+ }
315
+ accumulator += next.value?.segment
316
+
317
+ next = segmentIter.next()
318
+ if(next.done || next.value.segment !== 's') {
319
+ yield { type: TOKEN.ERROR, value: accumulator }
320
+ accumulator = EMPTY
321
+ state = stack.pop()
322
+ break
323
+ }
324
+ accumulator += next.value?.segment
325
+
326
+ next = segmentIter.next()
327
+ if(next.done || next.value.segment !== 'e') {
328
+ yield { type: TOKEN.ERROR, value: accumulator }
329
+ accumulator = EMPTY
330
+ state = stack.pop()
331
+ break
332
+ }
333
+ accumulator += next.value?.segment
334
+
335
+ yield { type: TOKEN.FALSE, value: accumulator }
336
+ accumulator = EMPTY
337
+
338
+ next = segmentIter.next()
339
+ state = stack.pop()
340
+ break
341
+ case 'n':
342
+ accumulator = 'n'
343
+
344
+ next = segmentIter.next()
345
+ if(next.done || next.value.segment !== 'u') {
346
+ yield { type: TOKEN.ERROR, value: accumulator }
347
+ accumulator = EMPTY
348
+ state = stack.pop()
349
+ break
350
+ }
351
+ accumulator += next.value?.segment
352
+
353
+ next = segmentIter.next()
354
+ if(next.done || next.value.segment !== 'l') {
355
+ yield { type: TOKEN.ERROR, value: accumulator }
356
+ accumulator = EMPTY
357
+ state = stack.pop()
358
+ break
359
+ }
360
+ accumulator += next.value?.segment
361
+
362
+ next = segmentIter.next()
363
+ if(next.done || next.value.segment !== 'l') {
364
+ yield { type: TOKEN.ERROR, value: accumulator }
365
+ accumulator = EMPTY
366
+ state = stack.pop()
367
+ break
368
+ }
369
+ accumulator += next.value?.segment
370
+
371
+ yield { type: TOKEN.NULL, value: accumulator }
372
+ accumulator = EMPTY
373
+
374
+ next = segmentIter.next()
375
+ state = stack.pop()
376
+ break
377
+ case '-':
378
+ accumulator = next.value.segment
379
+ state = STATE.NUMBER
380
+ next = segmentIter.next()
381
+ break
382
+ case '0':
383
+ accumulator = next.value.segment
384
+ state = STATE.NUMBER_INT_AFTER
385
+ next = segmentIter.next()
386
+ break
387
+ case '1': case '2': case '3':
388
+ case '4': case '5': case '6':
389
+ case '7': case '8': case '9':
390
+ accumulator = EMPTY
391
+ state = STATE.NUMBER_INT
392
+ break
393
+ default:
394
+ //
395
+ yield { type: TOKEN.ERROR, value: next.value.segment }
396
+ next = segmentIter.next()
397
+ break
398
+ }
399
+ break
400
+ case STATE.NUMBER:
401
+ switch(next.value.segment) {
402
+ case '0':
403
+ accumulator += next.value.segment
404
+ state = STATE.NUMBER_INT_AFTER
405
+ next = segmentIter.next()
406
+ break
407
+ default:
408
+ state = STATE.NUMBER_INT19
409
+ break
410
+ }
411
+ break
412
+ case STATE.NUMBER_INT19:
413
+ switch(next.value.segment) {
414
+ case '1': case '2': case '3':
415
+ case '4': case '5': case '6':
416
+ case '7': case '8': case '9':
417
+ accumulator += next.value.segment
418
+ state = STATE.NUMBER_INT
419
+ next = segmentIter.next()
420
+ break
421
+ default:
422
+ accumulator += next.value.segment
423
+ yield { type: TOKEN.ERROR, value: accumulator }
424
+ accumulator = EMPTY
425
+ next = segmentIter.next()
426
+ break
427
+ }
428
+ break
429
+ case STATE.NUMBER_INT:
430
+ switch(next.value.segment) {
431
+ case '0':
432
+ case '1': case '2': case '3':
433
+ case '4': case '5': case '6':
434
+ case '7': case '8': case '9':
435
+ accumulator += next.value.segment
436
+ next = segmentIter.next()
437
+
438
+ if(next.done) {
439
+ yield { type: TOKEN.NUMBER, value: accumulator }
440
+ accumulator = EMPTY
441
+ }
442
+ break
443
+ default:
444
+ state = STATE.NUMBER_INT_AFTER
445
+ break
446
+ }
447
+ break
448
+ case STATE.NUMBER_INT_AFTER:
449
+ switch(next.value.segment) {
450
+ case '.':
451
+ accumulator += next.value.segment
452
+ next = segmentIter.next()
453
+ state = STATE.NUMBER_DECIMAL_FIRST
454
+ break
455
+ case 'e':
456
+ case 'E':
457
+ accumulator += next.value.segment
458
+ state = STATE.NUMBER_EXPONENT_SIGN
459
+ next = segmentIter.next()
460
+ break
461
+ default:
462
+ yield { type: TOKEN.NUMBER, value: accumulator }
463
+ accumulator = EMPTY
464
+ state = stack.pop()
465
+ break
466
+ }
467
+ break
468
+ case STATE.NUMBER_DECIMAL_FIRST:
469
+ switch(next.value.segment) {
470
+ case '0':
471
+ case '1': case '2': case '3':
472
+ case '4': case '5': case '6':
473
+ case '7': case '8': case '9':
474
+ accumulator += next.value.segment
475
+ next = segmentIter.next()
476
+ state = STATE.NUMBER_DECIMAL
477
+
478
+ if(next.done) {
479
+ yield { type: TOKEN.NUMBER, value: accumulator }
480
+ accumulator = EMPTY
481
+ }
482
+ break
483
+ default:
484
+ accumulator += next.value.segment
485
+ yield { type: TOKEN.ERROR, value: accumulator}
486
+ accumulator = EMPTY
487
+ next = segmentIter.next()
488
+ break
489
+ }
490
+ break
491
+ case STATE.NUMBER_DECIMAL:
492
+ switch(next.value.segment) {
493
+ case '0':
494
+ case '1': case '2': case '3':
495
+ case '4': case '5': case '6':
496
+ case '7': case '8': case '9':
497
+ accumulator += next.value.segment
498
+ next = segmentIter.next()
499
+
500
+ if(next.done) {
501
+ yield { type: TOKEN.NUMBER, value: accumulator }
502
+ accumulator = EMPTY
503
+ }
504
+ break
505
+ case 'e':
506
+ case 'E':
507
+ accumulator += next.value.segment
508
+ state = STATE.NUMBER_EXPONENT_SIGN
509
+ next = segmentIter.next()
510
+ break
511
+ default:
512
+ yield { type: TOKEN.NUMBER, value: accumulator }
513
+ accumulator = EMPTY
514
+ state = stack.pop()
515
+ break
516
+ }
517
+ break
518
+ case STATE.NUMBER_EXPONENT_SIGN:
519
+ switch(next.value.segment) {
520
+ case '+':
521
+ accumulator += next.value.segment
522
+ next = segmentIter.next()
523
+ state = STATE.NUMBER_EXPONENT_FIRST
524
+ break
525
+ case '-':
526
+ accumulator += next.value.segment
527
+ next = segmentIter.next()
528
+ state = STATE.NUMBER_EXPONENT_FIRST
529
+ break
530
+ default:
531
+ state = STATE.NUMBER_EXPONENT_FIRST
532
+ break
533
+ }
534
+ break
535
+ case STATE.NUMBER_EXPONENT_FIRST:
536
+ switch(next.value.segment) {
537
+ case '0':
538
+ case '1': case '2': case '3':
539
+ case '4': case '5': case '6':
540
+ case '7': case '8': case '9':
541
+ accumulator += next.value.segment
542
+ next = segmentIter.next()
543
+ state = STATE.NUMBER_EXPONENT
544
+ break
545
+ default:
546
+ accumulator += next.value.segment
547
+ yield { type: TOKEN.ERROR, value: accumulator }
548
+ accumulator = EMPTY
549
+ next = segmentIter.next()
550
+ break
551
+ }
552
+ break
553
+ case STATE.NUMBER_EXPONENT:
554
+ switch(next.value.segment) {
555
+ case '0':
556
+ case '1': case '2': case '3':
557
+ case '4': case '5': case '6':
558
+ case '7': case '8': case '9':
559
+ accumulator += next.value.segment
560
+ next = segmentIter.next()
561
+ break
562
+ default:
563
+ yield { type: TOKEN.NUMBER, value: accumulator }
564
+ accumulator = EMPTY
565
+ state = stack.pop()
566
+ break
567
+ }
568
+ break
569
+ case STATE.OBJ_OPEN:
570
+ switch(next.value.segment) {
571
+ case ' ': case '\r': case '\n': case '\t':
572
+ stack.push(STATE.OBJ_OPEN)
573
+ state = STATE.WS
574
+ break
575
+ case '}':
576
+ yield { type: TOKEN.OBJECT_CLOSE_IMMEDIATE, value: next.value.segment }
577
+ next = segmentIter.next()
578
+ state = stack.pop()
579
+ break
580
+ default:
581
+ stack.push(STATE.OBJ_CLOSE)
582
+ state = STATE.MEMBERS
583
+ break
584
+ }
585
+ break
586
+ case STATE.OBJ_CLOSE:
587
+ switch(next.value.segment) {
588
+ case '}':
589
+ yield { type: TOKEN.OBJECT_CLOSE, value: next.value.segment }
590
+ next = segmentIter.next()
591
+ state = stack.pop()
592
+ break
593
+ default:
594
+ // todo
595
+ throw new Error('expecting object close')
596
+ break
597
+ }
598
+ break
599
+ case STATE.MEMBERS:
600
+ switch(next.value.segment){
601
+ case '}':
602
+ state = stack.pop()
603
+ break
604
+ case ',':
605
+ yield { type: TOKEN.OBJECT_MEMBER_COMMA, value: next.value.segment }
606
+ stack.push(STATE.MEMBERS)
607
+ state = STATE.MEMBER
608
+ next = segmentIter.next()
609
+ break
610
+ default:
611
+ stack.push(STATE.MEMBERS)
612
+ state = STATE.MEMBER
613
+ break
614
+ }
615
+ break
616
+ case STATE.MEMBER:
617
+ switch(next.value.segment) {
618
+ case ' ': case '\r': case '\n': case '\t':
619
+ stack.push(STATE.MEMBER)
620
+ state = STATE.WS
621
+ break
622
+ case '"':
623
+ yield { type: TOKEN.OBJECT_KEY_OPEN, value: next.value.segment }
624
+ stack.push(STATE.MEMBER_KEY_AFTER)
625
+ state = STATE.KEY
626
+ next = segmentIter.next()
627
+ break
628
+ default:
629
+ yield { type: TOKEN.ERROR, value: next.value.segment }
630
+ next = segmentIter.next()
631
+ break
632
+ }
633
+ break
634
+ case STATE.MEMBER_KEY_AFTER:
635
+ switch(next.value.segment) {
636
+ case ' ': case '\r': case '\n': case '\t':
637
+ stack.push(STATE.MEMBER_KEY_AFTER)
638
+ state = STATE.WS
639
+ break
640
+ case ':':
641
+ yield { type: TOKEN.OBJECT_COLON, value: next.value.segment }
642
+ next = segmentIter.next()
643
+
644
+ state = STATE.ELEMENT
645
+ break
646
+ default:
647
+ yield { type: TOKEN.ERROR, value: next.value.segment }
648
+ next = segmentIter.next()
649
+ break
650
+ }
651
+ break
652
+ case STATE.ARY_OPEN:
653
+ switch(next.value.segment) {
654
+ case ' ': case '\r': case '\n': case '\t':
655
+ stack.push(STATE.ARY_OPEN)
656
+ state = STATE.WS
657
+ break
658
+ case ']':
659
+ yield { type: TOKEN.ARRAY_CLOSE_IMMEDIATE, value: next.value.segment }
660
+ next = segmentIter.next()
661
+ state = stack.pop()
662
+ break
663
+ case ',':
664
+ yield { type: TOKEN.ERROR, value: next.value.segment}
665
+ next = segmentIter.next()
666
+ break
667
+ default:
668
+ stack.push(STATE.ARY_CLOSE)
669
+ state = STATE.ELEMENTS
670
+ break
671
+ }
672
+ break
673
+ case STATE.ARY_CLOSE:
674
+ switch(next.value.segment) {
675
+ case ']':
676
+ yield { type: TOKEN.ARRAY_CLOSE, value: next.value.segment }
677
+ next = segmentIter.next()
678
+ state = stack.pop()
679
+ break
680
+ default:
681
+ // todo
682
+ throw new Error('expecting array close')
683
+ break
684
+ }
685
+ break
686
+ case STATE.KEY:
687
+ case STATE.STR:
688
+ switch(next.value.segment) {
689
+ case '"':
690
+ if(state === STATE.KEY) {
691
+ yield { type: TOKEN.OBJECT_KEY, value: accumulator }
692
+ yield { type: TOKEN.OBJECT_KEY_CLOSE, value: next.value.segment }
693
+ }
694
+ else {
695
+ yield { type: TOKEN.STRING, value: accumulator }
696
+ yield { type: TOKEN.STRING_CLOSE, value: next.value.segment }
697
+ }
698
+
699
+ accumulator = EMPTY
700
+ next = segmentIter.next()
701
+ state = stack.pop()
702
+ break
703
+ case '\\':
704
+ accumulator += next.value.segment
705
+
706
+ next = segmentIter.next()
707
+ if(next.done) {
708
+ yield { type: TOKEN.ERROR, value: accumulator }
709
+ accumulator = EMPTY
710
+ state = stack.pop()
711
+ break
712
+ }
713
+
714
+ accumulator += next.value.segment
715
+
716
+ if(!JSONTokenizer.isValueEscapeChar(next.value?.segment)) {
717
+ yield { type: TOKEN.ERROR, value: accumulator }
718
+ accumulator = EMPTY
719
+ }
720
+
721
+ if(next.value.segment === 'u') {
722
+ stack.push(state)
723
+ state = STATE.U_HEX4
724
+ }
725
+
726
+ next = segmentIter.next()
727
+ break
728
+ default:
729
+ accumulator += next.value.segment
730
+
731
+ if(!JSONTokenizer.isValidChar(next.value.segment)) {
732
+ yield { type: TOKEN.ERROR, value: accumulator }
733
+ accumulator = EMPTY
734
+ }
735
+
736
+ next = segmentIter.next()
737
+ break
738
+ }
739
+ break
740
+ case STATE.U_HEX4:
741
+ switch(next.value.segment) {
742
+ case '0':
743
+ case '1': case '2': case '3':
744
+ case '4': case '5': case '6':
745
+ case '7': case '8': case '9':
746
+ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
747
+ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
748
+
749
+ accumulator += next.value.segment
750
+
751
+ // second
752
+ next = segmentIter.next()
753
+ if(next.done) {
754
+ yield { type: TOKEN.ERROR, value: accumulator }
755
+ accumulator = EMPTY
756
+ state = stack.pop()
757
+ break
758
+ }
759
+ accumulator += next.value?.segment
760
+
761
+ // third
762
+ next = segmentIter.next()
763
+ if(next.done) {
764
+ yield { type: TOKEN.ERROR, value: accumulator }
765
+ accumulator = EMPTY
766
+ state = stack.pop()
767
+ break
768
+ }
769
+ accumulator += next.value?.segment
770
+
771
+ // fourth
772
+ next = segmentIter.next()
773
+ if(next.done) {
774
+ yield { type: TOKEN.ERROR, value: accumulator }
775
+ accumulator = EMPTY
776
+ state = stack.pop()
777
+ break
778
+ }
779
+ accumulator += next.value?.segment
780
+
781
+ state = stack.pop()
782
+ next = segmentIter.next()
783
+ break
784
+ default:
785
+ accumulator += next.value.segment
786
+ yield { type: TOKEN.ERROR, value: accumulator }
787
+ accumulator = EMPTY
788
+ next = segmentIter.next()
789
+ break
790
+ }
791
+ break
792
+ default:
793
+ // todo
794
+ throw new Error(`unknown state ${state}`)
795
+ }
796
+ }
797
+ }
798
+ }
799
+