@bcts/dcbor-parse 1.0.0-alpha.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/token.ts ADDED
@@ -0,0 +1,641 @@
1
+ /**
2
+ * @bcts/dcbor-parse - Token types and Lexer
3
+ *
4
+ * This is a 1:1 TypeScript port of bc-dcbor-parse-rust token.rs
5
+ *
6
+ * @module dcbor-parse/token
7
+ */
8
+
9
+ import { type CborDate, CborDate as DCborDate } from "@bcts/dcbor";
10
+ import { UR } from "@bcts/uniform-resources";
11
+ import { type Span, span, parseError as PE, type ParseResult, ok, err } from "./error";
12
+
13
+ /**
14
+ * Token types produced by the lexer.
15
+ *
16
+ * Corresponds to the Rust `Token` enum in token.rs
17
+ */
18
+ export type Token =
19
+ | { readonly type: "Bool"; readonly value: boolean }
20
+ | { readonly type: "BraceOpen" }
21
+ | { readonly type: "BraceClose" }
22
+ | { readonly type: "BracketOpen" }
23
+ | { readonly type: "BracketClose" }
24
+ | { readonly type: "ParenthesisOpen" }
25
+ | { readonly type: "ParenthesisClose" }
26
+ | { readonly type: "Colon" }
27
+ | { readonly type: "Comma" }
28
+ | { readonly type: "Null" }
29
+ | { readonly type: "NaN" }
30
+ | { readonly type: "Infinity" }
31
+ | { readonly type: "NegInfinity" }
32
+ | { readonly type: "ByteStringHex"; readonly value: Uint8Array }
33
+ | { readonly type: "ByteStringBase64"; readonly value: Uint8Array }
34
+ | { readonly type: "DateLiteral"; readonly value: CborDate }
35
+ | { readonly type: "Number"; readonly value: number }
36
+ | { readonly type: "String"; readonly value: string }
37
+ | { readonly type: "TagValue"; readonly value: number }
38
+ | { readonly type: "TagName"; readonly value: string }
39
+ | { readonly type: "KnownValueNumber"; readonly value: number }
40
+ | { readonly type: "KnownValueName"; readonly value: string }
41
+ | { readonly type: "Unit" }
42
+ | { readonly type: "UR"; readonly value: UR };
43
+
44
+ // Token constructors (lowercase to differentiate from the type)
45
+ export const token = {
46
+ bool(value: boolean): Token {
47
+ return { type: "Bool", value };
48
+ },
49
+ braceOpen(): Token {
50
+ return { type: "BraceOpen" };
51
+ },
52
+ braceClose(): Token {
53
+ return { type: "BraceClose" };
54
+ },
55
+ bracketOpen(): Token {
56
+ return { type: "BracketOpen" };
57
+ },
58
+ bracketClose(): Token {
59
+ return { type: "BracketClose" };
60
+ },
61
+ parenthesisOpen(): Token {
62
+ return { type: "ParenthesisOpen" };
63
+ },
64
+ parenthesisClose(): Token {
65
+ return { type: "ParenthesisClose" };
66
+ },
67
+ colon(): Token {
68
+ return { type: "Colon" };
69
+ },
70
+ comma(): Token {
71
+ return { type: "Comma" };
72
+ },
73
+ null(): Token {
74
+ return { type: "Null" };
75
+ },
76
+ nan(): Token {
77
+ return { type: "NaN" };
78
+ },
79
+ infinity(): Token {
80
+ return { type: "Infinity" };
81
+ },
82
+ negInfinity(): Token {
83
+ return { type: "NegInfinity" };
84
+ },
85
+ byteStringHex(value: Uint8Array): Token {
86
+ return { type: "ByteStringHex", value };
87
+ },
88
+ byteStringBase64(value: Uint8Array): Token {
89
+ return { type: "ByteStringBase64", value };
90
+ },
91
+ dateLiteral(value: CborDate): Token {
92
+ return { type: "DateLiteral", value };
93
+ },
94
+ number(value: number): Token {
95
+ return { type: "Number", value };
96
+ },
97
+ string(value: string): Token {
98
+ return { type: "String", value };
99
+ },
100
+ tagValue(value: number): Token {
101
+ return { type: "TagValue", value };
102
+ },
103
+ tagName(value: string): Token {
104
+ return { type: "TagName", value };
105
+ },
106
+ knownValueNumber(value: number): Token {
107
+ return { type: "KnownValueNumber", value };
108
+ },
109
+ knownValueName(value: string): Token {
110
+ return { type: "KnownValueName", value };
111
+ },
112
+ unit(): Token {
113
+ return { type: "Unit" };
114
+ },
115
+ ur(value: UR): Token {
116
+ return { type: "UR", value };
117
+ },
118
+ };
119
+
120
+ /**
121
+ * Lexer for dCBOR diagnostic notation.
122
+ *
123
+ * Corresponds to the Rust `logos::Lexer` used in parse.rs
124
+ */
125
+ export class Lexer {
126
+ readonly #source: string;
127
+ #position: number;
128
+ #tokenStart: number;
129
+ #tokenEnd: number;
130
+
131
+ constructor(source: string) {
132
+ this.#source = source;
133
+ this.#position = 0;
134
+ this.#tokenStart = 0;
135
+ this.#tokenEnd = 0;
136
+ }
137
+
138
+ /**
139
+ * Gets the current span (position range of the last token).
140
+ */
141
+ span(): Span {
142
+ return span(this.#tokenStart, this.#tokenEnd);
143
+ }
144
+
145
+ /**
146
+ * Gets the slice of source corresponding to the last token.
147
+ */
148
+ slice(): string {
149
+ return this.#source.slice(this.#tokenStart, this.#tokenEnd);
150
+ }
151
+
152
+ /**
153
+ * Gets the next token, or undefined if at end of input.
154
+ * Returns a Result to handle lexing errors.
155
+ */
156
+ next(): ParseResult<Token> | undefined {
157
+ this.#skipWhitespaceAndComments();
158
+
159
+ if (this.#position >= this.#source.length) {
160
+ return undefined;
161
+ }
162
+
163
+ this.#tokenStart = this.#position;
164
+
165
+ // Try to match tokens in order of specificity
166
+ const result =
167
+ this.#tryMatchKeyword() ??
168
+ this.#tryMatchDateLiteral() ??
169
+ this.#tryMatchTagValueOrNumber() ??
170
+ this.#tryMatchTagName() ??
171
+ this.#tryMatchString() ??
172
+ this.#tryMatchByteStringHex() ??
173
+ this.#tryMatchByteStringBase64() ??
174
+ this.#tryMatchKnownValue() ??
175
+ this.#tryMatchUR() ??
176
+ this.#tryMatchPunctuation();
177
+
178
+ if (result === undefined) {
179
+ // Unrecognized token - consume one character
180
+ this.#position++;
181
+ this.#tokenEnd = this.#position;
182
+ return err(PE.unrecognizedToken(this.span()));
183
+ }
184
+
185
+ return result;
186
+ }
187
+
188
+ #skipWhitespaceAndComments(): void {
189
+ while (this.#position < this.#source.length) {
190
+ const ch = this.#source[this.#position];
191
+
192
+ // Skip whitespace
193
+ if (ch === " " || ch === "\t" || ch === "\r" || ch === "\n" || ch === "\f") {
194
+ this.#position++;
195
+ continue;
196
+ }
197
+
198
+ // Skip inline comments: /.../ (not preceded by another /)
199
+ if (
200
+ ch === "/" &&
201
+ this.#position + 1 < this.#source.length &&
202
+ this.#source[this.#position + 1] !== "/"
203
+ ) {
204
+ this.#position++; // Skip opening /
205
+ while (this.#position < this.#source.length && this.#source[this.#position] !== "/") {
206
+ this.#position++;
207
+ }
208
+ if (this.#position < this.#source.length) {
209
+ this.#position++; // Skip closing /
210
+ }
211
+ continue;
212
+ }
213
+
214
+ // Skip end-of-line comments: #...
215
+ if (ch === "#") {
216
+ while (this.#position < this.#source.length && this.#source[this.#position] !== "\n") {
217
+ this.#position++;
218
+ }
219
+ continue;
220
+ }
221
+
222
+ break;
223
+ }
224
+ }
225
+
226
+ #tryMatchKeyword(): ParseResult<Token> | undefined {
227
+ const keywords: [string, Token][] = [
228
+ ["true", token.bool(true)],
229
+ ["false", token.bool(false)],
230
+ ["null", token.null()],
231
+ ["NaN", token.nan()],
232
+ ["Infinity", token.infinity()],
233
+ ["-Infinity", token.negInfinity()],
234
+ ["Unit", token.unit()],
235
+ ];
236
+
237
+ for (const [keyword, tok] of keywords) {
238
+ if (this.#matchLiteral(keyword)) {
239
+ // Make sure it's not part of a longer identifier
240
+ const nextChar = this.#source[this.#position];
241
+ if (nextChar === undefined || !this.#isIdentifierChar(nextChar)) {
242
+ this.#tokenEnd = this.#position;
243
+ return ok(tok);
244
+ }
245
+ // Reset position if it was a partial match
246
+ this.#position = this.#tokenStart;
247
+ }
248
+ }
249
+
250
+ return undefined;
251
+ }
252
+
253
+ #tryMatchDateLiteral(): ParseResult<Token> | undefined {
254
+ // ISO-8601 date: YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS...
255
+ const dateRegex = /^\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?)?/;
256
+ const remaining = this.#source.slice(this.#position);
257
+ const match = dateRegex.exec(remaining);
258
+
259
+ if (match !== null) {
260
+ const dateStr = match[0];
261
+ this.#position += dateStr.length;
262
+ this.#tokenEnd = this.#position;
263
+
264
+ // Validate date components before parsing to match Rust's strict behavior
265
+ if (!isValidDateString(dateStr)) {
266
+ return err(PE.invalidDateString(dateStr, this.span()));
267
+ }
268
+
269
+ try {
270
+ const date = DCborDate.fromString(dateStr);
271
+ return ok(token.dateLiteral(date));
272
+ } catch {
273
+ return err(PE.invalidDateString(dateStr, this.span()));
274
+ }
275
+ }
276
+
277
+ return undefined;
278
+ }
279
+
280
+ #tryMatchTagValueOrNumber(): ParseResult<Token> | undefined {
281
+ // Check for tag value: integer followed by (
282
+ // Or just a number
283
+ const numberRegex = /^-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?/;
284
+ const remaining = this.#source.slice(this.#position);
285
+ const match = numberRegex.exec(remaining);
286
+
287
+ if (match !== null) {
288
+ const numStr = match[0];
289
+ const nextChar = this.#source[this.#position + numStr.length];
290
+
291
+ // Check if this is a tag value (integer followed by parenthesis)
292
+ if (
293
+ nextChar === "(" &&
294
+ !numStr.includes(".") &&
295
+ !numStr.includes("e") &&
296
+ !numStr.includes("E") &&
297
+ !numStr.startsWith("-")
298
+ ) {
299
+ // It's a tag value
300
+ this.#position += numStr.length + 1; // Include the (
301
+ this.#tokenEnd = this.#position;
302
+
303
+ const tagValue = parseInt(numStr, 10);
304
+ if (!Number.isSafeInteger(tagValue) || tagValue < 0) {
305
+ return err(
306
+ PE.invalidTagValue(numStr, span(this.#tokenStart, this.#tokenStart + numStr.length)),
307
+ );
308
+ }
309
+
310
+ return ok(token.tagValue(tagValue));
311
+ }
312
+
313
+ // It's a regular number
314
+ this.#position += numStr.length;
315
+ this.#tokenEnd = this.#position;
316
+
317
+ const num = parseFloat(numStr);
318
+ return ok(token.number(num));
319
+ }
320
+
321
+ return undefined;
322
+ }
323
+
324
+ #tryMatchTagName(): ParseResult<Token> | undefined {
325
+ // Tag name: identifier followed by (
326
+ const tagNameRegex = /^[a-zA-Z_][a-zA-Z0-9_-]*\(/;
327
+ const remaining = this.#source.slice(this.#position);
328
+ const match = tagNameRegex.exec(remaining);
329
+
330
+ if (match !== null) {
331
+ const fullMatch = match[0];
332
+ const name = fullMatch.slice(0, -1); // Remove trailing (
333
+ this.#position += fullMatch.length;
334
+ this.#tokenEnd = this.#position;
335
+
336
+ return ok(token.tagName(name));
337
+ }
338
+
339
+ return undefined;
340
+ }
341
+
342
+ #tryMatchString(): ParseResult<Token> | undefined {
343
+ if (this.#source[this.#position] !== '"') {
344
+ return undefined;
345
+ }
346
+
347
+ // JavaScript-style string with escape sequences
348
+ // eslint-disable-next-line no-control-regex
349
+ const stringRegex = /^"([^"\\\x00-\x1F]|\\(["\\bnfrt/]|u[a-fA-F0-9]{4}))*"/;
350
+ const remaining = this.#source.slice(this.#position);
351
+ const match = stringRegex.exec(remaining);
352
+
353
+ if (match !== null) {
354
+ const fullMatch = match[0];
355
+ this.#position += fullMatch.length;
356
+ this.#tokenEnd = this.#position;
357
+
358
+ // Return the full string including quotes
359
+ return ok(token.string(fullMatch));
360
+ }
361
+
362
+ // Invalid string - try to find where it ends for better error reporting
363
+ this.#position++;
364
+ while (this.#position < this.#source.length) {
365
+ const ch = this.#source[this.#position];
366
+ if (ch === '"' || ch === "\n") {
367
+ if (ch === '"') this.#position++;
368
+ break;
369
+ }
370
+ if (ch === "\\") {
371
+ this.#position += 2;
372
+ } else {
373
+ this.#position++;
374
+ }
375
+ }
376
+ this.#tokenEnd = this.#position;
377
+ return err(PE.unrecognizedToken(this.span()));
378
+ }
379
+
380
+ #tryMatchByteStringHex(): ParseResult<Token> | undefined {
381
+ // h'...'
382
+ if (!this.#matchLiteral("h'")) {
383
+ return undefined;
384
+ }
385
+
386
+ const hexRegex = /^[0-9a-fA-F]*/;
387
+ const remaining = this.#source.slice(this.#position);
388
+ const match = hexRegex.exec(remaining);
389
+ const hexPart = match !== null ? match[0] : "";
390
+
391
+ this.#position += hexPart.length;
392
+
393
+ if (this.#source[this.#position] !== "'") {
394
+ this.#tokenEnd = this.#position;
395
+ return err(PE.invalidHexString(this.span()));
396
+ }
397
+
398
+ this.#position++; // Skip closing '
399
+ this.#tokenEnd = this.#position;
400
+
401
+ // Check that hex string has even length
402
+ if (hexPart.length % 2 !== 0) {
403
+ return err(PE.invalidHexString(this.span()));
404
+ }
405
+
406
+ // Decode hex
407
+ const bytes = hexToBytes(hexPart);
408
+ return ok(token.byteStringHex(bytes));
409
+ }
410
+
411
+ #tryMatchByteStringBase64(): ParseResult<Token> | undefined {
412
+ // b64'...'
413
+ if (!this.#matchLiteral("b64'")) {
414
+ return undefined;
415
+ }
416
+
417
+ const base64Regex = /^[A-Za-z0-9+/=]*/;
418
+ const remaining = this.#source.slice(this.#position);
419
+ const match = base64Regex.exec(remaining);
420
+ const base64Part = match !== null ? match[0] : "";
421
+
422
+ this.#position += base64Part.length;
423
+
424
+ if (this.#source[this.#position] !== "'") {
425
+ this.#tokenEnd = this.#position;
426
+ return err(PE.invalidBase64String(this.span()));
427
+ }
428
+
429
+ this.#position++; // Skip closing '
430
+ this.#tokenEnd = this.#position;
431
+
432
+ // Check minimum length requirement (2 characters)
433
+ if (base64Part.length < 2) {
434
+ return err(PE.invalidBase64String(this.span()));
435
+ }
436
+
437
+ // Decode base64
438
+ try {
439
+ const bytes = base64ToBytes(base64Part);
440
+ return ok(token.byteStringBase64(bytes));
441
+ } catch {
442
+ return err(PE.invalidBase64String(this.span()));
443
+ }
444
+ }
445
+
446
+ #tryMatchKnownValue(): ParseResult<Token> | undefined {
447
+ if (this.#source[this.#position] !== "'") {
448
+ return undefined;
449
+ }
450
+
451
+ // Check for empty string '' (Unit)
452
+ if (this.#source[this.#position + 1] === "'") {
453
+ this.#position += 2;
454
+ this.#tokenEnd = this.#position;
455
+ return ok(token.knownValueName(""));
456
+ }
457
+
458
+ // Check for numeric known value: '0' or '[1-9][0-9]*'
459
+ const numericRegex = /^'(0|[1-9][0-9]*)'/;
460
+ const remaining = this.#source.slice(this.#position);
461
+ let match = numericRegex.exec(remaining);
462
+
463
+ if (match !== null) {
464
+ const fullMatch = match[0];
465
+ const numStr = match[1];
466
+ this.#position += fullMatch.length;
467
+ this.#tokenEnd = this.#position;
468
+
469
+ const value = parseInt(numStr, 10);
470
+ if (!Number.isSafeInteger(value) || value < 0) {
471
+ return err(PE.invalidKnownValue(numStr, span(this.#tokenStart + 1, this.#tokenEnd - 1)));
472
+ }
473
+
474
+ return ok(token.knownValueNumber(value));
475
+ }
476
+
477
+ // Check for named known value: '[a-zA-Z_][a-zA-Z0-9_-]*'
478
+ const nameRegex = /^'([a-zA-Z_][a-zA-Z0-9_-]*)'/;
479
+ match = nameRegex.exec(remaining);
480
+
481
+ if (match !== null) {
482
+ const fullMatch = match[0];
483
+ const name = match[1];
484
+ this.#position += fullMatch.length;
485
+ this.#tokenEnd = this.#position;
486
+
487
+ return ok(token.knownValueName(name));
488
+ }
489
+
490
+ // Invalid known value
491
+ this.#position++;
492
+ while (this.#position < this.#source.length && this.#source[this.#position] !== "'") {
493
+ this.#position++;
494
+ }
495
+ if (this.#position < this.#source.length) {
496
+ this.#position++;
497
+ }
498
+ this.#tokenEnd = this.#position;
499
+ return err(PE.unrecognizedToken(this.span()));
500
+ }
501
+
502
+ #tryMatchUR(): ParseResult<Token> | undefined {
503
+ // ur:type/data
504
+ const urRegex = /^ur:([a-zA-Z0-9][a-zA-Z0-9-]*)\/([a-zA-Z]{8,})/;
505
+ const remaining = this.#source.slice(this.#position);
506
+ const match = urRegex.exec(remaining);
507
+
508
+ if (match !== null) {
509
+ const fullMatch = match[0];
510
+ this.#position += fullMatch.length;
511
+ this.#tokenEnd = this.#position;
512
+
513
+ try {
514
+ const ur = UR.fromURString(fullMatch);
515
+ return ok(token.ur(ur));
516
+ } catch (e) {
517
+ const errorMsg = e instanceof Error ? e.message : String(e);
518
+ return err(PE.invalidUr(errorMsg, this.span()));
519
+ }
520
+ }
521
+
522
+ return undefined;
523
+ }
524
+
525
+ #tryMatchPunctuation(): ParseResult<Token> | undefined {
526
+ const ch = this.#source[this.#position];
527
+
528
+ const punctuation: Record<string, Token> = {
529
+ "{": token.braceOpen(),
530
+ "}": token.braceClose(),
531
+ "[": token.bracketOpen(),
532
+ "]": token.bracketClose(),
533
+ "(": token.parenthesisOpen(),
534
+ ")": token.parenthesisClose(),
535
+ ":": token.colon(),
536
+ ",": token.comma(),
537
+ };
538
+
539
+ const matched = punctuation[ch];
540
+ if (matched !== undefined) {
541
+ this.#position++;
542
+ this.#tokenEnd = this.#position;
543
+ return ok(matched);
544
+ }
545
+
546
+ return undefined;
547
+ }
548
+
549
+ #matchLiteral(literal: string): boolean {
550
+ if (this.#source.slice(this.#position, this.#position + literal.length) === literal) {
551
+ this.#position += literal.length;
552
+ return true;
553
+ }
554
+ return false;
555
+ }
556
+
557
+ #isIdentifierChar(ch: string): boolean {
558
+ return /[a-zA-Z0-9_-]/.test(ch);
559
+ }
560
+ }
561
+
562
+ /**
563
+ * Converts a hex string to bytes.
564
+ */
565
+ function hexToBytes(hex: string): Uint8Array {
566
+ const bytes = new Uint8Array(hex.length / 2);
567
+ for (let i = 0; i < bytes.length; i++) {
568
+ bytes[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
569
+ }
570
+ return bytes;
571
+ }
572
+
573
+ /**
574
+ * Converts a base64 string to bytes with strict validation.
575
+ * Rejects base64 strings with invalid padding (matches Rust's base64 crate behavior).
576
+ */
577
+ function base64ToBytes(base64: string): Uint8Array {
578
+ // Validate base64 padding strictly (Rust's base64 crate requires proper padding)
579
+ const expectedPadding = (4 - (base64.replace(/=/g, "").length % 4)) % 4;
580
+ const paddingMatch = /=+$/.exec(base64);
581
+ const actualPadding = paddingMatch !== null ? paddingMatch[0].length : 0;
582
+
583
+ // If there should be padding but there isn't, or padding is wrong length
584
+ if (expectedPadding !== actualPadding) {
585
+ throw new Error("Invalid base64 padding");
586
+ }
587
+
588
+ // Use built-in atob for base64 decoding
589
+ const binaryString = atob(base64);
590
+ const bytes = new Uint8Array(binaryString.length);
591
+ for (let i = 0; i < binaryString.length; i++) {
592
+ bytes[i] = binaryString.charCodeAt(i);
593
+ }
594
+ return bytes;
595
+ }
596
+
597
+ /**
598
+ * Validates a date string has valid month/day values.
599
+ * JavaScript Date is lenient and accepts invalid dates like 2023-02-30,
600
+ * but Rust's Date::from_string rejects them.
601
+ */
602
+ function isValidDateString(dateStr: string): boolean {
603
+ // Extract date components
604
+ const dateMatch = /^(\d{4})-(\d{2})-(\d{2})/.exec(dateStr);
605
+ if (dateMatch === null) return false;
606
+
607
+ const year = parseInt(dateMatch[1], 10);
608
+ const month = parseInt(dateMatch[2], 10);
609
+ const day = parseInt(dateMatch[3], 10);
610
+
611
+ // Validate month (1-12)
612
+ if (month < 1 || month > 12) return false;
613
+
614
+ // Validate day (1-N where N depends on month)
615
+ if (day < 1) return false;
616
+
617
+ // Days in each month (non-leap year)
618
+ const daysInMonth = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31];
619
+
620
+ // Adjust for leap year
621
+ const isLeapYear = (year % 4 === 0 && year % 100 !== 0) || year % 400 === 0;
622
+ if (isLeapYear && month === 2) {
623
+ if (day > 29) return false;
624
+ } else {
625
+ if (day > daysInMonth[month - 1]) return false;
626
+ }
627
+
628
+ // If there's a time component, validate it
629
+ const timeMatch = /T(\d{2}):(\d{2}):(\d{2})/.exec(dateStr);
630
+ if (timeMatch !== null) {
631
+ const hour = parseInt(timeMatch[1], 10);
632
+ const minute = parseInt(timeMatch[2], 10);
633
+ const second = parseInt(timeMatch[3], 10);
634
+
635
+ if (hour < 0 || hour > 23) return false;
636
+ if (minute < 0 || minute > 59) return false;
637
+ if (second < 0 || second > 59) return false;
638
+ }
639
+
640
+ return true;
641
+ }