@bcts/dcbor-parse 1.0.0-alpha.22 → 1.0.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -46,20 +46,72 @@
46
46
  * @module dcbor-parse
47
47
  */
48
48
 
49
+ // =============================================================================
50
+ // Public surface that mirrors Rust `bc-dcbor-parse-rust/src/lib.rs:59-72`.
51
+ //
52
+ // Rust re-exports:
53
+ // - `parse_dcbor_item`, `parse_dcbor_item_partial`
54
+ // - `Token`
55
+ // - `Error as ParseError`, `Result as ParseResult`
56
+ // - `Error as ComposeError`, `Result as ComposeResult`,
57
+ // `compose_dcbor_array`, `compose_dcbor_map`
58
+ // =============================================================================
59
+
49
60
  // Parse functions
50
61
  export { parseDcborItem, parseDcborItemPartial } from "./parse";
51
62
 
52
- // Token types
53
- export { type Token, token, Lexer } from "./token";
63
+ // Token types — Rust exposes only the `Token` enum publicly.
64
+ export { type Token } from "./token";
65
+
66
+ // Error types — Rust exposes only `ParseError` (the error enum) and
67
+ // `ParseResult` (the result type alias).
68
+ export { type ParseError, type ParseResult } from "./error";
54
69
 
55
- // Error types
70
+ // Compose types and functions — Rust exposes `ComposeError`,
71
+ // `ComposeResult`, and the two `compose_*` functions.
72
+ export {
73
+ type ComposeError,
74
+ type ComposeResult,
75
+ composeDcborArray,
76
+ composeDcborMap,
77
+ } from "./compose";
78
+
79
+ // =============================================================================
80
+ // TypeScript-only conveniences.
81
+ //
82
+ // Rust models its `Result<T, E>` natively via the `Result<T, E>` enum
83
+ // and `?` operator; the Logos lexer is a private implementation detail.
84
+ // In TypeScript we model `ParseResult<T>` as a discriminated union, so
85
+ // helper constructors and discriminators (`ok`, `err`, `isOk`, `isErr`,
86
+ // `unwrap`, `unwrapErr`, `parseError`, `composeError`, `composeOk`,
87
+ // `composeErr`, `Span`, …) are mandatory ergonomics. They are exported
88
+ // here as TS-only helpers and are **not** part of the Rust↔TS parity
89
+ // surface — Rust callers don't see them, and TS callers writing
90
+ // strictly-portable code shouldn't depend on them.
91
+ //
92
+ // `Lexer` and the `token` constructor namespace are likewise TS-only;
93
+ // in Rust the lexer is created via `Token::lexer(src)` internally and
94
+ // consumers never instantiate it directly. These re-exports stay so
95
+ // existing test code keeps working, but production callers should
96
+ // prefer `parseDcborItem` / `parseDcborItemPartial`.
97
+ // =============================================================================
98
+
99
+ // Token — TS-only convenience namespace for constructing tokens from
100
+ // userland (rare; mostly used in tests). The `Lexer` class is also
101
+ // TS-only — Rust treats `Token::lexer(...)` as an internal API.
102
+ export { token, Lexer } from "./token";
103
+
104
+ // Error helpers — `Span`/`span`/`defaultSpan` are TS-only because Rust
105
+ // uses the `logos::Span` type alias directly. The `ok`/`err`/`isOk`/
106
+ // `isErr`/`unwrap`/`unwrapErr` helpers are TS-only `Result`-modeling
107
+ // utilities. `parseError`, `isDefaultError`, `errorMessage`,
108
+ // `errorSpan`, `fullErrorMessage`, and `defaultParseError` are
109
+ // likewise convenience helpers around the discriminated union.
56
110
  export {
57
111
  type Span,
58
112
  span,
59
113
  defaultSpan,
60
- type ParseError,
61
114
  parseError,
62
- type ParseResult,
63
115
  ok,
64
116
  err,
65
117
  isOk,
@@ -73,14 +125,7 @@ export {
73
125
  defaultParseError,
74
126
  } from "./error";
75
127
 
76
- // Compose functions
77
- export {
78
- type ComposeError,
79
- composeError,
80
- type ComposeResult,
81
- composeOk,
82
- composeErr,
83
- composeErrorMessage,
84
- composeDcborArray,
85
- composeDcborMap,
86
- } from "./compose";
128
+ // Compose helpers — `composeError`/`composeOk`/`composeErr`/
129
+ // `composeErrorMessage` are the TS-only counterparts of the
130
+ // `ComposeError`/`ComposeResult` discriminated union ergonomics.
131
+ export { composeError, composeOk, composeErr, composeErrorMessage } from "./compose";
package/src/parse.ts CHANGED
@@ -252,7 +252,7 @@ function parseUr(ur: UR, tokenSpan: Span): ParseResult<Cbor> {
252
252
  );
253
253
  }
254
254
 
255
- function parseNumberTag(tagValue: number, lexer: Lexer): ParseResult<Cbor> {
255
+ function parseNumberTag(tagValue: number | bigint, lexer: Lexer): ParseResult<Cbor> {
256
256
  const itemResult = parseItem(lexer);
257
257
  if (!itemResult.ok) {
258
258
  return itemResult;
@@ -267,6 +267,10 @@ function parseNumberTag(tagValue: number, lexer: Lexer): ParseResult<Cbor> {
267
267
  }
268
268
 
269
269
  if (closeResult.value.type === "ParenthesisClose") {
270
+ // Pass the tag value through as-is: when it's a `bigint` (i.e. a
271
+ // u64 outside the safe-integer range), dCBOR's `cbor({ tag, value })`
272
+ // builder serialises it as a `bigint` tag — matching Rust which
273
+ // accepts the full `0..=2^64-1` range natively.
270
274
  return ok(cbor({ tag: tagValue, value: itemResult.value }));
271
275
  }
272
276
 
@@ -386,23 +390,37 @@ function parseMap(lexer: Lexer): ParseResult<Cbor> {
386
390
  return err(PE.duplicateMapKey(keySpan));
387
391
  }
388
392
 
389
- // Expect colon
393
+ // Expect colon.
394
+ //
395
+ // Mirrors Rust `parse.rs:382-395`:
396
+ // ```
397
+ // if let Ok(Token::Colon) = expect_token(lexer) { … }
398
+ // else { return Err(Error::ExpectedColon(lexer.span())); }
399
+ // ```
400
+ // Rust's pattern collapses *every* non-Colon outcome — including
401
+ // `UnexpectedEndOfInput`, `UnrecognizedToken`, and any other error
402
+ // — into `ExpectedColon`. Earlier revisions of this port forwarded
403
+ // the inner error verbatim, so `{1` reported `UnexpectedEndOfInput`
404
+ // instead of `ExpectedColon`.
390
405
  const colonResult = expectToken(lexer);
391
- if (!colonResult.ok) {
392
- return colonResult;
393
- }
394
-
395
- if (colonResult.value.type !== "Colon") {
406
+ if (!colonResult.ok || colonResult.value.type !== "Colon") {
396
407
  return err(PE.expectedColon(lexer.span()));
397
408
  }
398
409
 
399
- // Parse the value
410
+ // Parse the value.
411
+ //
412
+ // Rust `parse.rs:383-389` uses the inner `UnexpectedToken`'s **own**
413
+ // span when it converts to `ExpectedMapKey`. Earlier revisions of
414
+ // this port called `lexer.span()` here, which can drift if the
415
+ // lexer has stepped past the offending `}`. We now use the
416
+ // captured span from `valueResult.error` to preserve Rust's exact
417
+ // span semantics.
400
418
  const valueResult = parseItem(lexer);
401
419
  if (!valueResult.ok) {
402
420
  if (valueResult.error.type === "UnexpectedToken") {
403
- const unexpectedToken = (valueResult.error as { token: Token }).token;
404
- if (unexpectedToken.type === "BraceClose") {
405
- return err(PE.expectedMapKey(lexer.span()));
421
+ const unexpected = valueResult.error;
422
+ if (unexpected.token.type === "BraceClose") {
423
+ return err(PE.expectedMapKey(unexpected.span));
406
424
  }
407
425
  }
408
426
  return valueResult;
package/src/token.ts CHANGED
@@ -17,7 +17,21 @@ import { type Span, span, parseError as PE, type ParseResult, ok, err } from "./
17
17
  /**
18
18
  * Token types produced by the lexer.
19
19
  *
20
- * Corresponds to the Rust `Token` enum in token.rs
20
+ * Corresponds to the Rust `Token` enum in token.rs.
21
+ *
22
+ * **u64 parity**: `TagValue` and `KnownValueNumber` are widened to
23
+ * `number | bigint` because Rust accepts the full `u64` range
24
+ * (`0..=2^64-1`). Values that fit in
25
+ * {@link Number.MAX_SAFE_INTEGER} (`2^53-1`) come through as plain
26
+ * `number`s; anything larger arrives as a `bigint` so callers don't
27
+ * silently lose precision. This matches the way `@bcts/dcbor` already
28
+ * stores large unsigned integers (`number | bigint`) and lets the
29
+ * downstream `cbor({ tag, value })` builder serialize correctly.
30
+ *
31
+ * **String value field**: the lexer keeps the outer double quotes on
32
+ * the slice (e.g. `"\"hello\""`); the parser strips them in
33
+ * `parseString`. Mirrors Rust `Token::String(String)` which holds the
34
+ * raw `lex.slice()` including quotes (`token.rs:115-119`).
21
35
  */
22
36
  export type Token =
23
37
  | { readonly type: "Bool"; readonly value: boolean }
@@ -38,9 +52,9 @@ export type Token =
38
52
  | { readonly type: "DateLiteral"; readonly value: CborDate }
39
53
  | { readonly type: "Number"; readonly value: number }
40
54
  | { readonly type: "String"; readonly value: string }
41
- | { readonly type: "TagValue"; readonly value: number }
55
+ | { readonly type: "TagValue"; readonly value: number | bigint }
42
56
  | { readonly type: "TagName"; readonly value: string }
43
- | { readonly type: "KnownValueNumber"; readonly value: number }
57
+ | { readonly type: "KnownValueNumber"; readonly value: number | bigint }
44
58
  | { readonly type: "KnownValueName"; readonly value: string }
45
59
  | { readonly type: "Unit" }
46
60
  | { readonly type: "UR"; readonly value: UR };
@@ -101,13 +115,13 @@ export const token = {
101
115
  string(value: string): Token {
102
116
  return { type: "String", value };
103
117
  },
104
- tagValue(value: number): Token {
118
+ tagValue(value: number | bigint): Token {
105
119
  return { type: "TagValue", value };
106
120
  },
107
121
  tagName(value: string): Token {
108
122
  return { type: "TagName", value };
109
123
  },
110
- knownValueNumber(value: number): Token {
124
+ knownValueNumber(value: number | bigint): Token {
111
125
  return { type: "KnownValueNumber", value };
112
126
  },
113
127
  knownValueName(value: string): Token {
@@ -199,20 +213,29 @@ export class Lexer {
199
213
  continue;
200
214
  }
201
215
 
202
- // Skip inline comments: /.../ (not preceded by another /)
203
- if (
204
- ch === "/" &&
205
- this._position + 1 < this._source.length &&
206
- this._source[this._position + 1] !== "/"
207
- ) {
208
- this._position++; // Skip opening /
209
- while (this._position < this._source.length && this._source[this._position] !== "/") {
210
- this._position++;
216
+ // Skip inline comments: `/[^/]*/` (matches the Rust skip regex
217
+ // `/[^/]*/`). Note that the Rust regex *does* match `//` (zero
218
+ // non-slash characters between the two slashes), so an empty
219
+ // comment is a valid no-op for the lexer. We accept that case too;
220
+ // earlier revisions of this port required at least one non-slash
221
+ // body character, which broke parity with Rust on inputs like
222
+ // `// trailing thought`.
223
+ if (ch === "/") {
224
+ // Confirm there is a closing slash somewhere ahead. If not, fall
225
+ // through and let the punctuation matcher report an
226
+ // unrecognized token (Rust would equally fail to match the skip
227
+ // regex and emit an `UnrecognizedToken`).
228
+ let scan = this._position + 1;
229
+ while (scan < this._source.length && this._source[scan] !== "/") {
230
+ scan++;
211
231
  }
212
- if (this._position < this._source.length) {
213
- this._position++; // Skip closing /
232
+ if (scan < this._source.length) {
233
+ this._position = scan + 1; // jump past the closing /
234
+ continue;
214
235
  }
215
- continue;
236
+ // No closing /: not a comment — leave _position alone and break
237
+ // out so the punctuation matcher can flag the unrecognized `/`.
238
+ break;
216
239
  }
217
240
 
218
241
  // Skip end-of-line comments: #...
@@ -227,27 +250,38 @@ export class Lexer {
227
250
  }
228
251
  }
229
252
 
253
+ /**
254
+ * Matches reserved keywords: `true`, `false`, `null`, `NaN`,
255
+ * `Infinity`, `-Infinity`, `Unit`.
256
+ *
257
+ * Mirrors Rust's `Logos` `#[token(...)]` matcher
258
+ * (`bc-dcbor-parse-rust/src/token.rs:12-50, 164`), which is greedy
259
+ * and emits the keyword token *as soon as the literal matches* —
260
+ * subsequent characters become a separate (likely unrecognized) token
261
+ * stream. So input like `truex` lexes as `Bool(true)` followed by an
262
+ * unrecognized run on `x`. Earlier revisions of this port enforced an
263
+ * identifier boundary check (`!_isIdentifierChar(nextChar)`) and
264
+ * rejected the whole prefix as a single `UnrecognizedToken`, which
265
+ * broke span/variant parity with Rust.
266
+ */
230
267
  private _tryMatchKeyword(): ParseResult<Token> | undefined {
231
268
  const keywords: [string, Token][] = [
269
+ // Order matters: `-Infinity` must come before any other `-` based
270
+ // matcher (we lex this before numbers, so the `-` doesn't get
271
+ // siphoned off as a sign).
272
+ ["-Infinity", token.negInfinity()],
232
273
  ["true", token.bool(true)],
233
274
  ["false", token.bool(false)],
234
275
  ["null", token.null()],
235
276
  ["NaN", token.nan()],
236
277
  ["Infinity", token.infinity()],
237
- ["-Infinity", token.negInfinity()],
238
278
  ["Unit", token.unit()],
239
279
  ];
240
280
 
241
281
  for (const [keyword, tok] of keywords) {
242
282
  if (this._matchLiteral(keyword)) {
243
- // Make sure it's not part of a longer identifier
244
- const nextChar = this._source[this._position];
245
- if (nextChar === undefined || !this._isIdentifierChar(nextChar)) {
246
- this._tokenEnd = this._position;
247
- return ok(tok);
248
- }
249
- // Reset position if it was a partial match
250
- this._position = this._tokenStart;
283
+ this._tokenEnd = this._position;
284
+ return ok(tok);
251
285
  }
252
286
  }
253
287
 
@@ -300,18 +334,24 @@ export class Lexer {
300
334
  !numStr.includes("E") &&
301
335
  !numStr.startsWith("-")
302
336
  ) {
303
- // It's a tag value
337
+ // It's a tag value. Mirrors Rust `token.rs:128-136`:
338
+ // `stripped.parse::<TagValue>()` accepts the full `u64` range
339
+ // (`0..=2^64-1`). We use `BigInt` to get exact-integer parsing,
340
+ // then narrow to `number` when the value fits in
341
+ // `Number.MAX_SAFE_INTEGER` so callers don't pay the bigint
342
+ // tax for tag numbers in the common range. Anything outside
343
+ // `[0, 2^64-1]` is reported as `InvalidTagValue` matching Rust.
304
344
  this._position += numStr.length + 1; // Include the (
305
345
  this._tokenEnd = this._position;
306
346
 
307
- const tagValue = parseInt(numStr, 10);
308
- if (!Number.isSafeInteger(tagValue) || tagValue < 0) {
347
+ const parsed = parseUsize64(numStr);
348
+ if (parsed === undefined) {
309
349
  return err(
310
350
  PE.invalidTagValue(numStr, span(this._tokenStart, this._tokenStart + numStr.length)),
311
351
  );
312
352
  }
313
353
 
314
- return ok(token.tagValue(tagValue));
354
+ return ok(token.tagValue(parsed));
315
355
  }
316
356
 
317
357
  // It's a regular number
@@ -363,20 +403,15 @@ export class Lexer {
363
403
  return ok(token.string(fullMatch));
364
404
  }
365
405
 
366
- // Invalid string - try to find where it ends for better error reporting
406
+ // Invalid string: emit an unrecognized token covering just the
407
+ // opening `"` and let the next call to `next()` re-lex. Mirrors
408
+ // Rust's Logos behaviour when the `String` regex fails to match —
409
+ // the lexer emits `Error::default()` (which `expect_token` upgrades
410
+ // to `UnrecognizedToken(span)` for the single character) and
411
+ // recovers at the very next byte. Earlier revisions of this port
412
+ // consumed through the next `"` or `\n`, which inflated the error
413
+ // span beyond what Rust reports.
367
414
  this._position++;
368
- while (this._position < this._source.length) {
369
- const ch = this._source[this._position];
370
- if (ch === '"' || ch === "\n") {
371
- if (ch === '"') this._position++;
372
- break;
373
- }
374
- if (ch === "\\") {
375
- this._position += 2;
376
- } else {
377
- this._position++;
378
- }
379
- }
380
415
  this._tokenEnd = this._position;
381
416
  return err(PE.unrecognizedToken(this.span()));
382
417
  }
@@ -470,8 +505,11 @@ export class Lexer {
470
505
  this._position += fullMatch.length;
471
506
  this._tokenEnd = this._position;
472
507
 
473
- const value = parseInt(numStr, 10);
474
- if (!Number.isSafeInteger(value) || value < 0) {
508
+ // Mirrors Rust `token.rs:146-153`: `stripped.parse::<u64>()`
509
+ // accepts the full `u64` range. We share the helper used for
510
+ // `TagValue` to get the same narrow-when-safe-else-bigint path.
511
+ const value = parseUsize64(numStr);
512
+ if (value === undefined) {
475
513
  return err(PE.invalidKnownValue(numStr, span(this._tokenStart + 1, this._tokenEnd - 1)));
476
514
  }
477
515
 
@@ -491,14 +529,14 @@ export class Lexer {
491
529
  return ok(token.knownValueName(name));
492
530
  }
493
531
 
494
- // Invalid known value
532
+ // Invalid known value: emit an unrecognized token covering just the
533
+ // opening `'` and let the next call to `next()` re-lex. Mirrors
534
+ // Rust's Logos behaviour when neither `KnownValueNumber` nor
535
+ // `KnownValueName` regex matches — the lexer emits `Error::default()`
536
+ // (single character span) and recovers at the next byte. Earlier
537
+ // revisions of this port consumed through the closing `'`, which
538
+ // inflated the error span beyond what Rust reports.
495
539
  this._position++;
496
- while (this._position < this._source.length && this._source[this._position] !== "'") {
497
- this._position++;
498
- }
499
- if (this._position < this._source.length) {
500
- this._position++;
501
- }
502
540
  this._tokenEnd = this._position;
503
541
  return err(PE.unrecognizedToken(this.span()));
504
542
  }
@@ -557,10 +595,45 @@ export class Lexer {
557
595
  }
558
596
  return false;
559
597
  }
598
+ }
560
599
 
561
- private _isIdentifierChar(ch: string): boolean {
562
- return /[a-zA-Z0-9_-]/.test(ch);
600
+ /**
601
+ * Strictly parses a non-negative integer string in the range
602
+ * `[0, 2^64 - 1]`, mirroring Rust `<u64 as FromStr>::from_str`.
603
+ *
604
+ * - Empty input or non-digit characters → `undefined`.
605
+ * - Values that fit in `Number.MAX_SAFE_INTEGER` are returned as plain
606
+ * `number`s, so callers in the common case (tag values like `40000`,
607
+ * known values like `1`) never see a `bigint`.
608
+ * - Values in `(2^53-1, 2^64-1]` are returned as `bigint`. dCBOR's
609
+ * `cbor({ tag, value })` and `KnownValue` constructors both accept
610
+ * `bigint` natively, so the bigint flows through to wire encoding
611
+ * without precision loss.
612
+ * - Values strictly greater than `2^64 - 1` (or negative) are rejected
613
+ * so this parser never produces a tag/known-value outside the
614
+ * `u64` domain — matches Rust which fails `parse::<u64>()` in that
615
+ * case.
616
+ */
617
+ const MAX_U64: bigint = (1n << 64n) - 1n;
618
+ function parseUsize64(s: string): number | bigint | undefined {
619
+ if (s.length === 0) return undefined;
620
+ // The regex feeding this helper already rejects sign / leading
621
+ // zeros / non-digits; this guard is defensive in case the helper is
622
+ // reused elsewhere.
623
+ if (!/^\d+$/.test(s)) return undefined;
624
+ let value: bigint;
625
+ try {
626
+ value = BigInt(s);
627
+ } catch {
628
+ return undefined;
629
+ }
630
+ if (value < 0n || value > MAX_U64) return undefined;
631
+ // Narrow to plain `number` when safe so common-case callers never
632
+ // see a `bigint`.
633
+ if (value <= BigInt(Number.MAX_SAFE_INTEGER)) {
634
+ return Number(value);
563
635
  }
636
+ return value;
564
637
  }
565
638
 
566
639
  /**