npm - @bcts/dcbor-parse - Versions diffs - 1.0.0-alpha.22 → 1.0.0-beta.0 - Mend

@bcts/dcbor-parse 1.0.0-alpha.22 → 1.0.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/src/index.ts CHANGED Viewed

@@ -46,20 +46,72 @@
  * @module dcbor-parse
  */
+// =============================================================================
+// Public surface that mirrors Rust `bc-dcbor-parse-rust/src/lib.rs:59-72`.
+//
+// Rust re-exports:
+//   - `parse_dcbor_item`, `parse_dcbor_item_partial`
+//   - `Token`
+//   - `Error as ParseError`, `Result as ParseResult`
+//   - `Error as ComposeError`, `Result as ComposeResult`,
+//     `compose_dcbor_array`, `compose_dcbor_map`
+// =============================================================================
 // Parse functions
 export { parseDcborItem, parseDcborItemPartial } from "./parse";
-// Token types
-export { type Token, token, Lexer } from "./token";
+// Token types — Rust exposes only the `Token` enum publicly.
+export { type Token } from "./token";
+// Error types — Rust exposes only `ParseError` (the error enum) and
+// `ParseResult` (the result type alias).
+export { type ParseError, type ParseResult } from "./error";
-// Error types
+// Compose types and functions — Rust exposes `ComposeError`,
+// `ComposeResult`, and the two `compose_*` functions.
+export {
+  type ComposeError,
+  type ComposeResult,
+  composeDcborArray,
+  composeDcborMap,
+} from "./compose";
+// =============================================================================
+// TypeScript-only conveniences.
+//
+// Rust models its `Result<T, E>` natively via the `Result<T, E>` enum
+// and `?` operator; the Logos lexer is a private implementation detail.
+// In TypeScript we model `ParseResult<T>` as a discriminated union, so
+// helper constructors and discriminators (`ok`, `err`, `isOk`, `isErr`,
+// `unwrap`, `unwrapErr`, `parseError`, `composeError`, `composeOk`,
+// `composeErr`, `Span`, …) are mandatory ergonomics. They are exported
+// here as TS-only helpers and are **not** part of the Rust↔TS parity
+// surface — Rust callers don't see them, and TS callers writing
+// strictly-portable code shouldn't depend on them.
+//
+// `Lexer` and the `token` constructor namespace are likewise TS-only;
+// in Rust the lexer is created via `Token::lexer(src)` internally and
+// consumers never instantiate it directly. These re-exports stay so
+// existing test code keeps working, but production callers should
+// prefer `parseDcborItem` / `parseDcborItemPartial`.
+// =============================================================================
+// Token — TS-only convenience namespace for constructing tokens from
+// userland (rare; mostly used in tests). The `Lexer` class is also
+// TS-only — Rust treats `Token::lexer(...)` as an internal API.
+export { token, Lexer } from "./token";
+// Error helpers — `Span`/`span`/`defaultSpan` are TS-only because Rust
+// uses the `logos::Span` type alias directly. The `ok`/`err`/`isOk`/
+// `isErr`/`unwrap`/`unwrapErr` helpers are TS-only `Result`-modeling
+// utilities. `parseError`, `isDefaultError`, `errorMessage`,
+// `errorSpan`, `fullErrorMessage`, and `defaultParseError` are
+// likewise convenience helpers around the discriminated union.
 export {
   type Span,
   span,
   defaultSpan,
-  type ParseError,
   parseError,
-  type ParseResult,
   ok,
   err,
   isOk,
@@ -73,14 +125,7 @@ export {
   defaultParseError,
 } from "./error";
-// Compose functions
-export {
-  type ComposeError,
-  composeError,
-  type ComposeResult,
-  composeOk,
-  composeErr,
-  composeErrorMessage,
-  composeDcborArray,
-  composeDcborMap,
-} from "./compose";
+// Compose helpers — `composeError`/`composeOk`/`composeErr`/
+// `composeErrorMessage` are the TS-only counterparts of the
+// `ComposeError`/`ComposeResult` discriminated union ergonomics.
+export { composeError, composeOk, composeErr, composeErrorMessage } from "./compose";

package/src/parse.ts CHANGED Viewed

@@ -252,7 +252,7 @@ function parseUr(ur: UR, tokenSpan: Span): ParseResult<Cbor> {
   );
 }
-function parseNumberTag(tagValue: number, lexer: Lexer): ParseResult<Cbor> {
+function parseNumberTag(tagValue: number | bigint, lexer: Lexer): ParseResult<Cbor> {
   const itemResult = parseItem(lexer);
   if (!itemResult.ok) {
     return itemResult;
@@ -267,6 +267,10 @@ function parseNumberTag(tagValue: number, lexer: Lexer): ParseResult<Cbor> {
   }
   if (closeResult.value.type === "ParenthesisClose") {
+    // Pass the tag value through as-is: when it's a `bigint` (i.e. a
+    // u64 outside the safe-integer range), dCBOR's `cbor({ tag, value })`
+    // builder serialises it as a `bigint` tag — matching Rust which
+    // accepts the full `0..=2^64-1` range natively.
     return ok(cbor({ tag: tagValue, value: itemResult.value }));
   }
@@ -386,23 +390,37 @@ function parseMap(lexer: Lexer): ParseResult<Cbor> {
       return err(PE.duplicateMapKey(keySpan));
     }
-    // Expect colon
+    // Expect colon.
+    //
+    // Mirrors Rust `parse.rs:382-395`:
+    // ```
+    //   if let Ok(Token::Colon) = expect_token(lexer) { … }
+    //   else { return Err(Error::ExpectedColon(lexer.span())); }
+    // ```
+    // Rust's pattern collapses *every* non-Colon outcome — including
+    // `UnexpectedEndOfInput`, `UnrecognizedToken`, and any other error
+    // — into `ExpectedColon`. Earlier revisions of this port forwarded
+    // the inner error verbatim, so `{1` reported `UnexpectedEndOfInput`
+    // instead of `ExpectedColon`.
     const colonResult = expectToken(lexer);
-    if (!colonResult.ok) {
-      return colonResult;
-    }
-    if (colonResult.value.type !== "Colon") {
+    if (!colonResult.ok || colonResult.value.type !== "Colon") {
       return err(PE.expectedColon(lexer.span()));
     }
-    // Parse the value
+    // Parse the value.
+    //
+    // Rust `parse.rs:383-389` uses the inner `UnexpectedToken`'s **own**
+    // span when it converts to `ExpectedMapKey`. Earlier revisions of
+    // this port called `lexer.span()` here, which can drift if the
+    // lexer has stepped past the offending `}`. We now use the
+    // captured span from `valueResult.error` to preserve Rust's exact
+    // span semantics.
     const valueResult = parseItem(lexer);
     if (!valueResult.ok) {
       if (valueResult.error.type === "UnexpectedToken") {
-        const unexpectedToken = (valueResult.error as { token: Token }).token;
-        if (unexpectedToken.type === "BraceClose") {
-          return err(PE.expectedMapKey(lexer.span()));
+        const unexpected = valueResult.error;
+        if (unexpected.token.type === "BraceClose") {
+          return err(PE.expectedMapKey(unexpected.span));
         }
       }
       return valueResult;

package/src/token.ts CHANGED Viewed

@@ -17,7 +17,21 @@ import { type Span, span, parseError as PE, type ParseResult, ok, err } from "./
 /**
  * Token types produced by the lexer.
  *
- * Corresponds to the Rust `Token` enum in token.rs
+ * Corresponds to the Rust `Token` enum in token.rs.
+ *
+ * **u64 parity**: `TagValue` and `KnownValueNumber` are widened to
+ * `number | bigint` because Rust accepts the full `u64` range
+ * (`0..=2^64-1`). Values that fit in
+ * {@link Number.MAX_SAFE_INTEGER} (`2^53-1`) come through as plain
+ * `number`s; anything larger arrives as a `bigint` so callers don't
+ * silently lose precision. This matches the way `@bcts/dcbor` already
+ * stores large unsigned integers (`number | bigint`) and lets the
+ * downstream `cbor({ tag, value })` builder serialize correctly.
+ *
+ * **String value field**: the lexer keeps the outer double quotes on
+ * the slice (e.g. `"\"hello\""`); the parser strips them in
+ * `parseString`. Mirrors Rust `Token::String(String)` which holds the
+ * raw `lex.slice()` including quotes (`token.rs:115-119`).
  */
 export type Token =
   | { readonly type: "Bool"; readonly value: boolean }
@@ -38,9 +52,9 @@ export type Token =
   | { readonly type: "DateLiteral"; readonly value: CborDate }
   | { readonly type: "Number"; readonly value: number }
   | { readonly type: "String"; readonly value: string }
-  | { readonly type: "TagValue"; readonly value: number }
+  | { readonly type: "TagValue"; readonly value: number | bigint }
   | { readonly type: "TagName"; readonly value: string }
-  | { readonly type: "KnownValueNumber"; readonly value: number }
+  | { readonly type: "KnownValueNumber"; readonly value: number | bigint }
   | { readonly type: "KnownValueName"; readonly value: string }
   | { readonly type: "Unit" }
   | { readonly type: "UR"; readonly value: UR };
@@ -101,13 +115,13 @@ export const token = {
   string(value: string): Token {
     return { type: "String", value };
   },
-  tagValue(value: number): Token {
+  tagValue(value: number | bigint): Token {
     return { type: "TagValue", value };
   },
   tagName(value: string): Token {
     return { type: "TagName", value };
   },
-  knownValueNumber(value: number): Token {
+  knownValueNumber(value: number | bigint): Token {
     return { type: "KnownValueNumber", value };
   },
   knownValueName(value: string): Token {
@@ -199,20 +213,29 @@ export class Lexer {
         continue;
       }
-      // Skip inline comments: /.../ (not preceded by another /)
-      if (
-        ch === "/" &&
-        this._position + 1 < this._source.length &&
-        this._source[this._position + 1] !== "/"
-      ) {
-        this._position++; // Skip opening /
-        while (this._position < this._source.length && this._source[this._position] !== "/") {
-          this._position++;
+      // Skip inline comments: `/[^/]*/` (matches the Rust skip regex
+      // `/[^/]*/`). Note that the Rust regex *does* match `//` (zero
+      // non-slash characters between the two slashes), so an empty
+      // comment is a valid no-op for the lexer. We accept that case too;
+      // earlier revisions of this port required at least one non-slash
+      // body character, which broke parity with Rust on inputs like
+      // `// trailing thought`.
+      if (ch === "/") {
+        // Confirm there is a closing slash somewhere ahead. If not, fall
+        // through and let the punctuation matcher report an
+        // unrecognized token (Rust would equally fail to match the skip
+        // regex and emit an `UnrecognizedToken`).
+        let scan = this._position + 1;
+        while (scan < this._source.length && this._source[scan] !== "/") {
+          scan++;
         }
-        if (this._position < this._source.length) {
-          this._position++; // Skip closing /
+        if (scan < this._source.length) {
+          this._position = scan + 1; // jump past the closing /
+          continue;
         }
-        continue;
+        // No closing /: not a comment — leave _position alone and break
+        // out so the punctuation matcher can flag the unrecognized `/`.
+        break;
       }
       // Skip end-of-line comments: #...
@@ -227,27 +250,38 @@ export class Lexer {
     }
   }
+  /**
+   * Matches reserved keywords: `true`, `false`, `null`, `NaN`,
+   * `Infinity`, `-Infinity`, `Unit`.
+   *
+   * Mirrors Rust's `Logos` `#[token(...)]` matcher
+   * (`bc-dcbor-parse-rust/src/token.rs:12-50, 164`), which is greedy
+   * and emits the keyword token *as soon as the literal matches* —
+   * subsequent characters become a separate (likely unrecognized) token
+   * stream. So input like `truex` lexes as `Bool(true)` followed by an
+   * unrecognized run on `x`. Earlier revisions of this port enforced an
+   * identifier boundary check (`!_isIdentifierChar(nextChar)`) and
+   * rejected the whole prefix as a single `UnrecognizedToken`, which
+   * broke span/variant parity with Rust.
+   */
   private _tryMatchKeyword(): ParseResult<Token> | undefined {
     const keywords: [string, Token][] = [
+      // Order matters: `-Infinity` must come before any other `-` based
+      // matcher (we lex this before numbers, so the `-` doesn't get
+      // siphoned off as a sign).
+      ["-Infinity", token.negInfinity()],
       ["true", token.bool(true)],
       ["false", token.bool(false)],
       ["null", token.null()],
       ["NaN", token.nan()],
       ["Infinity", token.infinity()],
-      ["-Infinity", token.negInfinity()],
       ["Unit", token.unit()],
     ];
     for (const [keyword, tok] of keywords) {
       if (this._matchLiteral(keyword)) {
-        // Make sure it's not part of a longer identifier
-        const nextChar = this._source[this._position];
-        if (nextChar === undefined || !this._isIdentifierChar(nextChar)) {
-          this._tokenEnd = this._position;
-          return ok(tok);
-        }
-        // Reset position if it was a partial match
-        this._position = this._tokenStart;
+        this._tokenEnd = this._position;
+        return ok(tok);
       }
     }
@@ -300,18 +334,24 @@ export class Lexer {
         !numStr.includes("E") &&
         !numStr.startsWith("-")
       ) {
-        // It's a tag value
+        // It's a tag value. Mirrors Rust `token.rs:128-136`:
+        // `stripped.parse::<TagValue>()` accepts the full `u64` range
+        // (`0..=2^64-1`). We use `BigInt` to get exact-integer parsing,
+        // then narrow to `number` when the value fits in
+        // `Number.MAX_SAFE_INTEGER` so callers don't pay the bigint
+        // tax for tag numbers in the common range. Anything outside
+        // `[0, 2^64-1]` is reported as `InvalidTagValue` matching Rust.
         this._position += numStr.length + 1; // Include the (
         this._tokenEnd = this._position;
-        const tagValue = parseInt(numStr, 10);
-        if (!Number.isSafeInteger(tagValue) || tagValue < 0) {
+        const parsed = parseUsize64(numStr);
+        if (parsed === undefined) {
           return err(
             PE.invalidTagValue(numStr, span(this._tokenStart, this._tokenStart + numStr.length)),
           );
         }
-        return ok(token.tagValue(tagValue));
+        return ok(token.tagValue(parsed));
       }
       // It's a regular number
@@ -363,20 +403,15 @@ export class Lexer {
       return ok(token.string(fullMatch));
     }
-    // Invalid string - try to find where it ends for better error reporting
+    // Invalid string: emit an unrecognized token covering just the
+    // opening `"` and let the next call to `next()` re-lex. Mirrors
+    // Rust's Logos behaviour when the `String` regex fails to match —
+    // the lexer emits `Error::default()` (which `expect_token` upgrades
+    // to `UnrecognizedToken(span)` for the single character) and
+    // recovers at the very next byte. Earlier revisions of this port
+    // consumed through the next `"` or `\n`, which inflated the error
+    // span beyond what Rust reports.
     this._position++;
-    while (this._position < this._source.length) {
-      const ch = this._source[this._position];
-      if (ch === '"' || ch === "\n") {
-        if (ch === '"') this._position++;
-        break;
-      }
-      if (ch === "\\") {
-        this._position += 2;
-      } else {
-        this._position++;
-      }
-    }
     this._tokenEnd = this._position;
     return err(PE.unrecognizedToken(this.span()));
   }
@@ -470,8 +505,11 @@ export class Lexer {
       this._position += fullMatch.length;
       this._tokenEnd = this._position;
-      const value = parseInt(numStr, 10);
-      if (!Number.isSafeInteger(value) || value < 0) {
+      // Mirrors Rust `token.rs:146-153`: `stripped.parse::<u64>()`
+      // accepts the full `u64` range. We share the helper used for
+      // `TagValue` to get the same narrow-when-safe-else-bigint path.
+      const value = parseUsize64(numStr);
+      if (value === undefined) {
         return err(PE.invalidKnownValue(numStr, span(this._tokenStart + 1, this._tokenEnd - 1)));
       }
@@ -491,14 +529,14 @@ export class Lexer {
       return ok(token.knownValueName(name));
     }
-    // Invalid known value
+    // Invalid known value: emit an unrecognized token covering just the
+    // opening `'` and let the next call to `next()` re-lex. Mirrors
+    // Rust's Logos behaviour when neither `KnownValueNumber` nor
+    // `KnownValueName` regex matches — the lexer emits `Error::default()`
+    // (single character span) and recovers at the next byte. Earlier
+    // revisions of this port consumed through the closing `'`, which
+    // inflated the error span beyond what Rust reports.
     this._position++;
-    while (this._position < this._source.length && this._source[this._position] !== "'") {
-      this._position++;
-    }
-    if (this._position < this._source.length) {
-      this._position++;
-    }
     this._tokenEnd = this._position;
     return err(PE.unrecognizedToken(this.span()));
   }
@@ -557,10 +595,45 @@ export class Lexer {
     }
     return false;
   }
+}
-  private _isIdentifierChar(ch: string): boolean {
-    return /[a-zA-Z0-9_-]/.test(ch);
+/**
+ * Strictly parses a non-negative integer string in the range
+ * `[0, 2^64 - 1]`, mirroring Rust `<u64 as FromStr>::from_str`.
+ *
+ * - Empty input or non-digit characters → `undefined`.
+ * - Values that fit in `Number.MAX_SAFE_INTEGER` are returned as plain
+ *   `number`s, so callers in the common case (tag values like `40000`,
+ *   known values like `1`) never see a `bigint`.
+ * - Values in `(2^53-1, 2^64-1]` are returned as `bigint`. dCBOR's
+ *   `cbor({ tag, value })` and `KnownValue` constructors both accept
+ *   `bigint` natively, so the bigint flows through to wire encoding
+ *   without precision loss.
+ * - Values strictly greater than `2^64 - 1` (or negative) are rejected
+ *   so this parser never produces a tag/known-value outside the
+ *   `u64` domain — matches Rust which fails `parse::<u64>()` in that
+ *   case.
+ */
+const MAX_U64: bigint = (1n << 64n) - 1n;
+function parseUsize64(s: string): number | bigint | undefined {
+  if (s.length === 0) return undefined;
+  // The regex feeding this helper already rejects sign / leading
+  // zeros / non-digits; this guard is defensive in case the helper is
+  // reused elsewhere.
+  if (!/^\d+$/.test(s)) return undefined;
+  let value: bigint;
+  try {
+    value = BigInt(s);
+  } catch {
+    return undefined;
+  }
+  if (value < 0n || value > MAX_U64) return undefined;
+  // Narrow to plain `number` when safe so common-case callers never
+  // see a `bigint`.
+  if (value <= BigInt(Number.MAX_SAFE_INTEGER)) {
+    return Number(value);
   }
+  return value;
 }
 /**