npm - @energy8platform/stake-math-tools - Versions diffs - 0.4.0 → 0.6.0 - Mend

@energy8platform/stake-math-tools 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/README.md +223 -56
package/package.json +1 -1
package/src/index.ts +13 -0
package/src/optimize-lookup.ts +174 -19
package/src/stake-report.ts +145 -0
package/src/tiered.ts +1832 -0
package/src/transform-jsonl-zst.ts +285 -0
package/src/types.ts +141 -0
package/test/optimize-lookup.integration.test.ts +423 -0
package/test/optimize-lookup.unit.test.ts +2 -0
package/test/transform-jsonl-zst.test.ts +343 -0

package/src/transform-jsonl-zst.ts ADDED Viewed

@@ -0,0 +1,285 @@
+/**
+ * Streaming `*.jsonl.zst → *.jsonl.zst` transformer.
+ *
+ * Decompresses the input with zstd, runs each line through an optional user
+ * mapper, recompresses the result. Pipes everything — no temp files, memory
+ * footprint stays at one line regardless of input size, so gigabyte books
+ * files work fine.
+ *
+ * Why not `readline.createInterface`? readline accumulates each line as a
+ * growing JS string (`buffer += chunk`), which hits V8's ~512MB string-length
+ * limit on books files that contain very long event arrays — manifests as
+ * `RangeError: Invalid string length` deep in node:internal/readline. The
+ * Buffer-based splitter here keeps incomplete-line state as raw bytes and
+ * only materializes a string at the LF boundary (mapper mode) or never
+ * (identity mode), so any line that fits in OS memory is fine.
+ *
+ * In identity mode (no mapper) we don't split at all — we pipe decompressor
+ * output directly into compressor input as raw bytes. That's the fastest
+ * path: ~25 MB/s of compressed input on a single core, dominated by the
+ * zstd subprocesses, with zero JS string allocation.
+ *
+ * Requires the `zstd` binary on PATH (same precondition as the rest of the
+ * stake-bridge tooling).
+ */
+import { spawn } from 'node:child_process';
+export type LineMapper = (
+  line: string,
+  index: number,
+) => string | string[] | null | undefined;
+export type BinaryLineMapper = (
+  line: Buffer,
+  index: number,
+) => Buffer | string | Array<Buffer | string> | null | undefined;
+export interface TransformJsonlZstParams {
+  /** Path to a zstd-compressed `.jsonl.zst` file. */
+  inputPath: string;
+  /** Path where the transformed `.jsonl.zst` will be written (overwritten). */
+  outputPath: string;
+  /** Per-line transform with the line decoded as a JS string. Default =
+   *  identity passthrough (byte pipe, no per-line allocations).
+   *
+   *  - Return a `string` to replace the line with that content.
+   *  - Return a `string[]` to expand one input line into several output lines.
+   *  - Return `null` / `undefined` to drop the line entirely.
+   *
+   *  Mutually exclusive with `binaryMapper`. Use `binaryMapper` instead when
+   *  any single line could exceed V8's ~512 MB string-length cap (e.g. bonus
+   *  game books with massive event arrays) — `toString('utf8')` will throw
+   *  `ERR_STRING_TOO_LONG` on lines above that limit. */
+  mapper?: LineMapper;
+  /** Per-line transform with the line passed as a raw `Buffer`. Use this
+   *  for any line that may exceed V8's ~512 MB string limit, or when you
+   *  only need to peek at a small prefix (`line.subarray(0, 64).toString()`)
+   *  and want to pass the rest of the bytes through verbatim.
+   *
+   *  Return shape:
+   *    - `Buffer` or `string` — replace the line with that content.
+   *    - array of `Buffer | string` — expand to N output lines.
+   *    - `null` / `undefined` — drop.
+   *
+   *  Mutually exclusive with `mapper`. */
+  binaryMapper?: BinaryLineMapper;
+  /** zstd compression level for the output. 1 = fastest, 22 = smallest.
+   *  Default 9 — same level the kitsune optimize pipeline uses. */
+  zstdLevel?: number;
+  /** Called every `progressEveryLines` input lines with running counts.
+   *  Useful for progress bars on multi-million-row files. Identity mode
+   *  reports `linesRead == linesWritten == 0` because we don't split. */
+  onProgress?: (linesRead: number, linesWritten: number) => void;
+  /** How often to fire `onProgress`. Default 100_000. */
+  progressEveryLines?: number;
+}
+export interface TransformJsonlZstResult {
+  linesRead: number;
+  linesWritten: number;
+  /** True when identity mode was used: byte-pipe passthrough without
+   *  per-line counting (so `linesRead`/`linesWritten` will be 0). */
+  identityPassthrough: boolean;
+}
+const LF = 0x0a;
+const LF_BUFFER = Buffer.from([LF]);
+export async function transformJsonlZst(
+  params: TransformJsonlZstParams,
+): Promise<TransformJsonlZstResult> {
+  const {
+    inputPath,
+    outputPath,
+    mapper,
+    binaryMapper,
+    zstdLevel = 9,
+    onProgress,
+    progressEveryLines = 100_000,
+  } = params;
+  if (mapper && binaryMapper) {
+    throw new Error(
+      'transformJsonlZst: pass either `mapper` (string) or `binaryMapper` (Buffer), not both',
+    );
+  }
+  const anyMapper = mapper ?? binaryMapper;
+  const decompress = spawn('zstd', ['-dc', '-q', inputPath], {
+    stdio: ['ignore', 'pipe', 'inherit'],
+  });
+  const compress = spawn(
+    'zstd',
+    [`-${zstdLevel}`, '-q', '-f', '-o', outputPath],
+    { stdio: ['pipe', 'inherit', 'inherit'] },
+  );
+  const decompressDone = waitForExit(decompress, 'zstd -d');
+  const compressDone = waitForExit(compress, 'zstd -c');
+  const writeChunk = (chunk: Buffer | string): Promise<void> => {
+    if (compress.stdin.write(chunk)) return Promise.resolve();
+    return new Promise<void>((resolve, reject) => {
+      const onDrain = () => {
+        cleanup();
+        resolve();
+      };
+      const onError = (err: Error) => {
+        cleanup();
+        reject(err);
+      };
+      const cleanup = () => {
+        compress.stdin.off('drain', onDrain);
+        compress.stdin.off('error', onError);
+      };
+      compress.stdin.once('drain', onDrain);
+      compress.stdin.once('error', onError);
+    });
+  };
+  let linesRead = 0;
+  let linesWritten = 0;
+  try {
+    if (!anyMapper) {
+      // Identity mode: byte-pipe. Never split into lines, never materialize
+      // strings, never accumulate buffers. Constant memory regardless of how
+      // long individual lines are.
+      for await (const chunk of decompress.stdout!) {
+        await writeChunk(chunk);
+      }
+    } else {
+      // Mapper mode: split on LF boundaries by scanning raw bytes. We keep
+      // incomplete-line bytes in a small array of Buffers (no concatenation
+      // into a single growing JS string), then `Buffer.concat` + `toString`
+      // when the LF is finally seen (string mapper) or never (binary mapper).
+      let pending: Buffer[] = [];
+      let pendingLen = 0;
+      const writeMapperResult = async (out: Buffer | string): Promise<void> => {
+        await writeChunk(out);
+        await writeChunk(LF_BUFFER);
+        linesWritten++;
+      };
+      const flushLine = async (lineBuf: Buffer): Promise<void> => {
+        // Strip trailing CR for CRLF tolerance, matching readline behaviour.
+        const trimmed =
+          lineBuf.length > 0 && lineBuf[lineBuf.length - 1] === 0x0d
+            ? lineBuf.subarray(0, lineBuf.length - 1)
+            : lineBuf;
+        let result: string | string[] | Buffer | Array<Buffer | string> | null | undefined;
+        if (binaryMapper) {
+          result = binaryMapper(trimmed, linesRead);
+        } else {
+          // String mapper: decode the line. Lines above V8's ~512 MB string
+          // cap throw ERR_STRING_TOO_LONG here — re-throw with a pointer to
+          // `binaryMapper` so the failure mode is obvious.
+          let lineStr: string;
+          try {
+            lineStr = trimmed.toString('utf8');
+          } catch (err) {
+            if (
+              err instanceof Error &&
+              (err as NodeJS.ErrnoException).code === 'ERR_STRING_TOO_LONG'
+            ) {
+              const wrapped = new Error(
+                `transformJsonlZst: line ${linesRead} is ${trimmed.length} bytes — ` +
+                  `exceeds V8 max JS string length (~512 MB). Use the ` +
+                  '`binaryMapper` option to receive the line as a Buffer.',
+              );
+              (wrapped as { cause?: unknown }).cause = err;
+              throw wrapped;
+            }
+            throw err;
+          }
+          result = mapper!(lineStr, linesRead);
+        }
+        linesRead++;
+        if (result === null || result === undefined) {
+          // drop
+        } else if (Array.isArray(result)) {
+          for (const out of result) {
+            await writeMapperResult(out);
+          }
+        } else {
+          await writeMapperResult(result);
+        }
+        if (onProgress && linesRead % progressEveryLines === 0) {
+          onProgress(linesRead, linesWritten);
+        }
+      };
+      for await (const chunk of decompress.stdout! as AsyncIterable<Buffer>) {
+        let start = 0;
+        // Buffer.indexOf(LF) is a C++ scan, ~20× faster than a JS byte loop.
+        while (start < chunk.length) {
+          const lf = chunk.indexOf(LF, start);
+          if (lf < 0) {
+            // No LF in the remainder — stash as pending and move on.
+            const remainder = chunk.subarray(start);
+            const owned = Buffer.from(remainder);
+            pending.push(owned);
+            pendingLen += owned.length;
+            break;
+          }
+          const tail = chunk.subarray(start, lf);
+          let lineBuf: Buffer;
+          if (pendingLen === 0) {
+            lineBuf = tail;
+          } else {
+            pending.push(tail);
+            lineBuf = Buffer.concat(pending, pendingLen + tail.length);
+            pending = [];
+            pendingLen = 0;
+          }
+          await flushLine(lineBuf);
+          start = lf + 1;
+        }
+      }
+      // Trailing line without a terminating LF — emit it the same way readline
+      // would (so callers don't silently lose data when the input lacks a
+      // final newline).
+      if (pendingLen > 0) {
+        const lineBuf = Buffer.concat(pending, pendingLen);
+        await flushLine(lineBuf);
+      }
+    }
+  } catch (err) {
+    compress.stdin.destroy();
+    throw err;
+  }
+  compress.stdin.end();
+  await Promise.all([decompressDone, compressDone]);
+  if (mapper && onProgress) onProgress(linesRead, linesWritten);
+  return { linesRead, linesWritten, identityPassthrough: !mapper };
+}
+function waitForExit(
+  child: ReturnType<typeof spawn>,
+  label: string,
+): Promise<void> {
+  return new Promise((resolve, reject) => {
+    child.on('error', (err) => {
+      reject(new Error(`${label} failed to spawn: ${err.message}`));
+    });
+    child.on('close', (code, signal) => {
+      if (code === 0) resolve();
+      else
+        reject(
+          new Error(
+            `${label} exited with ${code === null ? `signal ${signal}` : `code ${code}`}`,
+          ),
+        );
+    });
+  });
+}

package/src/types.ts CHANGED Viewed

@@ -20,6 +20,10 @@ export interface OptimizeParams {
   /** Hard cap. Rows with payoutCents > capMaxWin are dropped. */
   capMaxWin: number;
+  /** Cost of a single bet in cents. Used to convert payouts to "bet multiplier" units
+   *  for the Stake-style report. Default 100 (1.0 bet = 100 cents). */
+  betCostCents?: number;
   /** When true, force ≥ 1 row with payoutCents ≥ maxReachedFraction × capMaxWin. Default true. */
   requireMaxReached?: boolean;
   /** Default 0.95. */
@@ -43,6 +47,73 @@ export interface OptimizeParams {
    *  Default 0.05 (5%). Set to 1.0 to disable.
    */
   maxRowRtpShare?: number;
+  /** Maximum integer weight allowed for any single output row, as a multiple of the
+   *  uniform prior weight (totalWeightOut / nRowsOut). E.g., 10 means no row can have
+   *  weight greater than 10 × (totalWeightOut / nRowsOut). This prevents Stake's ETL
+   *  ("Within Liability Limits") check from failing due to over-concentrated weight.
+   *  Default 10. Set to Infinity to disable. */
+  maxWeightPerRow?: number;
+  /** Algorithm for compressing source rows into a weighted lookup table.
+   *  - 'tiered' (default): tier-based rarity weighting (cap/large rows get weight=1,
+   *    small rows get calculated weight W). Preserves source distribution rates;
+   *    passes Stake Engine's "Within Liability Limits" check.
+   *  - 'nnls': legacy NNLS optimization; hits RTP/CV/HR targets exactly but may
+   *    concentrate weight on few rows and fail Stake's Liability check. */
+  algorithm?: 'tiered' | 'nnls';
+  /** Tier-based only: payout multiplier (payoutCents / betCostCents) above which
+   *  a row is in the "cap" tier (weight=1, rare). Default: 0.95 × max source pm. */
+  capPmThreshold?: number;
+  /** Tier-based only: payout multiplier threshold for the "large" tier.
+   *  Rows with capPmThreshold > pm >= largePmThreshold get weight=1.
+   *  Default: undefined (no large tier — only cap vs small). */
+  largePmThreshold?: number;
+  /** Tier-based only: target effective probability for cap+large rows in output.
+   *  Default: natural rate from source = (n_cap + n_large) / n_source. */
+  largeTarget?: number;
+  /** Tier-based only: when true, ensure every Stake hit-rate distribution range
+   *  up to the actual max payout has ≥ 1 output row when source has rows in
+   *  that range. Prevents Stake's "Gaps in the Hit Rate Table" rejection.
+   *  Default true. */
+  ensureRangeCoverage?: boolean;
+  /** Tier-based only: reshape the high-tier sampling so the per-bucket row
+   *  counts follow a log-decay curve across Stake hit-rate ranges — each
+   *  bucket above the lowest one in the high tier targets `ratio × prev`
+   *  rows. Turns the typical sparse tail of `…18 → 1 → 1 → 1 → 4` into
+   *  a smooth `…18 → 9 → 4 → 2 → 1` instead.
+   *
+   *  When true and `largePmThreshold` is unset, auto-sets it to
+   *  `max(50, capPmThreshold / 20)` so the decay covers multiple Stake
+   *  buckets, not just the single cap bucket. Default false. */
+  shapeDistribution?: boolean;
+  /** Tier-based only: ratio between adjacent Stake-bucket row counts when
+   *  `shapeDistribution=true`. 0.5 = each higher bucket has half the rows
+   *  of the one below it. Default 0.5. */
+  shapeDecayRatio?: number;
+  /** Tier-based only: auto-pick `shapeDecayRatio` by binary search so the
+   *  achieved CV lands at `targetCV` within `toleranceCV`. Requires
+   *  `shapeDistribution=true` and a `targetCV > 0`. Runs the full pipeline
+   *  up to 6 times (one per bisection step) — expect a 5×-6× wall-clock
+   *  hit on builds where it triggers. Default false. */
+  shapeAutoMatchCV?: boolean;
+  /** Tier-based only: minimum fraction of nRowsOut that must be distinct payoutCents
+   *  values in the output. Stake Engine rejects "Insufficient Unique Events" when
+   *  too few distinct outcomes exist (same events repeat in a session). Default 0.01
+   *  (1%). For 100K output → 1K unique payouts required. Set to 0 to disable.
+   *
+   *  When the target cannot be reached (source lacks enough distinct payouts, or
+   *  RTP-drift budget exhausts), the optimizer falls back to maximizing unique
+   *  count under the budget and emits a warning. */
+  minUniqueEventsRate?: number;
 }
 export interface OptimizeAchieved {
@@ -60,6 +131,71 @@ export interface ToleranceMet {
   maxReached: boolean;
   /** True if no output row contributes more than maxRowRtpShare of total RTP. */
   rtpConcentration: boolean;
+  /** True if no output row's weight exceeds maxWeightPerRow × (totalWeightOut / nRowsOut). */
+  weightCap: boolean;
+}
+export interface TopKShare {
+  /** Cumulative share of total RTP coming from the top-K rows (ordered by w·payout descending). */
+  k: number;
+  share: number;
+}
+export interface HitRateBucket {
+  /** Inclusive lower bound of the payout-multiplier range. */
+  low: number;
+  /** Exclusive upper bound (Infinity for the open top bucket). */
+  high: number;
+  /** Number of distinct output rows with pm in [low, high). */
+  count: number;
+  /** Σ weight in this range / Σ weight total — the player-facing probability. */
+  effectiveHitRate: number;
+}
+export interface StakeReport {
+  /** Maximum payout in the output, as a bet multiplier (payoutCents / betCostCents). */
+  payoutMultMax: number;
+  /** Standard deviation of payouts in bet-cost units (= stddev_payout_cents / betCostCents).
+   *  Equivalent to cv × rtp × (100 / betCostCents). For bet=100 cents, equals cv × rtp × 1. */
+  baseStd: number;
+  /** Probability that a sampled spin pays ≥ 5000 × betCost. */
+  prob5K: number;
+  /** Probability that a sampled spin pays ≥ 10000 × betCost. */
+  prob10K: number;
+  /** Top-K cumulative RTP shares, sorted by per-row (w × payout) descending.
+   *  Standard K values reported: 1, 5, 10, 100. */
+  topKShare: TopKShare[];
+  /** Stake's hit-rate-distribution table: payout-multiplier ranges with row count
+   *  and effective probability. Ranges are: [0, 0.1), [0.1, 1), [1, 2), [2, 5),
+   *  [5, 10), [10, 20), [20, 50), [50, 100), [100, 200), [200, 500), [500, 1000),
+   *  [1000, 2000), [2000, 5000), [5000, 10000), [10000, 20000), [20000, ∞).
+   *  Stake fails publication when any intermediate range is empty (gap). */
+  hitRateDistribution: HitRateBucket[];
+  /** Number of distinct payoutCents values in the output. Stake flags "Insufficient
+   *  Unique Events" when this is too low — same outcomes repeat in a session. */
+  uniqueEvents: number;
+  /** Bet cost in cents used for the multiplier conversions (echoed from params). */
+  betCostCents: number;
+}
+export interface RefinementStats {
+  /** Single-row swaps applied during refineRtpBySwap to close residual RTP gap. */
+  rtpSwaps: number;
+  /** Σ-preserving 2-swaps applied during refineCvBySwap to nudge CV. */
+  cvSwaps: number;
+  /** Swaps applied to fill empty Stake distribution ranges (ensureRangeCoverage). */
+  gapFillSwaps: number;
+  /** Stake distribution ranges where source has no rows — gaps that cannot be filled. */
+  gapsUnfillable: number;
+  /** Swaps applied to introduce new distinct payoutCents into the output (minUniqueEventsRate). */
+  diversifySwaps: number;
 }
 export interface OptimizeResult {
@@ -68,5 +204,10 @@ export interface OptimizeResult {
   toleranceMet: ToleranceMet;
   /** The single output row's largest fraction of total RTP. */
   maxRowRtpShare: number;
+  /** Maximum integer weight observed in output, as a multiple of uniform prior. */
+  maxWeightRatio: number;
+  /** Per-pass swap counters from the refinement loops. */
+  refinement: RefinementStats;
   warnings: string[];
+  stakeReport: StakeReport;
 }