npm - benchforge - Versions diffs - 0.1.0 - Mend

benchforge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

package/README.md +432 -0
package/bin/benchforge +3 -0
package/dist/bin/benchforge.mjs +9 -0
package/dist/bin/benchforge.mjs.map +1 -0
package/dist/browser/index.js +914 -0
package/dist/index.mjs +3 -0
package/dist/src-CGuaC3Wo.mjs +3676 -0
package/dist/src-CGuaC3Wo.mjs.map +1 -0
package/package.json +49 -0
package/src/BenchMatrix.ts +380 -0
package/src/Benchmark.ts +33 -0
package/src/BenchmarkReport.ts +156 -0
package/src/GitUtils.ts +79 -0
package/src/HtmlDataPrep.ts +148 -0
package/src/MeasuredResults.ts +127 -0
package/src/NodeGC.ts +48 -0
package/src/PermutationTest.ts +115 -0
package/src/StandardSections.ts +268 -0
package/src/StatisticalUtils.ts +176 -0
package/src/TypeUtil.ts +8 -0
package/src/bin/benchforge.ts +4 -0
package/src/browser/BrowserGcStats.ts +44 -0
package/src/browser/BrowserHeapSampler.ts +248 -0
package/src/cli/CliArgs.ts +64 -0
package/src/cli/FilterBenchmarks.ts +68 -0
package/src/cli/RunBenchCLI.ts +856 -0
package/src/export/JsonExport.ts +103 -0
package/src/export/JsonFormat.ts +91 -0
package/src/export/PerfettoExport.ts +203 -0
package/src/heap-sample/HeapSampleReport.ts +196 -0
package/src/heap-sample/HeapSampler.ts +78 -0
package/src/html/HtmlReport.ts +131 -0
package/src/html/HtmlTemplate.ts +284 -0
package/src/html/Types.ts +88 -0
package/src/html/browser/CIPlot.ts +287 -0
package/src/html/browser/HistogramKde.ts +118 -0
package/src/html/browser/LegendUtils.ts +163 -0
package/src/html/browser/RenderPlots.ts +263 -0
package/src/html/browser/SampleTimeSeries.ts +389 -0
package/src/html/browser/Types.ts +96 -0
package/src/html/browser/index.ts +1 -0
package/src/html/index.ts +17 -0
package/src/index.ts +92 -0
package/src/matrix/CaseLoader.ts +36 -0
package/src/matrix/MatrixFilter.ts +103 -0
package/src/matrix/MatrixReport.ts +290 -0
package/src/matrix/VariantLoader.ts +46 -0
package/src/runners/AdaptiveWrapper.ts +391 -0
package/src/runners/BasicRunner.ts +368 -0
package/src/runners/BenchRunner.ts +60 -0
package/src/runners/CreateRunner.ts +11 -0
package/src/runners/GcStats.ts +107 -0
package/src/runners/RunnerOrchestrator.ts +374 -0
package/src/runners/RunnerUtils.ts +2 -0
package/src/runners/TimingUtils.ts +13 -0
package/src/runners/WorkerScript.ts +256 -0
package/src/table-util/ConvergenceFormatters.ts +19 -0
package/src/table-util/Formatters.ts +152 -0
package/src/table-util/README.md +70 -0
package/src/table-util/TableReport.ts +293 -0
package/src/table-util/test/TableReport.test.ts +105 -0
package/src/table-util/test/TableValueExtractor.test.ts +41 -0
package/src/table-util/test/TableValueExtractor.ts +100 -0
package/src/test/AdaptiveRunner.test.ts +185 -0
package/src/test/AdaptiveStatistics.integration.ts +119 -0
package/src/test/BenchmarkReport.test.ts +82 -0
package/src/test/BrowserBench.e2e.test.ts +44 -0
package/src/test/BrowserBench.test.ts +79 -0
package/src/test/GcStats.test.ts +94 -0
package/src/test/PermutationTest.test.ts +121 -0
package/src/test/RunBenchCLI.test.ts +166 -0
package/src/test/RunnerOrchestrator.test.ts +102 -0
package/src/test/StatisticalUtils.test.ts +112 -0
package/src/test/TestUtils.ts +93 -0
package/src/test/fixtures/test-bench-script.ts +30 -0
package/src/tests/AdaptiveConvergence.test.ts +177 -0
package/src/tests/AdaptiveSampling.test.ts +240 -0
package/src/tests/BenchMatrix.test.ts +366 -0
package/src/tests/MatrixFilter.test.ts +117 -0
package/src/tests/MatrixReport.test.ts +139 -0
package/src/tests/RealDataValidation.test.ts +177 -0
package/src/tests/fixtures/baseline/impl.ts +4 -0
package/src/tests/fixtures/bevy30-samples.ts +158 -0
package/src/tests/fixtures/cases/asyncCases.ts +7 -0
package/src/tests/fixtures/cases/cases.ts +8 -0
package/src/tests/fixtures/cases/variants/product.ts +2 -0
package/src/tests/fixtures/cases/variants/sum.ts +2 -0
package/src/tests/fixtures/discover/fast.ts +1 -0
package/src/tests/fixtures/discover/slow.ts +4 -0
package/src/tests/fixtures/invalid/bad.ts +1 -0
package/src/tests/fixtures/loader/fast.ts +1 -0
package/src/tests/fixtures/loader/slow.ts +4 -0
package/src/tests/fixtures/loader/stateful.ts +2 -0
package/src/tests/fixtures/stateful/stateful.ts +2 -0
package/src/tests/fixtures/variants/extra.ts +1 -0
package/src/tests/fixtures/variants/impl.ts +1 -0
package/src/tests/fixtures/worker/fast.ts +1 -0
package/src/tests/fixtures/worker/slow.ts +4 -0

package/README.md ADDED Viewed

@@ -0,0 +1,432 @@
+# Benchforge
+A TypeScript benchmarking library with CLI support for running performance tests.
+## Browser Profiling
+See [Browser Heap Profiling](README-browser.md) for profiling code running in a browser.
+## Installation
+```bash
+npm install benchforge
+# or
+pnpm add benchforge
+```
+## Quick Start
+```typescript
+import { parseBenchArgs, runBenchmarks, reportResults, timeSection, runsSection, type BenchSuite } from 'benchforge';
+const suite: BenchSuite = {
+  name: "String Operations",
+  groups: [
+    {
+      name: "Concatenation",
+      benchmarks: [
+        { name: "plus", fn: () => "a" + "b" },
+        { name: "template", fn: () => `a${"b"}` },
+      ],
+    },
+  ],
+};
+const args = parseBenchArgs();
+const results = await runBenchmarks(suite, args);
+const table = reportResults(results, [timeSection, runsSection]);
+console.log(table);
+```
+### Setup and Baseline Example
+Here's a more comprehensive example with shared setup data and baseline comparison:
+```typescript
+import { parseBenchArgs, runBenchmarks, defaultReport, type BenchGroup, type BenchSuite } from 'benchforge';
+const sortingGroup: BenchGroup<number[]> = {
+  name: "Array Sorting (1000 numbers)",
+  setup: () => Array.from({ length: 1000 }, () => Math.random()),
+  baseline: { name: "native sort", fn: nativeSort },
+  benchmarks: [
+    { name: "quicksort", fn: quickSort },
+    { name: "insertion sort", fn: insertionSort },
+  ],
+};
+const suite: BenchSuite = {
+  name: "Performance Tests",
+  groups: [sortingGroup],
+};
+const args = parseBenchArgs();
+const results = await runBenchmarks(suite, args);
+const report = defaultReport(results, args);
+console.log(report);
+```
+See `examples/simple-cli.ts` for a complete runnable example.
+### Worker Mode with Module Imports
+For worker mode, benchmarks can reference module exports instead of inline functions. This is essential for proper isolation since functions can't be serialized across process boundaries.
+```typescript
+const group: BenchGroup = {
+  name: "Parser Benchmark",
+  setup: () => loadTestData(),
+  benchmarks: [{
+    name: "parse",
+    fn: () => {},  // placeholder - not used in worker mode
+    modulePath: new URL("./benchmarks.ts", import.meta.url).href,
+    exportName: "parse",
+    setupExportName: "setup",  // optional: called once, result passed to exportName fn
+  }],
+};
+```
+When `setupExportName` is provided, the worker:
+1. Imports the module
+2. Calls `setup(params)` once (where params comes from `BenchGroup.setup()`)
+3. Passes the setup result to each benchmark iteration
+This eliminates manual caching boilerplate in worker modules.
+## CLI Options
+### Basic Options
+- `--time <seconds>` - Benchmark duration per test (default: 0.642s)
+- `--iterations <count>` - Exact number of iterations (overrides --time)
+- `--filter <pattern>` - Run only benchmarks matching regex/substring
+- `--worker` / `--no-worker` - Run in isolated worker process (default: true)
+- `--profile` - Run once for profiling (single iteration, no warmup)
+- `--warmup <count>` - Warmup iterations before measurement (default: 0)
+- `--help` - Show all available options
+### Memory Profiling
+- `--gc-stats` - Collect GC allocation/collection stats via --trace-gc-nvp
+- `--heap-sample` - Heap sampling allocation attribution (includes garbage)
+- `--heap-interval <bytes>` - Sampling interval in bytes (default: 32768)
+- `--heap-depth <frames>` - Stack depth to capture (default: 64)
+- `--heap-rows <n>` - Number of top allocation sites to show (default: 20)
+### Output Options
+- `--html` - Generate HTML report, start server, and open in browser
+- `--export-html <file>` - Export HTML report to file
+- `--json <file>` - Export benchmark data to JSON
+- `--perfetto <file>` - Export Perfetto trace file
+## CLI Usage
+### Filter benchmarks by name
+```bash
+simple-cli.ts --filter "concat"
+simple-cli.ts --filter "^parse" --time 2
+```
+### Profiling with external debuggers
+Use `--profile` to run benchmarks once for attaching external profilers:
+```bash
+# Use with Chrome DevTools profiler
+node --inspect-brk simple-cli.ts --profile
+# Use with other profiling tools
+node --prof simple-cli.ts --profile
+```
+The `--profile` flag executes exactly one iteration with no warmup, making it ideal for debugging and performance profiling.
+### Key Concepts
+**Setup Functions**: Run once per group and provide shared data to all benchmarks in that group. The data returned by setup is automatically passed as the first parameter to benchmark functions that expect it.
+**Baseline Comparison**: When a baseline is specified, all benchmarks in the group show percentage differences (Δ%) compared to baseline.
+## Output
+Results are displayed in a formatted table:
+```
+╔═════════════════╤═══════════════════════════════════════════╤═══════╤═════════╗
+║                 │                   time                    │       │         ║
+║ name            │ mean  Δ% CI                    p50   p99  │ conv% │ runs    ║
+╟─────────────────┼───────────────────────────────────────────┼───────┼─────────╢
+║ quicksort       │ 0.17  +5.5% [+4.7%, +6.2%]     0.15  0.63 │ 100%  │ 1,134   ║
+║ insertion sort  │ 0.24  +25.9% [+25.3%, +27.4%]  0.18  0.36 │ 100%  │ 807     ║
+║ --> native sort │ 0.16                           0.15  0.41 │ 100%  │ 1,210   ║
+╚═════════════════╧═══════════════════════════════════════════╧═══════╧═════════╝
+```
+- **Δ% CI**: Percentage difference from baseline with bootstrap confidence interval
+- **conv%**: Convergence percentage (100% = stable measurements)
+### HTML
+The HTML report displays:
+- Histogram + KDE: Bar chart showing the distribution
+- Time Series: Sample values over iterations
+- Allocation Series: Per-sample heap allocation (requires `--heap-sample`)
+```bash
+# Generate HTML report, start server, and open in browser
+simple-cli.ts --html
+# Press Ctrl+C to exit when done viewing
+```
+### Perfetto Trace Export
+Export benchmark data as a Perfetto-compatible trace file for detailed analysis:
+```bash
+# Export trace file
+simple-cli.ts --perfetto trace.json
+# With V8 GC events (automatically merged after exit)
+node --expose-gc --trace-events-enabled --trace-event-categories=v8,v8.gc \
+  simple-cli.ts --perfetto trace.json
+```
+View the trace at https://ui.perfetto.dev by dragging the JSON file.
+The trace includes:
+- **Heap counter**: Continuous heap usage as a line graph
+- **Sample markers**: Each benchmark iteration with timing
+- **Pause markers**: V8 optimization pause points
+- **V8 GC events**: Automatically merged after process exit (when run with `--trace-events-enabled`)
+### GC Statistics
+Collect detailed garbage collection statistics via V8's `--trace-gc-nvp`:
+```bash
+# Collect GC allocation/collection stats (requires worker mode)
+simple-cli.ts --gc-stats
+```
+Adds these columns to the output table:
+- **alloc/iter**: Bytes allocated per iteration
+- **scav**: Number of scavenge (minor) GCs
+- **full**: Number of full (mark-compact) GCs
+- **promo%**: Percentage of allocations promoted to old generation
+- **pause/iter**: GC pause time per iteration
+### Heap Sampling
+For allocation profiling including garbage (short-lived objects), use `--heap-sample` mode which uses Node's built-in inspector API:
+```bash
+# Basic heap sampling
+simple-cli.ts --heap-sample --iterations 100
+# Smaller interval = more samples = better coverage of rare allocations
+simple-cli.ts --heap-sample --heap-interval 4096 --iterations 100
+# Verbose output with clickable file:// paths
+simple-cli.ts --heap-sample --heap-verbose
+# Control call stack display depth
+simple-cli.ts --heap-sample --heap-stack 5
+```
+**CLI Options:**
+- `--heap-sample` - Enable heap sampling allocation attribution
+- `--heap-interval <bytes>` - Sampling interval in bytes (default: 32768)
+- `--heap-depth <frames>` - Maximum stack depth to capture (default: 64)
+- `--heap-rows <n>` - Number of top allocation sites to show (default: 20)
+- `--heap-stack <n>` - Call stack depth to display (default: 3)
+- `--heap-verbose` - Show full file:// paths with line numbers (cmd-clickable)
+**Output (default compact):**
+```
+─── Heap profile: bevy_env_map ───
+Heap allocation sites (top 20, garbage included):
+  13.62 MB  recursiveResolve <- flattenTreeImport <- bindAndTransform
+  12.36 MB  nextToken <- parseBlockStatements <- parseCompoundStatement
+   5.15 MB  coverWithText <- finishElem <- parseVarOrLet
+Total (all):       56.98 MB
+Total (user-code): 28.45 MB
+Samples: 1,842
+```
+**How V8 Heap Sampling Works:**
+V8's sampling profiler uses Poisson-distributed sampling. When an allocation occurs, V8 probabilistically decides whether to record it based on the sampling interval. Key points:
+1. **selfSize is scaled**: V8 doesn't report raw sampled bytes. It scales sample counts to estimate total allocations (`selfSize = size × count × scaleFactor`). This means changing `--heap-interval` affects sample count and overhead, but the estimated total converges to the same value.
+2. **Smaller intervals = better coverage**: With a smaller interval (e.g., 1024 vs 32768), you get more samples and discover more unique allocation sites, especially rare ones. The total estimate stays similar, but you see more of the distribution.
+3. **User-code only**: The report filters out Node.js internals (`node:`, `internal/`). "Total (user-code)" shows filtered allocations; "Total (all)" shows everything.
+4. **Measurement window**: Sampling covers benchmark module import + execution. Worker startup and framework init aren't captured (but do appear in `--gc-stats`).
+5. **Sites are stack-unique**: The same function appears multiple times with different callers. For example, `nextToken` may show up in several entries with different call stacks, each representing a distinct allocation pattern.
+**Limitations:**
+- **Function-level attribution only**: V8 reports the function where allocation occurred, not the specific line. The line:column shown is where the function is *defined*.
+- **Statistical sampling**: Results vary between runs. More iterations = more stable results.
+- **~50% filtered**: Node.js internals account for roughly half of allocations. Use "Total (all)" to see the full picture.
+**When to use which:**
+| Tool | Use When |
+|------|----------|
+| `--gc-stats` | Need total allocation/collection bytes, GC pause times |
+| `--heap-sample` | Need to identify which functions allocate the most |
+| Both | Cross-reference attribution with totals |
+## Requirements
+- Node.js 22.6+ (for native TypeScript support)
+- Use `--expose-gc --allow-natives-syntax` flags for garbage collection monitoring and V8 native functions
+## Adaptive Mode
+Adaptive mode automatically adjusts the number of benchmark iterations until measurements stabilize, providing statistically significant results without excessive runtime.
+### Using Adaptive Mode
+```bash
+# Enable adaptive benchmarking with default settings
+simple-cli.ts --adaptive
+# Customize time limits
+simple-cli.ts --adaptive --time 60 --min-time 5
+# Combine with other options
+simple-cli.ts --adaptive --filter "quicksort"
+```
+### CLI Options for Adaptive Mode
+- `--adaptive` - Enable adaptive sampling mode
+- `--min-time <seconds>` - Minimum time before convergence can stop (default: 1s)
+- `--convergence <percent>` - Confidence threshold 0-100 (default: 95)
+- `--time <seconds>` - Maximum time limit (default: 20s in adaptive mode)
+### How It Works
+1. **Initial Sampling**: Collects initial batch of ~100 samples (includes warmup)
+2. **Window Comparison**: Compares recent samples against previous window
+3. **Stability Detection**: Checks median drift and outlier impact between windows
+4. **Convergence**: Stops when both metrics are stable (<5% drift) or reaches threshold
+Progress is shown during execution:
+```
+◊ quicksort: 75% confident (2.1s)
+```
+### Output with Adaptive Mode
+```
+╔═════════════════╤═════════════════════════════════════════════╤═══════╤═════════╤══════╗
+║                 │                    time                     │       │         │      ║
+║ name            │ median  Δ% CI                    mean  p99  │ conv% │ runs    │ time ║
+╟─────────────────┼─────────────────────────────────────────────┼───────┼─────────┼──────╢
+║ quicksort       │ 0.17    +17.3% [+15.4%, +20.0%]  0.20  0.65 │ 100%  │ 526     │ 0.0s ║
+║ insertion sort  │ 0.18    +24.2% [+23.9%, +24.6%]  0.19  0.36 │ 100%  │ 529     │ 0.0s ║
+║ --> native sort │ 0.15                             0.15  0.25 │ 100%  │ 647     │ 0.0s ║
+╚═════════════════╧═════════════════════════════════════════════╧═══════╧═════════╧══════╝
+```
+- **conv%**: Convergence percentage (100% = stable measurements)
+- **time**: Total sampling duration for that benchmark
+## Statistical Considerations: Mean vs Median
+### When to Use Mean with Confidence Intervals
+**Best for:**
+- **Normally distributed data** - When benchmark times follow a bell curve
+- **Statistical comparison** - Comparing performance between implementations
+- **Throughput analysis** - Understanding average system performance
+- **Resource planning** - Estimating typical resource usage
+**Advantages:**
+- Provides confidence intervals for statistical significance
+- Captures the full distribution including outliers
+- Better for detecting small but consistent performance differences
+- Standard in academic performance research
+**Example use cases:**
+- Comparing algorithm implementations
+- Measuring API response times under normal load
+- Evaluating compiler optimizations
+- Benchmarking pure computational functions
+### When to Use Median (p50)
+**Best for:**
+- **Skewed distributions** - When outliers are common
+- **Latency-sensitive applications** - Where typical user experience matters
+- **Noisy environments** - Systems with unpredictable interference
+- **Service Level Agreements** - "50% of requests complete within X ms"
+**Advantages:**
+- Robust to outliers and system noise
+- Better represents "typical" performance
+- More stable in virtualized/cloud environments
+- Less affected by GC pauses and OS scheduling
+**Example use cases:**
+- Web server response times
+- Database query performance
+- UI responsiveness metrics
+- Real-time system benchmarks
+### Interpreting Results
+#### Baseline Comparison (Δ% CI)
+```
+0.17  +5.5% [+4.7%, +6.2%]
+```
+This shows the benchmark is 5.5% slower than baseline, with a bootstrap confidence interval of [+4.7%, +6.2%]. Use this for comparing implementations.
+#### Percentiles
+```
+p50: 0.15ms, p99: 0.27ms
+```
+This shows that 50% of runs completed in ≤0.15ms and 99% in ≤0.27ms. Use this when you care about consistency and tail latencies.
+### Practical Guidelines
+1. **Use adaptive mode when:**
+   - You want automatic convergence detection
+   - Benchmarks have varying execution times
+   - You need stable measurements without guessing iteration counts
+2. **Use fixed iterations when:**
+   - Comparing across runs/machines (reproducibility)
+   - You know roughly how many samples you need
+   - Running in CI pipelines with time constraints
+3. **Interpreting conv%:**
+   - 100% = measurements are stable
+   - <100% = still converging or high variance
+   - Red color indicates low confidence
+### Statistical Notes
+- **Bootstrap CI**: Baseline comparison uses permutation testing with bootstrap confidence intervals
+- **Window Stability**: Adaptive mode compares sliding windows for median drift and outlier impact
+- **Independence**: Assumes benchmark iterations are independent (use `--worker` flag for better isolation)
+## Understanding GC Time Measurements
+### GC Duration in Node.js Performance Hooks
+The `duration` field in GC PerformanceEntry records **stop-the-world pause time** - the time when JavaScript execution is actually blocked. This does NOT include:
+1. **Concurrent GC work** done in parallel threads (concurrent marking, sweeping)
+2. **Performance degradation** from CPU contention and cache effects
+3. **Total GC overhead** including preparation and cleanup
+### Key Findings
+1. **Multiple GC Events**: A single `gc()` call can trigger multiple GC events that are recorded separately
+2. **Incremental GC**: V8 breaks up GC work into smaller increments to reduce pause times
+3. **Duration < Impact**: The recorded duration is often much less than the actual performance impact

package/bin/benchforge ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env -S node --experimental-strip-types
+import { runDefaultBench } from "../src/index.ts";
+await runDefaultBench();

package/dist/bin/benchforge.mjs ADDED Viewed

@@ -0,0 +1,9 @@
+#!/usr/bin/env node
+import { g as runDefaultBench } from "../src-CGuaC3Wo.mjs";
+//#region src/bin/benchforge.ts
+await runDefaultBench();
+//#endregion
+export {  };
+//# sourceMappingURL=benchforge.mjs.map

package/dist/bin/benchforge.mjs.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"benchforge.mjs","names":[],"sources":["../../src/bin/benchforge.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { runDefaultBench } from \"../index.ts\";\n\nawait runDefaultBench();\n"],"mappings":";;;;AAGA,MAAM,iBAAiB"}