benchforge 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -129
- package/dist/BenchRunner-BLfGX2wQ.d.mts +225 -0
- package/dist/{TimingUtils-D4z1jpp2.mjs → TimingUtils-ClclVQ7E.mjs} +276 -278
- package/dist/TimingUtils-ClclVQ7E.mjs.map +1 -0
- package/dist/bin/benchforge.d.mts +1 -0
- package/dist/bin/benchforge.mjs +1 -1
- package/dist/index.d.mts +711 -0
- package/dist/index.mjs +2 -2
- package/dist/runners/WorkerScript.d.mts +39 -0
- package/dist/runners/WorkerScript.mjs +1 -1
- package/dist/{src-cYpHvc40.mjs → src-JGOI6_Sc.mjs} +22 -23
- package/dist/src-JGOI6_Sc.mjs.map +1 -0
- package/package.json +1 -1
- package/src/StandardSections.ts +1 -8
- package/src/browser/BrowserHeapSampler.ts +3 -2
- package/src/cli/CliArgs.ts +11 -14
- package/src/cli/RunBenchCLI.ts +1 -4
- package/src/runners/BasicRunner.ts +0 -4
- package/src/table-util/Formatters.ts +1 -1
- package/dist/TimingUtils-D4z1jpp2.mjs.map +0 -1
- package/dist/src-cYpHvc40.mjs.map +0 -1
package/README.md
CHANGED
|
@@ -151,18 +151,17 @@ The `--profile` flag executes exactly one iteration with no warmup, making it id
|
|
|
151
151
|
Results are displayed in a formatted table:
|
|
152
152
|
|
|
153
153
|
```
|
|
154
|
-
|
|
155
|
-
║ │ time │
|
|
156
|
-
║ name │ mean Δ% CI p50 p99 │
|
|
157
|
-
|
|
158
|
-
║ quicksort │ 0.17 +5.5% [+4.7%, +6.2%] 0.15 0.63 │
|
|
159
|
-
║ insertion sort │ 0.24 +25.9% [+25.3%, +27.4%] 0.18 0.36 │
|
|
160
|
-
║ --> native sort │ 0.16 0.15 0.41 │
|
|
161
|
-
|
|
154
|
+
╔═════════════════╤═══════════════════════════════════════════╤═════════╗
|
|
155
|
+
║ │ time │ ║
|
|
156
|
+
║ name │ mean Δ% CI p50 p99 │ runs ║
|
|
157
|
+
╟─────────────────┼───────────────────────────────────────────┼─────────╢
|
|
158
|
+
║ quicksort │ 0.17 +5.5% [+4.7%, +6.2%] 0.15 0.63 │ 1,134 ║
|
|
159
|
+
║ insertion sort │ 0.24 +25.9% [+25.3%, +27.4%] 0.18 0.36 │ 807 ║
|
|
160
|
+
║ --> native sort │ 0.16 0.15 0.41 │ 1,210 ║
|
|
161
|
+
╚═════════════════╧═══════════════════════════════════════════╧═════════╝
|
|
162
162
|
```
|
|
163
163
|
|
|
164
164
|
- **Δ% CI**: Percentage difference from baseline with bootstrap confidence interval
|
|
165
|
-
- **conv%**: Convergence percentage (100% = stable measurements)
|
|
166
165
|
|
|
167
166
|
### HTML
|
|
168
167
|
|
|
@@ -284,136 +283,23 @@ V8's sampling profiler uses Poisson-distributed sampling. When an allocation occ
|
|
|
284
283
|
- Node.js 22.6+ (for native TypeScript support)
|
|
285
284
|
- Use `--expose-gc --allow-natives-syntax` flags for garbage collection monitoring and V8 native functions
|
|
286
285
|
|
|
287
|
-
## Adaptive Mode
|
|
286
|
+
## Adaptive Mode (Experimental)
|
|
288
287
|
|
|
289
|
-
Adaptive mode automatically adjusts
|
|
288
|
+
Adaptive mode (`--adaptive`) automatically adjusts iteration count until measurements stabilize. The algorithm is still being tuned — use `--help` for available options.
|
|
290
289
|
|
|
291
|
-
|
|
290
|
+
## Interpreting Results
|
|
292
291
|
|
|
293
|
-
|
|
294
|
-
# Enable adaptive benchmarking with default settings
|
|
295
|
-
simple-cli.ts --adaptive
|
|
296
|
-
|
|
297
|
-
# Customize time limits
|
|
298
|
-
simple-cli.ts --adaptive --time 60 --min-time 5
|
|
299
|
-
|
|
300
|
-
# Combine with other options
|
|
301
|
-
simple-cli.ts --adaptive --filter "quicksort"
|
|
302
|
-
```
|
|
303
|
-
|
|
304
|
-
### CLI Options for Adaptive Mode
|
|
305
|
-
|
|
306
|
-
- `--adaptive` - Enable adaptive sampling mode
|
|
307
|
-
- `--min-time <seconds>` - Minimum time before convergence can stop (default: 1s)
|
|
308
|
-
- `--convergence <percent>` - Confidence threshold 0-100 (default: 95)
|
|
309
|
-
- `--time <seconds>` - Maximum time limit (default: 20s in adaptive mode)
|
|
310
|
-
|
|
311
|
-
### How It Works
|
|
312
|
-
|
|
313
|
-
1. **Initial Sampling**: Collects initial batch of ~100 samples (includes warmup)
|
|
314
|
-
2. **Window Comparison**: Compares recent samples against previous window
|
|
315
|
-
3. **Stability Detection**: Checks median drift and outlier impact between windows
|
|
316
|
-
4. **Convergence**: Stops when both metrics are stable (<5% drift) or reaches threshold
|
|
317
|
-
|
|
318
|
-
Progress is shown during execution:
|
|
319
|
-
```
|
|
320
|
-
◊ quicksort: 75% confident (2.1s)
|
|
321
|
-
```
|
|
322
|
-
|
|
323
|
-
### Output with Adaptive Mode
|
|
324
|
-
|
|
325
|
-
```
|
|
326
|
-
╔═════════════════╤═════════════════════════════════════════════╤═══════╤═════════╤══════╗
|
|
327
|
-
║ │ time │ │ │ ║
|
|
328
|
-
║ name │ median Δ% CI mean p99 │ conv% │ runs │ time ║
|
|
329
|
-
╟─────────────────┼─────────────────────────────────────────────┼───────┼─────────┼──────╢
|
|
330
|
-
║ quicksort │ 0.17 +17.3% [+15.4%, +20.0%] 0.20 0.65 │ 100% │ 526 │ 0.0s ║
|
|
331
|
-
║ insertion sort │ 0.18 +24.2% [+23.9%, +24.6%] 0.19 0.36 │ 100% │ 529 │ 0.0s ║
|
|
332
|
-
║ --> native sort │ 0.15 0.15 0.25 │ 100% │ 647 │ 0.0s ║
|
|
333
|
-
╚═════════════════╧═════════════════════════════════════════════╧═══════╧═════════╧══════╝
|
|
334
|
-
```
|
|
335
|
-
|
|
336
|
-
- **conv%**: Convergence percentage (100% = stable measurements)
|
|
337
|
-
- **time**: Total sampling duration for that benchmark
|
|
338
|
-
|
|
339
|
-
## Statistical Considerations: Mean vs Median
|
|
340
|
-
|
|
341
|
-
### When to Use Mean with Confidence Intervals
|
|
342
|
-
|
|
343
|
-
**Best for:**
|
|
344
|
-
- **Normally distributed data** - When benchmark times follow a bell curve
|
|
345
|
-
- **Statistical comparison** - Comparing performance between implementations
|
|
346
|
-
- **Throughput analysis** - Understanding average system performance
|
|
347
|
-
- **Resource planning** - Estimating typical resource usage
|
|
348
|
-
|
|
349
|
-
**Advantages:**
|
|
350
|
-
- Provides confidence intervals for statistical significance
|
|
351
|
-
- Captures the full distribution including outliers
|
|
352
|
-
- Better for detecting small but consistent performance differences
|
|
353
|
-
- Standard in academic performance research
|
|
354
|
-
|
|
355
|
-
**Example use cases:**
|
|
356
|
-
- Comparing algorithm implementations
|
|
357
|
-
- Measuring API response times under normal load
|
|
358
|
-
- Evaluating compiler optimizations
|
|
359
|
-
- Benchmarking pure computational functions
|
|
360
|
-
|
|
361
|
-
### When to Use Median (p50)
|
|
362
|
-
|
|
363
|
-
**Best for:**
|
|
364
|
-
- **Skewed distributions** - When outliers are common
|
|
365
|
-
- **Latency-sensitive applications** - Where typical user experience matters
|
|
366
|
-
- **Noisy environments** - Systems with unpredictable interference
|
|
367
|
-
- **Service Level Agreements** - "50% of requests complete within X ms"
|
|
368
|
-
|
|
369
|
-
**Advantages:**
|
|
370
|
-
- Robust to outliers and system noise
|
|
371
|
-
- Better represents "typical" performance
|
|
372
|
-
- More stable in virtualized/cloud environments
|
|
373
|
-
- Less affected by GC pauses and OS scheduling
|
|
374
|
-
|
|
375
|
-
**Example use cases:**
|
|
376
|
-
- Web server response times
|
|
377
|
-
- Database query performance
|
|
378
|
-
- UI responsiveness metrics
|
|
379
|
-
- Real-time system benchmarks
|
|
380
|
-
|
|
381
|
-
### Interpreting Results
|
|
382
|
-
|
|
383
|
-
#### Baseline Comparison (Δ% CI)
|
|
292
|
+
### Baseline Comparison (Δ% CI)
|
|
384
293
|
```
|
|
385
294
|
0.17 +5.5% [+4.7%, +6.2%]
|
|
386
295
|
```
|
|
387
|
-
|
|
296
|
+
The benchmark is 5.5% slower than baseline, with a bootstrap confidence interval of [+4.7%, +6.2%].
|
|
388
297
|
|
|
389
|
-
|
|
298
|
+
### Percentiles
|
|
390
299
|
```
|
|
391
300
|
p50: 0.15ms, p99: 0.27ms
|
|
392
301
|
```
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
### Practical Guidelines
|
|
396
|
-
|
|
397
|
-
1. **Use adaptive mode when:**
|
|
398
|
-
- You want automatic convergence detection
|
|
399
|
-
- Benchmarks have varying execution times
|
|
400
|
-
- You need stable measurements without guessing iteration counts
|
|
401
|
-
|
|
402
|
-
2. **Use fixed iterations when:**
|
|
403
|
-
- Comparing across runs/machines (reproducibility)
|
|
404
|
-
- You know roughly how many samples you need
|
|
405
|
-
- Running in CI pipelines with time constraints
|
|
406
|
-
|
|
407
|
-
3. **Interpreting conv%:**
|
|
408
|
-
- 100% = measurements are stable
|
|
409
|
-
- <100% = still converging or high variance
|
|
410
|
-
- Red color indicates low confidence
|
|
411
|
-
|
|
412
|
-
### Statistical Notes
|
|
413
|
-
|
|
414
|
-
- **Bootstrap CI**: Baseline comparison uses permutation testing with bootstrap confidence intervals
|
|
415
|
-
- **Window Stability**: Adaptive mode compares sliding windows for median drift and outlier impact
|
|
416
|
-
- **Independence**: Assumes benchmark iterations are independent (use `--worker` flag for better isolation)
|
|
302
|
+
50% of runs completed in ≤0.15ms and 99% in ≤0.27ms. Use percentiles when you care about consistency and tail latencies.
|
|
417
303
|
|
|
418
304
|
## Understanding GC Time Measurements
|
|
419
305
|
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
//#region src/heap-sample/HeapSampler.d.ts
|
|
2
|
+
interface ProfileNode {
|
|
3
|
+
callFrame: {
|
|
4
|
+
functionName: string;
|
|
5
|
+
url: string;
|
|
6
|
+
lineNumber: number;
|
|
7
|
+
columnNumber?: number;
|
|
8
|
+
};
|
|
9
|
+
selfSize: number;
|
|
10
|
+
children?: ProfileNode[];
|
|
11
|
+
}
|
|
12
|
+
interface HeapProfile {
|
|
13
|
+
head: ProfileNode;
|
|
14
|
+
samples?: number[];
|
|
15
|
+
}
|
|
16
|
+
//#endregion
|
|
17
|
+
//#region src/NodeGC.d.ts
|
|
18
|
+
/** Individual GC event for visualization */
|
|
19
|
+
interface GcEvent {
|
|
20
|
+
/** Offset from collection start (ms) - can be negative for warmup GCs */
|
|
21
|
+
offset: number;
|
|
22
|
+
/** Duration of GC pause (ms) */
|
|
23
|
+
duration: number;
|
|
24
|
+
}
|
|
25
|
+
/** GC time measured by Node's performance hooks */
|
|
26
|
+
interface NodeGCTime {
|
|
27
|
+
inRun: number;
|
|
28
|
+
before: number;
|
|
29
|
+
after: number;
|
|
30
|
+
total: number;
|
|
31
|
+
collects: number;
|
|
32
|
+
/** Individual GC events during sample collection (for visualization) */
|
|
33
|
+
events: GcEvent[];
|
|
34
|
+
}
|
|
35
|
+
//#endregion
|
|
36
|
+
//#region src/runners/GcStats.d.ts
|
|
37
|
+
/** GC statistics aggregated from V8 trace events.
|
|
38
|
+
* Node (--trace-gc-nvp) provides all fields.
|
|
39
|
+
* Browser (CDP Tracing) provides counts, collected, and pause only. */
|
|
40
|
+
interface GcStats {
|
|
41
|
+
scavenges: number;
|
|
42
|
+
markCompacts: number;
|
|
43
|
+
totalCollected: number;
|
|
44
|
+
gcPauseTime: number;
|
|
45
|
+
totalAllocated?: number;
|
|
46
|
+
totalPromoted?: number;
|
|
47
|
+
totalSurvived?: number;
|
|
48
|
+
}
|
|
49
|
+
//#endregion
|
|
50
|
+
//#region src/MeasuredResults.d.ts
|
|
51
|
+
/** CPU performance counter stats */
|
|
52
|
+
interface CpuCounts {
|
|
53
|
+
instructions?: number;
|
|
54
|
+
cycles?: number;
|
|
55
|
+
branchMisses?: number;
|
|
56
|
+
}
|
|
57
|
+
/** Benchmark results: times in milliseconds, sizes in kilobytes */
|
|
58
|
+
interface MeasuredResults {
|
|
59
|
+
name: string;
|
|
60
|
+
/** Raw execution time samples for custom statistics */
|
|
61
|
+
samples: number[];
|
|
62
|
+
/** Warmup iteration timings (ms) - captured before gc/settle */
|
|
63
|
+
warmupSamples?: number[];
|
|
64
|
+
/** Raw allocation samples per iteration (KB) */
|
|
65
|
+
allocationSamples?: number[];
|
|
66
|
+
/** Heap size per sample (bytes) - used for charts */
|
|
67
|
+
heapSamples?: number[];
|
|
68
|
+
/** Wall-clock timestamps per sample (μs since process start) - for Perfetto export */
|
|
69
|
+
timestamps?: number[];
|
|
70
|
+
/** Execution time in milliseconds (measurement overhead excluded by mitata) */
|
|
71
|
+
time: {
|
|
72
|
+
min: number;
|
|
73
|
+
max: number;
|
|
74
|
+
avg: number;
|
|
75
|
+
p25?: number;
|
|
76
|
+
p50: number;
|
|
77
|
+
p75: number;
|
|
78
|
+
p95?: number;
|
|
79
|
+
p99: number;
|
|
80
|
+
p999: number;
|
|
81
|
+
cv?: number;
|
|
82
|
+
mad?: number;
|
|
83
|
+
outlierRate?: number;
|
|
84
|
+
};
|
|
85
|
+
/** Heap size increase during test run (kilobytes) */
|
|
86
|
+
heapSize?: {
|
|
87
|
+
avg: number;
|
|
88
|
+
min: number;
|
|
89
|
+
max: number;
|
|
90
|
+
};
|
|
91
|
+
/**
|
|
92
|
+
* Time for explicit gc() call after test execution (milliseconds).
|
|
93
|
+
* Does not include GC time during test execution.
|
|
94
|
+
* Only reported by mitata runner.
|
|
95
|
+
*/
|
|
96
|
+
gcTime?: {
|
|
97
|
+
avg: number;
|
|
98
|
+
min: number;
|
|
99
|
+
max: number;
|
|
100
|
+
};
|
|
101
|
+
/** CPU counter stats from @mitata/counters (requires root access) */
|
|
102
|
+
cpu?: CpuCounts;
|
|
103
|
+
/** L1 cache miss rate */
|
|
104
|
+
cpuCacheMiss?: number;
|
|
105
|
+
/** CPU stall rate (macOS only) */
|
|
106
|
+
cpuStall?: number;
|
|
107
|
+
/**
|
|
108
|
+
* Stop-the-world GC time blocking main thread (milliseconds).
|
|
109
|
+
* Measured via Node's performance hooks when nodeObserveGC is true.
|
|
110
|
+
* Excludes parallel thread collection time and indirect slowdowns.
|
|
111
|
+
*/
|
|
112
|
+
nodeGcTime?: NodeGCTime;
|
|
113
|
+
/** Total time spent collecting samples (seconds) */
|
|
114
|
+
totalTime?: number;
|
|
115
|
+
/** Convergence information for adaptive mode */
|
|
116
|
+
convergence?: {
|
|
117
|
+
converged: boolean;
|
|
118
|
+
confidence: number;
|
|
119
|
+
reason: string;
|
|
120
|
+
};
|
|
121
|
+
/** V8 optimization tier tracking (requires --allow-natives-syntax) */
|
|
122
|
+
optStatus?: OptStatusInfo;
|
|
123
|
+
/** Per-sample V8 optimization status codes (for chart visualization) */
|
|
124
|
+
optSamples?: number[];
|
|
125
|
+
/** Points where pauses occurred for V8 optimization */
|
|
126
|
+
pausePoints?: PausePoint[];
|
|
127
|
+
/** GC stats from V8's --trace-gc-nvp (requires --gc-stats and worker mode) */
|
|
128
|
+
gcStats?: GcStats;
|
|
129
|
+
/** Heap sampling allocation profile (requires --heap-sample and worker mode) */
|
|
130
|
+
heapProfile?: HeapProfile;
|
|
131
|
+
}
|
|
132
|
+
/** A pause point during sample collection for V8 optimization */
|
|
133
|
+
interface PausePoint {
|
|
134
|
+
/** Sample index where pause occurred (after this iteration) */
|
|
135
|
+
sampleIndex: number;
|
|
136
|
+
/** Pause duration in milliseconds */
|
|
137
|
+
durationMs: number;
|
|
138
|
+
}
|
|
139
|
+
/** V8 optimization tier distribution */
|
|
140
|
+
interface OptTierInfo {
|
|
141
|
+
count: number;
|
|
142
|
+
medianMs: number;
|
|
143
|
+
}
|
|
144
|
+
/** V8 optimization status summary */
|
|
145
|
+
interface OptStatusInfo {
|
|
146
|
+
/** Samples by tier name (e.g., "turbofan", "sparkplug") */
|
|
147
|
+
byTier: Record<string, OptTierInfo>;
|
|
148
|
+
/** Number of samples with deopt flag set */
|
|
149
|
+
deoptCount: number;
|
|
150
|
+
}
|
|
151
|
+
//#endregion
|
|
152
|
+
//#region src/Benchmark.d.ts
|
|
153
|
+
/** Single benchmark function specification */
|
|
154
|
+
interface BenchmarkSpec<T = unknown> {
|
|
155
|
+
name: string;
|
|
156
|
+
fn: BenchmarkFunction<T>;
|
|
157
|
+
/** Path to module exporting the benchmark function (for worker mode) */
|
|
158
|
+
modulePath?: string;
|
|
159
|
+
/** Name of the exported function in the module (defaults to default export) */
|
|
160
|
+
exportName?: string;
|
|
161
|
+
/** Setup function export name - called once in worker, result passed to fn */
|
|
162
|
+
setupExportName?: string;
|
|
163
|
+
}
|
|
164
|
+
type BenchmarkFunction<T = unknown> = ((params: T) => void) | (() => void);
|
|
165
|
+
/** Group of benchmarks with shared setup */
|
|
166
|
+
interface BenchGroup<T = unknown> {
|
|
167
|
+
name: string;
|
|
168
|
+
/** Prepare parameters for all benchmarks in this group */
|
|
169
|
+
setup?: () => T | Promise<T>;
|
|
170
|
+
benchmarks: BenchmarkSpec<T>[];
|
|
171
|
+
/** Baseline benchmark for comparison */
|
|
172
|
+
baseline?: BenchmarkSpec<T>;
|
|
173
|
+
/** Metadata for reporting (e.g., lines of code) */
|
|
174
|
+
metadata?: Record<string, any>;
|
|
175
|
+
}
|
|
176
|
+
/** Collection of benchmark groups */
|
|
177
|
+
interface BenchSuite {
|
|
178
|
+
name: string;
|
|
179
|
+
groups: BenchGroup<any>[];
|
|
180
|
+
}
|
|
181
|
+
//#endregion
|
|
182
|
+
//#region src/runners/BenchRunner.d.ts
|
|
183
|
+
interface RunnerOptions {
|
|
184
|
+
/** Minimum time to run each benchmark (milliseconds) */
|
|
185
|
+
minTime?: number;
|
|
186
|
+
/** Maximum time to run each benchmark - ignored by mitata (milliseconds) */
|
|
187
|
+
maxTime?: number;
|
|
188
|
+
/** Maximum iterations per benchmark - ignored by TinyBench */
|
|
189
|
+
maxIterations?: number;
|
|
190
|
+
/** Warmup iterations before measurement (default: 0) */
|
|
191
|
+
warmup?: number;
|
|
192
|
+
/** Warmup time before measurement (milliseconds) */
|
|
193
|
+
warmupTime?: number;
|
|
194
|
+
/** Warmup samples - mitata only, for reducing test time */
|
|
195
|
+
warmupSamples?: number;
|
|
196
|
+
/** Warmup threshold - mitata only (nanoseconds) */
|
|
197
|
+
warmupThreshold?: number;
|
|
198
|
+
/** Minimum samples required - mitata only */
|
|
199
|
+
minSamples?: number;
|
|
200
|
+
/** Force GC after each iteration (requires --expose-gc) */
|
|
201
|
+
collect?: boolean;
|
|
202
|
+
/** Enable CPU performance counters (requires root access) */
|
|
203
|
+
cpuCounters?: boolean;
|
|
204
|
+
/** Trace V8 optimization tiers (requires --allow-natives-syntax) */
|
|
205
|
+
traceOpt?: boolean;
|
|
206
|
+
/** Skip post-warmup settle time (default: false) */
|
|
207
|
+
noSettle?: boolean;
|
|
208
|
+
/** Iterations before first pause (then pauseInterval applies) */
|
|
209
|
+
pauseFirst?: number;
|
|
210
|
+
/** Iterations between pauses for V8 optimization (0 to disable) */
|
|
211
|
+
pauseInterval?: number;
|
|
212
|
+
/** Pause duration in ms for V8 optimization */
|
|
213
|
+
pauseDuration?: number;
|
|
214
|
+
/** Collect GC stats via --trace-gc-nvp (requires worker mode) */
|
|
215
|
+
gcStats?: boolean;
|
|
216
|
+
/** Heap sampling allocation attribution */
|
|
217
|
+
heapSample?: boolean;
|
|
218
|
+
/** Heap sampling interval in bytes */
|
|
219
|
+
heapInterval?: number;
|
|
220
|
+
/** Heap sampling stack depth */
|
|
221
|
+
heapDepth?: number;
|
|
222
|
+
}
|
|
223
|
+
//#endregion
|
|
224
|
+
export { MeasuredResults as a, BenchmarkSpec as i, BenchGroup as n, HeapProfile as o, BenchSuite as r, RunnerOptions as t };
|
|
225
|
+
//# sourceMappingURL=BenchRunner-BLfGX2wQ.d.mts.map
|