npm - agy-superpowers - Versions diffs - 5.2.1 → 5.2.3 - Mend

agy-superpowers 5.2.1 → 5.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (233) hide show

package/template/agent/skills/rust-developer/references/rust-rules/opt-inline-small.md DELETED Viewed

@@ -1,160 +0,0 @@
-# opt-inline-small
-> Use `#[inline]` for small hot functions
-## Why It Matters
-Function call overhead (stack frame setup, register saves, jumps) can dominate small functions. Inlining eliminates this overhead and enables further optimizations by the compiler. The compiler often inlines automatically, but hints help for cross-crate calls.
-## Bad
-```rust
-// Small hot function without inline hint
-// May not be inlined across crate boundaries
-fn is_ascii_digit(b: u8) -> bool {
-    b >= b'0' && b <= b'9'
-}
-// Called millions of times
-for byte in data {
-    if is_ascii_digit(*byte) {  // Function call overhead
-        count += 1;
-    }
-}
-```
-## Good
-```rust
-#[inline]
-fn is_ascii_digit(b: u8) -> bool {
-    b >= b'0' && b <= b'9'
-}
-// Now the compiler will inline this
-for byte in data {
-    if is_ascii_digit(*byte) {  // Inlined, no call overhead
-        count += 1;
-    }
-}
-```
-## Inline Attributes
-```rust
-// No attribute - compiler decides (usually good for same-crate)
-fn auto_decide() { }
-// Suggest inlining - helps cross-crate
-#[inline]
-fn suggest_inline() { }
-// Strongly suggest inlining - almost always inlined
-#[inline(always)]
-fn force_inline() { }
-// Strongly suggest NOT inlining - for large/cold code
-#[inline(never)]
-fn prevent_inline() { }
-```
-## When to Use Each
-```rust
-// #[inline] - Small functions, especially in libraries
-#[inline]
-pub fn len(&self) -> usize {
-    self.inner.len()
-}
-// #[inline(always)] - Critical hot path, verified by profiling
-#[inline(always)]
-fn hot_inner_loop_helper(x: u32) -> u32 {
-    x.wrapping_mul(0x9E3779B9)
-}
-// #[inline(never)] - Error handlers, cold paths
-#[inline(never)]
-fn handle_error(err: Error) -> ! {
-    eprintln!("Fatal: {}", err);
-    std::process::exit(1);
-}
-// No attribute - large functions, infrequent calls
-fn complex_processing(data: &mut Data) {
-    // Many lines of code...
-}
-```
-## Evidence from ripgrep
-```rust
-// https://github.com/BurntSushi/ripgrep/blob/master/crates/printer/src/standard.rs
-#[inline(always)]
-fn write_prelude(
-    &self,
-    absolute_byte_offset: u64,
-    line_number: Option<u64>,
-    column: Option<u64>,
-) -> io::Result<()> {
-    // Hot path in printing matches
-}
-#[inline(always)]
-fn write_line(&self, line: &[u8]) -> io::Result<()> {
-    // Called for every line
-}
-```
-## Generic Functions
-```rust
-// Generic functions are already candidates for per-monomorphization inlining
-// But #[inline] helps ensure it across crates
-#[inline]
-pub fn min<T: Ord>(a: T, b: T) -> T {
-    if a < b { a } else { b }
-}
-```
-## Cautions
-```rust
-// DON'T inline large functions - hurts instruction cache
-#[inline(always)]  // BAD for large function
-fn large_complex_function(data: &mut [u8]) {
-    // 100+ lines of code
-    // Inlining bloats every call site
-}
-// DON'T assume inlining always helps - measure!
-// Sometimes the compiler makes better decisions
-// Inlining is non-transitive
-#[inline]
-fn outer() {
-    inner();  // inner() also needs #[inline] to be inlined together
-}
-fn inner() { }  // Won't be inlined at outer's call sites
-```
-## Verifying Inlining
-```bash
-# Check if function was inlined using Cachegrind
-# Non-inlined functions show entry/exit counts
-# Or examine assembly
-cargo rustc --release -- --emit=asm
-# Look for call instructions vs inlined code
-```
-## See Also
-- [opt-inline-always-rare](opt-inline-always-rare.md) - Use #[inline(always)] sparingly
-- [opt-inline-never-cold](opt-inline-never-cold.md) - Use #[inline(never)] for cold paths
-- [opt-cold-unlikely](opt-cold-unlikely.md) - Use #[cold] for unlikely paths
-- [opt-lto-release](opt-lto-release.md) - LTO enables cross-crate inlining

package/template/agent/skills/rust-developer/references/rust-rules/opt-likely-hint.md DELETED Viewed

@@ -1,171 +0,0 @@
-# opt-likely-hint
-> Use code structure to hint at likely branches; use intrinsics on nightly
-## Why It Matters
-Modern CPUs predict branches to speculatively execute code. Mispredictions cause pipeline stalls (10-20 cycles). Helping the compiler understand which branches are likely allows it to generate optimal code layout and branch hints, improving performance in hot paths.
-## Stable Rust: Code Structure Hints
-```rust
-// Pattern 1: Early returns for unlikely cases
-fn process(data: Option<&Data>) -> i32 {
-    // Compiler assumes early return is "unlikely"
-    let data = match data {
-        None => return 0,  // Unlikely
-        Some(d) => d,
-    };
-    // Hot path continues here
-    complex_processing(data)
-}
-// Pattern 2: if-else ordering
-fn calculate(x: i32) -> i32 {
-    if x >= 0 {
-        // Put likely case in "if" branch
-        x * 2
-    } else {
-        // Unlikely case in "else"
-        handle_negative(x)
-    }
-}
-// Pattern 3: Cold function extraction
-fn hot_path(data: &[u8]) -> Result<(), Error> {
-    if data.is_empty() {
-        return cold_empty_error();  // Extracted = unlikely
-    }
-    process_fast(data)
-}
-#[cold]
-fn cold_empty_error() -> Result<(), Error> {
-    Err(Error::EmptyInput)
-}
-```
-## Nightly: Intrinsics
-```rust
-#![feature(core_intrinsics)]
-use std::intrinsics::{likely, unlikely};
-fn process(data: &Data) -> i32 {
-    if unlikely(data.is_corrupted()) {
-        return handle_corruption(data);
-    }
-    if likely(data.is_cached()) {
-        return fast_cached_path(data);
-    }
-    slow_uncached_path(data)
-}
-```
-## Boolean Likely Wrapper (Nightly)
-```rust
-#![feature(core_intrinsics)]
-#[inline(always)]
-fn likely(b: bool) -> bool {
-    std::intrinsics::likely(b)
-}
-#[inline(always)]
-fn unlikely(b: bool) -> bool {
-    std::intrinsics::unlikely(b)
-}
-// Usage
-if likely(x > 0) {
-    hot_path(x)
-} else {
-    cold_path(x)
-}
-```
-## Stable: likely-stable Crate
-```rust
-use likely_stable::{likely, unlikely};
-fn check(value: i32) -> bool {
-    if unlikely(value < 0) {
-        handle_negative()
-    } else if likely(value < 1000) {
-        handle_common()
-    } else {
-        handle_large()
-    }
-}
-```
-## Loop Optimization
-```rust
-fn search(data: &[i32], target: i32) -> Option<usize> {
-    for (i, &item) in data.iter().enumerate() {
-        // Assume most iterations DON'T find the target
-        if unlikely(item == target) {
-            return Some(i);
-        }
-    }
-    None
-}
-// Alternative: structure for likely case
-fn search_common(data: &[i32], target: i32) -> Option<usize> {
-    // If target is usually found
-    for (i, &item) in data.iter().enumerate() {
-        if likely(item == target) {
-            return Some(i);
-        }
-    }
-    None
-}
-```
-## Match Arm Ordering
-```rust
-// Put most common variants first
-fn process_message(msg: Message) {
-    match msg {
-        // Most common - listed first
-        Message::Data(d) => handle_data(d),
-        Message::Heartbeat => (), // Second most common
-        // Rare cases last
-        Message::Error(e) => handle_error(e),
-        Message::Shutdown => shutdown(),
-    }
-}
-```
-## Benchmark-Driven Hints
-```rust
-// Profile first to know which branches are actually likely!
-fn speculative(x: i32) -> i32 {
-    // DON'T GUESS - measure with profiling
-    // perf record / perf report
-    // cargo flamegraph
-    if x > threshold {  // Is this actually common?
-        path_a(x)
-    } else {
-        path_b(x)
-    }
-}
-```
-## See Also
-- [opt-cold-unlikely](./opt-cold-unlikely.md) - #[cold] for unlikely functions
-- [opt-inline-never-cold](./opt-inline-never-cold.md) - Keeping cold code separate
-- [perf-profile-first](./perf-profile-first.md) - Profile to know what's likely

package/template/agent/skills/rust-developer/references/rust-rules/opt-lto-release.md DELETED Viewed

@@ -1,130 +0,0 @@
-# opt-lto-release
-> Enable LTO in release builds
-## Why It Matters
-Link-Time Optimization (LTO) enables optimizations across crate boundaries that aren't possible during normal compilation. This includes cross-crate inlining, dead code elimination, and devirtualization. Typically provides 5-20% performance improvement.
-## Bad
-```toml
-# Cargo.toml - default release profile
-[profile.release]
-opt-level = 3
-# No LTO = missed optimization opportunities
-```
-## Good
-```toml
-# Cargo.toml - optimized release profile
-[profile.release]
-opt-level = 3
-lto = "fat"          # Maximum optimization
-codegen-units = 1    # Better optimization (single codegen unit)
-panic = "abort"      # Smaller binary, no unwind tables
-strip = true         # Remove symbols for smaller binary
-```
-## LTO Options Explained
-```toml
-# No LTO (default)
-lto = false
-# Thin LTO - fast compilation, most benefits
-lto = "thin"
-# Fat LTO - slowest compilation, maximum optimization
-lto = "fat"
-# Equivalent to:
-lto = true
-# Thin-local - LTO within each crate only
-lto = "off"
-```
-## Trade-offs
-| Setting | Compile Time | Binary Size | Performance |
-|---------|--------------|-------------|-------------|
-| `lto = false` | Fast | Larger | Baseline |
-| `lto = "thin"` | Medium | Smaller | +5-15% |
-| `lto = "fat"` | Slow | Smallest | +10-20% |
-## Evidence from Production
-```toml
-# From Anchor (Solana framework)
-# https://github.com/solana-foundation/anchor/blob/master/cli/src/rust_template.rs
-[profile.release]
-overflow-checks = true
-lto = "fat"
-codegen-units = 1
-# From sol-trade-sdk
-# https://github.com/0xfnzero/sol-trade-sdk
-[profile.release]
-opt-level = 3
-lto = "fat"
-codegen-units = 1
-panic = "abort"
-```
-## Complete Optimized Profile
-```toml
-[profile.release]
-opt-level = 3        # Maximum optimization
-lto = "fat"          # Link-time optimization
-codegen-units = 1    # Single codegen unit for better optimization
-panic = "abort"      # Remove panic unwinding code
-strip = true         # Strip symbols
-debug = false        # No debug info
-# For benchmarking (need some debug info for profiling)
-[profile.bench]
-inherits = "release"
-debug = true
-strip = false
-# Fast dev builds with optimized dependencies
-[profile.dev]
-opt-level = 0
-debug = true
-[profile.dev.package."*"]
-opt-level = 3        # Optimize dependencies even in dev
-```
-## When to Use Each
-| Situation | LTO Setting |
-|-----------|-------------|
-| Development | `false` (fast compiles) |
-| CI builds | `"thin"` (balance) |
-| Release binaries | `"fat"` (max perf) |
-| Libraries (crates.io) | `false` (users choose) |
-## Measuring Impact
-```bash
-# Build without LTO
-cargo build --release
-hyperfine ./target/release/myapp
-# Build with LTO
-# (after adding lto = "fat" to Cargo.toml)
-cargo build --release
-hyperfine ./target/release/myapp
-# Compare binary sizes
-ls -la target/release/myapp
-```
-## See Also
-- [opt-codegen-units](opt-codegen-units.md) - Use codegen-units = 1
-- [opt-pgo-profile](opt-pgo-profile.md) - Profile-guided optimization
-- [perf-release-profile](perf-release-profile.md) - Full release profile settings

package/template/agent/skills/rust-developer/references/rust-rules/opt-pgo-profile.md DELETED Viewed

@@ -1,167 +0,0 @@
-# opt-pgo-profile
-> Use Profile-Guided Optimization (PGO) for maximum performance
-## Why It Matters
-PGO uses real runtime behavior to guide compiler optimization decisions. By profiling actual workloads, the compiler learns which code paths are hot, optimizing them aggressively while deprioritizing cold paths. This can yield 10-30% performance improvements beyond standard optimizations.
-## The PGO Process
-1. **Instrument**: Build with profiling instrumentation
-2. **Profile**: Run representative workloads
-3. **Optimize**: Rebuild using collected profile data
-## Step-by-Step
-```bash
-# Step 1: Build instrumented binary
-RUSTFLAGS="-Cprofile-generate=/tmp/pgo-data" \
-    cargo build --release
-# Step 2: Run representative workloads
-./target/release/my_app < test_data_1.txt
-./target/release/my_app < test_data_2.txt
-./target/release/my_app < typical_workload.txt
-# Step 3: Merge profile data
-llvm-profdata merge -o /tmp/pgo-data/merged.profdata /tmp/pgo-data
-# Step 4: Build optimized binary using profile
-RUSTFLAGS="-Cprofile-use=/tmp/pgo-data/merged.profdata" \
-    cargo build --release
-```
-## Cargo Configuration
-```toml
-# Cargo.toml
-[profile.release]
-lto = "fat"
-codegen-units = 1
-opt-level = 3
-# PGO flags set via RUSTFLAGS environment variable
-```
-## Build Script
-```bash
-#!/bin/bash
-set -e
-PGO_DIR=/tmp/pgo-$(date +%s)
-# Clean
-cargo clean
-# Instrumented build
-echo "Building instrumented binary..."
-RUSTFLAGS="-Cprofile-generate=$PGO_DIR" cargo build --release
-# Run workloads
-echo "Collecting profile data..."
-./target/release/my_app --benchmark-mode
-./target/release/my_app < test_fixtures/typical.txt
-./target/release/my_app < test_fixtures/stress.txt
-# Merge profiles
-echo "Merging profile data..."
-llvm-profdata merge -o $PGO_DIR/merged.profdata $PGO_DIR
-# Optimized build
-echo "Building optimized binary..."
-RUSTFLAGS="-Cprofile-use=$PGO_DIR/merged.profdata" cargo build --release
-echo "Done! Optimized binary at target/release/my_app"
-```
-## Representative Workloads
-```rust
-// Create benchmarks that match real usage patterns
-// Good: actual data samples
-fn profile_workload() {
-    for file in real_customer_data_samples() {
-        process_file(&file);
-    }
-}
-// Good: synthetic but realistic
-fn profile_synthetic() {
-    for _ in 0..10000 {
-        let data = generate_realistic_data();
-        process(&data);
-    }
-}
-// Bad: artificial microbenchmarks
-fn profile_bad() {
-    for _ in 0..1000000 {
-        small_operation();  // Doesn't reflect real hot paths
-    }
-}
-```
-## BOLT Post-Link Optimization
-For even more gains, combine PGO with BOLT:
-```bash
-# After PGO build, apply BOLT
-llvm-bolt target/release/my_app \
-    -o target/release/my_app.bolt \
-    -data=perf.data \
-    -reorder-blocks=ext-tsp \
-    -reorder-functions=hfsort
-# BOLT can add another 5-15% on top of PGO
-```
-## CI/CD Integration
-```yaml
-# GitHub Actions example
-jobs:
-  pgo-build:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Install LLVM tools
-        run: sudo apt-get install llvm
-      - name: Instrumented build
-        run: RUSTFLAGS="-Cprofile-generate=/tmp/pgo" cargo build --release
-      - name: Run profiling workloads
-        run: ./scripts/run_profiling_workloads.sh
-      - name: Merge profiles
-        run: llvm-profdata merge -o /tmp/pgo/merged.profdata /tmp/pgo
-      - name: Optimized build
-        run: RUSTFLAGS="-Cprofile-use=/tmp/pgo/merged.profdata" cargo build --release
-      - name: Upload artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: optimized-binary
-          path: target/release/my_app
-```
-## When to Use PGO
-| Use PGO | Skip PGO |
-|---------|----------|
-| Production deployments | Development builds |
-| Performance-critical apps | Libraries (users can PGO) |
-| Stable workload patterns | Highly variable workloads |
-| Sufficient profiling data | Quick iteration cycles |
-## See Also
-- [opt-lto-release](./opt-lto-release.md) - LTO works well with PGO
-- [opt-codegen-units](./opt-codegen-units.md) - Single codegen unit for PGO
-- [perf-profile-first](./perf-profile-first.md) - Profiling basics