npm - agy-superpowers - Versions diffs - 5.2.2 → 5.2.3 - Mend

agy-superpowers 5.2.2 → 5.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

package/template/agent/skills/rust-developer/references/rust-rules/opt-cache-friendly.md DELETED Viewed

@@ -1,187 +0,0 @@
-# opt-cache-friendly
-> Organize data for cache-efficient access patterns
-## Why It Matters
-Cache misses are expensive—a L3 cache miss costs ~100+ cycles vs ~4 cycles for L1 hit. Data layout and access patterns determine cache efficiency. Arrays of structs (AoS) vs structs of arrays (SoA), memory locality, and access patterns can make order-of-magnitude performance differences.
-## Bad
-```rust
-// Array of Structs (AoS) - poor cache use when accessing one field
-struct Particle {
-    position: [f32; 3],  // 12 bytes
-    velocity: [f32; 3],  // 12 bytes
-    mass: f32,           // 4 bytes
-    id: u64,             // 8 bytes
-    flags: u8,           // 1 byte + padding
-    // Total: 40 bytes per particle
-}
-fn update_positions(particles: &mut [Particle], dt: f32) {
-    for p in particles {
-        // Access position and velocity - 24 bytes
-        // But loads 40-byte struct per particle
-        // 16 bytes wasted per cache line load
-        p.position[0] += p.velocity[0] * dt;
-        p.position[1] += p.velocity[1] * dt;
-        p.position[2] += p.velocity[2] * dt;
-    }
-}
-```
-## Good
-```rust
-// Struct of Arrays (SoA) - cache-efficient for field access
-struct Particles {
-    positions_x: Vec<f32>,
-    positions_y: Vec<f32>,
-    positions_z: Vec<f32>,
-    velocities_x: Vec<f32>,
-    velocities_y: Vec<f32>,
-    velocities_z: Vec<f32>,
-    masses: Vec<f32>,
-    ids: Vec<u64>,
-    flags: Vec<u8>,
-}
-fn update_positions(p: &mut Particles, dt: f32) {
-    // Access contiguous memory - perfect cache utilization
-    for (px, vx) in p.positions_x.iter_mut().zip(&p.velocities_x) {
-        *px += vx * dt;
-    }
-    for (py, vy) in p.positions_y.iter_mut().zip(&p.velocities_y) {
-        *py += vy * dt;
-    }
-    for (pz, vz) in p.positions_z.iter_mut().zip(&p.velocities_z) {
-        *pz += vz * dt;
-    }
-}
-```
-## Hot/Cold Splitting
-```rust
-// Separate frequently and rarely accessed fields
-struct EntityHot {
-    position: [f32; 3],
-    velocity: [f32; 3],
-    // Hot data - accessed every frame
-}
-struct EntityCold {
-    name: String,
-    creation_time: Instant,
-    metadata: HashMap<String, Value>,
-    // Cold data - rarely accessed
-}
-struct Entities {
-    hot: Vec<EntityHot>,
-    cold: Vec<EntityCold>,
-}
-// Hot loop touches only hot data
-fn update(entities: &mut Entities, dt: f32) {
-    for e in &mut entities.hot {
-        e.position[0] += e.velocity[0] * dt;
-        // Cold data stays out of cache
-    }
-}
-```
-## Prefetching
-```rust
-// Process in cache-line-sized chunks
-const CACHE_LINE: usize = 64;
-fn process_with_prefetch(data: &mut [u8]) {
-    for chunk in data.chunks_mut(CACHE_LINE) {
-        // Prefetch next chunk while processing current
-        // (automatic in many cases, manual for complex patterns)
-        process_chunk(chunk);
-    }
-}
-// Matrix multiplication - block for cache
-fn matmul_blocked(a: &[f64], b: &[f64], c: &mut [f64], n: usize) {
-    const BLOCK: usize = 32;  // Fits in L1 cache
-    for i0 in (0..n).step_by(BLOCK) {
-        for j0 in (0..n).step_by(BLOCK) {
-            for k0 in (0..n).step_by(BLOCK) {
-                // Process BLOCK x BLOCK tile
-                for i in i0..min(i0 + BLOCK, n) {
-                    for j in j0..min(j0 + BLOCK, n) {
-                        // Inner loop operates on cached data
-                    }
-                }
-            }
-        }
-    }
-}
-```
-## Avoid Pointer Chasing
-```rust
-// Bad: linked list - random memory access
-struct Node {
-    value: i32,
-    next: Option<Box<Node>>,
-}
-fn sum_linked(head: &Node) -> i32 {
-    // Each node is a cache miss
-}
-// Good: contiguous vector
-fn sum_vector(data: &[i32]) -> i32 {
-    data.iter().sum()  // Sequential access, prefetcher happy
-}
-// Good: if graph needed, use indices
-struct Graph {
-    values: Vec<i32>,
-    edges: Vec<usize>,  // Indices into values
-}
-```
-## Memory Layout Attributes
-```rust
-// Ensure cache-line alignment
-#[repr(C, align(64))]
-struct CacheAligned {
-    data: [u8; 64],
-}
-// Prevent false sharing in concurrent code
-#[repr(C, align(64))]
-struct PaddedCounter {
-    value: AtomicU64,
-    _pad: [u8; 56],
-}
-```
-## Measuring Cache Performance
-```bash
-# Linux perf
-perf stat -e cache-references,cache-misses ./my_program
-# Detailed cache analysis
-perf stat -e L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses ./my_program
-# Cachegrind
-valgrind --tool=cachegrind ./my_program
-```
-## See Also
-- [mem-smaller-integers](./mem-smaller-integers.md) - Smaller data fits more in cache
-- [mem-box-large-variant](./mem-box-large-variant.md) - Keep enum sizes small
-- [opt-bounds-check](./opt-bounds-check.md) - Sequential access patterns

package/template/agent/skills/rust-developer/references/rust-rules/opt-codegen-units.md DELETED Viewed

@@ -1,142 +0,0 @@
-# opt-codegen-units
-> Set `codegen-units = 1` for maximum optimization in release builds
-## Why It Matters
-By default, Cargo splits code into multiple codegen units for parallel compilation. This speeds up builds but prevents some cross-unit optimizations. Setting `codegen-units = 1` allows LLVM to optimize across the entire crate, potentially improving runtime performance by 5-20% at the cost of slower builds.
-## Bad
-```toml
-# Cargo.toml - default settings
-[profile.release]
-# codegen-units defaults to 16
-# Fast to compile, but misses optimization opportunities
-```
-## Good
-```toml
-# Cargo.toml - optimized for runtime performance
-[profile.release]
-codegen-units = 1  # Single unit = better optimization
-lto = true         # Link-time optimization
-opt-level = 3      # Maximum optimization
-```
-## What codegen-units Affects
-| Codegen Units | Compile Time | Runtime Performance | Memory Use |
-|---------------|--------------|---------------------|------------|
-| 16 (default)  | Faster       | Baseline            | Lower      |
-| 4-8           | Moderate     | Slightly better     | Moderate   |
-| 1             | Slower       | Best                | Higher     |
-## How It Works
-```rust
-// With codegen-units = 16:
-// - Crate split into 16 independent compilation units
-// - Compiled in parallel
-// - Limited visibility between units for optimization
-// With codegen-units = 1:
-// - Entire crate in single unit
-// - LLVM sees all code at once
-// - Can inline across module boundaries
-// - Better dead code elimination
-// - Better constant propagation
-```
-## Full Release Profile
-```toml
-[profile.release]
-# Maximum runtime performance
-opt-level = 3
-lto = "fat"
-codegen-units = 1
-panic = "abort"      # Smaller binary, slight perf gain
-strip = true         # Smaller binary
-[profile.release-with-debug]
-# Performance with debugging ability
-inherits = "release"
-debug = true         # Keep debug symbols
-strip = false
-[profile.bench]
-# For benchmarking
-inherits = "release"
-```
-## Build Time Trade-offs
-```bash
-# Default release build (fast compile)
-cargo build --release
-# Time: ~30s
-# Optimized release build (slow compile, fast runtime)
-# With codegen-units = 1, lto = "fat"
-cargo build --release
-# Time: ~2-5min, but potentially 10-20% faster binary
-```
-## Per-Profile Configuration
-```toml
-# Fast debug builds
-[profile.dev]
-codegen-units = 256  # Maximum parallelism
-# Fast CI builds
-[profile.ci]
-inherits = "release"
-codegen-units = 16   # Balance compile time vs runtime
-lto = "thin"         # Faster than "fat"
-# Production release
-[profile.production]
-inherits = "release"
-codegen-units = 1
-lto = "fat"
-```
-## When to Use What
-```rust
-// codegen-units = 16 (default)
-// - Development builds
-// - CI where compile time matters
-// - When runtime performance isn't critical
-// codegen-units = 1
-// - Production deployments
-// - Performance-critical applications
-// - Final releases
-// - Benchmarking
-```
-## Measuring Impact
-```bash
-# Build with different settings
-cargo build --release
-# Benchmark
-cargo bench
-# Compare binary sizes
-ls -lh target/release/my_binary
-# Profile runtime
-perf stat ./target/release/my_binary
-```
-## See Also
-- [opt-lto-release](./opt-lto-release.md) - Link-time optimization
-- [opt-pgo-profile](./opt-pgo-profile.md) - Profile-guided optimization
-- [opt-target-cpu](./opt-target-cpu.md) - CPU-specific optimization

package/template/agent/skills/rust-developer/references/rust-rules/opt-cold-unlikely.md DELETED Viewed

@@ -1,152 +0,0 @@
-# opt-cold-unlikely
-> Mark unlikely code paths with `#[cold]` to help compiler optimization
-## Why It Matters
-The `#[cold]` attribute tells the compiler that a function is rarely called. The compiler uses this to optimize code layout—keeping cold code away from hot code improves instruction cache utilization. Combined with branch layout optimization, this can measurably improve performance.
-## Bad
-```rust
-// All branches treated equally
-fn validate(input: &str) -> Result<Data, ValidationError> {
-    if input.is_empty() {
-        return Err(ValidationError::Empty);  // Rare
-    }
-    if input.len() > 1000 {
-        return Err(ValidationError::TooLong);  // Rare
-    }
-    if !input.is_ascii() {
-        return Err(ValidationError::NonAscii);  // Rare
-    }
-    // This is the common case
-    Ok(parse_data(input))
-}
-```
-## Good
-```rust
-fn validate(input: &str) -> Result<Data, ValidationError> {
-    if input.is_empty() {
-        return cold_empty_error();
-    }
-    if input.len() > 1000 {
-        return cold_too_long_error();
-    }
-    if !input.is_ascii() {
-        return cold_non_ascii_error();
-    }
-    Ok(parse_data(input))
-}
-#[cold]
-fn cold_empty_error() -> Result<Data, ValidationError> {
-    Err(ValidationError::Empty)
-}
-#[cold]
-fn cold_too_long_error() -> Result<Data, ValidationError> {
-    Err(ValidationError::TooLong)
-}
-#[cold]
-fn cold_non_ascii_error() -> Result<Data, ValidationError> {
-    Err(ValidationError::NonAscii)
-}
-```
-## What #[cold] Does
-1. **Code placement**: Cold functions are placed in separate code sections, away from hot code
-2. **Branch prediction**: Compiler generates branch hints favoring the non-cold path
-3. **Inlining decisions**: Cold functions are not inlined into hot paths
-4. **Optimization budget**: Compiler spends less effort optimizing cold code
-## Common Cold Patterns
-```rust
-// Error handling
-#[cold]
-fn handle_error<E: std::fmt::Display>(e: E) -> ! {
-    eprintln!("Fatal error: {}", e);
-    std::process::exit(1);
-}
-// Logging rare events
-#[cold]
-fn log_rare_event(event: &Event) {
-    log::warn!("Rare event occurred: {:?}", event);
-}
-// Fallback paths
-#[cold]
-fn slow_fallback(data: &Data) -> Output {
-    // This path should rarely be taken
-    compute_slowly(data)
-}
-// Panic handlers
-#[cold]
-fn panic_invalid_state(state: &State) -> ! {
-    panic!("Invalid state: {:?}", state);
-}
-```
-## Assertions and Invariants
-```rust
-fn get_unchecked(&self, index: usize) -> &T {
-    if index >= self.len {
-        cold_bounds_panic(index, self.len);
-    }
-    unsafe { &*self.ptr.add(index) }
-}
-#[cold]
-#[inline(never)]
-fn cold_bounds_panic(index: usize, len: usize) -> ! {
-    panic!("index out of bounds: the len is {} but the index is {}", len, index);
-}
-```
-## Combining with #[inline(never)]
-```rust
-// Usually combine both for maximum effect
-#[cold]
-#[inline(never)]
-fn error_path() -> Error {
-    // Complex error construction stays out of hot code
-    Error {
-        backtrace: Backtrace::capture(),
-        context: gather_context(),
-    }
-}
-```
-## Measuring Impact
-```rust
-// Check code layout with objdump
-// objdump -d target/release/binary | less
-// Look for .cold sections
-// nm target/release/binary | grep cold
-// Profile to verify improvement
-// perf stat -e cache-misses,cache-references ./binary
-```
-## See Also
-- [opt-inline-never-cold](./opt-inline-never-cold.md) - Combining with inline(never)
-- [opt-likely-hint](./opt-likely-hint.md) - Branch prediction hints
-- [err-result-over-panic](./err-result-over-panic.md) - Error handling

package/template/agent/skills/rust-developer/references/rust-rules/opt-inline-always-rare.md DELETED Viewed

@@ -1,141 +0,0 @@
-# opt-inline-always-rare
-> Use `#[inline(always)]` sparingly—only for critical hot paths proven by profiling
-## Why It Matters
-`#[inline(always)]` forces the compiler to inline a function regardless of heuristics. Overuse increases binary size, hurts instruction cache, and can slow down code. The compiler is usually smarter about inlining than humans. Reserve this for measured hot paths where benchmarks prove a benefit.
-## Bad
-```rust
-// Annotating everything - trusting intuition over data
-#[inline(always)]
-pub fn get_name(&self) -> &str {
-    &self.name
-}
-#[inline(always)]
-pub fn calculate_tax(amount: f64) -> f64 {
-    amount * 0.1
-}
-#[inline(always)]
-fn helper(x: i32) -> i32 {
-    x + 1
-}
-// Result: bloated binary, poor cache utilization
-```
-## Good
-```rust
-// Let compiler decide for most functions
-pub fn get_name(&self) -> &str {
-    &self.name
-}
-pub fn calculate_tax(amount: f64) -> f64 {
-    amount * 0.1
-}
-// Only force inline for proven hot paths
-impl Hasher for MyHasher {
-    // Hasher::write is called millions of times in tight loops
-    // Profiling showed 15% improvement from forced inlining
-    #[inline(always)]
-    fn write(&mut self, bytes: &[u8]) {
-        // Very small, very hot
-        self.state = self.state.wrapping_add(bytes.len() as u64);
-    }
-}
-```
-## When #[inline(always)] Helps
-```rust
-// ✅ Tiny functions in hot inner loops
-#[inline(always)]
-fn fast_hash(a: u64, b: u64) -> u64 {
-    a.wrapping_mul(b).wrapping_add(a)
-}
-// ✅ Generic functions that benefit from monomorphization
-#[inline(always)]
-fn swap<T>(a: &mut T, b: &mut T) {
-    std::mem::swap(a, b);
-}
-// ✅ Iterator adapters and closures
-#[inline(always)]
-fn apply<T, F: Fn(T) -> T>(f: F, x: T) -> T {
-    f(x)
-}
-// ✅ SIMD/vectorization helpers
-#[inline(always)]
-fn add_simd(a: &[f32], b: &[f32], out: &mut [f32]) {
-    // ...
-}
-```
-## Inline Variants
-```rust
-// #[inline] - hint to inline, compiler may ignore
-#[inline]
-fn suggested_inline(x: i32) -> i32 { x + 1 }
-// #[inline(always)] - force inline (almost always)
-#[inline(always)]
-fn force_inline(x: i32) -> i32 { x + 1 }
-// #[inline(never)] - prevent inlining (for profiling, code size)
-#[inline(never)]
-fn no_inline(x: i32) -> i32 { x + 1 }
-// No annotation - compiler decides based on heuristics
-fn compiler_decides(x: i32) -> i32 { x + 1 }
-```
-## Measuring Inline Impact
-```rust
-// Use criterion to benchmark
-use criterion::{criterion_group, criterion_main, Criterion};
-fn bench_with_inline(c: &mut Criterion) {
-    c.bench_function("hot_path_inline", |b| {
-        b.iter(|| hot_loop())
-    });
-}
-// Compare binary sizes
-// cargo bloat --release --crates
-// Check if function was inlined
-// cargo asm --rust my_crate::hot_function
-```
-## Generic Functions
-```rust
-// Generic functions across crate boundaries often need #[inline]
-// Because the generic code is compiled in the calling crate
-// In library crate:
-#[inline]  // Allow inlining in downstream crates
-pub fn generic_function<T: Display>(x: T) {
-    println!("{}", x);
-}
-// Without #[inline], the generic function can't be inlined
-// across crate boundaries even if beneficial
-```
-## See Also
-- [opt-inline-small](./opt-inline-small.md) - Regular inline for small functions
-- [opt-inline-never-cold](./opt-inline-never-cold.md) - Preventing inlining
-- [perf-profile-first](./perf-profile-first.md) - Profile before optimizing