PyPI - cuda-engine - Versions diffs - 1.0.0__py3-none-any.whl - Mend

cuda-engine 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

cuda_engine/__init__.py +24 -0
cuda_engine/api.py +39 -0
cuda_engine/cli.py +485 -0
cuda_engine/config.py +32 -0
cuda_engine/models/__init__.py +27 -0
cuda_engine/models/artifact.py +12 -0
cuda_engine/models/reports.py +106 -0
cuda_engine/models/spec.py +45 -0
cuda_engine/orchestrator.py +352 -0
cuda_engine/prompts/__init__.py +8 -0
cuda_engine/prompts/codegen.md +29 -0
cuda_engine/prompts/interview.md +30 -0
cuda_engine/prompts/perf_fix.md +56 -0
cuda_engine/prompts/polish.md +13 -0
cuda_engine/services/__init__.py +1 -0
cuda_engine/services/gpu/__init__.py +3 -0
cuda_engine/services/gpu/_run_kernel_child.py +305 -0
cuda_engine/services/gpu/base.py +88 -0
cuda_engine/services/gpu/local.py +451 -0
cuda_engine/services/gpu/mocks.py +85 -0
cuda_engine/services/llm/__init__.py +3 -0
cuda_engine/services/llm/anthropic.py +71 -0
cuda_engine/services/llm/base.py +35 -0
cuda_engine/services/llm/mocks.py +38 -0
cuda_engine/services/llm/tools.py +64 -0
cuda_engine/services/store/__init__.py +3 -0
cuda_engine/services/store/base.py +24 -0
cuda_engine/services/store/local_dir.py +42 -0
cuda_engine/services/store/mocks.py +27 -0
cuda_engine/stages/__init__.py +1 -0
cuda_engine/stages/base.py +41 -0
cuda_engine/stages/codegen.py +193 -0
cuda_engine/stages/correctness.py +241 -0
cuda_engine/stages/interview.py +117 -0
cuda_engine/stages/performance.py +424 -0
cuda_engine/stages/polish.py +152 -0
cuda_engine/targets/__init__.py +7 -0
cuda_engine/targets/sm_100.py +2 -0
cuda_engine/targets/sm_80.py +18 -0
cuda_engine/targets/sm_90.py +2 -0
cuda_engine-1.0.0.dist-info/METADATA +266 -0
cuda_engine-1.0.0.dist-info/RECORD +45 -0
cuda_engine-1.0.0.dist-info/WHEEL +4 -0
cuda_engine-1.0.0.dist-info/entry_points.txt +2 -0
cuda_engine-1.0.0.dist-info/licenses/LICENSE +21 -0

cuda_engine/models/reports.py ADDED Viewed

@@ -0,0 +1,106 @@
+from typing import Any, Self
+from pydantic import BaseModel, ConfigDict, Field
+class CorrectnessReport(BaseModel):
+    passed: bool
+    max_abs_err: float
+    max_rel_err: float
+    shapes_tested: list[tuple[int, ...]]
+    shape_results: list[dict[str, Any]] = Field(default_factory=list)
+    failing_inputs: list[dict[str, Any]] = Field(default_factory=list)
+class PerformanceReport(BaseModel):
+    speedup_vs_reference: float | None = None
+    speedup_vs_torch_compile: float | None = None
+    achieved_tflops: float | None = None
+    achieved_gbps: float | None = None
+    occupancy: float | None = None
+    regs_per_thread: int | None = None
+    spill_bytes: int = 0
+    below_target: bool = False
+    notes: list[str] = Field(default_factory=list)
+    warnings: list[str] = Field(default_factory=list)
+class StageTrace(BaseModel):
+    stage_name: str
+    attempts: int
+    succeeded: bool
+    model_used: str
+    tokens_in: int = 0
+    tokens_out: int = 0
+    cache_read_tokens: int = 0
+    latency_seconds: float = 0.0
+class SynthesisReport(BaseModel):
+    run_id: str
+    spec_name: str
+    stages_executed: list[str]
+    stage_traces: list[StageTrace] = Field(default_factory=list)
+    total_llm_tokens_in: int = 0
+    total_llm_tokens_out: int = 0
+    total_cost_usd: float = 0.0
+    wall_time_seconds: float = 0.0
+    warnings: list[str] = Field(default_factory=list)
+class SynthesisResult(BaseModel):
+    """Top-level return value of synthesize()."""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    passed: bool
+    run_id: str
+    artifacts_dir: str
+    report: SynthesisReport
+    failed_stage: int | None = None
+    failure_reason: str | None = None
+    correctness: CorrectnessReport | None = None
+    performance: PerformanceReport | None = None
+    kernel_callable: object | None = None
+    @classmethod
+    def ok(
+        cls,
+        *,
+        run_id: str,
+        artifacts_dir: str,
+        report: SynthesisReport,
+        correctness: CorrectnessReport,
+        performance: PerformanceReport,
+        kernel_callable: object | None,
+    ) -> Self:
+        return cls(
+            passed=True,
+            run_id=run_id,
+            artifacts_dir=artifacts_dir,
+            report=report,
+            correctness=correctness,
+            performance=performance,
+            kernel_callable=kernel_callable,
+        )
+    @classmethod
+    def failed(
+        cls,
+        *,
+        stage: int,
+        reason: str,
+        run_id: str,
+        artifacts_dir: str,
+        report: SynthesisReport,
+        correctness: CorrectnessReport | None = None,
+    ) -> Self:
+        return cls(
+            passed=False,
+            failed_stage=stage,
+            failure_reason=reason,
+            run_id=run_id,
+            artifacts_dir=artifacts_dir,
+            report=report,
+            correctness=correctness,
+        )

cuda_engine/models/spec.py ADDED Viewed

@@ -0,0 +1,45 @@
+from __future__ import annotations
+from enum import StrEnum
+from typing import Literal, TypeAlias
+from pydantic import BaseModel, ConfigDict, Field
+DType: TypeAlias = Literal["fp32", "fp16", "bf16", "fp64", "int32", "int64", "uint8", "int8"]
+TargetArch: TypeAlias = Literal["sm_80", "sm_90", "sm_100", "sm_120"]
+class OptimizationPriority(StrEnum):
+    LATENCY = "latency"
+    THROUGHPUT = "throughput"
+    BALANCED = "balanced"
+class TensorArg(BaseModel):
+    model_config = ConfigDict(frozen=True)
+    name: str
+    dtype: DType
+    shape: tuple[str, ...] = Field(description="Symbolic shape, e.g. ('B', 'S', 'D')")
+    layout_hint: Literal["row_major", "col_major", "any"] = "any"
+class PrecisionTolerance(BaseModel):
+    model_config = ConfigDict(frozen=True)
+    rtol: float = 1e-3
+    atol: float = 1e-3
+class KernelSpec(BaseModel):
+    """Frozen Stage 1 contract; downstream stages must not mutate it."""
+    model_config = ConfigDict(frozen=True)
+    name: str
+    target_arch: TargetArch
+    inputs: list[TensorArg]
+    outputs: list[TensorArg]
+    precision_tolerance: PrecisionTolerance
+    optimization_priority: OptimizationPriority
+    notes: str = ""

cuda_engine/orchestrator.py ADDED Viewed

@@ -0,0 +1,352 @@
+from __future__ import annotations
+import inspect
+import time
+from collections.abc import Callable
+from typing import Any, TypeVar
+from cuda_engine.config import SynthesisConfig
+from cuda_engine.models import (
+    CorrectnessReport,
+    KernelArtifact,
+    StageTrace,
+    SynthesisReport,
+    SynthesisResult,
+)
+from cuda_engine.services.gpu.base import GPURunner
+from cuda_engine.services.llm.base import LLMClient, LLMResponse, ToolSpec
+from cuda_engine.services.store.base import ArtifactStore
+from cuda_engine.stages.base import BudgetExhaustedError
+from cuda_engine.stages.codegen import Stage2Codegen
+from cuda_engine.stages.correctness import Stage3Correctness
+from cuda_engine.stages.interview import Stage1Interview
+from cuda_engine.stages.performance import Stage4Performance
+from cuda_engine.stages.polish import Stage5Polish
+T = TypeVar("T")
+class Orchestrator:
+    def __init__(
+        self,
+        *,
+        llm: LLMClient,
+        gpu: GPURunner,
+        store: ArtifactStore,
+        cfg: SynthesisConfig,
+    ) -> None:
+        self.llm = llm
+        self.gpu = gpu
+        self.store = store
+        self.cfg = cfg
+    def run(self, *, prompt: str, reference: Callable[..., Any], target: str) -> SynthesisResult:
+        run_id = self.store.new_run()
+        started_at = time.time()
+        llm = _TracingLLMClient(self.llm)
+        stage_traces: list[StageTrace] = []
+        self.store.write_text(run_id, "inputs/prompt.txt", prompt)
+        self.store.write_json(run_id, "inputs/config.json", self.cfg)
+        self.store.write_text(run_id, "inputs/reference.py", _reference_source(reference))
+        spec = _run_traced_stage(
+            stage_traces,
+            llm,
+            "interview",
+            lambda: Stage1Interview(llm=llm, store=self.store).run(
+                prompt=prompt,
+                reference=reference,
+                target_arch=target,
+                run_id=run_id,
+                model=self.cfg.sonnet_model,
+            ),
+        )
+        artifact = _run_traced_stage(
+            stage_traces,
+            llm,
+            "codegen",
+            lambda: _run_codegen_with_escalation(
+                llm=llm,
+                gpu=self.gpu,
+                store=self.store,
+                cfg=self.cfg,
+                run_args={
+                    "spec": spec,
+                    "run_id": run_id,
+                    "retry_budget": self.cfg.retry_budgets.codegen,
+                },
+            ),
+        )
+        correctness = _run_traced_stage(
+            stage_traces,
+            llm,
+            "correctness",
+            lambda: Stage3Correctness(llm=llm, gpu=self.gpu, store=self.store).run(
+                spec=spec,
+                artifact=artifact,
+                reference=reference,
+                run_id=run_id,
+                retry_budget=self.cfg.retry_budgets.correctness,
+                correctness_shapes=self.cfg.correctness_shapes,
+            ),
+            succeeded=lambda report: report.passed,
+        )
+        for repair_attempt in range(1, self.cfg.retry_budgets.correctness + 1):
+            if correctness.passed:
+                break
+            repair_dir = f"stage3_repair/attempt_{repair_attempt:02d}"
+            self.store.write_json(
+                run_id,
+                f"{repair_dir}/correctness_report.json",
+                correctness.model_dump(mode="json"),
+            )
+            def repair_action(
+                correctness_report: CorrectnessReport = correctness,
+                repair_prefix: str = repair_dir,
+            ) -> KernelArtifact:
+                return _run_codegen_with_escalation(
+                    llm=llm,
+                    gpu=self.gpu,
+                    store=self.store,
+                    cfg=self.cfg,
+                    run_args={
+                        "spec": spec,
+                        "run_id": run_id,
+                        "retry_budget": self.cfg.retry_budgets.codegen,
+                        "repair_context": correctness_report,
+                        "artifact_prefix": f"{repair_prefix}/codegen",
+                    },
+                )
+            artifact = _run_traced_stage(
+                stage_traces,
+                llm,
+                "codegen_repair",
+                repair_action,
+            )
+            def correctness_action(candidate: KernelArtifact = artifact) -> CorrectnessReport:
+                return Stage3Correctness(llm=llm, gpu=self.gpu, store=self.store).run(
+                    spec=spec,
+                    artifact=candidate,
+                    reference=reference,
+                    run_id=run_id,
+                    retry_budget=self.cfg.retry_budgets.correctness,
+                    correctness_shapes=self.cfg.correctness_shapes,
+                )
+            correctness = _run_traced_stage(
+                stage_traces,
+                llm,
+                "correctness",
+                correctness_action,
+                succeeded=lambda report: report.passed,
+            )
+        if not correctness.passed:
+            result = SynthesisResult.failed(
+                stage=3,
+                reason="correctness check failed",
+                run_id=run_id,
+                artifacts_dir=str(self.store.run_dir(run_id)),
+                report=_build_report(
+                    run_id=run_id,
+                    spec_name=spec.name,
+                    stage_traces=stage_traces,
+                    wall_time_seconds=time.time() - started_at,
+                ),
+                correctness=correctness,
+            )
+            _write_result_report(self.store, result)
+            return result
+        performance, artifact = _run_traced_stage(
+            stage_traces,
+            llm,
+            "performance",
+            lambda: Stage4Performance(llm=llm, gpu=self.gpu, store=self.store, cfg=self.cfg).run(
+                spec=spec,
+                artifact=artifact,
+                run_id=run_id,
+                retry_budget=self.cfg.retry_budgets.performance,
+                reference=reference,
+            ),
+        )
+        artifact = _run_traced_stage(
+            stage_traces,
+            llm,
+            "polish",
+            lambda: Stage5Polish(llm=llm, gpu=self.gpu, store=self.store).run(
+                spec=spec,
+                artifact=artifact,
+                correctness=correctness,
+                performance=performance,
+                reference=reference,
+                run_id=run_id,
+                model=self.cfg.sonnet_model,
+                correctness_shapes=self.cfg.correctness_shapes,
+            ),
+        )
+        report = _build_report(
+            run_id=run_id,
+            spec_name=spec.name,
+            stage_traces=stage_traces,
+            wall_time_seconds=time.time() - started_at,
+            warnings=["below perf target"] if performance.below_target else [],
+        )
+        result = SynthesisResult.ok(
+            run_id=run_id,
+            artifacts_dir=str(self.store.run_dir(run_id)),
+            report=report,
+            correctness=correctness,
+            performance=performance,
+            kernel_callable=None,
+        )
+        _write_result_report(self.store, result)
+        return result
+def _run_codegen_with_escalation(
+    *,
+    llm: _TracingLLMClient,
+    gpu: GPURunner,
+    store: ArtifactStore,
+    cfg: SynthesisConfig,
+    run_args: dict[str, Any],
+) -> KernelArtifact:
+    """Run Stage2Codegen with Sonnet, escalating to Opus on BudgetExhaustedError."""
+    try:
+        return Stage2Codegen(llm=llm, gpu=gpu, store=store).run(
+            **run_args, model=cfg.sonnet_model
+        )
+    except BudgetExhaustedError as bust:
+        if not cfg.escalate_to_opus_on_bust or cfg.opus_retry_budget_codegen <= 0:
+            raise
+        opus_run_args = {
+            **run_args,
+            "retry_budget": cfg.opus_retry_budget_codegen,
+            "artifact_prefix": f"{run_args.get('artifact_prefix', 'stage2_codegen')}/escalated",
+            "escalation_context": bust.summary,
+        }
+        return Stage2Codegen(llm=llm, gpu=gpu, store=store).run(
+            **opus_run_args, model=cfg.opus_model
+        )
+def _reference_source(reference: Callable[..., Any]) -> str:
+    try:
+        return inspect.getsource(reference)
+    except OSError:
+        return repr(reference)
+class _TracingLLMClient(LLMClient):
+    def __init__(self, inner: LLMClient) -> None:
+        self._inner = inner
+        self.responses: list[LLMResponse] = []
+    def complete(
+        self,
+        *,
+        system: list[dict[str, Any]],
+        messages: list[dict[str, Any]],
+        tools: list[ToolSpec] | None = None,
+        model: str,
+        max_tokens: int = 4096,
+        temperature: float | None = None,
+    ) -> LLMResponse:
+        response = self._inner.complete(
+            system=system,
+            messages=messages,
+            tools=tools,
+            model=model,
+            max_tokens=max_tokens,
+            temperature=temperature,
+        )
+        self.responses.append(response)
+        return response
+def _run_traced_stage(
+    stage_traces: list[StageTrace],
+    llm: _TracingLLMClient,
+    stage_name: str,
+    action: Callable[[], T],
+    *,
+    succeeded: Callable[[T], bool] | None = None,
+) -> T:
+    response_start = len(llm.responses)
+    started_at = time.time()
+    try:
+        result = action()
+    except Exception:
+        responses = llm.responses[response_start:]
+        stage_traces.append(_build_stage_trace(stage_name, responses, started_at, succeeded=False))
+        raise
+    responses = llm.responses[response_start:]
+    stage_traces.append(
+        _build_stage_trace(
+            stage_name,
+            responses,
+            started_at,
+            succeeded=succeeded(result) if succeeded is not None else True,
+        )
+    )
+    return result
+def _build_stage_trace(
+    stage_name: str,
+    responses: list[LLMResponse],
+    started_at: float,
+    *,
+    succeeded: bool,
+) -> StageTrace:
+    reported_latency = sum(response.latency_seconds for response in responses)
+    return StageTrace(
+        stage_name=stage_name,
+        attempts=max(1, len(responses)),
+        succeeded=succeeded,
+        model_used=_model_summary(responses),
+        tokens_in=sum(response.tokens_in for response in responses),
+        tokens_out=sum(response.tokens_out for response in responses),
+        cache_read_tokens=sum(response.cache_read_tokens for response in responses),
+        latency_seconds=reported_latency if reported_latency > 0 else time.time() - started_at,
+    )
+def _model_summary(responses: list[LLMResponse]) -> str:
+    if not responses:
+        return "none"
+    models: list[str] = []
+    for response in responses:
+        if response.model not in models:
+            models.append(response.model)
+    return ", ".join(models)
+def _build_report(
+    *,
+    run_id: str,
+    spec_name: str,
+    stage_traces: list[StageTrace],
+    wall_time_seconds: float,
+    warnings: list[str] | None = None,
+) -> SynthesisReport:
+    return SynthesisReport(
+        run_id=run_id,
+        spec_name=spec_name,
+        stages_executed=[trace.stage_name for trace in stage_traces],
+        stage_traces=stage_traces,
+        total_llm_tokens_in=sum(trace.tokens_in for trace in stage_traces),
+        total_llm_tokens_out=sum(trace.tokens_out for trace in stage_traces),
+        wall_time_seconds=wall_time_seconds,
+        warnings=warnings or [],
+    )
+def _write_result_report(store: ArtifactStore, result: SynthesisResult) -> None:
+    payload = result.model_dump(mode="json", exclude={"kernel_callable"})
+    store.write_json(result.run_id, "report.json", payload)

cuda_engine/prompts/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+from importlib.resources import files
+def load_prompt(name: str) -> str:
+    prompt_path = files(__package__).joinpath(f"{name}.md")
+    if not prompt_path.is_file():
+        raise FileNotFoundError(f"Prompt not found: {name}")
+    return prompt_path.read_text(encoding="utf-8")

cuda_engine/prompts/codegen.md ADDED Viewed

@@ -0,0 +1,29 @@
+# CUDA Codegen Stage
+You generate a single CUDA `.cu` file for the frozen `KernelSpec`.
+Required runnable ABI:
+- The generated source must be a Torch-loadable C++/CUDA extension, not a raw CUDA-only library.
+- Include the needed Torch headers, normally `#include <torch/extension.h>` and `#include <ATen/cuda/CUDAContext.h>`.
+- Expose exactly one user-callable op: `cuda_engine::forward`.
+- Register the schema with `TORCH_LIBRARY(cuda_engine, m)`.
+- Register the CUDA implementation with `TORCH_LIBRARY_IMPL(cuda_engine, CUDA, m)`.
+- The Python runner will call `torch.ops.cuda_engine.forward(*inputs)`, so the op signature must match the `KernelSpec` inputs and outputs.
+- Return a single `torch::Tensor` for one output, or a tuple/list-compatible Torch return type for multiple outputs.
+Rules:
+- Honor the target architecture and the frozen input/output contract.
+- Treat KernelSpec inputs with `shape: []` as scalar/0-D Torch tensors, not vectors.
+- For reduction outputs, return tensors with the exact reduced shape in the KernelSpec.
+  Example: input `["B", "D"]` and output `["B"]` means one output element per row.
+- For argmax kernels, return `int64` indices when the KernelSpec output dtype is `int64`.
+- For RMSNorm fp16 kernels, use fp32 accumulation for the mean square and reciprocal square root,
+  do not add gamma unless the KernelSpec includes it, and cast the final output to fp16.
+- For `sm_80`, prefer straightforward CUDA C++ suitable for A100.
+- Make memory hierarchy choices explicit in comments when they affect performance.
+- Use 256 threads per block as the default elementwise baseline unless the spec suggests otherwise.
+- Output complete CUDA source as one fenced `cuda` code block.
+- After generating the source, call `compile_kernel(src, target_arch)` using the exact source.
+- If compilation fails, use the compiler errors to revise the source and call `compile_kernel` again.
+Do not change dtypes, shapes, argument ordering, or precision tolerance.

cuda_engine/prompts/interview.md ADDED Viewed

@@ -0,0 +1,30 @@
+# CUDA Kernel Interview Stage
+You convert a user prompt plus Python reference metadata into a frozen `KernelSpec`.
+Return only structured JSON, preferably in a fenced `json` code block. The JSON must match:
+```json
+{
+  "name": "snake_case_kernel_name",
+  "target_arch": "sm_80",
+  "inputs": [{"name": "x", "dtype": "fp32", "shape": ["N"], "layout_hint": "any"}],
+  "outputs": [{"name": "out", "dtype": "fp32", "shape": ["N"], "layout_hint": "any"}],
+  "precision_tolerance": {"rtol": 0.001, "atol": 0.001},
+  "optimization_priority": "balanced",
+  "notes": "brief clarification notes"
+}
+```
+Rules:
+- Do not invent unsupported target architectures.
+- Use symbolic shapes when concrete shapes are unknown.
+- Represent scalar or 0-D tensor inputs with an empty shape list: `"shape": []`.
+- For reductions, shrink only the reduced dimensions. Example: last-dimension sum of `x[B,D]`
+  should use input shape `["B", "D"]` and output shape `["B"]`.
+- For `argmax`, use an integer output dtype, normally `int64`, with the reduced output shape.
+- For fp16 RMSNorm without gamma, use fp16 input/output shapes that match, fp32 accumulation
+  semantics in `notes`, and a practical fp16 tolerance such as `rtol=0.01`, `atol=0.01`.
+- Preserve the user's requested operation; do not broaden scope.
+- Prefer `throughput` for large elementwise/reduction prompts and `latency` only when the prompt explicitly prioritizes small inputs.
+- Use the reference metadata only to infer names and arity; if uncertain, choose conservative defaults and explain in `notes`.

cuda_engine/prompts/perf_fix.md ADDED Viewed

@@ -0,0 +1,56 @@
+# CUDA Performance Repair
+You revise a CUDA kernel that compiles and is correct but runs below the
+performance target. Your job is to improve throughput without breaking
+correctness, then call `compile_kernel(src, target_arch)` with the revised
+source.
+Required runnable ABI (unchanged from the previous kernel):
+- Keep `cuda_engine::forward` as the only user-callable op.
+- Keep the same `TORCH_LIBRARY(cuda_engine, m)` namespace, op signature,
+  argument order, dtypes, shapes, and return type.
+- Keep correctness: outputs must remain within the KernelSpec precision
+  tolerance compared to the reference.
+Inputs you will receive:
+- The current `kernel.cu` source.
+- The frozen `KernelSpec`.
+- The latest `BenchmarkResult` (`custom_ms`, `baseline_ms`, achieved GB/s).
+- A `NsightMetrics` snapshot (achieved occupancy, registers per thread,
+  spill bytes when available).
+- Suggested optimization hints derived from those metrics.
+Optimization themes to consider:
+- **Register pressure**: high regs/thread reduces occupancy on A100
+  (max 64 regs/thread for full occupancy at 256-thread blocks). Split
+  work into more, smaller blocks; reduce live registers; only spill to
+  shared memory when necessary.
+- **Occupancy**: low achieved occupancy means few warps are resident.
+  Investigate register, shared memory, or block-size limits.
+- **Memory coalescing**: ensure 32 consecutive threads in a warp read
+  128 consecutive bytes. Avoid strided global loads/stores; use
+  `__ldg` for read-only cached loads where appropriate.
+- **Grid wave alignment**: A100 has 108 SMs. Choose grid sizes that
+  fill full waves; a partial-wave tail can waste up to 20% of runtime.
+- **Shared-memory tiling**: for reductions, use 256-thread blocks with
+  `__shfl_down_sync` for warp-level reduction; store partial results
+  to shared memory only when the reduction crosses warp boundaries.
+- **Vectorized loads**: `float4`/`__half2` loads can double effective
+  bandwidth for elementwise ops on aligned, contiguous data.
+- **Simple fused elementwise kernels**: for one-pass pointwise or fused
+  pointwise work, prefer one coalesced read/compute/write pass with enough
+  blocks to cover the tensor. Do not add multi-pass reductions, shared-memory
+  staging, or complicated synchronization unless the KernelSpec actually
+  requires cross-element communication.
+Matching torch.compile is acceptable but not the goal. To strictly beat it on A100:
+- Prefer vectorized memory ops: `float4` for fp32, `__half2` for fp16. They double effective bandwidth on aligned contiguous data.
+- Align grid to A100's 108 SMs. A full wave is a multiple of 108 blocks; a partial tail wave wastes runtime. For tensors that don't divide evenly, prefer fewer-larger blocks over more-smaller.
+- Maximize instruction-level parallelism: `#pragma unroll` inner loops with small bounded trip count. Keep enough independent work per thread to hide arithmetic and memory latency.
+- Fuse passes when the KernelSpec permits. Reductions followed by elementwise can often be one-pass with `__shfl_down_sync` warp reductions.
+- Inspect register pressure first if Nsight shows occupancy < 50%. If regs/thread > 64 on a 256-thread block, work-split or block-size reduction frees waves.
+Output the complete revised CUDA source as one fenced `cuda` code block,
+then call `compile_kernel(src, target_arch)` with the exact source.
+Do not change dtypes, shapes, argument ordering, or precision tolerance.

cuda_engine/prompts/polish.md ADDED Viewed

@@ -0,0 +1,13 @@
+# CUDA Kernel Polish Stage
+You annotate an already-correct CUDA kernel for maintainability.
+Return only the complete annotated CUDA source in a fenced `cuda` code block.
+Annotations should explain:
+- tile size and launch configuration choices
+- memory layout and coalescing assumptions
+- precision tolerance and correctness summary
+- performance summary, including speedups and any occupancy/register notes when available
+Do not change behavior, signatures, namespace registration, or the `cuda_engine::forward` ABI.

cuda_engine/services/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Service interfaces and implementations."""

cuda_engine/services/gpu/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from cuda_engine.services.gpu.base import CompileResult, GPURunner, NsightMetrics, RunResult
+__all__ = ["CompileResult", "GPURunner", "NsightMetrics", "RunResult"]