simplex-tensor 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- simplex_tensor-1.0.0/PKG-INFO +326 -0
- simplex_tensor-1.0.0/README.md +294 -0
- simplex_tensor-1.0.0/pyproject.toml +55 -0
- simplex_tensor-1.0.0/python/.gitignore +8 -0
- simplex_tensor-1.0.0/python/Cargo.lock +584 -0
- simplex_tensor-1.0.0/python/Cargo.toml +26 -0
- simplex_tensor-1.0.0/python/README.md +294 -0
- simplex_tensor-1.0.0/python/benchmarks/bench_physics.py +374 -0
- simplex_tensor-1.0.0/python/benchmarks/bench_results.json +384 -0
- simplex_tensor-1.0.0/python/benchmarks/bench_symplex.py +865 -0
- simplex_tensor-1.0.0/python/demo.py +88 -0
- simplex_tensor-1.0.0/python/setup.py +72 -0
- simplex_tensor-1.0.0/python/src/lib.rs +2001 -0
- simplex_tensor-1.0.0/python/tests/test_jit.py +278 -0
- simplex_tensor-1.0.0/python/tests/test_math.py +228 -0
- simplex_tensor-1.0.0/python/tests/test_purity.py +259 -0
- simplex_tensor-1.0.0/rust-engine/.gitignore +1 -0
- simplex_tensor-1.0.0/rust-engine/Cargo.toml +31 -0
- simplex_tensor-1.0.0/rust-engine/benches/physics_bench.rs +1352 -0
- simplex_tensor-1.0.0/rust-engine/src/cuda_backend.rs +1538 -0
- simplex_tensor-1.0.0/rust-engine/src/ffi.rs +847 -0
- simplex_tensor-1.0.0/rust-engine/src/fusion_engine.rs +3684 -0
- simplex_tensor-1.0.0/rust-engine/src/lib.rs +1702 -0
- simplex_tensor-1.0.0/rust-engine/src/phase3_jit.rs +21728 -0
- simplex_tensor-1.0.0/rust-engine/src/polyhedral.rs +4812 -0
- simplex_tensor-1.0.0/rust-engine/src/tracing_jit.rs +2114 -0
- simplex_tensor-1.0.0/rust-engine/src/types.rs +590 -0
- simplex_tensor-1.0.0/rust-engine/src/x86_emitter.rs +2287 -0
- simplex_tensor-1.0.0/rust-engine/tests/integration.rs +118 -0
- simplex_tensor-1.0.0/symplex/__init__.py +799 -0
- simplex_tensor-1.0.0/symplex/_array.py +419 -0
- simplex_tensor-1.0.0/symplex/_ast_checker.py +500 -0
- simplex_tensor-1.0.0/symplex/_errors.py +39 -0
- simplex_tensor-1.0.0/symplex/_jit.py +3326 -0
- simplex_tensor-1.0.0/symplex/_tracer.cpp +612 -0
- simplex_tensor-1.0.0/symplex/_tracer.py +445 -0
- simplex_tensor-1.0.0/symplex/linalg.py +258 -0
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: simplex-tensor
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Classifier: Development Status :: 4 - Beta
|
|
5
|
+
Classifier: Intended Audience :: Developers
|
|
6
|
+
Classifier: Intended Audience :: Science/Research
|
|
7
|
+
Classifier: License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Rust
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Classifier: Topic :: Software Development :: Compilers
|
|
15
|
+
Classifier: Topic :: System :: Hardware :: Hardware Drivers
|
|
16
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
17
|
+
Requires-Dist: numpy>=1.24
|
|
18
|
+
Requires-Dist: pytest>=7.0 ; extra == 'dev'
|
|
19
|
+
Requires-Dist: maturin>=1.0 ; extra == 'dev'
|
|
20
|
+
Requires-Dist: scipy>=1.10 ; extra == 'ml'
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Provides-Extra: ml
|
|
23
|
+
Summary: SympleX – Polyhedral Tensor Superoptimizer with JAX-style purity enforcement and x86-64 JIT compilation
|
|
24
|
+
Keywords: jit,compiler,tensor,polyhedral,optimization,simd,avx512,numpy,scientific-computing
|
|
25
|
+
Author: SympleX Contributors
|
|
26
|
+
License: AGPL-3.0
|
|
27
|
+
Requires-Python: >=3.10
|
|
28
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
29
|
+
Project-URL: Homepage, https://github.com/hollowguy898-cloud/SympleX
|
|
30
|
+
Project-URL: Repository, https://github.com/hollowguy898-cloud/SympleX
|
|
31
|
+
|
|
32
|
+
# SympleX — Polyhedral Tensor Superoptimizer
|
|
33
|
+
|
|
34
|
+
**SympleX** is a C++20 compiler engine that combines equality saturation (e-graph superoptimization) with the polyhedral model to automatically discover mathematically equivalent but faster programs for AI training. It explores the space of valid program transformations — not just tile sizes — and maps the best discovered program directly onto GPU Tensor Cores, SRAM hierarchies, and distributed cluster topologies.
|
|
35
|
+
|
|
36
|
+
## Architecture
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
[AI Model Graph / Iteration Space]
|
|
40
|
+
│
|
|
41
|
+
▼
|
|
42
|
+
┌──────────────────────────────────────────────────────┐
|
|
43
|
+
│ Level 1: Mathematical Superoptimizer (E-Graph) │
|
|
44
|
+
│ ┌────────────────────────────────────────────────┐ │
|
|
45
|
+
│ │ E-Graph: compactly represents exponentially │ │
|
|
46
|
+
│ │ many equivalent programs as equivalence classes │ │
|
|
47
|
+
│ └────────────────────────────────────────────────┘ │
|
|
48
|
+
│ Rewrite Rules: │
|
|
49
|
+
│ - A*B + A*C → A*(B+C) (factor / distribute) │
|
|
50
|
+
│ - ReLU(MatMul(A,B)) → FusedMatMulReLU(A,B) │
|
|
51
|
+
│ - MatMul(A,B)+bias → FusedMatMulAdd(A,B,bias) │
|
|
52
|
+
│ - A + 0 == A, A * 1 == A, A * 0 == 0 │
|
|
53
|
+
│ - (A@B)^T == B^T@A^T, Transpose(Transpose(A))=A │
|
|
54
|
+
│ - Softmax decomposition, LayerNorm decomposition │
|
|
55
|
+
│ - Tiling decomposition, tile-pointwise distribution│
|
|
56
|
+
│ Polyhedral Guardrails: │
|
|
57
|
+
│ - Rejects rewrites that violate dependencies │
|
|
58
|
+
│ Cost-Guided Extraction: │
|
|
59
|
+
│ - Picks cheapest program from saturated e-graph │
|
|
60
|
+
└──────────────────────────────────────────────────────┘
|
|
61
|
+
│
|
|
62
|
+
▼
|
|
63
|
+
┌──────────────────────────────────────────────────────┐
|
|
64
|
+
│ Level 2: Hardware-Mapping Search │
|
|
65
|
+
│ Phase 1: Roofline pruning (memory-bound filtering) │
|
|
66
|
+
│ Phase 2: Compute-symmetry alignment (TC multiples) │
|
|
67
|
+
│ Phase 3: Hardware occupancy sieve (analytical/emp.) │
|
|
68
|
+
└──────────────────────────────────────────────────────┘
|
|
69
|
+
│
|
|
70
|
+
▼
|
|
71
|
+
┌──────────────────────────────────────────────────────┐
|
|
72
|
+
│ Code Generator (PTX Emitter) │
|
|
73
|
+
│ - WMMA/MMA Tensor Core instructions │
|
|
74
|
+
│ - Shared memory swizzling (XOR bank-conflict avoid) │
|
|
75
|
+
│ - Async TMA / cp.async pipelines │
|
|
76
|
+
│ - Double-buffering & software pipelining │
|
|
77
|
+
└──────────────────────────────────────────────────────┘
|
|
78
|
+
│
|
|
79
|
+
▼
|
|
80
|
+
┌──────────────────────────────────────────────────────┐
|
|
81
|
+
│ Distributed Sharding & Fault Tolerance │
|
|
82
|
+
│ - 2D mesh (TP × PP × DP) │
|
|
83
|
+
│ - NCCL collective scheduling │
|
|
84
|
+
│ - 1F1B pipeline overlap │
|
|
85
|
+
│ - Resilient forward recovery (no rollback) │
|
|
86
|
+
│ - Dynamic micro-batching │
|
|
87
|
+
│ - Activation checkpointing (save/recompute/offload) │
|
|
88
|
+
└──────────────────────────────────────────────────────┘
|
|
89
|
+
│
|
|
90
|
+
▼
|
|
91
|
+
[Optimized GPU Binary / PTX]
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Modules
|
|
95
|
+
|
|
96
|
+
| Module | Path | Description |
|
|
97
|
+
|--------|------|-------------|
|
|
98
|
+
| **Polyhedral** | `include/symplex/polyhedral/` | Integer polytopes, affine maps, dependency analysis, iteration spaces |
|
|
99
|
+
| **Schedule** | `include/symplex/schedule/` | Schedule trees, tiling, operator fusion, GPU parallelization mapping |
|
|
100
|
+
| **Hardware** | `include/symplex/hardware/` | GPU topology, Tensor Core specs, memory hierarchy, roofline model |
|
|
101
|
+
| **Optimizer** | `include/symplex/optimizer/` | **E-graph superoptimizer** + 3-phase hardware search (roofline → symmetry → occupancy) |
|
|
102
|
+
| **Cost Model** | `include/symplex/costmodel/` | Roofline, analytical, empirical, and hybrid cost models |
|
|
103
|
+
| **Codegen** | `include/symplex/codegen/` | PTX emitter, WMMA/MMA instruction generation, swizzling, register allocation |
|
|
104
|
+
| **Distributed** | `include/symplex/distributed/` | Cluster mesh, SPMD sharding, NCCL bridge, pipeline overlap |
|
|
105
|
+
| **Fault Tolerance** | `include/symplex/fault_tolerance/` | Health monitoring, forward recovery, communicator repair, activation checkpointing |
|
|
106
|
+
| **Training** | `include/symplex/training/` | Training loop orchestrator, dynamic batch sizing, memory watchdog, JIT compiler pipeline |
|
|
107
|
+
|
|
108
|
+
## Quick Start
|
|
109
|
+
|
|
110
|
+
### Prerequisites
|
|
111
|
+
|
|
112
|
+
- C++20 compiler (GCC 12+, Clang 15+)
|
|
113
|
+
- CMake 3.20+
|
|
114
|
+
- Optional: CUDA Toolkit 12+ (for empirical profiling and GPU execution)
|
|
115
|
+
- Optional: ISL (Integer Set Library) for enhanced polyhedral analysis
|
|
116
|
+
- Optional: NCCL (for distributed training)
|
|
117
|
+
|
|
118
|
+
### Build
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
git clone https://github.com/hollowguy898-cloud/SympleX.git
|
|
122
|
+
cd SympleX
|
|
123
|
+
mkdir build && cd build
|
|
124
|
+
cmake .. -DCMAKE_BUILD_TYPE=Release
|
|
125
|
+
make -j$(nproc)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Run Tests
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
./symplex_tests
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Run the Matmul Optimization Example
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
./example_matmul
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
This optimizes a 4096×4096×2048 matrix multiplication for the H100 GPU target, running the full 3-phase superoptimizer search and generating PTX code.
|
|
141
|
+
|
|
142
|
+
## Usage
|
|
143
|
+
|
|
144
|
+
### Optimize a Matrix Multiplication
|
|
145
|
+
|
|
146
|
+
```cpp
|
|
147
|
+
#include "symplex/training/compiler_pipeline.h"
|
|
148
|
+
#include "symplex/hardware/hardware_target.h"
|
|
149
|
+
|
|
150
|
+
using namespace symplex::training;
|
|
151
|
+
using namespace symplex::hardware;
|
|
152
|
+
|
|
153
|
+
// Target: NVIDIA H100
|
|
154
|
+
HardwareTarget target = HardwareTarget::H100();
|
|
155
|
+
|
|
156
|
+
// Create compiler pipeline
|
|
157
|
+
CompilerPipeline pipeline(target);
|
|
158
|
+
|
|
159
|
+
// Optimize matmul C[M,N] += A[M,K] * B[K,N]
|
|
160
|
+
auto result = pipeline.compile_matmul(4096, 4096, 2048);
|
|
161
|
+
|
|
162
|
+
// result.ptx_source — generated PTX kernel
|
|
163
|
+
// result.estimated_latency_ns — predicted latency
|
|
164
|
+
// result.speedup_vs_naive — speedup over naive tiling
|
|
165
|
+
// result.grid_dims / block_dims — GPU launch parameters
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Use the Superoptimizer Directly
|
|
169
|
+
|
|
170
|
+
```cpp
|
|
171
|
+
#include "symplex/optimizer/superoptimizer.h"
|
|
172
|
+
#include "symplex/polyhedral/iteration_space.h"
|
|
173
|
+
#include "symplex/hardware/hardware_target.h"
|
|
174
|
+
|
|
175
|
+
using namespace symplex::optimizer;
|
|
176
|
+
using namespace symplex::polyhedral;
|
|
177
|
+
using namespace symplex::hardware;
|
|
178
|
+
|
|
179
|
+
HardwareTarget target = HardwareTarget::H100();
|
|
180
|
+
Superoptimizer opt(target);
|
|
181
|
+
|
|
182
|
+
auto ispace = make_matmul_iteration_space(1024, 1024, 512);
|
|
183
|
+
auto result = opt.optimize(ispace);
|
|
184
|
+
|
|
185
|
+
// result.best_tile — optimal TileConfig
|
|
186
|
+
// result.estimated_latency_ns — predicted latency
|
|
187
|
+
// result.speedup_vs_naive — speedup vs smallest Tensor Core tile
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Distributed Training with Fault Tolerance
|
|
191
|
+
|
|
192
|
+
```cpp
|
|
193
|
+
#include "symplex/training/training_loop.h"
|
|
194
|
+
#include "symplex/hardware/hardware_target.h"
|
|
195
|
+
|
|
196
|
+
using namespace symplex::training;
|
|
197
|
+
using namespace symplex::hardware;
|
|
198
|
+
|
|
199
|
+
TrainingConfig config;
|
|
200
|
+
config.global_batch_size = 2048;
|
|
201
|
+
config.enable_fault_tolerance = true;
|
|
202
|
+
config.enable_dynamic_batching = true;
|
|
203
|
+
|
|
204
|
+
HardwareTarget target = HardwareTarget::H100();
|
|
205
|
+
TrainingLoop loop(config, target);
|
|
206
|
+
|
|
207
|
+
auto ispace = make_matmul_iteration_space(4096, 4096, 2048);
|
|
208
|
+
loop.initialize(ispace);
|
|
209
|
+
|
|
210
|
+
auto results = loop.execute_epoch();
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Key Mathematical Concepts
|
|
214
|
+
|
|
215
|
+
### Iteration Space (I)
|
|
216
|
+
Every AI loop nest is modeled as an **integer polytope**:
|
|
217
|
+
|
|
218
|
+
```
|
|
219
|
+
I = { i ∈ Z^n | A·i + b ≥ 0 }
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
### Data Dependency Polyhedron (D)
|
|
223
|
+
Dependencies are vectors in the polyhedral space that must remain lexicographically positive:
|
|
224
|
+
|
|
225
|
+
```
|
|
226
|
+
d = i_sink - i_source, d ≥ 0
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### Schedule Map (Φ)
|
|
230
|
+
The central optimization maps iteration points to hardware coordinates and time:
|
|
231
|
+
|
|
232
|
+
```
|
|
233
|
+
Φ(i) → (DeviceID, SM_ID, Warp_ID, Thread_ID, TimeStep)
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
### 3-Phase Superoptimizer Search (Level 2)
|
|
237
|
+
|
|
238
|
+
1. **Roofline Pruning**: Drop 90% of tile configurations using analytical operational intensity bounds
|
|
239
|
+
2. **Compute-Symmetry Alignment**: Only evaluate tile sizes that are exact multiples of Tensor Core fragment dimensions (16×8×16 for H100)
|
|
240
|
+
3. **Hardware Occupancy Sieve**: Micro-benchmark the top candidates, selecting the configuration that maximizes SM occupancy
|
|
241
|
+
|
|
242
|
+
### Equality Saturation (Level 1 — the real superoptimizer)
|
|
243
|
+
|
|
244
|
+
Unlike traditional autotuners that only sweep hardware parameters, SympleX's superoptimizer explores the **space of equivalent programs** using equality saturation:
|
|
245
|
+
|
|
246
|
+
1. **E-Graph Construction**: The input program is represented as an e-graph — a data structure that compactly represents exponentially many equivalent expressions as equivalence classes
|
|
247
|
+
2. **Rewrite Rule Application**: Algebraic identities, fusion patterns, and tiling decompositions are applied iteratively, growing the e-graph to represent all discovered equivalent programs
|
|
248
|
+
3. **Polyhedral Guardrails**: Before any extracted program is accepted, it is validated against the original computation's data dependencies — rewrites that would violate semantics are rejected
|
|
249
|
+
4. **Cost-Guided Extraction**: The cheapest program is extracted from the saturated e-graph using a bottom-up dynamic programming approach, where fused operations (e.g., `FusedMatMulReLU`) have lower cost than their unfused equivalents (`ReLU(MatMul(A,B))`)
|
|
250
|
+
|
|
251
|
+
## Hardware Targets
|
|
252
|
+
|
|
253
|
+
Built-in profiles for:
|
|
254
|
+
|
|
255
|
+
| GPU | SMs | Tensor Core | HBM BW | SRAM/SM |
|
|
256
|
+
|-----|-----|-------------|--------|---------|
|
|
257
|
+
| **H100** (Hopper) | 132 | 16×8×16 FP16 | 3.35 TB/s | 228 KB |
|
|
258
|
+
| **B200** (Blackwell) | 160 | 16×8×32 FP16 | 8.0 TB/s | 304 KB |
|
|
259
|
+
| **Generic** | 84 | 16×8×16 FP16 | 2.0 TB/s | 164 KB |
|
|
260
|
+
|
|
261
|
+
Custom targets can be constructed via `HardwareTarget` fields.
|
|
262
|
+
|
|
263
|
+
## Project Structure
|
|
264
|
+
|
|
265
|
+
```
|
|
266
|
+
SympleX/
|
|
267
|
+
├── CMakeLists.txt
|
|
268
|
+
├── LICENSE # GNU AGPL v3
|
|
269
|
+
├── README.md
|
|
270
|
+
├── include/symplex/
|
|
271
|
+
│ ├── polyhedral/ # Core polyhedral types
|
|
272
|
+
│ │ ├── integer_polytope.h
|
|
273
|
+
│ │ ├── affine_map.h
|
|
274
|
+
│ │ ├── dependency.h
|
|
275
|
+
│ │ ├── iteration_space.h
|
|
276
|
+
│ │ └── union_map.h
|
|
277
|
+
│ ├── schedule/ # Schedule tree & transformations
|
|
278
|
+
│ │ ├── schedule_tree.h
|
|
279
|
+
│ │ ├── tiling.h
|
|
280
|
+
│ │ ├── fusion.h
|
|
281
|
+
│ │ ├── parallelization.h
|
|
282
|
+
│ │ └── schedule_map.h
|
|
283
|
+
│ ├── hardware/ # GPU hardware models
|
|
284
|
+
│ │ └── hardware_target.h
|
|
285
|
+
│ ├── optimizer/ # Superoptimizer search
|
|
286
|
+
│ │ ├── tile_config.h
|
|
287
|
+
│ │ ├── search_phase1.h
|
|
288
|
+
│ │ ├── search_phase2.h
|
|
289
|
+
│ │ ├── search_phase3.h
|
|
290
|
+
│ │ └── superoptimizer.h
|
|
291
|
+
│ ├── costmodel/ # Performance cost models
|
|
292
|
+
│ │ ├── roofline.h
|
|
293
|
+
│ │ ├── analytical.h
|
|
294
|
+
│ │ ├── empirical.h
|
|
295
|
+
│ │ └── cost_model.h
|
|
296
|
+
│ ├── codegen/ # PTX code generation
|
|
297
|
+
│ │ ├── wmma.h
|
|
298
|
+
│ │ ├── swizzle.h
|
|
299
|
+
│ │ ├── register_allocator.h
|
|
300
|
+
│ │ ├── ptx_emitter.h
|
|
301
|
+
│ │ └── code_generator.h
|
|
302
|
+
│ ├── distributed/ # Distributed training
|
|
303
|
+
│ │ ├── mesh.h
|
|
304
|
+
│ │ ├── sharding.h
|
|
305
|
+
│ │ ├── nccl_bridge.h
|
|
306
|
+
│ │ └── pipeline_overlap.h
|
|
307
|
+
│ ├── fault_tolerance/ # Fault tolerance
|
|
308
|
+
│ │ ├── health_monitor.h
|
|
309
|
+
│ │ ├── forward_recovery.h
|
|
310
|
+
│ │ ├── communicator_repair.h
|
|
311
|
+
│ │ └── checkpoint.h
|
|
312
|
+
│ └── training/ # Training orchestrator
|
|
313
|
+
│ ├── dynamic_batch.h
|
|
314
|
+
│ ├── memory_watchdog.h
|
|
315
|
+
│ ├── training_loop.h
|
|
316
|
+
│ └── compiler_pipeline.h
|
|
317
|
+
├── src/ # Implementation files (mirrors include/)
|
|
318
|
+
├── tests/ # Unit tests
|
|
319
|
+
├── benchmarks/ # Performance benchmarks
|
|
320
|
+
└── examples/ # Usage examples
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
## License
|
|
324
|
+
|
|
325
|
+
GNU Affero General Public License v3 — see [LICENSE](LICENSE).
|
|
326
|
+
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
# SympleX — Polyhedral Tensor Superoptimizer
|
|
2
|
+
|
|
3
|
+
**SympleX** is a C++20 compiler engine that combines equality saturation (e-graph superoptimization) with the polyhedral model to automatically discover mathematically equivalent but faster programs for AI training. It explores the space of valid program transformations — not just tile sizes — and maps the best discovered program directly onto GPU Tensor Cores, SRAM hierarchies, and distributed cluster topologies.
|
|
4
|
+
|
|
5
|
+
## Architecture
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
[AI Model Graph / Iteration Space]
|
|
9
|
+
│
|
|
10
|
+
▼
|
|
11
|
+
┌──────────────────────────────────────────────────────┐
|
|
12
|
+
│ Level 1: Mathematical Superoptimizer (E-Graph) │
|
|
13
|
+
│ ┌────────────────────────────────────────────────┐ │
|
|
14
|
+
│ │ E-Graph: compactly represents exponentially │ │
|
|
15
|
+
│ │ many equivalent programs as equivalence classes │ │
|
|
16
|
+
│ └────────────────────────────────────────────────┘ │
|
|
17
|
+
│ Rewrite Rules: │
|
|
18
|
+
│ - A*B + A*C → A*(B+C) (factor / distribute) │
|
|
19
|
+
│ - ReLU(MatMul(A,B)) → FusedMatMulReLU(A,B) │
|
|
20
|
+
│ - MatMul(A,B)+bias → FusedMatMulAdd(A,B,bias) │
|
|
21
|
+
│ - A + 0 == A, A * 1 == A, A * 0 == 0 │
|
|
22
|
+
│ - (A@B)^T == B^T@A^T, Transpose(Transpose(A))=A │
|
|
23
|
+
│ - Softmax decomposition, LayerNorm decomposition │
|
|
24
|
+
│ - Tiling decomposition, tile-pointwise distribution│
|
|
25
|
+
│ Polyhedral Guardrails: │
|
|
26
|
+
│ - Rejects rewrites that violate dependencies │
|
|
27
|
+
│ Cost-Guided Extraction: │
|
|
28
|
+
│ - Picks cheapest program from saturated e-graph │
|
|
29
|
+
└──────────────────────────────────────────────────────┘
|
|
30
|
+
│
|
|
31
|
+
▼
|
|
32
|
+
┌──────────────────────────────────────────────────────┐
|
|
33
|
+
│ Level 2: Hardware-Mapping Search │
|
|
34
|
+
│ Phase 1: Roofline pruning (memory-bound filtering) │
|
|
35
|
+
│ Phase 2: Compute-symmetry alignment (TC multiples) │
|
|
36
|
+
│ Phase 3: Hardware occupancy sieve (analytical/emp.) │
|
|
37
|
+
└──────────────────────────────────────────────────────┘
|
|
38
|
+
│
|
|
39
|
+
▼
|
|
40
|
+
┌──────────────────────────────────────────────────────┐
|
|
41
|
+
│ Code Generator (PTX Emitter) │
|
|
42
|
+
│ - WMMA/MMA Tensor Core instructions │
|
|
43
|
+
│ - Shared memory swizzling (XOR bank-conflict avoid) │
|
|
44
|
+
│ - Async TMA / cp.async pipelines │
|
|
45
|
+
│ - Double-buffering & software pipelining │
|
|
46
|
+
└──────────────────────────────────────────────────────┘
|
|
47
|
+
│
|
|
48
|
+
▼
|
|
49
|
+
┌──────────────────────────────────────────────────────┐
|
|
50
|
+
│ Distributed Sharding & Fault Tolerance │
|
|
51
|
+
│ - 2D mesh (TP × PP × DP) │
|
|
52
|
+
│ - NCCL collective scheduling │
|
|
53
|
+
│ - 1F1B pipeline overlap │
|
|
54
|
+
│ - Resilient forward recovery (no rollback) │
|
|
55
|
+
│ - Dynamic micro-batching │
|
|
56
|
+
│ - Activation checkpointing (save/recompute/offload) │
|
|
57
|
+
└──────────────────────────────────────────────────────┘
|
|
58
|
+
│
|
|
59
|
+
▼
|
|
60
|
+
[Optimized GPU Binary / PTX]
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Modules
|
|
64
|
+
|
|
65
|
+
| Module | Path | Description |
|
|
66
|
+
|--------|------|-------------|
|
|
67
|
+
| **Polyhedral** | `include/symplex/polyhedral/` | Integer polytopes, affine maps, dependency analysis, iteration spaces |
|
|
68
|
+
| **Schedule** | `include/symplex/schedule/` | Schedule trees, tiling, operator fusion, GPU parallelization mapping |
|
|
69
|
+
| **Hardware** | `include/symplex/hardware/` | GPU topology, Tensor Core specs, memory hierarchy, roofline model |
|
|
70
|
+
| **Optimizer** | `include/symplex/optimizer/` | **E-graph superoptimizer** + 3-phase hardware search (roofline → symmetry → occupancy) |
|
|
71
|
+
| **Cost Model** | `include/symplex/costmodel/` | Roofline, analytical, empirical, and hybrid cost models |
|
|
72
|
+
| **Codegen** | `include/symplex/codegen/` | PTX emitter, WMMA/MMA instruction generation, swizzling, register allocation |
|
|
73
|
+
| **Distributed** | `include/symplex/distributed/` | Cluster mesh, SPMD sharding, NCCL bridge, pipeline overlap |
|
|
74
|
+
| **Fault Tolerance** | `include/symplex/fault_tolerance/` | Health monitoring, forward recovery, communicator repair, activation checkpointing |
|
|
75
|
+
| **Training** | `include/symplex/training/` | Training loop orchestrator, dynamic batch sizing, memory watchdog, JIT compiler pipeline |
|
|
76
|
+
|
|
77
|
+
## Quick Start
|
|
78
|
+
|
|
79
|
+
### Prerequisites
|
|
80
|
+
|
|
81
|
+
- C++20 compiler (GCC 12+, Clang 15+)
|
|
82
|
+
- CMake 3.20+
|
|
83
|
+
- Optional: CUDA Toolkit 12+ (for empirical profiling and GPU execution)
|
|
84
|
+
- Optional: ISL (Integer Set Library) for enhanced polyhedral analysis
|
|
85
|
+
- Optional: NCCL (for distributed training)
|
|
86
|
+
|
|
87
|
+
### Build
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
git clone https://github.com/hollowguy898-cloud/SympleX.git
|
|
91
|
+
cd SympleX
|
|
92
|
+
mkdir build && cd build
|
|
93
|
+
cmake .. -DCMAKE_BUILD_TYPE=Release
|
|
94
|
+
make -j$(nproc)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Run Tests
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
./symplex_tests
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Run the Matmul Optimization Example
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
./example_matmul
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
This optimizes a 4096×4096×2048 matrix multiplication for the H100 GPU target, running the full 3-phase superoptimizer search and generating PTX code.
|
|
110
|
+
|
|
111
|
+
## Usage
|
|
112
|
+
|
|
113
|
+
### Optimize a Matrix Multiplication
|
|
114
|
+
|
|
115
|
+
```cpp
|
|
116
|
+
#include "symplex/training/compiler_pipeline.h"
|
|
117
|
+
#include "symplex/hardware/hardware_target.h"
|
|
118
|
+
|
|
119
|
+
using namespace symplex::training;
|
|
120
|
+
using namespace symplex::hardware;
|
|
121
|
+
|
|
122
|
+
// Target: NVIDIA H100
|
|
123
|
+
HardwareTarget target = HardwareTarget::H100();
|
|
124
|
+
|
|
125
|
+
// Create compiler pipeline
|
|
126
|
+
CompilerPipeline pipeline(target);
|
|
127
|
+
|
|
128
|
+
// Optimize matmul C[M,N] += A[M,K] * B[K,N]
|
|
129
|
+
auto result = pipeline.compile_matmul(4096, 4096, 2048);
|
|
130
|
+
|
|
131
|
+
// result.ptx_source — generated PTX kernel
|
|
132
|
+
// result.estimated_latency_ns — predicted latency
|
|
133
|
+
// result.speedup_vs_naive — speedup over naive tiling
|
|
134
|
+
// result.grid_dims / block_dims — GPU launch parameters
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Use the Superoptimizer Directly
|
|
138
|
+
|
|
139
|
+
```cpp
|
|
140
|
+
#include "symplex/optimizer/superoptimizer.h"
|
|
141
|
+
#include "symplex/polyhedral/iteration_space.h"
|
|
142
|
+
#include "symplex/hardware/hardware_target.h"
|
|
143
|
+
|
|
144
|
+
using namespace symplex::optimizer;
|
|
145
|
+
using namespace symplex::polyhedral;
|
|
146
|
+
using namespace symplex::hardware;
|
|
147
|
+
|
|
148
|
+
HardwareTarget target = HardwareTarget::H100();
|
|
149
|
+
Superoptimizer opt(target);
|
|
150
|
+
|
|
151
|
+
auto ispace = make_matmul_iteration_space(1024, 1024, 512);
|
|
152
|
+
auto result = opt.optimize(ispace);
|
|
153
|
+
|
|
154
|
+
// result.best_tile — optimal TileConfig
|
|
155
|
+
// result.estimated_latency_ns — predicted latency
|
|
156
|
+
// result.speedup_vs_naive — speedup vs smallest Tensor Core tile
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Distributed Training with Fault Tolerance
|
|
160
|
+
|
|
161
|
+
```cpp
|
|
162
|
+
#include "symplex/training/training_loop.h"
|
|
163
|
+
#include "symplex/hardware/hardware_target.h"
|
|
164
|
+
|
|
165
|
+
using namespace symplex::training;
|
|
166
|
+
using namespace symplex::hardware;
|
|
167
|
+
|
|
168
|
+
TrainingConfig config;
|
|
169
|
+
config.global_batch_size = 2048;
|
|
170
|
+
config.enable_fault_tolerance = true;
|
|
171
|
+
config.enable_dynamic_batching = true;
|
|
172
|
+
|
|
173
|
+
HardwareTarget target = HardwareTarget::H100();
|
|
174
|
+
TrainingLoop loop(config, target);
|
|
175
|
+
|
|
176
|
+
auto ispace = make_matmul_iteration_space(4096, 4096, 2048);
|
|
177
|
+
loop.initialize(ispace);
|
|
178
|
+
|
|
179
|
+
auto results = loop.execute_epoch();
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## Key Mathematical Concepts
|
|
183
|
+
|
|
184
|
+
### Iteration Space (I)
|
|
185
|
+
Every AI loop nest is modeled as an **integer polytope**:
|
|
186
|
+
|
|
187
|
+
```
|
|
188
|
+
I = { i ∈ Z^n | A·i + b ≥ 0 }
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### Data Dependency Polyhedron (D)
|
|
192
|
+
Dependencies are vectors in the polyhedral space that must remain lexicographically positive:
|
|
193
|
+
|
|
194
|
+
```
|
|
195
|
+
d = i_sink - i_source, d ≥ 0
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### Schedule Map (Φ)
|
|
199
|
+
The central optimization maps iteration points to hardware coordinates and time:
|
|
200
|
+
|
|
201
|
+
```
|
|
202
|
+
Φ(i) → (DeviceID, SM_ID, Warp_ID, Thread_ID, TimeStep)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### 3-Phase Superoptimizer Search (Level 2)
|
|
206
|
+
|
|
207
|
+
1. **Roofline Pruning**: Drop 90% of tile configurations using analytical operational intensity bounds
|
|
208
|
+
2. **Compute-Symmetry Alignment**: Only evaluate tile sizes that are exact multiples of Tensor Core fragment dimensions (16×8×16 for H100)
|
|
209
|
+
3. **Hardware Occupancy Sieve**: Micro-benchmark the top candidates, selecting the configuration that maximizes SM occupancy
|
|
210
|
+
|
|
211
|
+
### Equality Saturation (Level 1 — the real superoptimizer)
|
|
212
|
+
|
|
213
|
+
Unlike traditional autotuners that only sweep hardware parameters, SympleX's superoptimizer explores the **space of equivalent programs** using equality saturation:
|
|
214
|
+
|
|
215
|
+
1. **E-Graph Construction**: The input program is represented as an e-graph — a data structure that compactly represents exponentially many equivalent expressions as equivalence classes
|
|
216
|
+
2. **Rewrite Rule Application**: Algebraic identities, fusion patterns, and tiling decompositions are applied iteratively, growing the e-graph to represent all discovered equivalent programs
|
|
217
|
+
3. **Polyhedral Guardrails**: Before any extracted program is accepted, it is validated against the original computation's data dependencies — rewrites that would violate semantics are rejected
|
|
218
|
+
4. **Cost-Guided Extraction**: The cheapest program is extracted from the saturated e-graph using a bottom-up dynamic programming approach, where fused operations (e.g., `FusedMatMulReLU`) have lower cost than their unfused equivalents (`ReLU(MatMul(A,B))`)
|
|
219
|
+
|
|
220
|
+
## Hardware Targets
|
|
221
|
+
|
|
222
|
+
Built-in profiles for:
|
|
223
|
+
|
|
224
|
+
| GPU | SMs | Tensor Core | HBM BW | SRAM/SM |
|
|
225
|
+
|-----|-----|-------------|--------|---------|
|
|
226
|
+
| **H100** (Hopper) | 132 | 16×8×16 FP16 | 3.35 TB/s | 228 KB |
|
|
227
|
+
| **B200** (Blackwell) | 160 | 16×8×32 FP16 | 8.0 TB/s | 304 KB |
|
|
228
|
+
| **Generic** | 84 | 16×8×16 FP16 | 2.0 TB/s | 164 KB |
|
|
229
|
+
|
|
230
|
+
Custom targets can be constructed via `HardwareTarget` fields.
|
|
231
|
+
|
|
232
|
+
## Project Structure
|
|
233
|
+
|
|
234
|
+
```
|
|
235
|
+
SympleX/
|
|
236
|
+
├── CMakeLists.txt
|
|
237
|
+
├── LICENSE # GNU AGPL v3
|
|
238
|
+
├── README.md
|
|
239
|
+
├── include/symplex/
|
|
240
|
+
│ ├── polyhedral/ # Core polyhedral types
|
|
241
|
+
│ │ ├── integer_polytope.h
|
|
242
|
+
│ │ ├── affine_map.h
|
|
243
|
+
│ │ ├── dependency.h
|
|
244
|
+
│ │ ├── iteration_space.h
|
|
245
|
+
│ │ └── union_map.h
|
|
246
|
+
│ ├── schedule/ # Schedule tree & transformations
|
|
247
|
+
│ │ ├── schedule_tree.h
|
|
248
|
+
│ │ ├── tiling.h
|
|
249
|
+
│ │ ├── fusion.h
|
|
250
|
+
│ │ ├── parallelization.h
|
|
251
|
+
│ │ └── schedule_map.h
|
|
252
|
+
│ ├── hardware/ # GPU hardware models
|
|
253
|
+
│ │ └── hardware_target.h
|
|
254
|
+
│ ├── optimizer/ # Superoptimizer search
|
|
255
|
+
│ │ ├── tile_config.h
|
|
256
|
+
│ │ ├── search_phase1.h
|
|
257
|
+
│ │ ├── search_phase2.h
|
|
258
|
+
│ │ ├── search_phase3.h
|
|
259
|
+
│ │ └── superoptimizer.h
|
|
260
|
+
│ ├── costmodel/ # Performance cost models
|
|
261
|
+
│ │ ├── roofline.h
|
|
262
|
+
│ │ ├── analytical.h
|
|
263
|
+
│ │ ├── empirical.h
|
|
264
|
+
│ │ └── cost_model.h
|
|
265
|
+
│ ├── codegen/ # PTX code generation
|
|
266
|
+
│ │ ├── wmma.h
|
|
267
|
+
│ │ ├── swizzle.h
|
|
268
|
+
│ │ ├── register_allocator.h
|
|
269
|
+
│ │ ├── ptx_emitter.h
|
|
270
|
+
│ │ └── code_generator.h
|
|
271
|
+
│ ├── distributed/ # Distributed training
|
|
272
|
+
│ │ ├── mesh.h
|
|
273
|
+
│ │ ├── sharding.h
|
|
274
|
+
│ │ ├── nccl_bridge.h
|
|
275
|
+
│ │ └── pipeline_overlap.h
|
|
276
|
+
│ ├── fault_tolerance/ # Fault tolerance
|
|
277
|
+
│ │ ├── health_monitor.h
|
|
278
|
+
│ │ ├── forward_recovery.h
|
|
279
|
+
│ │ ├── communicator_repair.h
|
|
280
|
+
│ │ └── checkpoint.h
|
|
281
|
+
│ └── training/ # Training orchestrator
|
|
282
|
+
│ ├── dynamic_batch.h
|
|
283
|
+
│ ├── memory_watchdog.h
|
|
284
|
+
│ ├── training_loop.h
|
|
285
|
+
│ └── compiler_pipeline.h
|
|
286
|
+
├── src/ # Implementation files (mirrors include/)
|
|
287
|
+
├── tests/ # Unit tests
|
|
288
|
+
├── benchmarks/ # Performance benchmarks
|
|
289
|
+
└── examples/ # Usage examples
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
## License
|
|
293
|
+
|
|
294
|
+
GNU Affero General Public License v3 — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["maturin>=1.0,<2.0"]
|
|
3
|
+
build-backend = "maturin"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "simplex-tensor"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "SympleX – Polyhedral Tensor Superoptimizer with JAX-style purity enforcement and x86-64 JIT compilation"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = {text = "AGPL-3.0"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "SympleX Contributors"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["jit", "compiler", "tensor", "polyhedral", "optimization", "simd", "avx512", "numpy", "scientific-computing"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Programming Language :: Rust",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
27
|
+
"Topic :: Software Development :: Compilers",
|
|
28
|
+
"Topic :: System :: Hardware :: Hardware Drivers",
|
|
29
|
+
"Operating System :: POSIX :: Linux",
|
|
30
|
+
]
|
|
31
|
+
dependencies = [
|
|
32
|
+
"numpy>=1.24",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.optional-dependencies]
|
|
36
|
+
dev = [
|
|
37
|
+
"pytest>=7.0",
|
|
38
|
+
"maturin>=1.0",
|
|
39
|
+
]
|
|
40
|
+
ml = [
|
|
41
|
+
"scipy>=1.10",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[project.urls]
|
|
45
|
+
Homepage = "https://github.com/hollowguy898-cloud/SympleX"
|
|
46
|
+
Repository = "https://github.com/hollowguy898-cloud/SympleX"
|
|
47
|
+
|
|
48
|
+
[tool.maturin]
|
|
49
|
+
features = ["pyo3/extension-module"]
|
|
50
|
+
python-source = "."
|
|
51
|
+
module-name = "symplex._symplex_core"
|
|
52
|
+
manifest-path = "python/Cargo.toml"
|
|
53
|
+
|
|
54
|
+
[tool.pytest.ini_options]
|
|
55
|
+
testpaths = ["tests"]
|