simplex-tensor 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. simplex_tensor-1.0.0/PKG-INFO +326 -0
  2. simplex_tensor-1.0.0/README.md +294 -0
  3. simplex_tensor-1.0.0/pyproject.toml +55 -0
  4. simplex_tensor-1.0.0/python/.gitignore +8 -0
  5. simplex_tensor-1.0.0/python/Cargo.lock +584 -0
  6. simplex_tensor-1.0.0/python/Cargo.toml +26 -0
  7. simplex_tensor-1.0.0/python/README.md +294 -0
  8. simplex_tensor-1.0.0/python/benchmarks/bench_physics.py +374 -0
  9. simplex_tensor-1.0.0/python/benchmarks/bench_results.json +384 -0
  10. simplex_tensor-1.0.0/python/benchmarks/bench_symplex.py +865 -0
  11. simplex_tensor-1.0.0/python/demo.py +88 -0
  12. simplex_tensor-1.0.0/python/setup.py +72 -0
  13. simplex_tensor-1.0.0/python/src/lib.rs +2001 -0
  14. simplex_tensor-1.0.0/python/tests/test_jit.py +278 -0
  15. simplex_tensor-1.0.0/python/tests/test_math.py +228 -0
  16. simplex_tensor-1.0.0/python/tests/test_purity.py +259 -0
  17. simplex_tensor-1.0.0/rust-engine/.gitignore +1 -0
  18. simplex_tensor-1.0.0/rust-engine/Cargo.toml +31 -0
  19. simplex_tensor-1.0.0/rust-engine/benches/physics_bench.rs +1352 -0
  20. simplex_tensor-1.0.0/rust-engine/src/cuda_backend.rs +1538 -0
  21. simplex_tensor-1.0.0/rust-engine/src/ffi.rs +847 -0
  22. simplex_tensor-1.0.0/rust-engine/src/fusion_engine.rs +3684 -0
  23. simplex_tensor-1.0.0/rust-engine/src/lib.rs +1702 -0
  24. simplex_tensor-1.0.0/rust-engine/src/phase3_jit.rs +21728 -0
  25. simplex_tensor-1.0.0/rust-engine/src/polyhedral.rs +4812 -0
  26. simplex_tensor-1.0.0/rust-engine/src/tracing_jit.rs +2114 -0
  27. simplex_tensor-1.0.0/rust-engine/src/types.rs +590 -0
  28. simplex_tensor-1.0.0/rust-engine/src/x86_emitter.rs +2287 -0
  29. simplex_tensor-1.0.0/rust-engine/tests/integration.rs +118 -0
  30. simplex_tensor-1.0.0/symplex/__init__.py +799 -0
  31. simplex_tensor-1.0.0/symplex/_array.py +419 -0
  32. simplex_tensor-1.0.0/symplex/_ast_checker.py +500 -0
  33. simplex_tensor-1.0.0/symplex/_errors.py +39 -0
  34. simplex_tensor-1.0.0/symplex/_jit.py +3326 -0
  35. simplex_tensor-1.0.0/symplex/_tracer.cpp +612 -0
  36. simplex_tensor-1.0.0/symplex/_tracer.py +445 -0
  37. simplex_tensor-1.0.0/symplex/linalg.py +258 -0
@@ -0,0 +1,326 @@
1
+ Metadata-Version: 2.4
2
+ Name: simplex-tensor
3
+ Version: 1.0.0
4
+ Classifier: Development Status :: 4 - Beta
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: Intended Audience :: Science/Research
7
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Rust
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Classifier: Topic :: Software Development :: Compilers
15
+ Classifier: Topic :: System :: Hardware :: Hardware Drivers
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Requires-Dist: numpy>=1.24
18
+ Requires-Dist: pytest>=7.0 ; extra == 'dev'
19
+ Requires-Dist: maturin>=1.0 ; extra == 'dev'
20
+ Requires-Dist: scipy>=1.10 ; extra == 'ml'
21
+ Provides-Extra: dev
22
+ Provides-Extra: ml
23
+ Summary: SympleX – Polyhedral Tensor Superoptimizer with JAX-style purity enforcement and x86-64 JIT compilation
24
+ Keywords: jit,compiler,tensor,polyhedral,optimization,simd,avx512,numpy,scientific-computing
25
+ Author: SympleX Contributors
26
+ License: AGPL-3.0
27
+ Requires-Python: >=3.10
28
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
29
+ Project-URL: Homepage, https://github.com/hollowguy898-cloud/SympleX
30
+ Project-URL: Repository, https://github.com/hollowguy898-cloud/SympleX
31
+
32
+ # SympleX — Polyhedral Tensor Superoptimizer
33
+
34
+ **SympleX** is a C++20 compiler engine that combines equality saturation (e-graph superoptimization) with the polyhedral model to automatically discover mathematically equivalent but faster programs for AI training. It explores the space of valid program transformations — not just tile sizes — and maps the best discovered program directly onto GPU Tensor Cores, SRAM hierarchies, and distributed cluster topologies.
35
+
36
+ ## Architecture
37
+
38
+ ```
39
+ [AI Model Graph / Iteration Space]
40
+
41
+
42
+ ┌──────────────────────────────────────────────────────┐
43
+ │ Level 1: Mathematical Superoptimizer (E-Graph) │
44
+ │ ┌────────────────────────────────────────────────┐ │
45
+ │ │ E-Graph: compactly represents exponentially │ │
46
+ │ │ many equivalent programs as equivalence classes │ │
47
+ │ └────────────────────────────────────────────────┘ │
48
+ │ Rewrite Rules: │
49
+ │ - A*B + A*C → A*(B+C) (factor / distribute) │
50
+ │ - ReLU(MatMul(A,B)) → FusedMatMulReLU(A,B) │
51
+ │ - MatMul(A,B)+bias → FusedMatMulAdd(A,B,bias) │
52
+ │ - A + 0 == A, A * 1 == A, A * 0 == 0 │
53
+ │ - (A@B)^T == B^T@A^T, Transpose(Transpose(A))=A │
54
+ │ - Softmax decomposition, LayerNorm decomposition │
55
+ │ - Tiling decomposition, tile-pointwise distribution│
56
+ │ Polyhedral Guardrails: │
57
+ │ - Rejects rewrites that violate dependencies │
58
+ │ Cost-Guided Extraction: │
59
+ │ - Picks cheapest program from saturated e-graph │
60
+ └──────────────────────────────────────────────────────┘
61
+
62
+
63
+ ┌──────────────────────────────────────────────────────┐
64
+ │ Level 2: Hardware-Mapping Search │
65
+ │ Phase 1: Roofline pruning (memory-bound filtering) │
66
+ │ Phase 2: Compute-symmetry alignment (TC multiples) │
67
+ │ Phase 3: Hardware occupancy sieve (analytical/emp.) │
68
+ └──────────────────────────────────────────────────────┘
69
+
70
+
71
+ ┌──────────────────────────────────────────────────────┐
72
+ │ Code Generator (PTX Emitter) │
73
+ │ - WMMA/MMA Tensor Core instructions │
74
+ │ - Shared memory swizzling (XOR bank-conflict avoid) │
75
+ │ - Async TMA / cp.async pipelines │
76
+ │ - Double-buffering & software pipelining │
77
+ └──────────────────────────────────────────────────────┘
78
+
79
+
80
+ ┌──────────────────────────────────────────────────────┐
81
+ │ Distributed Sharding & Fault Tolerance │
82
+ │ - 2D mesh (TP × PP × DP) │
83
+ │ - NCCL collective scheduling │
84
+ │ - 1F1B pipeline overlap │
85
+ │ - Resilient forward recovery (no rollback) │
86
+ │ - Dynamic micro-batching │
87
+ │ - Activation checkpointing (save/recompute/offload) │
88
+ └──────────────────────────────────────────────────────┘
89
+
90
+
91
+ [Optimized GPU Binary / PTX]
92
+ ```
93
+
94
+ ## Modules
95
+
96
+ | Module | Path | Description |
97
+ |--------|------|-------------|
98
+ | **Polyhedral** | `include/symplex/polyhedral/` | Integer polytopes, affine maps, dependency analysis, iteration spaces |
99
+ | **Schedule** | `include/symplex/schedule/` | Schedule trees, tiling, operator fusion, GPU parallelization mapping |
100
+ | **Hardware** | `include/symplex/hardware/` | GPU topology, Tensor Core specs, memory hierarchy, roofline model |
101
+ | **Optimizer** | `include/symplex/optimizer/` | **E-graph superoptimizer** + 3-phase hardware search (roofline → symmetry → occupancy) |
102
+ | **Cost Model** | `include/symplex/costmodel/` | Roofline, analytical, empirical, and hybrid cost models |
103
+ | **Codegen** | `include/symplex/codegen/` | PTX emitter, WMMA/MMA instruction generation, swizzling, register allocation |
104
+ | **Distributed** | `include/symplex/distributed/` | Cluster mesh, SPMD sharding, NCCL bridge, pipeline overlap |
105
+ | **Fault Tolerance** | `include/symplex/fault_tolerance/` | Health monitoring, forward recovery, communicator repair, activation checkpointing |
106
+ | **Training** | `include/symplex/training/` | Training loop orchestrator, dynamic batch sizing, memory watchdog, JIT compiler pipeline |
107
+
108
+ ## Quick Start
109
+
110
+ ### Prerequisites
111
+
112
+ - C++20 compiler (GCC 12+, Clang 15+)
113
+ - CMake 3.20+
114
+ - Optional: CUDA Toolkit 12+ (for empirical profiling and GPU execution)
115
+ - Optional: ISL (Integer Set Library) for enhanced polyhedral analysis
116
+ - Optional: NCCL (for distributed training)
117
+
118
+ ### Build
119
+
120
+ ```bash
121
+ git clone https://github.com/hollowguy898-cloud/SympleX.git
122
+ cd SympleX
123
+ mkdir build && cd build
124
+ cmake .. -DCMAKE_BUILD_TYPE=Release
125
+ make -j$(nproc)
126
+ ```
127
+
128
+ ### Run Tests
129
+
130
+ ```bash
131
+ ./symplex_tests
132
+ ```
133
+
134
+ ### Run the Matmul Optimization Example
135
+
136
+ ```bash
137
+ ./example_matmul
138
+ ```
139
+
140
+ This optimizes a 4096×4096×2048 matrix multiplication for the H100 GPU target, running the full 3-phase superoptimizer search and generating PTX code.
141
+
142
+ ## Usage
143
+
144
+ ### Optimize a Matrix Multiplication
145
+
146
+ ```cpp
147
+ #include "symplex/training/compiler_pipeline.h"
148
+ #include "symplex/hardware/hardware_target.h"
149
+
150
+ using namespace symplex::training;
151
+ using namespace symplex::hardware;
152
+
153
+ // Target: NVIDIA H100
154
+ HardwareTarget target = HardwareTarget::H100();
155
+
156
+ // Create compiler pipeline
157
+ CompilerPipeline pipeline(target);
158
+
159
+ // Optimize matmul C[M,N] += A[M,K] * B[K,N]
160
+ auto result = pipeline.compile_matmul(4096, 4096, 2048);
161
+
162
+ // result.ptx_source — generated PTX kernel
163
+ // result.estimated_latency_ns — predicted latency
164
+ // result.speedup_vs_naive — speedup over naive tiling
165
+ // result.grid_dims / block_dims — GPU launch parameters
166
+ ```
167
+
168
+ ### Use the Superoptimizer Directly
169
+
170
+ ```cpp
171
+ #include "symplex/optimizer/superoptimizer.h"
172
+ #include "symplex/polyhedral/iteration_space.h"
173
+ #include "symplex/hardware/hardware_target.h"
174
+
175
+ using namespace symplex::optimizer;
176
+ using namespace symplex::polyhedral;
177
+ using namespace symplex::hardware;
178
+
179
+ HardwareTarget target = HardwareTarget::H100();
180
+ Superoptimizer opt(target);
181
+
182
+ auto ispace = make_matmul_iteration_space(1024, 1024, 512);
183
+ auto result = opt.optimize(ispace);
184
+
185
+ // result.best_tile — optimal TileConfig
186
+ // result.estimated_latency_ns — predicted latency
187
+ // result.speedup_vs_naive — speedup vs smallest Tensor Core tile
188
+ ```
189
+
190
+ ### Distributed Training with Fault Tolerance
191
+
192
+ ```cpp
193
+ #include "symplex/training/training_loop.h"
194
+ #include "symplex/hardware/hardware_target.h"
195
+
196
+ using namespace symplex::training;
197
+ using namespace symplex::hardware;
198
+
199
+ TrainingConfig config;
200
+ config.global_batch_size = 2048;
201
+ config.enable_fault_tolerance = true;
202
+ config.enable_dynamic_batching = true;
203
+
204
+ HardwareTarget target = HardwareTarget::H100();
205
+ TrainingLoop loop(config, target);
206
+
207
+ auto ispace = make_matmul_iteration_space(4096, 4096, 2048);
208
+ loop.initialize(ispace);
209
+
210
+ auto results = loop.execute_epoch();
211
+ ```
212
+
213
+ ## Key Mathematical Concepts
214
+
215
+ ### Iteration Space (I)
216
+ Every AI loop nest is modeled as an **integer polytope**:
217
+
218
+ ```
219
+ I = { i ∈ Z^n | A·i + b ≥ 0 }
220
+ ```
221
+
222
+ ### Data Dependency Polyhedron (D)
223
+ Dependencies are vectors in the polyhedral space that must remain lexicographically positive:
224
+
225
+ ```
226
+ d = i_sink - i_source, d ≥ 0
227
+ ```
228
+
229
+ ### Schedule Map (Φ)
230
+ The central optimization maps iteration points to hardware coordinates and time:
231
+
232
+ ```
233
+ Φ(i) → (DeviceID, SM_ID, Warp_ID, Thread_ID, TimeStep)
234
+ ```
235
+
236
+ ### 3-Phase Superoptimizer Search (Level 2)
237
+
238
+ 1. **Roofline Pruning**: Drop 90% of tile configurations using analytical operational intensity bounds
239
+ 2. **Compute-Symmetry Alignment**: Only evaluate tile sizes that are exact multiples of Tensor Core fragment dimensions (16×8×16 for H100)
240
+ 3. **Hardware Occupancy Sieve**: Micro-benchmark the top candidates, selecting the configuration that maximizes SM occupancy
241
+
242
+ ### Equality Saturation (Level 1 — the real superoptimizer)
243
+
244
+ Unlike traditional autotuners that only sweep hardware parameters, SympleX's superoptimizer explores the **space of equivalent programs** using equality saturation:
245
+
246
+ 1. **E-Graph Construction**: The input program is represented as an e-graph — a data structure that compactly represents exponentially many equivalent expressions as equivalence classes
247
+ 2. **Rewrite Rule Application**: Algebraic identities, fusion patterns, and tiling decompositions are applied iteratively, growing the e-graph to represent all discovered equivalent programs
248
+ 3. **Polyhedral Guardrails**: Before any extracted program is accepted, it is validated against the original computation's data dependencies — rewrites that would violate semantics are rejected
249
+ 4. **Cost-Guided Extraction**: The cheapest program is extracted from the saturated e-graph using a bottom-up dynamic programming approach, where fused operations (e.g., `FusedMatMulReLU`) have lower cost than their unfused equivalents (`ReLU(MatMul(A,B))`)
250
+
251
+ ## Hardware Targets
252
+
253
+ Built-in profiles for:
254
+
255
+ | GPU | SMs | Tensor Core | HBM BW | SRAM/SM |
256
+ |-----|-----|-------------|--------|---------|
257
+ | **H100** (Hopper) | 132 | 16×8×16 FP16 | 3.35 TB/s | 228 KB |
258
+ | **B200** (Blackwell) | 160 | 16×8×32 FP16 | 8.0 TB/s | 304 KB |
259
+ | **Generic** | 84 | 16×8×16 FP16 | 2.0 TB/s | 164 KB |
260
+
261
+ Custom targets can be constructed via `HardwareTarget` fields.
262
+
263
+ ## Project Structure
264
+
265
+ ```
266
+ SympleX/
267
+ ├── CMakeLists.txt
268
+ ├── LICENSE # GNU AGPL v3
269
+ ├── README.md
270
+ ├── include/symplex/
271
+ │ ├── polyhedral/ # Core polyhedral types
272
+ │ │ ├── integer_polytope.h
273
+ │ │ ├── affine_map.h
274
+ │ │ ├── dependency.h
275
+ │ │ ├── iteration_space.h
276
+ │ │ └── union_map.h
277
+ │ ├── schedule/ # Schedule tree & transformations
278
+ │ │ ├── schedule_tree.h
279
+ │ │ ├── tiling.h
280
+ │ │ ├── fusion.h
281
+ │ │ ├── parallelization.h
282
+ │ │ └── schedule_map.h
283
+ │ ├── hardware/ # GPU hardware models
284
+ │ │ └── hardware_target.h
285
+ │ ├── optimizer/ # Superoptimizer search
286
+ │ │ ├── tile_config.h
287
+ │ │ ├── search_phase1.h
288
+ │ │ ├── search_phase2.h
289
+ │ │ ├── search_phase3.h
290
+ │ │ └── superoptimizer.h
291
+ │ ├── costmodel/ # Performance cost models
292
+ │ │ ├── roofline.h
293
+ │ │ ├── analytical.h
294
+ │ │ ├── empirical.h
295
+ │ │ └── cost_model.h
296
+ │ ├── codegen/ # PTX code generation
297
+ │ │ ├── wmma.h
298
+ │ │ ├── swizzle.h
299
+ │ │ ├── register_allocator.h
300
+ │ │ ├── ptx_emitter.h
301
+ │ │ └── code_generator.h
302
+ │ ├── distributed/ # Distributed training
303
+ │ │ ├── mesh.h
304
+ │ │ ├── sharding.h
305
+ │ │ ├── nccl_bridge.h
306
+ │ │ └── pipeline_overlap.h
307
+ │ ├── fault_tolerance/ # Fault tolerance
308
+ │ │ ├── health_monitor.h
309
+ │ │ ├── forward_recovery.h
310
+ │ │ ├── communicator_repair.h
311
+ │ │ └── checkpoint.h
312
+ │ └── training/ # Training orchestrator
313
+ │ ├── dynamic_batch.h
314
+ │ ├── memory_watchdog.h
315
+ │ ├── training_loop.h
316
+ │ └── compiler_pipeline.h
317
+ ├── src/ # Implementation files (mirrors include/)
318
+ ├── tests/ # Unit tests
319
+ ├── benchmarks/ # Performance benchmarks
320
+ └── examples/ # Usage examples
321
+ ```
322
+
323
+ ## License
324
+
325
+ GNU Affero General Public License v3 — see [LICENSE](LICENSE).
326
+
@@ -0,0 +1,294 @@
1
+ # SympleX — Polyhedral Tensor Superoptimizer
2
+
3
+ **SympleX** is a C++20 compiler engine that combines equality saturation (e-graph superoptimization) with the polyhedral model to automatically discover mathematically equivalent but faster programs for AI training. It explores the space of valid program transformations — not just tile sizes — and maps the best discovered program directly onto GPU Tensor Cores, SRAM hierarchies, and distributed cluster topologies.
4
+
5
+ ## Architecture
6
+
7
+ ```
8
+ [AI Model Graph / Iteration Space]
9
+
10
+
11
+ ┌──────────────────────────────────────────────────────┐
12
+ │ Level 1: Mathematical Superoptimizer (E-Graph) │
13
+ │ ┌────────────────────────────────────────────────┐ │
14
+ │ │ E-Graph: compactly represents exponentially │ │
15
+ │ │ many equivalent programs as equivalence classes │ │
16
+ │ └────────────────────────────────────────────────┘ │
17
+ │ Rewrite Rules: │
18
+ │ - A*B + A*C → A*(B+C) (factor / distribute) │
19
+ │ - ReLU(MatMul(A,B)) → FusedMatMulReLU(A,B) │
20
+ │ - MatMul(A,B)+bias → FusedMatMulAdd(A,B,bias) │
21
+ │ - A + 0 == A, A * 1 == A, A * 0 == 0 │
22
+ │ - (A@B)^T == B^T@A^T, Transpose(Transpose(A))=A │
23
+ │ - Softmax decomposition, LayerNorm decomposition │
24
+ │ - Tiling decomposition, tile-pointwise distribution│
25
+ │ Polyhedral Guardrails: │
26
+ │ - Rejects rewrites that violate dependencies │
27
+ │ Cost-Guided Extraction: │
28
+ │ - Picks cheapest program from saturated e-graph │
29
+ └──────────────────────────────────────────────────────┘
30
+
31
+
32
+ ┌──────────────────────────────────────────────────────┐
33
+ │ Level 2: Hardware-Mapping Search │
34
+ │ Phase 1: Roofline pruning (memory-bound filtering) │
35
+ │ Phase 2: Compute-symmetry alignment (TC multiples) │
36
+ │ Phase 3: Hardware occupancy sieve (analytical/emp.) │
37
+ └──────────────────────────────────────────────────────┘
38
+
39
+
40
+ ┌──────────────────────────────────────────────────────┐
41
+ │ Code Generator (PTX Emitter) │
42
+ │ - WMMA/MMA Tensor Core instructions │
43
+ │ - Shared memory swizzling (XOR bank-conflict avoid) │
44
+ │ - Async TMA / cp.async pipelines │
45
+ │ - Double-buffering & software pipelining │
46
+ └──────────────────────────────────────────────────────┘
47
+
48
+
49
+ ┌──────────────────────────────────────────────────────┐
50
+ │ Distributed Sharding & Fault Tolerance │
51
+ │ - 2D mesh (TP × PP × DP) │
52
+ │ - NCCL collective scheduling │
53
+ │ - 1F1B pipeline overlap │
54
+ │ - Resilient forward recovery (no rollback) │
55
+ │ - Dynamic micro-batching │
56
+ │ - Activation checkpointing (save/recompute/offload) │
57
+ └──────────────────────────────────────────────────────┘
58
+
59
+
60
+ [Optimized GPU Binary / PTX]
61
+ ```
62
+
63
+ ## Modules
64
+
65
+ | Module | Path | Description |
66
+ |--------|------|-------------|
67
+ | **Polyhedral** | `include/symplex/polyhedral/` | Integer polytopes, affine maps, dependency analysis, iteration spaces |
68
+ | **Schedule** | `include/symplex/schedule/` | Schedule trees, tiling, operator fusion, GPU parallelization mapping |
69
+ | **Hardware** | `include/symplex/hardware/` | GPU topology, Tensor Core specs, memory hierarchy, roofline model |
70
+ | **Optimizer** | `include/symplex/optimizer/` | **E-graph superoptimizer** + 3-phase hardware search (roofline → symmetry → occupancy) |
71
+ | **Cost Model** | `include/symplex/costmodel/` | Roofline, analytical, empirical, and hybrid cost models |
72
+ | **Codegen** | `include/symplex/codegen/` | PTX emitter, WMMA/MMA instruction generation, swizzling, register allocation |
73
+ | **Distributed** | `include/symplex/distributed/` | Cluster mesh, SPMD sharding, NCCL bridge, pipeline overlap |
74
+ | **Fault Tolerance** | `include/symplex/fault_tolerance/` | Health monitoring, forward recovery, communicator repair, activation checkpointing |
75
+ | **Training** | `include/symplex/training/` | Training loop orchestrator, dynamic batch sizing, memory watchdog, JIT compiler pipeline |
76
+
77
+ ## Quick Start
78
+
79
+ ### Prerequisites
80
+
81
+ - C++20 compiler (GCC 12+, Clang 15+)
82
+ - CMake 3.20+
83
+ - Optional: CUDA Toolkit 12+ (for empirical profiling and GPU execution)
84
+ - Optional: ISL (Integer Set Library) for enhanced polyhedral analysis
85
+ - Optional: NCCL (for distributed training)
86
+
87
+ ### Build
88
+
89
+ ```bash
90
+ git clone https://github.com/hollowguy898-cloud/SympleX.git
91
+ cd SympleX
92
+ mkdir build && cd build
93
+ cmake .. -DCMAKE_BUILD_TYPE=Release
94
+ make -j$(nproc)
95
+ ```
96
+
97
+ ### Run Tests
98
+
99
+ ```bash
100
+ ./symplex_tests
101
+ ```
102
+
103
+ ### Run the Matmul Optimization Example
104
+
105
+ ```bash
106
+ ./example_matmul
107
+ ```
108
+
109
+ This optimizes a 4096×4096×2048 matrix multiplication for the H100 GPU target, running the full 3-phase superoptimizer search and generating PTX code.
110
+
111
+ ## Usage
112
+
113
+ ### Optimize a Matrix Multiplication
114
+
115
+ ```cpp
116
+ #include "symplex/training/compiler_pipeline.h"
117
+ #include "symplex/hardware/hardware_target.h"
118
+
119
+ using namespace symplex::training;
120
+ using namespace symplex::hardware;
121
+
122
+ // Target: NVIDIA H100
123
+ HardwareTarget target = HardwareTarget::H100();
124
+
125
+ // Create compiler pipeline
126
+ CompilerPipeline pipeline(target);
127
+
128
+ // Optimize matmul C[M,N] += A[M,K] * B[K,N]
129
+ auto result = pipeline.compile_matmul(4096, 4096, 2048);
130
+
131
+ // result.ptx_source — generated PTX kernel
132
+ // result.estimated_latency_ns — predicted latency
133
+ // result.speedup_vs_naive — speedup over naive tiling
134
+ // result.grid_dims / block_dims — GPU launch parameters
135
+ ```
136
+
137
+ ### Use the Superoptimizer Directly
138
+
139
+ ```cpp
140
+ #include "symplex/optimizer/superoptimizer.h"
141
+ #include "symplex/polyhedral/iteration_space.h"
142
+ #include "symplex/hardware/hardware_target.h"
143
+
144
+ using namespace symplex::optimizer;
145
+ using namespace symplex::polyhedral;
146
+ using namespace symplex::hardware;
147
+
148
+ HardwareTarget target = HardwareTarget::H100();
149
+ Superoptimizer opt(target);
150
+
151
+ auto ispace = make_matmul_iteration_space(1024, 1024, 512);
152
+ auto result = opt.optimize(ispace);
153
+
154
+ // result.best_tile — optimal TileConfig
155
+ // result.estimated_latency_ns — predicted latency
156
+ // result.speedup_vs_naive — speedup vs smallest Tensor Core tile
157
+ ```
158
+
159
+ ### Distributed Training with Fault Tolerance
160
+
161
+ ```cpp
162
+ #include "symplex/training/training_loop.h"
163
+ #include "symplex/hardware/hardware_target.h"
164
+
165
+ using namespace symplex::training;
166
+ using namespace symplex::hardware;
167
+
168
+ TrainingConfig config;
169
+ config.global_batch_size = 2048;
170
+ config.enable_fault_tolerance = true;
171
+ config.enable_dynamic_batching = true;
172
+
173
+ HardwareTarget target = HardwareTarget::H100();
174
+ TrainingLoop loop(config, target);
175
+
176
+ auto ispace = make_matmul_iteration_space(4096, 4096, 2048);
177
+ loop.initialize(ispace);
178
+
179
+ auto results = loop.execute_epoch();
180
+ ```
181
+
182
+ ## Key Mathematical Concepts
183
+
184
+ ### Iteration Space (I)
185
+ Every AI loop nest is modeled as an **integer polytope**:
186
+
187
+ ```
188
+ I = { i ∈ Z^n | A·i + b ≥ 0 }
189
+ ```
190
+
191
+ ### Data Dependency Polyhedron (D)
192
+ Dependencies are vectors in the polyhedral space that must remain lexicographically positive:
193
+
194
+ ```
195
+ d = i_sink - i_source, d ≥ 0
196
+ ```
197
+
198
+ ### Schedule Map (Φ)
199
+ The central optimization maps iteration points to hardware coordinates and time:
200
+
201
+ ```
202
+ Φ(i) → (DeviceID, SM_ID, Warp_ID, Thread_ID, TimeStep)
203
+ ```
204
+
205
+ ### 3-Phase Superoptimizer Search (Level 2)
206
+
207
+ 1. **Roofline Pruning**: Drop 90% of tile configurations using analytical operational intensity bounds
208
+ 2. **Compute-Symmetry Alignment**: Only evaluate tile sizes that are exact multiples of Tensor Core fragment dimensions (16×8×16 for H100)
209
+ 3. **Hardware Occupancy Sieve**: Micro-benchmark the top candidates, selecting the configuration that maximizes SM occupancy
210
+
211
+ ### Equality Saturation (Level 1 — the real superoptimizer)
212
+
213
+ Unlike traditional autotuners that only sweep hardware parameters, SympleX's superoptimizer explores the **space of equivalent programs** using equality saturation:
214
+
215
+ 1. **E-Graph Construction**: The input program is represented as an e-graph — a data structure that compactly represents exponentially many equivalent expressions as equivalence classes
216
+ 2. **Rewrite Rule Application**: Algebraic identities, fusion patterns, and tiling decompositions are applied iteratively, growing the e-graph to represent all discovered equivalent programs
217
+ 3. **Polyhedral Guardrails**: Before any extracted program is accepted, it is validated against the original computation's data dependencies — rewrites that would violate semantics are rejected
218
+ 4. **Cost-Guided Extraction**: The cheapest program is extracted from the saturated e-graph using a bottom-up dynamic programming approach, where fused operations (e.g., `FusedMatMulReLU`) have lower cost than their unfused equivalents (`ReLU(MatMul(A,B))`)
219
+
220
+ ## Hardware Targets
221
+
222
+ Built-in profiles for:
223
+
224
+ | GPU | SMs | Tensor Core | HBM BW | SRAM/SM |
225
+ |-----|-----|-------------|--------|---------|
226
+ | **H100** (Hopper) | 132 | 16×8×16 FP16 | 3.35 TB/s | 228 KB |
227
+ | **B200** (Blackwell) | 160 | 16×8×32 FP16 | 8.0 TB/s | 304 KB |
228
+ | **Generic** | 84 | 16×8×16 FP16 | 2.0 TB/s | 164 KB |
229
+
230
+ Custom targets can be constructed via `HardwareTarget` fields.
231
+
232
+ ## Project Structure
233
+
234
+ ```
235
+ SympleX/
236
+ ├── CMakeLists.txt
237
+ ├── LICENSE # GNU AGPL v3
238
+ ├── README.md
239
+ ├── include/symplex/
240
+ │ ├── polyhedral/ # Core polyhedral types
241
+ │ │ ├── integer_polytope.h
242
+ │ │ ├── affine_map.h
243
+ │ │ ├── dependency.h
244
+ │ │ ├── iteration_space.h
245
+ │ │ └── union_map.h
246
+ │ ├── schedule/ # Schedule tree & transformations
247
+ │ │ ├── schedule_tree.h
248
+ │ │ ├── tiling.h
249
+ │ │ ├── fusion.h
250
+ │ │ ├── parallelization.h
251
+ │ │ └── schedule_map.h
252
+ │ ├── hardware/ # GPU hardware models
253
+ │ │ └── hardware_target.h
254
+ │ ├── optimizer/ # Superoptimizer search
255
+ │ │ ├── tile_config.h
256
+ │ │ ├── search_phase1.h
257
+ │ │ ├── search_phase2.h
258
+ │ │ ├── search_phase3.h
259
+ │ │ └── superoptimizer.h
260
+ │ ├── costmodel/ # Performance cost models
261
+ │ │ ├── roofline.h
262
+ │ │ ├── analytical.h
263
+ │ │ ├── empirical.h
264
+ │ │ └── cost_model.h
265
+ │ ├── codegen/ # PTX code generation
266
+ │ │ ├── wmma.h
267
+ │ │ ├── swizzle.h
268
+ │ │ ├── register_allocator.h
269
+ │ │ ├── ptx_emitter.h
270
+ │ │ └── code_generator.h
271
+ │ ├── distributed/ # Distributed training
272
+ │ │ ├── mesh.h
273
+ │ │ ├── sharding.h
274
+ │ │ ├── nccl_bridge.h
275
+ │ │ └── pipeline_overlap.h
276
+ │ ├── fault_tolerance/ # Fault tolerance
277
+ │ │ ├── health_monitor.h
278
+ │ │ ├── forward_recovery.h
279
+ │ │ ├── communicator_repair.h
280
+ │ │ └── checkpoint.h
281
+ │ └── training/ # Training orchestrator
282
+ │ ├── dynamic_batch.h
283
+ │ ├── memory_watchdog.h
284
+ │ ├── training_loop.h
285
+ │ └── compiler_pipeline.h
286
+ ├── src/ # Implementation files (mirrors include/)
287
+ ├── tests/ # Unit tests
288
+ ├── benchmarks/ # Performance benchmarks
289
+ └── examples/ # Usage examples
290
+ ```
291
+
292
+ ## License
293
+
294
+ GNU Affero General Public License v3 — see [LICENSE](LICENSE).
@@ -0,0 +1,55 @@
1
+ [build-system]
2
+ requires = ["maturin>=1.0,<2.0"]
3
+ build-backend = "maturin"
4
+
5
+ [project]
6
+ name = "simplex-tensor"
7
+ version = "1.0.0"
8
+ description = "SympleX – Polyhedral Tensor Superoptimizer with JAX-style purity enforcement and x86-64 JIT compilation"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = {text = "AGPL-3.0"}
12
+ authors = [
13
+ {name = "SympleX Contributors"},
14
+ ]
15
+ keywords = ["jit", "compiler", "tensor", "polyhedral", "optimization", "simd", "avx512", "numpy", "scientific-computing"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Science/Research",
20
+ "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Programming Language :: Rust",
26
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
27
+ "Topic :: Software Development :: Compilers",
28
+ "Topic :: System :: Hardware :: Hardware Drivers",
29
+ "Operating System :: POSIX :: Linux",
30
+ ]
31
+ dependencies = [
32
+ "numpy>=1.24",
33
+ ]
34
+
35
+ [project.optional-dependencies]
36
+ dev = [
37
+ "pytest>=7.0",
38
+ "maturin>=1.0",
39
+ ]
40
+ ml = [
41
+ "scipy>=1.10",
42
+ ]
43
+
44
+ [project.urls]
45
+ Homepage = "https://github.com/hollowguy898-cloud/SympleX"
46
+ Repository = "https://github.com/hollowguy898-cloud/SympleX"
47
+
48
+ [tool.maturin]
49
+ features = ["pyo3/extension-module"]
50
+ python-source = "."
51
+ module-name = "symplex._symplex_core"
52
+ manifest-path = "python/Cargo.toml"
53
+
54
+ [tool.pytest.ini_options]
55
+ testpaths = ["tests"]
@@ -0,0 +1,8 @@
1
+ target/
2
+ __pycache__/
3
+ *.pyc
4
+ .pytest_cache/
5
+ *.egg-info/
6
+ dist/
7
+ build/
8
+ *.whl