simplex-tensor 1.0.0__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/PKG-INFO +1 -1
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/pyproject.toml +1 -1
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/python/Cargo.lock +2 -2
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/python/setup.py +1 -1
- simplex_tensor-1.1.0/rust-engine/Cargo.lock +574 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/rust-engine/src/x86_emitter.rs +293 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/README.md +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/python/.gitignore +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/python/Cargo.toml +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/python/README.md +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/python/benchmarks/bench_physics.py +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/python/benchmarks/bench_results.json +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/python/benchmarks/bench_symplex.py +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/python/demo.py +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/python/src/lib.rs +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/python/tests/test_jit.py +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/python/tests/test_math.py +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/python/tests/test_purity.py +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/rust-engine/.gitignore +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/rust-engine/Cargo.toml +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/rust-engine/benches/physics_bench.rs +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/rust-engine/src/cuda_backend.rs +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/rust-engine/src/ffi.rs +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/rust-engine/src/fusion_engine.rs +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/rust-engine/src/lib.rs +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/rust-engine/src/phase3_jit.rs +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/rust-engine/src/polyhedral.rs +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/rust-engine/src/tracing_jit.rs +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/rust-engine/src/types.rs +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/rust-engine/tests/integration.rs +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/symplex/__init__.py +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/symplex/_array.py +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/symplex/_ast_checker.py +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/symplex/_errors.py +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/symplex/_jit.py +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/symplex/_tracer.cpp +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/symplex/_tracer.py +0 -0
- {simplex_tensor-1.0.0 → simplex_tensor-1.1.0}/symplex/linalg.py +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "maturin"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "simplex-tensor"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.1.0"
|
|
8
8
|
description = "SympleX – Polyhedral Tensor Superoptimizer with JAX-style purity enforcement and x86-64 JIT compilation"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -205,9 +205,9 @@ dependencies = [
|
|
|
205
205
|
|
|
206
206
|
[[package]]
|
|
207
207
|
name = "log"
|
|
208
|
-
version = "0.4.
|
|
208
|
+
version = "0.4.32"
|
|
209
209
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
210
|
-
checksum = "
|
|
210
|
+
checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a"
|
|
211
211
|
|
|
212
212
|
[[package]]
|
|
213
213
|
name = "matrixmultiply"
|
|
@@ -0,0 +1,574 @@
|
|
|
1
|
+
# This file is automatically @generated by Cargo.
|
|
2
|
+
# It is not intended for manual editing.
|
|
3
|
+
version = 4
|
|
4
|
+
|
|
5
|
+
[[package]]
|
|
6
|
+
name = "autocfg"
|
|
7
|
+
version = "1.5.1"
|
|
8
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
9
|
+
checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53"
|
|
10
|
+
|
|
11
|
+
[[package]]
|
|
12
|
+
name = "bumpalo"
|
|
13
|
+
version = "3.20.3"
|
|
14
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
15
|
+
checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649"
|
|
16
|
+
|
|
17
|
+
[[package]]
|
|
18
|
+
name = "cfg-if"
|
|
19
|
+
version = "1.0.4"
|
|
20
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
21
|
+
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
|
22
|
+
|
|
23
|
+
[[package]]
|
|
24
|
+
name = "crossbeam-deque"
|
|
25
|
+
version = "0.8.6"
|
|
26
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
27
|
+
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
|
28
|
+
dependencies = [
|
|
29
|
+
"crossbeam-epoch",
|
|
30
|
+
"crossbeam-utils",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[[package]]
|
|
34
|
+
name = "crossbeam-epoch"
|
|
35
|
+
version = "0.9.18"
|
|
36
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
37
|
+
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
|
38
|
+
dependencies = [
|
|
39
|
+
"crossbeam-utils",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[[package]]
|
|
43
|
+
name = "crossbeam-utils"
|
|
44
|
+
version = "0.8.21"
|
|
45
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
46
|
+
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
|
47
|
+
|
|
48
|
+
[[package]]
|
|
49
|
+
name = "cudarc"
|
|
50
|
+
version = "0.19.7"
|
|
51
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
52
|
+
checksum = "1cea5f10a99e025c1b44ae2354c2d8326b25ddbd0baf76bde8e55cfd4018a2cc"
|
|
53
|
+
dependencies = [
|
|
54
|
+
"libloading",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
[[package]]
|
|
58
|
+
name = "egg"
|
|
59
|
+
version = "0.6.0"
|
|
60
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
61
|
+
checksum = "05a6c0bbc92278f84e742f08c0ab9cb16a987376cd2bc39d228ef9c74d98d6f7"
|
|
62
|
+
dependencies = [
|
|
63
|
+
"indexmap",
|
|
64
|
+
"instant",
|
|
65
|
+
"log",
|
|
66
|
+
"once_cell",
|
|
67
|
+
"smallvec",
|
|
68
|
+
"symbolic_expressions",
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
[[package]]
|
|
72
|
+
name = "either"
|
|
73
|
+
version = "1.16.0"
|
|
74
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
75
|
+
checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
|
|
76
|
+
|
|
77
|
+
[[package]]
|
|
78
|
+
name = "fnv"
|
|
79
|
+
version = "1.0.7"
|
|
80
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
81
|
+
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
|
82
|
+
|
|
83
|
+
[[package]]
|
|
84
|
+
name = "futures-core"
|
|
85
|
+
version = "0.3.32"
|
|
86
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
87
|
+
checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
|
|
88
|
+
|
|
89
|
+
[[package]]
|
|
90
|
+
name = "futures-task"
|
|
91
|
+
version = "0.3.32"
|
|
92
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
93
|
+
checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
|
|
94
|
+
|
|
95
|
+
[[package]]
|
|
96
|
+
name = "futures-util"
|
|
97
|
+
version = "0.3.32"
|
|
98
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
99
|
+
checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
|
|
100
|
+
dependencies = [
|
|
101
|
+
"futures-core",
|
|
102
|
+
"futures-task",
|
|
103
|
+
"pin-project-lite",
|
|
104
|
+
"slab",
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
[[package]]
|
|
108
|
+
name = "good_lp"
|
|
109
|
+
version = "1.15.2"
|
|
110
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
111
|
+
checksum = "745190412d5ff4a54335cd16229a475ad3fb8f5474a5c1358292d62932187ea7"
|
|
112
|
+
dependencies = [
|
|
113
|
+
"fnv",
|
|
114
|
+
"microlp",
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
[[package]]
|
|
118
|
+
name = "hashbrown"
|
|
119
|
+
version = "0.12.3"
|
|
120
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
121
|
+
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
|
|
122
|
+
|
|
123
|
+
[[package]]
|
|
124
|
+
name = "heck"
|
|
125
|
+
version = "0.5.0"
|
|
126
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
127
|
+
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
|
128
|
+
|
|
129
|
+
[[package]]
|
|
130
|
+
name = "hermit-abi"
|
|
131
|
+
version = "0.5.2"
|
|
132
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
133
|
+
checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
|
|
134
|
+
|
|
135
|
+
[[package]]
|
|
136
|
+
name = "iced-x86"
|
|
137
|
+
version = "1.21.0"
|
|
138
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
139
|
+
checksum = "7c447cff8c7f384a7d4f741cfcff32f75f3ad02b406432e8d6c878d56b1edf6b"
|
|
140
|
+
dependencies = [
|
|
141
|
+
"lazy_static",
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
[[package]]
|
|
145
|
+
name = "indexmap"
|
|
146
|
+
version = "1.9.3"
|
|
147
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
148
|
+
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
|
|
149
|
+
dependencies = [
|
|
150
|
+
"autocfg",
|
|
151
|
+
"hashbrown",
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
[[package]]
|
|
155
|
+
name = "indoc"
|
|
156
|
+
version = "2.0.7"
|
|
157
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
158
|
+
checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
|
|
159
|
+
dependencies = [
|
|
160
|
+
"rustversion",
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
[[package]]
|
|
164
|
+
name = "instant"
|
|
165
|
+
version = "0.1.13"
|
|
166
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
167
|
+
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
|
|
168
|
+
dependencies = [
|
|
169
|
+
"cfg-if",
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
[[package]]
|
|
173
|
+
name = "js-sys"
|
|
174
|
+
version = "0.3.99"
|
|
175
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
176
|
+
checksum = "142bc4740e452c1e57ade0cbc129f139c9093e354346f0872ef985f4f5cf5f11"
|
|
177
|
+
dependencies = [
|
|
178
|
+
"cfg-if",
|
|
179
|
+
"futures-util",
|
|
180
|
+
"once_cell",
|
|
181
|
+
"wasm-bindgen",
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
[[package]]
|
|
185
|
+
name = "lazy_static"
|
|
186
|
+
version = "1.5.0"
|
|
187
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
188
|
+
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
|
189
|
+
|
|
190
|
+
[[package]]
|
|
191
|
+
name = "libc"
|
|
192
|
+
version = "0.2.186"
|
|
193
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
194
|
+
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
|
|
195
|
+
|
|
196
|
+
[[package]]
|
|
197
|
+
name = "libloading"
|
|
198
|
+
version = "0.9.0"
|
|
199
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
200
|
+
checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60"
|
|
201
|
+
dependencies = [
|
|
202
|
+
"cfg-if",
|
|
203
|
+
"windows-link",
|
|
204
|
+
]
|
|
205
|
+
|
|
206
|
+
[[package]]
|
|
207
|
+
name = "log"
|
|
208
|
+
version = "0.4.32"
|
|
209
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
210
|
+
checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a"
|
|
211
|
+
|
|
212
|
+
[[package]]
|
|
213
|
+
name = "matrixmultiply"
|
|
214
|
+
version = "0.3.10"
|
|
215
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
216
|
+
checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08"
|
|
217
|
+
dependencies = [
|
|
218
|
+
"autocfg",
|
|
219
|
+
"rawpointer",
|
|
220
|
+
]
|
|
221
|
+
|
|
222
|
+
[[package]]
|
|
223
|
+
name = "memoffset"
|
|
224
|
+
version = "0.9.1"
|
|
225
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
226
|
+
checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
|
|
227
|
+
dependencies = [
|
|
228
|
+
"autocfg",
|
|
229
|
+
]
|
|
230
|
+
|
|
231
|
+
[[package]]
|
|
232
|
+
name = "microlp"
|
|
233
|
+
version = "0.4.0"
|
|
234
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
235
|
+
checksum = "458ed987196f802dc47c69d4c5afcd19002d6c1c5f8f75c76d129bcf2425057a"
|
|
236
|
+
dependencies = [
|
|
237
|
+
"log",
|
|
238
|
+
"sprs",
|
|
239
|
+
"web-time",
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
[[package]]
|
|
243
|
+
name = "ndarray"
|
|
244
|
+
version = "0.17.2"
|
|
245
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
246
|
+
checksum = "520080814a7a6b4a6e9070823bb24b4531daac8c4627e08ba5de8c5ef2f2752d"
|
|
247
|
+
dependencies = [
|
|
248
|
+
"matrixmultiply",
|
|
249
|
+
"num-complex",
|
|
250
|
+
"num-integer",
|
|
251
|
+
"num-traits",
|
|
252
|
+
"portable-atomic",
|
|
253
|
+
"portable-atomic-util",
|
|
254
|
+
"rawpointer",
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
[[package]]
|
|
258
|
+
name = "num-complex"
|
|
259
|
+
version = "0.4.6"
|
|
260
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
261
|
+
checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
|
|
262
|
+
dependencies = [
|
|
263
|
+
"num-traits",
|
|
264
|
+
]
|
|
265
|
+
|
|
266
|
+
[[package]]
|
|
267
|
+
name = "num-integer"
|
|
268
|
+
version = "0.1.46"
|
|
269
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
270
|
+
checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
|
|
271
|
+
dependencies = [
|
|
272
|
+
"num-traits",
|
|
273
|
+
]
|
|
274
|
+
|
|
275
|
+
[[package]]
|
|
276
|
+
name = "num-traits"
|
|
277
|
+
version = "0.2.19"
|
|
278
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
279
|
+
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
|
|
280
|
+
dependencies = [
|
|
281
|
+
"autocfg",
|
|
282
|
+
]
|
|
283
|
+
|
|
284
|
+
[[package]]
|
|
285
|
+
name = "num_cpus"
|
|
286
|
+
version = "1.17.0"
|
|
287
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
288
|
+
checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
|
|
289
|
+
dependencies = [
|
|
290
|
+
"hermit-abi",
|
|
291
|
+
"libc",
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
[[package]]
|
|
295
|
+
name = "once_cell"
|
|
296
|
+
version = "1.21.4"
|
|
297
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
298
|
+
checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
|
|
299
|
+
|
|
300
|
+
[[package]]
|
|
301
|
+
name = "pin-project-lite"
|
|
302
|
+
version = "0.2.17"
|
|
303
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
304
|
+
checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
|
|
305
|
+
|
|
306
|
+
[[package]]
|
|
307
|
+
name = "portable-atomic"
|
|
308
|
+
version = "1.13.1"
|
|
309
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
310
|
+
checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
|
|
311
|
+
|
|
312
|
+
[[package]]
|
|
313
|
+
name = "portable-atomic-util"
|
|
314
|
+
version = "0.2.7"
|
|
315
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
316
|
+
checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618"
|
|
317
|
+
dependencies = [
|
|
318
|
+
"portable-atomic",
|
|
319
|
+
]
|
|
320
|
+
|
|
321
|
+
[[package]]
|
|
322
|
+
name = "proc-macro2"
|
|
323
|
+
version = "1.0.106"
|
|
324
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
325
|
+
checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
|
|
326
|
+
dependencies = [
|
|
327
|
+
"unicode-ident",
|
|
328
|
+
]
|
|
329
|
+
|
|
330
|
+
[[package]]
|
|
331
|
+
name = "pyo3"
|
|
332
|
+
version = "0.23.5"
|
|
333
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
334
|
+
checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872"
|
|
335
|
+
dependencies = [
|
|
336
|
+
"cfg-if",
|
|
337
|
+
"indoc",
|
|
338
|
+
"libc",
|
|
339
|
+
"memoffset",
|
|
340
|
+
"once_cell",
|
|
341
|
+
"portable-atomic",
|
|
342
|
+
"pyo3-build-config",
|
|
343
|
+
"pyo3-ffi",
|
|
344
|
+
"pyo3-macros",
|
|
345
|
+
"unindent",
|
|
346
|
+
]
|
|
347
|
+
|
|
348
|
+
[[package]]
|
|
349
|
+
name = "pyo3-build-config"
|
|
350
|
+
version = "0.23.5"
|
|
351
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
352
|
+
checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb"
|
|
353
|
+
dependencies = [
|
|
354
|
+
"once_cell",
|
|
355
|
+
"target-lexicon",
|
|
356
|
+
]
|
|
357
|
+
|
|
358
|
+
[[package]]
|
|
359
|
+
name = "pyo3-ffi"
|
|
360
|
+
version = "0.23.5"
|
|
361
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
362
|
+
checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d"
|
|
363
|
+
dependencies = [
|
|
364
|
+
"libc",
|
|
365
|
+
"pyo3-build-config",
|
|
366
|
+
]
|
|
367
|
+
|
|
368
|
+
[[package]]
|
|
369
|
+
name = "pyo3-macros"
|
|
370
|
+
version = "0.23.5"
|
|
371
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
372
|
+
checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da"
|
|
373
|
+
dependencies = [
|
|
374
|
+
"proc-macro2",
|
|
375
|
+
"pyo3-macros-backend",
|
|
376
|
+
"quote",
|
|
377
|
+
"syn",
|
|
378
|
+
]
|
|
379
|
+
|
|
380
|
+
[[package]]
|
|
381
|
+
name = "pyo3-macros-backend"
|
|
382
|
+
version = "0.23.5"
|
|
383
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
384
|
+
checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028"
|
|
385
|
+
dependencies = [
|
|
386
|
+
"heck",
|
|
387
|
+
"proc-macro2",
|
|
388
|
+
"pyo3-build-config",
|
|
389
|
+
"quote",
|
|
390
|
+
"syn",
|
|
391
|
+
]
|
|
392
|
+
|
|
393
|
+
[[package]]
|
|
394
|
+
name = "quote"
|
|
395
|
+
version = "1.0.45"
|
|
396
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
397
|
+
checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
|
|
398
|
+
dependencies = [
|
|
399
|
+
"proc-macro2",
|
|
400
|
+
]
|
|
401
|
+
|
|
402
|
+
[[package]]
|
|
403
|
+
name = "rawpointer"
|
|
404
|
+
version = "0.2.1"
|
|
405
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
406
|
+
checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
|
|
407
|
+
|
|
408
|
+
[[package]]
|
|
409
|
+
name = "rayon"
|
|
410
|
+
version = "1.12.0"
|
|
411
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
412
|
+
checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
|
|
413
|
+
dependencies = [
|
|
414
|
+
"either",
|
|
415
|
+
"rayon-core",
|
|
416
|
+
]
|
|
417
|
+
|
|
418
|
+
[[package]]
|
|
419
|
+
name = "rayon-core"
|
|
420
|
+
version = "1.13.0"
|
|
421
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
422
|
+
checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
|
|
423
|
+
dependencies = [
|
|
424
|
+
"crossbeam-deque",
|
|
425
|
+
"crossbeam-utils",
|
|
426
|
+
]
|
|
427
|
+
|
|
428
|
+
[[package]]
|
|
429
|
+
name = "rustc-hash"
|
|
430
|
+
version = "2.1.2"
|
|
431
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
432
|
+
checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
|
|
433
|
+
|
|
434
|
+
[[package]]
|
|
435
|
+
name = "rustversion"
|
|
436
|
+
version = "1.0.22"
|
|
437
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
438
|
+
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
|
439
|
+
|
|
440
|
+
[[package]]
|
|
441
|
+
name = "slab"
|
|
442
|
+
version = "0.4.12"
|
|
443
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
444
|
+
checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
|
|
445
|
+
|
|
446
|
+
[[package]]
|
|
447
|
+
name = "smallvec"
|
|
448
|
+
version = "1.15.1"
|
|
449
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
450
|
+
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
|
|
451
|
+
|
|
452
|
+
[[package]]
|
|
453
|
+
name = "sprs"
|
|
454
|
+
version = "0.11.4"
|
|
455
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
456
|
+
checksum = "6dca58a33be2188d4edc71534f8bafa826e787cc28ca1c47f31be3423f0d6e55"
|
|
457
|
+
dependencies = [
|
|
458
|
+
"ndarray",
|
|
459
|
+
"num-complex",
|
|
460
|
+
"num-traits",
|
|
461
|
+
"smallvec",
|
|
462
|
+
]
|
|
463
|
+
|
|
464
|
+
[[package]]
|
|
465
|
+
name = "symbolic_expressions"
|
|
466
|
+
version = "5.0.3"
|
|
467
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
468
|
+
checksum = "7c68d531d83ec6c531150584c42a4290911964d5f0d79132b193b67252a23b71"
|
|
469
|
+
|
|
470
|
+
[[package]]
|
|
471
|
+
name = "symplex-engine"
|
|
472
|
+
version = "1.0.0"
|
|
473
|
+
dependencies = [
|
|
474
|
+
"cudarc",
|
|
475
|
+
"egg",
|
|
476
|
+
"good_lp",
|
|
477
|
+
"iced-x86",
|
|
478
|
+
"lazy_static",
|
|
479
|
+
"libc",
|
|
480
|
+
"num_cpus",
|
|
481
|
+
"pyo3",
|
|
482
|
+
"rayon",
|
|
483
|
+
"rustc-hash",
|
|
484
|
+
]
|
|
485
|
+
|
|
486
|
+
[[package]]
|
|
487
|
+
name = "syn"
|
|
488
|
+
version = "2.0.117"
|
|
489
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
490
|
+
checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
|
|
491
|
+
dependencies = [
|
|
492
|
+
"proc-macro2",
|
|
493
|
+
"quote",
|
|
494
|
+
"unicode-ident",
|
|
495
|
+
]
|
|
496
|
+
|
|
497
|
+
[[package]]
|
|
498
|
+
name = "target-lexicon"
|
|
499
|
+
version = "0.12.16"
|
|
500
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
501
|
+
checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
|
|
502
|
+
|
|
503
|
+
[[package]]
|
|
504
|
+
name = "unicode-ident"
|
|
505
|
+
version = "1.0.24"
|
|
506
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
507
|
+
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
|
|
508
|
+
|
|
509
|
+
[[package]]
|
|
510
|
+
name = "unindent"
|
|
511
|
+
version = "0.2.4"
|
|
512
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
513
|
+
checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
|
|
514
|
+
|
|
515
|
+
[[package]]
|
|
516
|
+
name = "wasm-bindgen"
|
|
517
|
+
version = "0.2.122"
|
|
518
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
519
|
+
checksum = "3ed04576f974d2b2fba0f38c51dbc5518011e38c36bf1143164be765528fd409"
|
|
520
|
+
dependencies = [
|
|
521
|
+
"cfg-if",
|
|
522
|
+
"once_cell",
|
|
523
|
+
"rustversion",
|
|
524
|
+
"wasm-bindgen-macro",
|
|
525
|
+
"wasm-bindgen-shared",
|
|
526
|
+
]
|
|
527
|
+
|
|
528
|
+
[[package]]
|
|
529
|
+
name = "wasm-bindgen-macro"
|
|
530
|
+
version = "0.2.122"
|
|
531
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
532
|
+
checksum = "916151b09da36bd82f6615cbf3a419e2f0ba23a03c6160e8e92eb6bd4aa1dec6"
|
|
533
|
+
dependencies = [
|
|
534
|
+
"quote",
|
|
535
|
+
"wasm-bindgen-macro-support",
|
|
536
|
+
]
|
|
537
|
+
|
|
538
|
+
[[package]]
|
|
539
|
+
name = "wasm-bindgen-macro-support"
|
|
540
|
+
version = "0.2.122"
|
|
541
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
542
|
+
checksum = "299047362ccbfce148b67ab7e73349f77748e00c8296f9542adfad2ad82c5c5e"
|
|
543
|
+
dependencies = [
|
|
544
|
+
"bumpalo",
|
|
545
|
+
"proc-macro2",
|
|
546
|
+
"quote",
|
|
547
|
+
"syn",
|
|
548
|
+
"wasm-bindgen-shared",
|
|
549
|
+
]
|
|
550
|
+
|
|
551
|
+
[[package]]
|
|
552
|
+
name = "wasm-bindgen-shared"
|
|
553
|
+
version = "0.2.122"
|
|
554
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
555
|
+
checksum = "9a929b2c61f11ba3e9bc35b50c1f25cb38e0e892c0c231ae2b8cf78d5dad4437"
|
|
556
|
+
dependencies = [
|
|
557
|
+
"unicode-ident",
|
|
558
|
+
]
|
|
559
|
+
|
|
560
|
+
[[package]]
|
|
561
|
+
name = "web-time"
|
|
562
|
+
version = "1.1.0"
|
|
563
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
564
|
+
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
|
|
565
|
+
dependencies = [
|
|
566
|
+
"js-sys",
|
|
567
|
+
"wasm-bindgen",
|
|
568
|
+
]
|
|
569
|
+
|
|
570
|
+
[[package]]
|
|
571
|
+
name = "windows-link"
|
|
572
|
+
version = "0.2.1"
|
|
573
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
574
|
+
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
|
@@ -1466,9 +1466,293 @@ pub fn simd_reduce_f64(op: u8, data_ptr: usize, n: usize) -> f64 {
|
|
|
1466
1466
|
}
|
|
1467
1467
|
}
|
|
1468
1468
|
|
|
1469
|
+
// ── BLIS-Style Cache-Blocked Matmul ──────────────────────────────────────
|
|
1470
|
+
//
|
|
1471
|
+
// Implements a 5-loop BLIS-style algorithm with:
|
|
1472
|
+
// - AVX2 6×16 micro-kernel (12 YMM accumulators, 12 FMAs per k-step)
|
|
1473
|
+
// - B-matrix packing (contiguous access in micro-kernel, eliminates stride)
|
|
1474
|
+
// - A-matrix packing (column-major micro-panels for sequential broadcast)
|
|
1475
|
+
// - L1/L2/L3 cache blocking (MC=64, NC=64, KC=256)
|
|
1476
|
+
// - Rayon parallelism on the outermost (i2) loop
|
|
1477
|
+
//
|
|
1478
|
+
// Packed layouts (critical for micro-kernel performance):
|
|
1479
|
+
// A micro-panel [MR rows × kc cols]: column-major within panel
|
|
1480
|
+
// Layout: a[0,0] a[1,0] ... a[MR-1,0] a[0,1] a[1,1] ... a[MR-1,kc-1]
|
|
1481
|
+
// Stride between k-steps = MR (micro-kernel does a_ptr += MR)
|
|
1482
|
+
//
|
|
1483
|
+
// B micro-panel [kc rows × NR cols]: row-major within panel, padded to NR
|
|
1484
|
+
// Layout: b[0,0] b[0,1] ... b[0,NR-1] b[1,0] ... b[kc-1,NR-1]
|
|
1485
|
+
// Stride between k-steps = NR (micro-kernel does b_ptr += NR)
|
|
1486
|
+
|
|
1487
|
+
/// Micro-kernel row dimension (6 rows accumulated in YMM registers)
|
|
1488
|
+
const BLIS_MR: usize = 6;
|
|
1489
|
+
/// Micro-kernel column dimension (2 YMMs wide = 16 f32)
|
|
1490
|
+
const BLIS_NR: usize = 16;
|
|
1491
|
+
/// L2 cache block: rows of A
|
|
1492
|
+
const BLIS_MC: usize = 128;
|
|
1493
|
+
/// L2 cache block: cols of B
|
|
1494
|
+
const BLIS_NC: usize = 128;
|
|
1495
|
+
/// L2 cache block: shared dimension
|
|
1496
|
+
const BLIS_KC: usize = 128;
|
|
1497
|
+
|
|
1498
|
+
/// AVX2 micro-kernel for MR×NR rank-1 update.
|
|
1499
|
+
///
|
|
1500
|
+
/// Accumulates into C[i:i+mr, j:j+nr] += A_panel * B_panel
|
|
1501
|
+
/// using 12 YMM accumulators (6 rows × 2 YMM columns = 12 YMMs).
|
|
1502
|
+
///
|
|
1503
|
+
/// A_packed layout (column-major within panel):
|
|
1504
|
+
/// For each k step k, BLIS_MR consecutive values starting at a_packed + k*BLIS_MR:
|
|
1505
|
+
/// a[0,k], a[1,k], ..., a[BLIS_MR-1,k]
|
|
1506
|
+
///
|
|
1507
|
+
/// B_packed layout (row-major within panel, b_stride between k-rows):
|
|
1508
|
+
/// For each k step k, BLIS_NR consecutive values starting at b_packed + k*b_stride:
|
|
1509
|
+
/// b[k,0], b[k,1], ..., b[k,BLIS_NR-1]
|
|
1510
|
+
/// b_stride must be >= BLIS_NR (padded so loads are always valid).
|
|
1511
|
+
#[cfg(target_arch = "x86_64")]
|
|
1512
|
+
#[target_feature(enable = "avx2")]
|
|
1513
|
+
unsafe fn micro_kernel_6x16(
|
|
1514
|
+
a_packed: *const f32,
|
|
1515
|
+
b_packed: *const f32,
|
|
1516
|
+
b_stride: usize, // stride between k-rows in packed B
|
|
1517
|
+
c: *mut f32,
|
|
1518
|
+
ldc: usize,
|
|
1519
|
+
kc: usize,
|
|
1520
|
+
mr: usize, // actual mr (may be < BLIS_MR at edges)
|
|
1521
|
+
nr: usize, // actual nr (may be < BLIS_NR at edges)
|
|
1522
|
+
) {
|
|
1523
|
+
use std::arch::x86_64::*;
|
|
1524
|
+
|
|
1525
|
+
// 12 YMM accumulators: acc[row][col_block]
|
|
1526
|
+
// row 0: acc[0], acc[1] row 1: acc[2], acc[3] row 2: acc[4], acc[5]
|
|
1527
|
+
// row 3: acc[6], acc[7] row 4: acc[8], acc[9] row 5: acc[10], acc[11]
|
|
1528
|
+
let mut acc = [_mm256_setzero_ps(); 12];
|
|
1529
|
+
|
|
1530
|
+
for p in 0..kc {
|
|
1531
|
+
// Load 2 YMM from B row p (16 f32 contiguous, padded so loads are always valid)
|
|
1532
|
+
let b_row = b_packed.add(p * b_stride);
|
|
1533
|
+
let b0 = _mm256_loadu_ps(b_row);
|
|
1534
|
+
let b1 = _mm256_loadu_ps(b_row.add(8));
|
|
1535
|
+
|
|
1536
|
+
// Broadcast each A row value and do 12 FMAs
|
|
1537
|
+
// A is stored column-major: a_packed + p*BLIS_MR has all MR row values for this k
|
|
1538
|
+
let a_row = a_packed.add(p * BLIS_MR);
|
|
1539
|
+
for row in 0..BLIS_MR {
|
|
1540
|
+
let a_val = _mm256_broadcast_ss(&*a_row.add(row));
|
|
1541
|
+
acc[row * 2] = _mm256_fmadd_ps(a_val, b0, acc[row * 2]);
|
|
1542
|
+
acc[row * 2 + 1] = _mm256_fmadd_ps(a_val, b1, acc[row * 2 + 1]);
|
|
1543
|
+
}
|
|
1544
|
+
}
|
|
1545
|
+
|
|
1546
|
+
// Store accumulators to C with stride ldc
|
|
1547
|
+
for row in 0..mr {
|
|
1548
|
+
let c_row = c.add(row * ldc);
|
|
1549
|
+
if nr >= 16 {
|
|
1550
|
+
_mm256_storeu_ps(c_row, acc[row * 2]);
|
|
1551
|
+
_mm256_storeu_ps(c_row.add(8), acc[row * 2 + 1]);
|
|
1552
|
+
} else if nr >= 8 {
|
|
1553
|
+
_mm256_storeu_ps(c_row, acc[row * 2]);
|
|
1554
|
+
if nr > 8 {
|
|
1555
|
+
// Partial second YMM: extract and store only the valid elements
|
|
1556
|
+
let acc1_arr: [f32; 8] = std::mem::transmute(acc[row * 2 + 1]);
|
|
1557
|
+
for j in 8..nr {
|
|
1558
|
+
*c_row.add(j) = acc1_arr[j - 8];
|
|
1559
|
+
}
|
|
1560
|
+
}
|
|
1561
|
+
} else {
|
|
1562
|
+
// nr < 8: store element by element from first YMM
|
|
1563
|
+
let acc0_arr: [f32; 8] = std::mem::transmute(acc[row * 2]);
|
|
1564
|
+
for j in 0..nr {
|
|
1565
|
+
*c_row.add(j) = acc0_arr[j];
|
|
1566
|
+
}
|
|
1567
|
+
}
|
|
1568
|
+
}
|
|
1569
|
+
}
|
|
1570
|
+
|
|
1571
|
+
/// Core BLIS-style blocked matmul body (shared between serial and parallel).
|
|
1572
|
+
/// Processes a single MC-row block starting at row i2.
|
|
1573
|
+
/// Uses raw pointers for C to allow sharing across rayon threads safely.
|
|
1574
|
+
fn blis_process_block(
|
|
1575
|
+
a: &[f32],
|
|
1576
|
+
b: &[f32],
|
|
1577
|
+
c_ptr: *mut f32,
|
|
1578
|
+
_m: usize,
|
|
1579
|
+
n: usize,
|
|
1580
|
+
k_dim: usize, // renamed from `k` to avoid conflict with iced-x86's k2 register
|
|
1581
|
+
i2: usize,
|
|
1582
|
+
mc: usize,
|
|
1583
|
+
packed_b: &mut [f32],
|
|
1584
|
+
packed_a: &mut [f32],
|
|
1585
|
+
) {
|
|
1586
|
+
let has_avx2 = is_x86_feature_detected!("avx2");
|
|
1587
|
+
|
|
1588
|
+
// packed_b layout: [kc][BLIS_NC] with stride BLIS_NC (>= BLIS_NR, multiple of BLIS_NR)
|
|
1589
|
+
// BLIS_NC = 64 which is a multiple of BLIS_NR = 16, so loads at j1 offsets are safe
|
|
1590
|
+
let b_stride = BLIS_NC;
|
|
1591
|
+
|
|
1592
|
+
for j2 in (0..n).step_by(BLIS_NC) {
|
|
1593
|
+
let nc = std::cmp::min(BLIS_NC, n - j2);
|
|
1594
|
+
|
|
1595
|
+
for kk2 in (0..k_dim).step_by(BLIS_KC) {
|
|
1596
|
+
let kc = std::cmp::min(BLIS_KC, k_dim - kk2);
|
|
1597
|
+
|
|
1598
|
+
// Pack B[kk2:kk2+kc, j2:j2+nc] with stride b_stride = BLIS_NC
|
|
1599
|
+
// Layout: [kc][BLIS_NC] row-major, zero-padded beyond nc
|
|
1600
|
+
for p in 0..kc {
|
|
1601
|
+
let k_idx = kk2 + p;
|
|
1602
|
+
let row_start = p * b_stride;
|
|
1603
|
+
// Copy actual data
|
|
1604
|
+
for jj in 0..nc {
|
|
1605
|
+
packed_b[row_start + jj] = b[k_idx * n + j2 + jj];
|
|
1606
|
+
}
|
|
1607
|
+
// Zero-pad remainder (only if nc < BLIS_NC)
|
|
1608
|
+
if nc < BLIS_NC {
|
|
1609
|
+
for jj in nc..BLIS_NC {
|
|
1610
|
+
packed_b[row_start + jj] = 0.0f32;
|
|
1611
|
+
}
|
|
1612
|
+
}
|
|
1613
|
+
}
|
|
1614
|
+
|
|
1615
|
+
// Pack A[i2:i2+mc, kk2:kk2+kc] in column-major order within micro-panels
|
|
1616
|
+
// Layout within micro-panel (at offset i1*kc in packed_a):
|
|
1617
|
+
// for each k step p: BLIS_MR consecutive values
|
|
1618
|
+
// packed_a[i1*kc + p*BLIS_MR + row] = A[i2+i1+row, kk2+p]
|
|
1619
|
+
for i1 in (0..mc).step_by(BLIS_MR) {
|
|
1620
|
+
let mr = std::cmp::min(BLIS_MR, mc - i1);
|
|
1621
|
+
let panel_base = i1 * kc;
|
|
1622
|
+
for p in 0..kc {
|
|
1623
|
+
let col_start = panel_base + p * BLIS_MR;
|
|
1624
|
+
// Copy actual rows
|
|
1625
|
+
for row in 0..mr {
|
|
1626
|
+
let i_idx = i2 + i1 + row;
|
|
1627
|
+
packed_a[col_start + row] = a[i_idx * k_dim + kk2 + p];
|
|
1628
|
+
}
|
|
1629
|
+
// Zero-pad remainder (only if mr < BLIS_MR)
|
|
1630
|
+
if mr < BLIS_MR {
|
|
1631
|
+
for row in mr..BLIS_MR {
|
|
1632
|
+
packed_a[col_start + row] = 0.0f32;
|
|
1633
|
+
}
|
|
1634
|
+
}
|
|
1635
|
+
}
|
|
1636
|
+
}
|
|
1637
|
+
|
|
1638
|
+
// Micro-kernel loops
|
|
1639
|
+
for i1 in (0..mc).step_by(BLIS_MR) {
|
|
1640
|
+
let mr = std::cmp::min(BLIS_MR, mc - i1);
|
|
1641
|
+
|
|
1642
|
+
for j1 in (0..nc).step_by(BLIS_NR) {
|
|
1643
|
+
let nr = std::cmp::min(BLIS_NR, nc - j1);
|
|
1644
|
+
|
|
1645
|
+
let a_panel_ptr = packed_a.as_ptr().wrapping_add(i1 * kc);
|
|
1646
|
+
// B micro-panel starts at column j1 within the packed NC-wide panel
|
|
1647
|
+
let b_panel_ptr = packed_b.as_ptr().wrapping_add(j1);
|
|
1648
|
+
let c_row_ptr = unsafe { c_ptr.add((i2 + i1) * n + j2 + j1) };
|
|
1649
|
+
|
|
1650
|
+
if has_avx2 {
|
|
1651
|
+
unsafe {
|
|
1652
|
+
micro_kernel_6x16(
|
|
1653
|
+
a_panel_ptr,
|
|
1654
|
+
b_panel_ptr,
|
|
1655
|
+
b_stride, // stride between k-rows in packed B
|
|
1656
|
+
c_row_ptr,
|
|
1657
|
+
n, // ldc
|
|
1658
|
+
kc,
|
|
1659
|
+
mr,
|
|
1660
|
+
nr,
|
|
1661
|
+
);
|
|
1662
|
+
}
|
|
1663
|
+
} else {
|
|
1664
|
+
// Scalar fallback
|
|
1665
|
+
for i in 0..mr {
|
|
1666
|
+
for j in 0..nr {
|
|
1667
|
+
let mut sum = 0.0f32;
|
|
1668
|
+
for p in 0..kc {
|
|
1669
|
+
let a_val = packed_a[i1 * kc + p * BLIS_MR + i];
|
|
1670
|
+
let b_val = packed_b[p * b_stride + j1 + j];
|
|
1671
|
+
sum += a_val * b_val;
|
|
1672
|
+
}
|
|
1673
|
+
unsafe {
|
|
1674
|
+
*c_ptr.add((i2 + i1 + i) * n + j2 + j1 + j) += sum;
|
|
1675
|
+
}
|
|
1676
|
+
}
|
|
1677
|
+
}
|
|
1678
|
+
}
|
|
1679
|
+
}
|
|
1680
|
+
}
|
|
1681
|
+
}
|
|
1682
|
+
}
|
|
1683
|
+
}
|
|
1684
|
+
|
|
1685
|
+
/// BLIS-style cache-blocked matmul (serial version).
|
|
1686
|
+
pub fn cache_blocked_matmul(a: &[f32], b: &[f32], c: &mut [f32], m: usize, n: usize, k: usize) {
|
|
1687
|
+
if m == 0 || n == 0 || k == 0 { return; }
|
|
1688
|
+
|
|
1689
|
+
// Pre-allocate packing buffers (reused across loops)
|
|
1690
|
+
// A pack needs extra space for zero-padding: round up MC to next MR multiple
|
|
1691
|
+
let a_pack_rows = ((BLIS_MC + BLIS_MR - 1) / BLIS_MR) * BLIS_MR;
|
|
1692
|
+
let b_pack_size = BLIS_KC * BLIS_NC; // B panel [kc][BLIS_NC]
|
|
1693
|
+
let a_pack_size = a_pack_rows * BLIS_KC; // A panel (rounded mc × kc)
|
|
1694
|
+
let mut packed_b = vec![0.0f32; b_pack_size];
|
|
1695
|
+
let mut packed_a = vec![0.0f32; a_pack_size];
|
|
1696
|
+
|
|
1697
|
+
// Zero C
|
|
1698
|
+
for val in c.iter_mut().take(m * n) {
|
|
1699
|
+
*val = 0.0f32;
|
|
1700
|
+
}
|
|
1701
|
+
|
|
1702
|
+
let c_ptr = c.as_mut_ptr();
|
|
1703
|
+
|
|
1704
|
+
// Process all MC-row blocks serially
|
|
1705
|
+
for i2 in (0..m).step_by(BLIS_MC) {
|
|
1706
|
+
let mc = std::cmp::min(BLIS_MC, m - i2);
|
|
1707
|
+
blis_process_block(a, b, c_ptr, m, n, k, i2, mc, &mut packed_b, &mut packed_a);
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
|
|
1711
|
+
/// Parallel version of cache_blocked_matmul using rayon.
|
|
1712
|
+
/// Parallelizes over the outermost loop (MC-row blocks).
|
|
1713
|
+
pub fn parallel_cache_blocked_matmul(a: &[f32], b: &[f32], c: &mut [f32], m: usize, n: usize, k: usize) {
|
|
1714
|
+
use rayon::prelude::*;
|
|
1715
|
+
if m == 0 || n == 0 || k == 0 { return; }
|
|
1716
|
+
|
|
1717
|
+
// Zero C first
|
|
1718
|
+
for val in c.iter_mut().take(m * n) {
|
|
1719
|
+
*val = 0.0f32;
|
|
1720
|
+
}
|
|
1721
|
+
|
|
1722
|
+
let num_blocks = (m + BLIS_MC - 1) / BLIS_MC;
|
|
1723
|
+
|
|
1724
|
+
// Convert pointer to usize for safe Send+Sync capture across rayon threads.
|
|
1725
|
+
// Safety: each thread writes to disjoint rows of C (different i2 blocks).
|
|
1726
|
+
let c_addr = c.as_mut_ptr() as usize;
|
|
1727
|
+
|
|
1728
|
+
// Process each MC-row block in parallel (each thread gets its own packing buffers)
|
|
1729
|
+
(0..num_blocks).into_par_iter().for_each(|block_idx| {
|
|
1730
|
+
let i2 = block_idx * BLIS_MC;
|
|
1731
|
+
let mc = std::cmp::min(BLIS_MC, m - i2);
|
|
1732
|
+
let c_ptr = c_addr as *mut f32;
|
|
1733
|
+
|
|
1734
|
+
// Per-thread packing buffers
|
|
1735
|
+
let a_pack_rows = ((BLIS_MC + BLIS_MR - 1) / BLIS_MR) * BLIS_MR;
|
|
1736
|
+
let b_pack_size = BLIS_KC * BLIS_NC;
|
|
1737
|
+
let a_pack_size = a_pack_rows * BLIS_KC;
|
|
1738
|
+
let mut packed_b = vec![0.0f32; b_pack_size];
|
|
1739
|
+
let mut packed_a = vec![0.0f32; a_pack_size];
|
|
1740
|
+
|
|
1741
|
+
blis_process_block(a, b, c_ptr, m, n, k, i2, mc, &mut packed_b, &mut packed_a);
|
|
1742
|
+
});
|
|
1743
|
+
}
|
|
1744
|
+
|
|
1469
1745
|
pub fn parallel_matmul(a: &[f32], b: &[f32], c: &mut [f32], m: usize, n: usize, k: usize) {
|
|
1470
1746
|
use rayon::prelude::*;
|
|
1471
1747
|
if m == 0 || n == 0 || k == 0 { return; }
|
|
1748
|
+
|
|
1749
|
+
// Use cache-blocked matmul when AVX2 is available for significant speedup
|
|
1750
|
+
if is_x86_feature_detected!("avx2") {
|
|
1751
|
+
parallel_cache_blocked_matmul(a, b, c, m, n, k);
|
|
1752
|
+
return;
|
|
1753
|
+
}
|
|
1754
|
+
|
|
1755
|
+
// Scalar fallback
|
|
1472
1756
|
c.par_chunks_mut(n).enumerate().for_each(|(i, c_row)| {
|
|
1473
1757
|
c_row.fill(0.0f32);
|
|
1474
1758
|
let a_row = &a[i * k..(i + 1) * k];
|
|
@@ -1482,6 +1766,8 @@ pub fn parallel_matmul(a: &[f32], b: &[f32], c: &mut [f32], m: usize, n: usize,
|
|
|
1482
1766
|
|
|
1483
1767
|
pub fn jit_parallel_matmul(a: &[f32], b: &[f32], c: &mut [f32], m: usize, n: usize, k: usize) {
|
|
1484
1768
|
if m == 0 || n == 0 || k == 0 { return; }
|
|
1769
|
+
|
|
1770
|
+
// Use JIT AVX-512 kernel for large matrices where it excels
|
|
1485
1771
|
let isa = detect_isa_level();
|
|
1486
1772
|
if isa == ISALevel::AVX512 && n >= 64 {
|
|
1487
1773
|
let kernel = CompiledKernel::compile_matmul_avx512(m, n, k);
|
|
@@ -1490,6 +1776,13 @@ pub fn jit_parallel_matmul(a: &[f32], b: &[f32], c: &mut [f32], m: usize, n: usi
|
|
|
1490
1776
|
return;
|
|
1491
1777
|
}
|
|
1492
1778
|
}
|
|
1779
|
+
|
|
1780
|
+
// Use cache-blocked matmul when AVX2 is available
|
|
1781
|
+
if is_x86_feature_detected!("avx2") {
|
|
1782
|
+
parallel_cache_blocked_matmul(a, b, c, m, n, k);
|
|
1783
|
+
return;
|
|
1784
|
+
}
|
|
1785
|
+
|
|
1493
1786
|
parallel_matmul(a, b, c, m, n, k);
|
|
1494
1787
|
}
|
|
1495
1788
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|