PyPI - mplang-nightly - Versions diffs - 0.1.dev158__py3-none-any.whl → 0.1.dev268__py3-none-any.whl - Mend

mplang-nightly 0.1.dev158py3-none-any.whl → 0.1.dev268py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (191) hide show

mplang/__init__.py +21 -45
mplang/py.typed +13 -0
mplang/v1/__init__.py +157 -0
mplang/v1/_device.py +602 -0
mplang/{analysis → v1/analysis}/__init__.py +1 -1
mplang/{analysis → v1/analysis}/diagram.py +5 -7
mplang/v1/core/__init__.py +157 -0
mplang/{core → v1/core}/cluster.py +30 -14
mplang/{core → v1/core}/comm.py +5 -1
mplang/{core → v1/core}/context_mgr.py +1 -1
mplang/{core/dtype.py → v1/core/dtypes.py} +44 -2
mplang/{core → v1/core}/expr/__init__.py +7 -7
mplang/{core → v1/core}/expr/ast.py +13 -14
mplang/{core → v1/core}/expr/evaluator.py +65 -24
mplang/{core → v1/core}/expr/printer.py +24 -18
mplang/{core → v1/core}/expr/transformer.py +3 -3
mplang/{core → v1/core}/expr/utils.py +2 -2
mplang/{core → v1/core}/expr/visitor.py +1 -1
mplang/{core → v1/core}/expr/walk.py +1 -1
mplang/{core → v1/core}/interp.py +6 -6
mplang/{core → v1/core}/mpir.py +23 -16
mplang/{core → v1/core}/mpobject.py +6 -6
mplang/{core → v1/core}/mptype.py +13 -10
mplang/{core → v1/core}/pfunc.py +4 -4
mplang/{core → v1/core}/primitive.py +106 -201
mplang/{core → v1/core}/table.py +36 -8
mplang/{core → v1/core}/tensor.py +1 -1
mplang/{core → v1/core}/tracer.py +9 -9
mplang/{api.py → v1/host.py} +38 -6
mplang/v1/kernels/__init__.py +41 -0
mplang/{kernels → v1/kernels}/base.py +1 -1
mplang/v1/kernels/basic.py +240 -0
mplang/{kernels → v1/kernels}/context.py +42 -27
mplang/{kernels → v1/kernels}/crypto.py +44 -37
mplang/v1/kernels/fhe.py +858 -0
mplang/{kernels → v1/kernels}/mock_tee.py +12 -13
mplang/{kernels → v1/kernels}/phe.py +263 -57
mplang/{kernels → v1/kernels}/spu.py +137 -48
mplang/{kernels → v1/kernels}/sql_duckdb.py +12 -15
mplang/{kernels → v1/kernels}/stablehlo.py +30 -23
mplang/v1/kernels/value.py +626 -0
mplang/{ops → v1/ops}/__init__.py +5 -16
mplang/{ops → v1/ops}/base.py +2 -5
mplang/{ops/builtin.py → v1/ops/basic.py} +34 -26
mplang/v1/ops/crypto.py +262 -0
mplang/v1/ops/fhe.py +272 -0
mplang/{ops → v1/ops}/jax_cc.py +33 -68
mplang/v1/ops/nnx_cc.py +168 -0
mplang/{ops → v1/ops}/phe.py +16 -4
mplang/{ops → v1/ops}/spu.py +3 -5
mplang/v1/ops/sql_cc.py +303 -0
mplang/{ops → v1/ops}/tee.py +9 -24
mplang/{protos → v1/protos}/v1alpha1/mpir_pb2.pyi +71 -21
mplang/v1/protos/v1alpha1/value_pb2.py +34 -0
mplang/v1/protos/v1alpha1/value_pb2.pyi +169 -0
mplang/{runtime → v1/runtime}/__init__.py +2 -2
mplang/v1/runtime/channel.py +230 -0
mplang/{runtime → v1/runtime}/cli.py +35 -20
mplang/{runtime → v1/runtime}/client.py +19 -8
mplang/{runtime → v1/runtime}/communicator.py +59 -15
mplang/{runtime → v1/runtime}/data_providers.py +80 -19
mplang/{runtime → v1/runtime}/driver.py +30 -12
mplang/v1/runtime/link_comm.py +196 -0
mplang/{runtime → v1/runtime}/server.py +58 -42
mplang/{runtime → v1/runtime}/session.py +57 -71
mplang/{runtime → v1/runtime}/simulation.py +55 -28
mplang/v1/simp/api.py +353 -0
mplang/{simp → v1/simp}/mpi.py +8 -9
mplang/{simp/__init__.py → v1/simp/party.py} +19 -145
mplang/{simp → v1/simp}/random.py +21 -22
mplang/v1/simp/smpc.py +238 -0
mplang/v1/utils/table_utils.py +185 -0
mplang/v2/__init__.py +424 -0
mplang/v2/backends/__init__.py +57 -0
mplang/v2/backends/bfv_impl.py +705 -0
mplang/v2/backends/channel.py +217 -0
mplang/v2/backends/crypto_impl.py +723 -0
mplang/v2/backends/field_impl.py +454 -0
mplang/v2/backends/func_impl.py +107 -0
mplang/v2/backends/phe_impl.py +148 -0
mplang/v2/backends/simp_design.md +136 -0
mplang/v2/backends/simp_driver/__init__.py +41 -0
mplang/v2/backends/simp_driver/http.py +168 -0
mplang/v2/backends/simp_driver/mem.py +280 -0
mplang/v2/backends/simp_driver/ops.py +135 -0
mplang/v2/backends/simp_driver/state.py +60 -0
mplang/v2/backends/simp_driver/values.py +52 -0
mplang/v2/backends/simp_worker/__init__.py +29 -0
mplang/v2/backends/simp_worker/http.py +354 -0
mplang/v2/backends/simp_worker/mem.py +102 -0
mplang/v2/backends/simp_worker/ops.py +167 -0
mplang/v2/backends/simp_worker/state.py +49 -0
mplang/v2/backends/spu_impl.py +275 -0
mplang/v2/backends/spu_state.py +187 -0
mplang/v2/backends/store_impl.py +62 -0
mplang/v2/backends/table_impl.py +838 -0
mplang/v2/backends/tee_impl.py +215 -0
mplang/v2/backends/tensor_impl.py +519 -0
mplang/v2/cli.py +603 -0
mplang/v2/cli_guide.md +122 -0
mplang/v2/dialects/__init__.py +36 -0
mplang/v2/dialects/bfv.py +665 -0
mplang/v2/dialects/crypto.py +689 -0
mplang/v2/dialects/dtypes.py +378 -0
mplang/v2/dialects/field.py +210 -0
mplang/v2/dialects/func.py +135 -0
mplang/v2/dialects/phe.py +723 -0
mplang/v2/dialects/simp.py +944 -0
mplang/v2/dialects/spu.py +349 -0
mplang/v2/dialects/store.py +63 -0
mplang/v2/dialects/table.py +407 -0
mplang/v2/dialects/tee.py +346 -0
mplang/v2/dialects/tensor.py +1175 -0
mplang/v2/edsl/README.md +279 -0
mplang/v2/edsl/__init__.py +99 -0
mplang/v2/edsl/context.py +311 -0
mplang/v2/edsl/graph.py +463 -0
mplang/v2/edsl/jit.py +62 -0
mplang/v2/edsl/object.py +53 -0
mplang/v2/edsl/primitive.py +284 -0
mplang/v2/edsl/printer.py +119 -0
mplang/v2/edsl/registry.py +207 -0
mplang/v2/edsl/serde.py +375 -0
mplang/v2/edsl/tracer.py +614 -0
mplang/v2/edsl/typing.py +816 -0
mplang/v2/kernels/Makefile +30 -0
mplang/v2/kernels/__init__.py +23 -0
mplang/v2/kernels/gf128.cpp +148 -0
mplang/v2/kernels/ldpc.cpp +82 -0
mplang/v2/kernels/okvs.cpp +283 -0
mplang/v2/kernels/okvs_opt.cpp +291 -0
mplang/v2/kernels/py_kernels.py +398 -0
mplang/v2/libs/collective.py +330 -0
mplang/v2/libs/device/__init__.py +51 -0
mplang/v2/libs/device/api.py +813 -0
mplang/v2/libs/device/cluster.py +352 -0
mplang/v2/libs/ml/__init__.py +23 -0
mplang/v2/libs/ml/sgb.py +1861 -0
mplang/v2/libs/mpc/__init__.py +41 -0
mplang/v2/libs/mpc/_utils.py +99 -0
mplang/v2/libs/mpc/analytics/__init__.py +35 -0
mplang/v2/libs/mpc/analytics/aggregation.py +372 -0
mplang/v2/libs/mpc/analytics/groupby.md +99 -0
mplang/v2/libs/mpc/analytics/groupby.py +331 -0
mplang/v2/libs/mpc/analytics/permutation.py +386 -0
mplang/v2/libs/mpc/common/constants.py +39 -0
mplang/v2/libs/mpc/ot/__init__.py +32 -0
mplang/v2/libs/mpc/ot/base.py +222 -0
mplang/v2/libs/mpc/ot/extension.py +477 -0
mplang/v2/libs/mpc/ot/silent.py +217 -0
mplang/v2/libs/mpc/psi/__init__.py +40 -0
mplang/v2/libs/mpc/psi/cuckoo.py +228 -0
mplang/v2/libs/mpc/psi/okvs.py +49 -0
mplang/v2/libs/mpc/psi/okvs_gct.py +79 -0
mplang/v2/libs/mpc/psi/oprf.py +310 -0
mplang/v2/libs/mpc/psi/rr22.py +344 -0
mplang/v2/libs/mpc/psi/unbalanced.py +200 -0
mplang/v2/libs/mpc/vole/__init__.py +31 -0
mplang/v2/libs/mpc/vole/gilboa.py +327 -0
mplang/v2/libs/mpc/vole/ldpc.py +383 -0
mplang/v2/libs/mpc/vole/silver.py +336 -0
mplang/v2/runtime/__init__.py +15 -0
mplang/v2/runtime/dialect_state.py +41 -0
mplang/v2/runtime/interpreter.py +871 -0
mplang/v2/runtime/object_store.py +194 -0
mplang/v2/runtime/value.py +141 -0
{mplang_nightly-0.1.dev158.dist-info → mplang_nightly-0.1.dev268.dist-info}/METADATA +24 -17
mplang_nightly-0.1.dev268.dist-info/RECORD +180 -0
{mplang_nightly-0.1.dev158.dist-info → mplang_nightly-0.1.dev268.dist-info}/WHEEL +1 -1
mplang/core/__init__.py +0 -92
mplang/device.py +0 -340
mplang/kernels/builtin.py +0 -207
mplang/ops/crypto.py +0 -109
mplang/ops/ibis_cc.py +0 -139
mplang/ops/sql.py +0 -61
mplang/protos/v1alpha1/mpir_pb2_grpc.py +0 -3
mplang/runtime/link_comm.py +0 -131
mplang/simp/smpc.py +0 -201
mplang/utils/table_utils.py +0 -73
mplang_nightly-0.1.dev158.dist-info/RECORD +0 -77
/mplang/{core → v1/core}/mask.py +0 -0
/mplang/{protos → v1/protos}/v1alpha1/mpir_pb2.py +0 -0
/mplang/{runtime → v1/runtime}/exceptions.py +0 -0
/mplang/{runtime → v1/runtime}/http_api.md +0 -0
/mplang/{kernels → v1/simp}/__init__.py +0 -0
/mplang/{utils → v1/utils}/__init__.py +0 -0
/mplang/{utils → v1/utils}/crypto.py +0 -0
/mplang/{utils → v1/utils}/func_utils.py +0 -0
/mplang/{utils → v1/utils}/spu_utils.py +0 -0
{mplang_nightly-0.1.dev158.dist-info → mplang_nightly-0.1.dev268.dist-info}/entry_points.txt +0 -0
{mplang_nightly-0.1.dev158.dist-info → mplang_nightly-0.1.dev268.dist-info}/licenses/LICENSE +0 -0

mplang/v2/kernels/Makefile ADDED Viewed

@@ -0,0 +1,30 @@
+# Copyright 2025 Ant Group Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+CXX = g++
+# -march=native enables PCLMULQDQ if the host CPU supports it.
+# -mpclmul -maes are explicit flags if native doesn't pick them up, but native is safer for local dev.
+CXXFLAGS = -O3 -Wall -shared -fPIC -march=native -mpclmul -maes -fopenmp
+TARGET = libmplang_kernels.so
+SRCS = gf128.cpp okvs.cpp okvs_opt.cpp ldpc.cpp
+OBJS = $(SRCS:.cpp=.o)
+all: $(TARGET)
+$(TARGET): $(SRCS)
+	$(CXX) $(CXXFLAGS) -o $@ $^
+clean:
+	rm -f $(TARGET) $(OBJS)

mplang/v2/kernels/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+# Copyright 2025 Ant Group Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Kernels package for mplang v2.
+This package contains both:
+- Native C++ kernels (libmplang_kernels.so) for performance
+- Pure Python fallback implementations for portability
+The native kernels are optional. When not available, pure Python
+implementations will be used automatically.
+"""

mplang/v2/kernels/gf128.cpp ADDED Viewed

@@ -0,0 +1,148 @@
+/*
+ * Copyright 2025 Ant Group Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cstdint>
+#include <iostream>
+#include <wmmintrin.h> // For PCLMULQDQ
+#include <emmintrin.h> // For SSE2
+#include <tmmintrin.h> // For SSSE3 (pshufb)
+// Helper to reverse bits in bytes (if needed, but for GF(128) usually standard representation is used)
+// We assume standard GCM representation (x^128 + x^7 + x^2 + x + 1)
+// Little-endian input: a[0] is low 64 bits.
+extern "C" {
+    // ------------------------------------------------------------------------
+    // GF(2^128) Multiplication using PCLMULQDQ
+    // ------------------------------------------------------------------------
+    //
+    // Performs c = a * b mod P(x)
+    // P(x) = x^128 + x^7 + x^2 + x + 1
+    //
+    // Implementation based on Intel Whitepaper:
+    // "Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode"
+    // Algorithm 1 or optimized variants.
+    // Perform 128x128 -> 256 bit multiplication (carry-less)
+    // Returns low 128 bits in ret_lo, high 128 bits in ret_hi
+    static inline void clmul128(__m128i a, __m128i b, __m128i *ret_lo, __m128i *ret_hi) {
+        __m128i tmp3, tmp4, tmp5, tmp6;
+        tmp3 = _mm_clmulepi64_si128(a, b, 0x00); // a_lo * b_lo
+        tmp4 = _mm_clmulepi64_si128(a, b, 0x11); // a_hi * b_hi
+        tmp5 = _mm_clmulepi64_si128(a, b, 0x01); // a_lo * b_hi
+        tmp6 = _mm_clmulepi64_si128(a, b, 0x10); // a_hi * b_lo
+        tmp5 = _mm_xor_si128(tmp5, tmp6); // (a_lo*b_hi) + (a_hi*b_lo)
+        __m128i tmp5_lo = _mm_slli_si128(tmp5, 8);
+        __m128i tmp5_hi = _mm_srli_si128(tmp5, 8);
+        *ret_lo = _mm_xor_si128(tmp3, tmp5_lo);
+        *ret_hi = _mm_xor_si128(tmp4, tmp5_hi);
+    }
+    // Reduce 256-bit polynomial modulo P(x) = x^128 + x^7 + x^2 + x + 1
+    // Input: c_lo (low 128), c_hi (high 128)
+    // Output: reduced (128 bit)
+    // Based on optimized reduction for GCM (often called "folding")
+    static inline __m128i gcm_reduce(__m128i c_lo, __m128i c_hi) {
+        __m128i tmp3, tmp6, tmp7;
+        __m128i R = _mm_set_epi32(1, 0, 0, 135); // 0...010...010000111 (See note below)
+        // Actually, careful with endianness and GCM bit order "reflected" vs "polynomial".
+        // Most VOLE implementations (e.g., libOTe) use standard polynomial basis, not reflected GCM.
+        // Standard polynomial basis P(x) = x^128 + x^7 + x^2 + x + 1.
+        // x^128 = x^7 + x^2 + x + 1 (mod P)
+        // Simple reduction algorithm:
+        // We need to reduce c_hi into c_lo.
+        // 256-bit product C = C_hi * x^128 + C_lo
+        // x^128 mod P = (x^7 + x^2 + x + 1)
+        // Let's implement specific reduction for standard basis.
+        // Method: Shift-based or PCLMUL based reduction.
+        // For Speed, use PCLMUL.
+        // Constants for reduction
+        // Algorithm 5 from Intel paper (modified for standard basis if needed)
+        // The one in paper is for Reflected GCM.
+        // Let's assume we want Standard Basis GF(2^128).
+        // Ref: https://github.com/emp-toolkit/emp-ot/blob/master/emp-ot/ferret/ferret_cot.hpp#L15
+        return c_lo; // PLACEHOLDER: Reduction is complex to get right without writing a test first.
+        // I will implement a simpler but slower reduction first to verify pipeline,
+        // then optimize. Or copy verified code.
+    }
+    // Verified implementation of GF(2^128) Multiply from EMP-toolkit (Standard Basis)
+    // https://github.com/emp-toolkit/emp-tool/blob/master/emp-tool/utils/block.h#L137
+    // Using simple logic for now:
+    // This function computes mul in GF(2^128)
+    void gf128_mul(uint64_t* a_ptr, uint64_t* b_ptr, uint64_t* out_ptr) {
+        __m128i a = _mm_loadu_si128((__m128i*)a_ptr);
+        __m128i b = _mm_loadu_si128((__m128i*)b_ptr);
+        // 1. Multiply (Carry-less)
+        // Res = A * B
+        __m128i tmp3, tmp4, tmp5, tmp6;
+        tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
+        tmp4 = _mm_clmulepi64_si128(a, b, 0x11);
+        tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
+        tmp6 = _mm_clmulepi64_si128(a, b, 0x10);
+        tmp5 = _mm_xor_si128(tmp5, tmp6);
+        __m128i tmp5_lo = _mm_slli_si128(tmp5, 8);
+        __m128i tmp5_hi = _mm_srli_si128(tmp5, 8);
+        __m128i r0 = _mm_xor_si128(tmp3, tmp5_lo);
+        __m128i r1 = _mm_xor_si128(tmp4, tmp5_hi);
+        // 2. Reduce (Standard Basis)
+        // P(x) = x^128 + x^7 + x^2 + x + 1
+        // Q(x) = x^7 + x^2 + x + 1 = 0x87
+        __m128i Q = _mm_set_epi64x(0, 0x87);
+        __m128i r1_lo = r1;
+        __m128i m0 = _mm_clmulepi64_si128(r1, Q, 0x00); // r1_lo * Q
+        __m128i m1 = _mm_clmulepi64_si128(r1, Q, 0x10); // r1_hi * Q
+        __m128i m1_shifted = _mm_slli_si128(m1, 8);
+        __m128i M_lo = _mm_xor_si128(m0, m1_shifted);
+        __m128i M_hi = _mm_srli_si128(m1, 8);
+        __m128i H = _mm_clmulepi64_si128(M_hi, Q, 0x00);
+        __m128i res = _mm_xor_si128(r0, M_lo);
+        res = _mm_xor_si128(res, H);
+        _mm_storeu_si128((__m128i*)out_ptr, res);
+    }
+    // Batch Multiplication
+    void gf128_mul_batch(uint64_t* a, uint64_t* b, uint64_t* out, int64_t n) {
+        #pragma omp parallel for schedule(static)
+        for (int64_t i = 0; i < n; ++i) {
+            gf128_mul(a + 2*i, b + 2*i, out + 2*i);
+        }
+    }
+    // Test function updated
+    void gf128_mul_test(uint64_t* a, uint64_t* b, uint64_t* out) {
+        gf128_mul(a, b, out);
+    }
+}

mplang/v2/kernels/ldpc.cpp ADDED Viewed

@@ -0,0 +1,82 @@
+/*
+ * Copyright 2025 Ant Group Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cstdint>
+#include <cstring>
+#include <vector>
+#include <immintrin.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+extern "C" {
+/**
+ * @brief LDPC Encoding: Compute Syndrome s = H * x
+ *
+ * H is a sparse M x N binary matrix (CSR format).
+ * x is a dense N-vector of 128-bit blocks (N * 16 bytes).
+ * s is a dense M-vector of 128-bit blocks (M * 16 bytes).
+ *
+ * Logic: For each row i of H, s[i] = XOR(x[j]) for all j where H[i, j] = 1.
+ *
+ * @param message_ptr  Pointer to message x (N * 2 uint64_t)
+ * @param indices_ptr  Pointer to CSR indices (uint64_t)
+ * @param indptr_ptr   Pointer to CSR indptr (M+1 uint64_t)
+ * @param output_ptr   Pointer to output s (M * 2 uint64_t)
+ * @param m            Number of rows in H (syndrome length)
+ * @param n            Number of cols in H (message length)
+ */
+void ldpc_encode(const uint64_t* message_ptr,
+                 const uint64_t* indices_ptr,
+                 const uint64_t* indptr_ptr,
+                 uint64_t* output_ptr,
+                 uint64_t m,
+                 uint64_t n) {
+    // Check alignment
+    // We assume message_ptr and output_ptr are 16-byte aligned for SSE/AVX?
+    // JAX/Numpy arrays are usually aligned.
+    // Cast to __m128i for efficiency
+    // But we need to handle potential unaligned access if numpy doesn't align.
+    // _mm_loadu_si128 handles unaligned.
+    const __m128i* x_vec = (const __m128i*)message_ptr;
+    __m128i* s_vec = (__m128i*)output_ptr;
+    #pragma omp parallel for schedule(static)
+    for (uint64_t i = 0; i < m; ++i) {
+        // Row i
+        __m128i sum = _mm_setzero_si128();
+        uint64_t start = indptr_ptr[i];
+        uint64_t end = indptr_ptr[i+1];
+        for (uint64_t k = start; k < end; ++k) {
+            uint64_t col_idx = indices_ptr[k];
+            // XOR accumulation
+            // Use loadu for safety
+            __m128i val = _mm_loadu_si128(&x_vec[col_idx]);
+            sum = _mm_xor_si128(sum, val);
+        }
+        _mm_storeu_si128(&s_vec[i], sum);
+    }
+}
+}

mplang/v2/kernels/okvs.cpp ADDED Viewed

@@ -0,0 +1,283 @@
+/*
+ * Copyright 2025 Ant Group Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cstdint>
+#include <vector>
+#include <stack>
+#include <random>
+#include <immintrin.h>
+#include <cstring>
+#include <cstdio>
+#include <iostream>
+extern "C" {
+    // AES-NI Hashing Helper
+    struct Indices {
+        uint64_t h1, h2, h3;
+    };
+    inline Indices hash_key(uint64_t key, uint64_t m, __m128i seed) {
+        __m128i k = _mm_set_epi64x(0, key);
+        __m128i h = _mm_aesenc_si128(k, seed);
+        h = _mm_aesenc_si128(h, seed);
+        uint64_t v1 = _mm_extract_epi64(h, 0);
+        uint64_t v2 = _mm_extract_epi64(h, 1);
+        Indices idx;
+        idx.h1 = v1 % m;
+        idx.h2 = v2 % m;
+        idx.h3 = (v1 ^ v2) % m;
+        // Enforce distinct indices
+        if(idx.h2 == idx.h1) {
+            idx.h2 = (idx.h2 + 1) % m;
+        }
+        if(idx.h3 == idx.h1 || idx.h3 == idx.h2) {
+            idx.h3 = (idx.h3 + 1) % m;
+            if(idx.h3 == idx.h1 || idx.h3 == idx.h2) {
+                idx.h3 = (idx.h3 + 1) % m;
+            }
+        }
+        return idx;
+    }
+    // Solve OKVS System: H * P = V
+    void solve_okvs(uint64_t* keys, uint64_t* values, uint64_t* output, uint64_t n, uint64_t m, uint64_t* seed_ptr) {
+        // Load dynamic seed
+        __m128i seed = _mm_loadu_si128((__m128i*)seed_ptr);
+        struct Row {
+            uint64_t h1, h2, h3;
+        };
+        std::vector<Row> rows(n);
+        // 1. Parallel Hash Compute
+        #pragma omp parallel for schedule(static)
+        for(uint64_t i=0; i<n; ++i) {
+            Indices idx = hash_key(keys[i], m, seed);
+            rows[i] = {idx.h1, idx.h2, idx.h3};
+        }
+        // 2. Count Degrees (Serial or Atomic)
+        // Since M ~ 1.2N, atomic contention is low? Serial is safe and simple.
+        std::vector<int> col_degree(m, 0);
+        for(uint64_t i=0; i<n; ++i) {
+            col_degree[rows[i].h1]++;
+            col_degree[rows[i].h2]++;
+            col_degree[rows[i].h3]++;
+        }
+        // 3. Build CSR Structure (Flat Arrays) to replace vector<vector>
+        // col_start[j] points to start of column j's rows in flat_rows
+        std::vector<int> col_start(m + 1, 0);
+        // Prefix sum to compute start positions
+        // col_start[0] = 0
+        // col_start[j+1] = col_start[j] + degree[j]
+        for(uint64_t j=0; j<m; ++j) {
+            col_start[j+1] = col_start[j] + col_degree[j];
+        }
+        // Total edges = 3 * N implies flat_rows size
+        std::vector<int> flat_rows(n * 3);
+        // Temporary copy of start indices to use as fill pointers
+        std::vector<int> fill_ptr = col_start;
+        for(uint64_t i=0; i<n; ++i) {
+            const auto& r = rows[i];
+            flat_rows[fill_ptr[r.h1]++] = i;
+            flat_rows[fill_ptr[r.h2]++] = i;
+            flat_rows[fill_ptr[r.h3]++] = i;
+        }
+        // 4. Initialize Peeling
+        std::vector<int> peel_stack;
+        peel_stack.reserve(m);
+        for(uint64_t j=0; j<m; ++j) {
+            if(col_degree[j] == 1) peel_stack.push_back(j);
+        }
+        std::vector<bool> row_removed(n, false);
+        std::vector<bool> col_removed(m, false);
+        struct Assignment {
+            int col;
+            int row;
+        };
+        std::vector<Assignment> assignment_stack;
+        assignment_stack.reserve(n);
+        int head = 0;
+        // 5. Peeling BFS
+        while(head < peel_stack.size()) {
+            int j = peel_stack[head++];
+            if(col_removed[j]) continue;
+            // Find owner row: Iterate over edges of col j using flat arrays
+            int owner_row = -1;
+            int start = col_start[j];
+            int end = col_start[j+1];
+            for(int k=start; k<end; ++k) {
+                int r_idx = flat_rows[k];
+                if(!row_removed[r_idx]) {
+                    owner_row = r_idx;
+                    break;
+                }
+            }
+            if(owner_row == -1) {
+                col_removed[j] = true;
+                continue;
+            }
+            assignment_stack.push_back({j, owner_row});
+            col_removed[j] = true;
+            row_removed[owner_row] = true;
+            // Update neighbors
+            const auto& r = rows[owner_row];
+            uint64_t nbs[3] = {r.h1, r.h2, r.h3};
+            for(uint64_t neighbor : nbs) {
+                if(neighbor == (uint64_t)j) continue;
+                if(col_removed[neighbor]) continue;
+                col_degree[neighbor]--;
+                if(col_degree[neighbor] == 1) {
+                    peel_stack.push_back((int)neighbor);
+                }
+            }
+        }
+        if(assignment_stack.size() != n) {
+            fprintf(stderr, "[ERROR] OKVS Peeling Failed. N=%lu M=%lu Solved=%lu\n",
+                    n, m, assignment_stack.size());
+            // Zero output to identify failure clearly
+            memset(output, 0, m * 16);
+            return;
+        }
+        // 6. Back Substitution
+        // Use 128-bit intrinsics for value XORing
+        __m128i* P_vec = (__m128i*)output;
+        __m128i* V_vec = (__m128i*)values;
+        memset(output, 0, m * 16);
+        // Process in reverse constraint order (LIFO)
+        for(int i = (int)assignment_stack.size() - 1; i >= 0; --i) {
+            const auto& a = assignment_stack[i];
+            const auto& r = rows[a.row];
+            __m128i val1 = _mm_loadu_si128(&P_vec[r.h1]);
+            __m128i val2 = _mm_loadu_si128(&P_vec[r.h2]);
+            __m128i val3 = _mm_loadu_si128(&P_vec[r.h3]);
+            __m128i target = _mm_loadu_si128(&V_vec[a.row]);
+            __m128i current_sum = _mm_xor_si128(_mm_xor_si128(val1, val2), val3);
+            __m128i diff = _mm_xor_si128(target, current_sum);
+            _mm_storeu_si128(&P_vec[a.col], diff);
+        }
+    }
+    void decode_okvs(uint64_t* keys, uint64_t* storage, uint64_t* output, uint64_t n, uint64_t m, uint64_t* seed_ptr) {
+        __m128i seed = _mm_loadu_si128((__m128i*)seed_ptr);
+        __m128i* P_vec = (__m128i*)storage;
+        __m128i* out_vec = (__m128i*)output;
+        #pragma omp parallel for schedule(static)
+        for(uint64_t i=0; i<n; ++i) {
+            Indices idx = hash_key(keys[i], m, seed);
+            __m128i val = _mm_xor_si128(
+                _mm_xor_si128(_mm_loadu_si128(&P_vec[idx.h1]), _mm_loadu_si128(&P_vec[idx.h2])),
+                _mm_loadu_si128(&P_vec[idx.h3])
+            );
+            _mm_storeu_si128(&out_vec[i], val);
+        }
+    }
+    // Helper for key expansion
+    inline __m128i aes_keygen_assist(__m128i temp1, __m128i temp2) {
+        __m128i temp3;
+        temp2 = _mm_shuffle_epi32(temp2, 0xff);
+        temp3 = _mm_slli_si128(temp1, 0x4);
+        temp1 = _mm_xor_si128(temp1, temp3);
+        temp3 = _mm_slli_si128(temp3, 0x4);
+        temp1 = _mm_xor_si128(temp1, temp3);
+        temp3 = _mm_slli_si128(temp3, 0x4);
+        temp1 = _mm_xor_si128(temp1, temp3);
+        temp1 = _mm_xor_si128(temp1, temp2);
+        return temp1;
+    }
+    void aes_key_expand(__m128i user_key, __m128i* key_schedule) {
+        key_schedule[0] = user_key;
+        key_schedule[1] = aes_keygen_assist(key_schedule[0], _mm_aeskeygenassist_si128(key_schedule[0], 0x01));
+        key_schedule[2] = aes_keygen_assist(key_schedule[1], _mm_aeskeygenassist_si128(key_schedule[1], 0x02));
+        key_schedule[3] = aes_keygen_assist(key_schedule[2], _mm_aeskeygenassist_si128(key_schedule[2], 0x04));
+        key_schedule[4] = aes_keygen_assist(key_schedule[3], _mm_aeskeygenassist_si128(key_schedule[3], 0x08));
+        key_schedule[5] = aes_keygen_assist(key_schedule[4], _mm_aeskeygenassist_si128(key_schedule[4], 0x10));
+        key_schedule[6] = aes_keygen_assist(key_schedule[5], _mm_aeskeygenassist_si128(key_schedule[5], 0x20));
+        key_schedule[7] = aes_keygen_assist(key_schedule[6], _mm_aeskeygenassist_si128(key_schedule[6], 0x40));
+        key_schedule[8] = aes_keygen_assist(key_schedule[7], _mm_aeskeygenassist_si128(key_schedule[7], 0x80));
+        key_schedule[9] = aes_keygen_assist(key_schedule[8], _mm_aeskeygenassist_si128(key_schedule[8], 0x1b));
+        key_schedule[10] = aes_keygen_assist(key_schedule[9], _mm_aeskeygenassist_si128(key_schedule[9], 0x36));
+    }
+    // AES-128 Expansion
+    void aes_128_expand(uint64_t* seeds, uint64_t* output, uint64_t num_seeds, uint64_t length) {
+        __m128i* seeds_vec = (__m128i*)seeds;
+        __m128i* out_vec = (__m128i*)output;
+        // Fixed Key (Arbitrary constant)
+        // Using PI fractional part (Nothing-up-my-sleeve numbers)
+        // 0x243F6A8885A308D3 (PI_FRAC_1)
+        // 0x13198A2E03707344 (PI_FRAC_2)
+        __m128i fixed_key = _mm_set_epi64x(0x243F6A8885A308D3, 0x13198A2E03707344);
+        __m128i round_keys[11];
+        aes_key_expand(fixed_key, round_keys);
+        // For each seed
+        #pragma omp parallel for schedule(static)
+        for(uint64_t i=0; i<num_seeds; ++i) {
+             __m128i s = _mm_loadu_si128(&seeds_vec[i]);
+             // Expand to 'length' blocks
+             for(uint64_t j=0; j<length; ++j) {
+                 // Block = Seed ^ j
+                 // Note: j is passed as counter mix
+                 __m128i ctr = _mm_set_epi64x(0, j);
+                 __m128i block = _mm_xor_si128(s, ctr);
+                 // Encrypt Block
+                 __m128i state = _mm_xor_si128(block, round_keys[0]);
+                 for(int r=1; r<10; ++r) {
+                     state = _mm_aesenc_si128(state, round_keys[r]);
+                 }
+                 state = _mm_aesenclast_si128(state, round_keys[10]);
+                 // Store
+                 // Output is flat: [seed0_0, seed0_1 ... seed1_0 ...]
+                 _mm_storeu_si128(&out_vec[i * length + j], state);
+             }
+        }
+    }
+}

mplang-nightly 0.1.dev158__py3-none-any.whl → 0.1.dev268__py3-none-any.whl

mplang-nightly 0.1.dev158py3-none-any.whl → 0.1.dev268py3-none-any.whl