PyPI - mplang-nightly - Versions diffs - 0.1.dev158__py3-none-any.whl → 0.1.dev268__py3-none-any.whl - Mend

mplang-nightly 0.1.dev158py3-none-any.whl → 0.1.dev268py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (191) hide show

mplang/__init__.py +21 -45
mplang/py.typed +13 -0
mplang/v1/__init__.py +157 -0
mplang/v1/_device.py +602 -0
mplang/{analysis → v1/analysis}/__init__.py +1 -1
mplang/{analysis → v1/analysis}/diagram.py +5 -7
mplang/v1/core/__init__.py +157 -0
mplang/{core → v1/core}/cluster.py +30 -14
mplang/{core → v1/core}/comm.py +5 -1
mplang/{core → v1/core}/context_mgr.py +1 -1
mplang/{core/dtype.py → v1/core/dtypes.py} +44 -2
mplang/{core → v1/core}/expr/__init__.py +7 -7
mplang/{core → v1/core}/expr/ast.py +13 -14
mplang/{core → v1/core}/expr/evaluator.py +65 -24
mplang/{core → v1/core}/expr/printer.py +24 -18
mplang/{core → v1/core}/expr/transformer.py +3 -3
mplang/{core → v1/core}/expr/utils.py +2 -2
mplang/{core → v1/core}/expr/visitor.py +1 -1
mplang/{core → v1/core}/expr/walk.py +1 -1
mplang/{core → v1/core}/interp.py +6 -6
mplang/{core → v1/core}/mpir.py +23 -16
mplang/{core → v1/core}/mpobject.py +6 -6
mplang/{core → v1/core}/mptype.py +13 -10
mplang/{core → v1/core}/pfunc.py +4 -4
mplang/{core → v1/core}/primitive.py +106 -201
mplang/{core → v1/core}/table.py +36 -8
mplang/{core → v1/core}/tensor.py +1 -1
mplang/{core → v1/core}/tracer.py +9 -9
mplang/{api.py → v1/host.py} +38 -6
mplang/v1/kernels/__init__.py +41 -0
mplang/{kernels → v1/kernels}/base.py +1 -1
mplang/v1/kernels/basic.py +240 -0
mplang/{kernels → v1/kernels}/context.py +42 -27
mplang/{kernels → v1/kernels}/crypto.py +44 -37
mplang/v1/kernels/fhe.py +858 -0
mplang/{kernels → v1/kernels}/mock_tee.py +12 -13
mplang/{kernels → v1/kernels}/phe.py +263 -57
mplang/{kernels → v1/kernels}/spu.py +137 -48
mplang/{kernels → v1/kernels}/sql_duckdb.py +12 -15
mplang/{kernels → v1/kernels}/stablehlo.py +30 -23
mplang/v1/kernels/value.py +626 -0
mplang/{ops → v1/ops}/__init__.py +5 -16
mplang/{ops → v1/ops}/base.py +2 -5
mplang/{ops/builtin.py → v1/ops/basic.py} +34 -26
mplang/v1/ops/crypto.py +262 -0
mplang/v1/ops/fhe.py +272 -0
mplang/{ops → v1/ops}/jax_cc.py +33 -68
mplang/v1/ops/nnx_cc.py +168 -0
mplang/{ops → v1/ops}/phe.py +16 -4
mplang/{ops → v1/ops}/spu.py +3 -5
mplang/v1/ops/sql_cc.py +303 -0
mplang/{ops → v1/ops}/tee.py +9 -24
mplang/{protos → v1/protos}/v1alpha1/mpir_pb2.pyi +71 -21
mplang/v1/protos/v1alpha1/value_pb2.py +34 -0
mplang/v1/protos/v1alpha1/value_pb2.pyi +169 -0
mplang/{runtime → v1/runtime}/__init__.py +2 -2
mplang/v1/runtime/channel.py +230 -0
mplang/{runtime → v1/runtime}/cli.py +35 -20
mplang/{runtime → v1/runtime}/client.py +19 -8
mplang/{runtime → v1/runtime}/communicator.py +59 -15
mplang/{runtime → v1/runtime}/data_providers.py +80 -19
mplang/{runtime → v1/runtime}/driver.py +30 -12
mplang/v1/runtime/link_comm.py +196 -0
mplang/{runtime → v1/runtime}/server.py +58 -42
mplang/{runtime → v1/runtime}/session.py +57 -71
mplang/{runtime → v1/runtime}/simulation.py +55 -28
mplang/v1/simp/api.py +353 -0
mplang/{simp → v1/simp}/mpi.py +8 -9
mplang/{simp/__init__.py → v1/simp/party.py} +19 -145
mplang/{simp → v1/simp}/random.py +21 -22
mplang/v1/simp/smpc.py +238 -0
mplang/v1/utils/table_utils.py +185 -0
mplang/v2/__init__.py +424 -0
mplang/v2/backends/__init__.py +57 -0
mplang/v2/backends/bfv_impl.py +705 -0
mplang/v2/backends/channel.py +217 -0
mplang/v2/backends/crypto_impl.py +723 -0
mplang/v2/backends/field_impl.py +454 -0
mplang/v2/backends/func_impl.py +107 -0
mplang/v2/backends/phe_impl.py +148 -0
mplang/v2/backends/simp_design.md +136 -0
mplang/v2/backends/simp_driver/__init__.py +41 -0
mplang/v2/backends/simp_driver/http.py +168 -0
mplang/v2/backends/simp_driver/mem.py +280 -0
mplang/v2/backends/simp_driver/ops.py +135 -0
mplang/v2/backends/simp_driver/state.py +60 -0
mplang/v2/backends/simp_driver/values.py +52 -0
mplang/v2/backends/simp_worker/__init__.py +29 -0
mplang/v2/backends/simp_worker/http.py +354 -0
mplang/v2/backends/simp_worker/mem.py +102 -0
mplang/v2/backends/simp_worker/ops.py +167 -0
mplang/v2/backends/simp_worker/state.py +49 -0
mplang/v2/backends/spu_impl.py +275 -0
mplang/v2/backends/spu_state.py +187 -0
mplang/v2/backends/store_impl.py +62 -0
mplang/v2/backends/table_impl.py +838 -0
mplang/v2/backends/tee_impl.py +215 -0
mplang/v2/backends/tensor_impl.py +519 -0
mplang/v2/cli.py +603 -0
mplang/v2/cli_guide.md +122 -0
mplang/v2/dialects/__init__.py +36 -0
mplang/v2/dialects/bfv.py +665 -0
mplang/v2/dialects/crypto.py +689 -0
mplang/v2/dialects/dtypes.py +378 -0
mplang/v2/dialects/field.py +210 -0
mplang/v2/dialects/func.py +135 -0
mplang/v2/dialects/phe.py +723 -0
mplang/v2/dialects/simp.py +944 -0
mplang/v2/dialects/spu.py +349 -0
mplang/v2/dialects/store.py +63 -0
mplang/v2/dialects/table.py +407 -0
mplang/v2/dialects/tee.py +346 -0
mplang/v2/dialects/tensor.py +1175 -0
mplang/v2/edsl/README.md +279 -0
mplang/v2/edsl/__init__.py +99 -0
mplang/v2/edsl/context.py +311 -0
mplang/v2/edsl/graph.py +463 -0
mplang/v2/edsl/jit.py +62 -0
mplang/v2/edsl/object.py +53 -0
mplang/v2/edsl/primitive.py +284 -0
mplang/v2/edsl/printer.py +119 -0
mplang/v2/edsl/registry.py +207 -0
mplang/v2/edsl/serde.py +375 -0
mplang/v2/edsl/tracer.py +614 -0
mplang/v2/edsl/typing.py +816 -0
mplang/v2/kernels/Makefile +30 -0
mplang/v2/kernels/__init__.py +23 -0
mplang/v2/kernels/gf128.cpp +148 -0
mplang/v2/kernels/ldpc.cpp +82 -0
mplang/v2/kernels/okvs.cpp +283 -0
mplang/v2/kernels/okvs_opt.cpp +291 -0
mplang/v2/kernels/py_kernels.py +398 -0
mplang/v2/libs/collective.py +330 -0
mplang/v2/libs/device/__init__.py +51 -0
mplang/v2/libs/device/api.py +813 -0
mplang/v2/libs/device/cluster.py +352 -0
mplang/v2/libs/ml/__init__.py +23 -0
mplang/v2/libs/ml/sgb.py +1861 -0
mplang/v2/libs/mpc/__init__.py +41 -0
mplang/v2/libs/mpc/_utils.py +99 -0
mplang/v2/libs/mpc/analytics/__init__.py +35 -0
mplang/v2/libs/mpc/analytics/aggregation.py +372 -0
mplang/v2/libs/mpc/analytics/groupby.md +99 -0
mplang/v2/libs/mpc/analytics/groupby.py +331 -0
mplang/v2/libs/mpc/analytics/permutation.py +386 -0
mplang/v2/libs/mpc/common/constants.py +39 -0
mplang/v2/libs/mpc/ot/__init__.py +32 -0
mplang/v2/libs/mpc/ot/base.py +222 -0
mplang/v2/libs/mpc/ot/extension.py +477 -0
mplang/v2/libs/mpc/ot/silent.py +217 -0
mplang/v2/libs/mpc/psi/__init__.py +40 -0
mplang/v2/libs/mpc/psi/cuckoo.py +228 -0
mplang/v2/libs/mpc/psi/okvs.py +49 -0
mplang/v2/libs/mpc/psi/okvs_gct.py +79 -0
mplang/v2/libs/mpc/psi/oprf.py +310 -0
mplang/v2/libs/mpc/psi/rr22.py +344 -0
mplang/v2/libs/mpc/psi/unbalanced.py +200 -0
mplang/v2/libs/mpc/vole/__init__.py +31 -0
mplang/v2/libs/mpc/vole/gilboa.py +327 -0
mplang/v2/libs/mpc/vole/ldpc.py +383 -0
mplang/v2/libs/mpc/vole/silver.py +336 -0
mplang/v2/runtime/__init__.py +15 -0
mplang/v2/runtime/dialect_state.py +41 -0
mplang/v2/runtime/interpreter.py +871 -0
mplang/v2/runtime/object_store.py +194 -0
mplang/v2/runtime/value.py +141 -0
{mplang_nightly-0.1.dev158.dist-info → mplang_nightly-0.1.dev268.dist-info}/METADATA +24 -17
mplang_nightly-0.1.dev268.dist-info/RECORD +180 -0
{mplang_nightly-0.1.dev158.dist-info → mplang_nightly-0.1.dev268.dist-info}/WHEEL +1 -1
mplang/core/__init__.py +0 -92
mplang/device.py +0 -340
mplang/kernels/builtin.py +0 -207
mplang/ops/crypto.py +0 -109
mplang/ops/ibis_cc.py +0 -139
mplang/ops/sql.py +0 -61
mplang/protos/v1alpha1/mpir_pb2_grpc.py +0 -3
mplang/runtime/link_comm.py +0 -131
mplang/simp/smpc.py +0 -201
mplang/utils/table_utils.py +0 -73
mplang_nightly-0.1.dev158.dist-info/RECORD +0 -77
/mplang/{core → v1/core}/mask.py +0 -0
/mplang/{protos → v1/protos}/v1alpha1/mpir_pb2.py +0 -0
/mplang/{runtime → v1/runtime}/exceptions.py +0 -0
/mplang/{runtime → v1/runtime}/http_api.md +0 -0
/mplang/{kernels → v1/simp}/__init__.py +0 -0
/mplang/{utils → v1/utils}/__init__.py +0 -0
/mplang/{utils → v1/utils}/crypto.py +0 -0
/mplang/{utils → v1/utils}/func_utils.py +0 -0
/mplang/{utils → v1/utils}/spu_utils.py +0 -0
{mplang_nightly-0.1.dev158.dist-info → mplang_nightly-0.1.dev268.dist-info}/entry_points.txt +0 -0
{mplang_nightly-0.1.dev158.dist-info → mplang_nightly-0.1.dev268.dist-info}/licenses/LICENSE +0 -0

mplang/v2/kernels/okvs_opt.cpp ADDED Viewed

@@ -0,0 +1,291 @@
+/*
+ * Copyright 2025 Ant Group Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cstdint>
+#include <vector>
+#include <stack>
+#include <random>
+#include <immintrin.h>
+#include <cstring>
+#include <cstdio>
+#include <iostream>
+#include <omp.h>
+#include <atomic>
+extern "C" {
+    // Number of Bins for Mega-Binning strategy.
+    // 1024 bins implies ~1000 items per bin for N=1M, fitting the working set
+    // entirely in L1 cache (32KB/48KB) for maximum performance.
+    static const uint64_t NUM_BINS = 1024;
+    struct Indices {
+        uint64_t h1, h2, h3;
+    };
+    // Stateless Bin Selection
+    // Maps a key to a deterministic bin index [0, NUM_BINS).
+    inline uint64_t get_bin_index(uint64_t key, __m128i seed) {
+        __m128i k = _mm_set_epi64x(0, key);
+        __m128i h = _mm_aesenc_si128(k, seed);
+        h = _mm_aesenc_si128(h, seed);
+        uint64_t v1 = _mm_extract_epi64(h, 0);
+        return v1 % NUM_BINS;
+    }
+    // Generate 3 positions within a local bin of size m_local.
+    inline Indices get_bin_local_indices(uint64_t key, uint64_t m_local, __m128i seed) {
+        // Use a distinct seed mix to decorrelate from bin selection
+        __m128i k = _mm_set_epi64x(0, key);
+        __m128i s2 = _mm_add_epi64(seed, _mm_set_epi64x(1, 1));
+        __m128i h = _mm_aesenc_si128(k, s2);
+        h = _mm_aesenc_si128(h, s2);
+        h = _mm_aesenc_si128(h, s2);
+        uint64_t r = _mm_extract_epi64(h, 0);
+        Indices idx;
+        // Fast modulo for local indices
+        idx.h1 = r % m_local;
+        r = r * 6364136223846793005ULL + 1442695040888963407ULL; // LCG step
+        idx.h2 = r % m_local;
+        r = r * 6364136223846793005ULL + 1442695040888963407ULL;
+        idx.h3 = r % m_local;
+        // Ensure distinct indices
+        if(idx.h2 == idx.h1) idx.h2 = (idx.h2 + 1) % m_local;
+        if(idx.h3 == idx.h1 || idx.h3 == idx.h2) {
+            idx.h3 = (idx.h3 + 1) % m_local;
+            if(idx.h3 == idx.h1 || idx.h3 == idx.h2) idx.h3 = (idx.h3 + 1) % m_local;
+        }
+        return idx;
+    }
+    // Core Peeling Solver for a single Bin
+    bool solve_bin(
+        const std::vector<uint64_t>& keys,
+        const std::vector<__m128i>& vals,
+        __m128i* P_local,
+        uint64_t m,
+        __m128i seed
+    ) {
+        uint64_t n = keys.size();
+        if (n == 0) return true;
+        struct Edge {
+            uint64_t h1, h2, h3;
+            uint64_t key_idx;
+        };
+        std::vector<Edge> edges(n);
+        std::vector<int> col_degree(m, 0);
+        // 1. Build Local Graph
+        for(uint64_t i=0; i<n; ++i) {
+             Indices idx = get_bin_local_indices(keys[i], m, seed);
+             edges[i] = {idx.h1, idx.h2, idx.h3, i};
+             col_degree[idx.h1]++;
+             col_degree[idx.h2]++;
+             col_degree[idx.h3]++;
+        }
+        // 2. CSR Construction
+        std::vector<int> col_start(m + 1, 0);
+        for(uint64_t j=0; j<m; ++j) {
+            col_start[j+1] = col_start[j] + col_degree[j];
+        }
+        std::vector<int> flat_rows(n * 3);
+        std::vector<int> fill_ptr = col_start;
+        for(uint64_t i=0; i<n; ++i) {
+            flat_rows[fill_ptr[edges[i].h1]++] = i;
+            flat_rows[fill_ptr[edges[i].h2]++] = i;
+            flat_rows[fill_ptr[edges[i].h3]++] = i;
+        }
+        // 3. Peeling Process
+        std::vector<int> peel_stack;
+        peel_stack.reserve(m);
+        for(uint64_t j=0; j<m; ++j) {
+            if(col_degree[j] == 1) peel_stack.push_back(j);
+        }
+        std::vector<bool> row_removed(n, false);
+        std::vector<bool> col_removed(m, false);
+        struct Assignment {
+            int col;
+            int row_idx;
+        };
+        std::vector<Assignment> assignment_stack;
+        assignment_stack.reserve(n);
+        int head = 0;
+        while(head < peel_stack.size()) {
+            int j = peel_stack[head++];
+            if(col_removed[j]) continue;
+            int owner_row = -1;
+            for(int k=col_start[j]; k<col_start[j+1]; ++k) {
+                int r = flat_rows[k];
+                if(!row_removed[r]) {
+                    owner_row = r;
+                    break;
+                }
+            }
+            if(owner_row == -1) {
+                col_removed[j] = true;
+                continue;
+            }
+            assignment_stack.push_back({j, owner_row});
+            col_removed[j] = true;
+            row_removed[owner_row] = true;
+            const auto& e = edges[owner_row];
+            uint64_t nbs[3] = {e.h1, e.h2, e.h3};
+            for(uint64_t nb : nbs) {
+                if(nb == (uint64_t)j) continue;
+                if(col_removed[nb]) continue;
+                col_degree[nb]--;
+                if(col_degree[nb] == 1) peel_stack.push_back((int)nb);
+            }
+        }
+        if(assignment_stack.size() != n) return false;
+        // 4. Back-Substitution
+        for(int i=(int)assignment_stack.size()-1; i>=0; --i) {
+            auto a = assignment_stack[i];
+            const auto& e = edges[a.row_idx];
+            __m128i val1 = _mm_loadu_si128(&P_local[e.h1]);
+            __m128i val2 = _mm_loadu_si128(&P_local[e.h2]);
+            __m128i val3 = _mm_loadu_si128(&P_local[e.h3]);
+            __m128i target = vals[e.key_idx];
+            __m128i current = _mm_xor_si128(_mm_xor_si128(val1, val2), val3);
+            __m128i diff = _mm_xor_si128(target, current);
+            _mm_storeu_si128(&P_local[a.col], diff);
+        }
+        return true;
+    }
+    void solve_okvs_opt(uint64_t* keys, uint64_t* values, uint64_t* output, uint64_t n, uint64_t m, uint64_t* seed_ptr) {
+        __m128i seed = _mm_loadu_si128((__m128i*)seed_ptr);
+        // 1. Calculate Bin Boundaries
+        // We divide M evenly among bins. The remainder is distributed to the first few bins.
+        std::vector<uint64_t> bin_offsets(NUM_BINS + 1);
+        std::vector<uint64_t> m_per_bin(NUM_BINS);
+        uint64_t base_m = m / NUM_BINS;
+        uint64_t remainder = m % NUM_BINS;
+        uint64_t current_offset = 0;
+        for(uint64_t b=0; b<NUM_BINS; ++b) {
+            bin_offsets[b] = current_offset;
+            m_per_bin[b] = base_m + (b < remainder ? 1 : 0);
+            current_offset += m_per_bin[b];
+        }
+        bin_offsets[NUM_BINS] = m;
+        // 2. Partition Data (Stateless)
+        // Note on "Two-Choice Hashing":
+        // While Two-Choice Hashing (selecting the lighter of 2 potential bins) would significantly
+        // reduce max bin load variance, it introduces "Statefulness".
+        // The bin assignment for Key K would depend on the load of bins, which depends on other keys.
+        // In standard PSI protocols (like RR22), the Decode step must be capable of processing keys
+        // independently or without knowledge of the full set distribution (Sender/Receiver separation).
+        // Therefore, we use **Simple Binning** (Stateless Hash) where Bin(K) = H(K) % Bins.
+        // We mitigate the resulting variance ("Balls-in-Bins" problem) by using a slightly larger
+        // expansion factor (epsilon ~ 1.35) which is bandwidth-acceptable and ensures stability.
+        std::vector<std::vector<uint64_t>> bin_keys(NUM_BINS);
+        std::vector<std::vector<__m128i>> bin_vals(NUM_BINS);
+        // Pre-allocate to reduce reallocation overhead (assume ~uniform distribution)
+        // 1.5x margin for pre-allocation safety
+        size_t est_size = (n / NUM_BINS) * 3 / 2;
+        for(int b=0; b<NUM_BINS; ++b) {
+            bin_keys[b].reserve(est_size);
+            bin_vals[b].reserve(est_size);
+        }
+        const __m128i* V_ptr = (const __m128i*)values;
+        for(uint64_t i=0; i<n; ++i) {
+            uint64_t b = get_bin_index(keys[i], seed);
+            bin_keys[b].push_back(keys[i]);
+            bin_vals[b].push_back(_mm_loadu_si128(&V_ptr[i]));
+        }
+        // 3. Parallel Solve
+        // Each bin is solved independently. This logic is perfectly parallelizable (embarrassingly parallel).
+        // The working set for each bin (~1000 items) stays hot in L1 Cache.
+        memset(output, 0, m * 16);
+        __m128i* P_vec = (__m128i*)output;
+        #pragma omp parallel for schedule(dynamic)
+        for(uint64_t b=0; b<NUM_BINS; ++b) {
+            if(bin_keys[b].empty()) continue;
+            uint64_t offset = bin_offsets[b];
+            uint64_t valid_m = m_per_bin[b];
+            if(!solve_bin(bin_keys[b], bin_vals[b], &P_vec[offset], valid_m, seed)) {
+                #pragma omp critical
+                {
+                    fprintf(stderr, "[ERROR] Bin %lu failed OKVS peeling. Items: %lu / M: %lu (Ratio: %.2f). Try increasing expansion factor.\n",
+                        b, bin_keys[b].size(), valid_m, (double)valid_m / bin_keys[b].size());
+                }
+            }
+        }
+    }
+    void decode_okvs_opt(uint64_t* keys, uint64_t* storage, uint64_t* output, uint64_t n, uint64_t m, uint64_t* seed_ptr) {
+        __m128i seed = _mm_loadu_si128((__m128i*)seed_ptr);
+        __m128i* P_vec = (__m128i*)storage;
+        __m128i* out_vec = (__m128i*)output;
+        // Replicate Boundary Logic
+        std::vector<uint64_t> bin_offsets(NUM_BINS + 1);
+        std::vector<uint64_t> m_per_bin(NUM_BINS);
+        uint64_t base_m = m / NUM_BINS;
+        uint64_t remainder = m % NUM_BINS;
+        uint64_t current_offset = 0;
+        for(uint64_t b=0; b<NUM_BINS; ++b) {
+            bin_offsets[b] = current_offset;
+            m_per_bin[b] = base_m + (b < remainder ? 1 : 0);
+            current_offset += m_per_bin[b];
+        }
+        // Parallel Stateless Decode
+        #pragma omp parallel for schedule(static)
+        for(uint64_t i=0; i<n; ++i) {
+            uint64_t b = get_bin_index(keys[i], seed);
+            uint64_t m_local = m_per_bin[b];
+            uint64_t offset = bin_offsets[b];
+            Indices idx = get_bin_local_indices(keys[i], m_local, seed);
+            __m128i val = _mm_xor_si128(
+                _mm_xor_si128(_mm_loadu_si128(&P_vec[offset + idx.h1]), _mm_loadu_si128(&P_vec[offset + idx.h2])),
+                _mm_loadu_si128(&P_vec[offset + idx.h3])
+            );
+            _mm_storeu_si128(&out_vec[i], val);
+        }
+    }
+}

mplang/v2/kernels/py_kernels.py ADDED Viewed

@@ -0,0 +1,398 @@
+# Copyright 2025 Ant Group Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pure Python implementations of performance-critical kernels.
+These implementations provide fallback functionality when native C++ kernels
+(libmplang_kernels.so) are not available. They are functionally correct but
+significantly slower than the optimized C++ versions.
+"""
+from __future__ import annotations
+import numpy as np
+from mplang.v2.libs.mpc.common.constants import (
+    GOLDEN_RATIO_64,
+    SPLITMIX64_GAMMA_2,
+    SPLITMIX64_GAMMA_3,
+    SPLITMIX64_GAMMA_4,
+)
+# =============================================================================
+# GF(2^128) Arithmetic
+# =============================================================================
+# Irreducible polynomial: P(x) = x^128 + x^7 + x^2 + x + 1
+# In polynomial basis, this means x^128 = x^7 + x^2 + x + 1 (mod P)
+_GF128_POLYNOMIAL = 0x87  # x^7 + x^2 + x + 1 = 0b10000111 = 135
+def _gf128_clmul64(a: int, b: int) -> tuple[int, int]:
+    """Carryless multiplication of two 64-bit integers.
+    Returns (lo, hi) where result = hi * 2^64 + lo.
+    """
+    result_lo = 0
+    result_hi = 0
+    for i in range(64):
+        if (b >> i) & 1:
+            # Add a shifted by i positions
+            shifted_lo = (a << i) & ((1 << 64) - 1)
+            shifted_hi = a >> (64 - i) if i > 0 else 0
+            result_lo ^= shifted_lo
+            result_hi ^= shifted_hi
+    return result_lo, result_hi
+def _gf128_clmul128(
+    a_lo: int, a_hi: int, b_lo: int, b_hi: int
+) -> tuple[int, int, int, int]:
+    """Carryless multiplication of two 128-bit values.
+    Returns (r0, r1, r2, r3) where result = r3 * 2^192 + r2 * 2^128 + r1 * 2^64 + r0.
+    """
+    # a_lo * b_lo -> [0:128]
+    t0_lo, t0_hi = _gf128_clmul64(a_lo, b_lo)
+    # a_hi * b_hi -> [128:256]
+    t1_lo, t1_hi = _gf128_clmul64(a_hi, b_hi)
+    # a_lo * b_hi -> [64:192]
+    t2_lo, t2_hi = _gf128_clmul64(a_lo, b_hi)
+    # a_hi * b_lo -> [64:192]
+    t3_lo, t3_hi = _gf128_clmul64(a_hi, b_lo)
+    # Combine cross terms
+    mid_lo = t2_lo ^ t3_lo
+    mid_hi = t2_hi ^ t3_hi
+    # Result accumulation
+    r0 = t0_lo
+    r1 = t0_hi ^ mid_lo
+    r2 = t1_lo ^ mid_hi
+    r3 = t1_hi
+    # Handle carry from r1 to r2 (carryless, just XOR overflow)
+    # In carryless arithmetic, there's no carry propagation
+    return r0, r1, r2, r3
+def _gf128_reduce(r0: int, r1: int, r2: int, r3: int) -> tuple[int, int]:
+    """Reduce 256-bit polynomial modulo P(x) = x^128 + x^7 + x^2 + x + 1.
+    Returns (lo, hi) representing the 128-bit result.
+    """
+    # Reduction: x^128 = x^7 + x^2 + x + 1 (mod P)
+    # So we need to reduce r2 and r3 into r0 and r1
+    # r3 contributes at positions [192:256], which after reduction affects [64:128] and [0:64]
+    # r2 contributes at positions [128:192], which after reduction affects [0:64]
+    # First, reduce r3 (bits 192-255)
+    # x^192 = x^64 * x^128 = x^64 * (x^7 + x^2 + x + 1)
+    #       = x^71 + x^66 + x^65 + x^64
+    # x^256 is beyond our range, but r3 represents bits [192:256]
+    # For each bit position p in [192:255] that is set:
+    # x^p = x^(p-128) * x^128 = x^(p-128) * 0x87
+    # This means bit at position p reduces to XOR with 0x87 shifted by (p-128)
+    # Simpler approach: reduce in two stages
+    # Stage 1: Reduce r3 (affects r1 and r0 after multiple reductions)
+    # r3 * x^192 mod P = r3 * x^64 * (x^7 + x^2 + x + 1)
+    q3_lo, q3_hi = _gf128_clmul64(r3, _GF128_POLYNOMIAL)
+    # This gives us bits at [64+0:64+128] = [64:192]
+    # So it affects r1 and r2
+    r1 ^= q3_lo
+    r2 ^= q3_hi
+    # Stage 2: Reduce r2 (affects r0 and r1)
+    # r2 * x^128 mod P = r2 * 0x87
+    q2_lo, q2_hi = _gf128_clmul64(r2, _GF128_POLYNOMIAL)
+    # This gives bits at [0:128]
+    r0 ^= q2_lo
+    r1 ^= q2_hi
+    return r0, r1
+def gf128_mul_single(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    """Multiply two GF(2^128) elements.
+    Args:
+        a: Shape (2,) uint64 array representing a 128-bit element [lo, hi]
+        b: Shape (2,) uint64 array representing a 128-bit element [lo, hi]
+    Returns:
+        Shape (2,) uint64 array representing the product
+    """
+    a_lo, a_hi = int(a[0]), int(a[1])
+    b_lo, b_hi = int(b[0]), int(b[1])
+    r0, r1, r2, r3 = _gf128_clmul128(a_lo, a_hi, b_lo, b_hi)
+    res_lo, res_hi = _gf128_reduce(r0, r1, r2, r3)
+    return np.array(
+        [res_lo & ((1 << 64) - 1), res_hi & ((1 << 64) - 1)], dtype=np.uint64
+    )
+def gf128_mul_batch(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    """Batch multiply GF(2^128) elements.
+    Args:
+        a: Shape (..., 2) uint64 array
+        b: Shape (..., 2) uint64 array
+    Returns:
+        Shape (..., 2) uint64 array of products
+    """
+    original_shape = a.shape
+    a_flat = a.reshape(-1, 2)
+    b_flat = b.reshape(-1, 2)
+    n = a_flat.shape[0]
+    result = np.zeros_like(a_flat)
+    for i in range(n):
+        result[i] = gf128_mul_single(a_flat[i], b_flat[i])
+    return result.reshape(original_shape)
+# =============================================================================
+# OKVS (Oblivious Key-Value Store) - 3-Hash Garbled Cuckoo Table
+# =============================================================================
+def _hash_key_py(key: int, m: int, seed: tuple[int, int]) -> tuple[int, int, int]:
+    """Hash a key to 3 distinct indices using simple polynomial hashing.
+    This is a pure Python approximation of the AES-based hash in C++.
+    For compatibility, we use a deterministic hash based on the key.
+    """
+    # Simple polynomial hash (not as secure as AES, but deterministic)
+    s0, s1 = seed
+    # Mix key with seed
+    h1 = ((key * GOLDEN_RATIO_64) ^ s0) & ((1 << 64) - 1)
+    h2 = ((key * SPLITMIX64_GAMMA_2) ^ s1) & ((1 << 64) - 1)
+    # Additional mixing
+    h1 = ((h1 ^ (h1 >> 33)) * SPLITMIX64_GAMMA_3) & ((1 << 64) - 1)
+    h2 = ((h2 ^ (h2 >> 33)) * SPLITMIX64_GAMMA_4) & ((1 << 64) - 1)
+    idx1 = h1 % m
+    idx2 = h2 % m
+    idx3 = (h1 ^ h2) % m
+    # Enforce distinct indices
+    if idx2 == idx1:
+        idx2 = (idx2 + 1) % m
+    if idx3 == idx1 or idx3 == idx2:
+        idx3 = (idx3 + 1) % m
+        if idx3 == idx1 or idx3 == idx2:
+            idx3 = (idx3 + 1) % m
+    return int(idx1), int(idx2), int(idx3)
+def okvs_solve(
+    keys: np.ndarray,
+    values: np.ndarray,
+    m: int,
+    seed: tuple[int, int] = (0xDEADBEEF, 0xCAFEBABE),
+) -> np.ndarray:
+    """Solve the OKVS system using peeling algorithm.
+    Args:
+        keys: Shape (n,) uint64 array of keys
+        values: Shape (n, 2) uint64 array of values (128-bit each)
+        m: Size of output storage
+    Returns:
+        Shape (m, 2) uint64 array representing the OKVS storage
+    """
+    n = len(keys)
+    # Build graph: for each row, compute its 3 column indices
+    rows = []
+    col_to_rows: dict[int, list[int]] = {j: [] for j in range(m)}
+    for i in range(n):
+        h1, h2, h3 = _hash_key_py(int(keys[i]), m, seed)
+        rows.append((h1, h2, h3))
+        col_to_rows[h1].append(i)
+        col_to_rows[h2].append(i)
+        col_to_rows[h3].append(i)
+    # Compute column degrees
+    col_degree = [len(col_to_rows[j]) for j in range(m)]
+    # Initialize peel queue with degree-1 columns
+    peel_queue = [j for j in range(m) if col_degree[j] == 1]
+    row_removed = [False] * n
+    col_removed = [False] * m
+    assignment_stack: list[tuple[int, int]] = []  # (col, row)
+    head = 0
+    while head < len(peel_queue):
+        j = peel_queue[head]
+        head += 1
+        if col_removed[j]:
+            continue
+        # Find the single active row for this column
+        owner_row = -1
+        for r_idx in col_to_rows[j]:
+            if not row_removed[r_idx]:
+                owner_row = r_idx
+                break
+        if owner_row == -1:
+            col_removed[j] = True
+            continue
+        # Peel this (column, row) pair
+        assignment_stack.append((j, owner_row))
+        col_removed[j] = True
+        row_removed[owner_row] = True
+        # Update neighbor column degrees
+        h1, h2, h3 = rows[owner_row]
+        for neighbor in (h1, h2, h3):
+            if neighbor == j or col_removed[neighbor]:
+                continue
+            col_degree[neighbor] -= 1
+            if col_degree[neighbor] == 1:
+                peel_queue.append(neighbor)
+    if len(assignment_stack) != n:
+        raise RuntimeError(
+            f"OKVS core detected. Failed to peel all rows. "
+            f"n={n}, m={m}, solved={len(assignment_stack)}"
+        )
+    # Back substitution (solve in reverse order)
+    output = np.zeros((m, 2), dtype=np.uint64)
+    for col, row in reversed(assignment_stack):
+        h1, h2, h3 = rows[row]
+        # Current sum of columns in this row
+        current_sum = output[h1] ^ output[h2] ^ output[h3]
+        # Compute value needed for col to make sum equal target
+        target = values[row]
+        diff = target ^ current_sum
+        output[col] = diff
+    return output
+def okvs_decode(
+    keys: np.ndarray,
+    storage: np.ndarray,
+    m: int,
+    seed: tuple[int, int] = (0xDEADBEEF, 0xCAFEBABE),
+) -> np.ndarray:
+    """Decode values from OKVS storage.
+    Args:
+        keys: Shape (n,) uint64 array of keys to query
+        storage: Shape (m, 2) uint64 array (the solved OKVS)
+        m: Size of storage
+    Returns:
+        Shape (n, 2) uint64 array of decoded values
+    """
+    n = len(keys)
+    output = np.zeros((n, 2), dtype=np.uint64)
+    for i in range(n):
+        h1, h2, h3 = _hash_key_py(int(keys[i]), m, seed)
+        output[i] = storage[h1] ^ storage[h2] ^ storage[h3]
+    return output
+# =============================================================================
+# AES-128 Expansion (PRG Fallback)
+# =============================================================================
+def aes_expand(seeds: np.ndarray, length: int) -> np.ndarray:
+    """Expand seeds to pseudorandom sequence.
+    This is a fallback using NumPy's PRNG instead of AES-NI.
+    Args:
+        seeds: Shape (num_seeds, 2) uint64 array of 128-bit seeds
+        length: Number of 128-bit blocks to generate per seed
+    Returns:
+        Shape (num_seeds, length, 2) uint64 array
+    """
+    num_seeds = seeds.shape[0]
+    output = np.zeros((num_seeds, length, 2), dtype=np.uint64)
+    for i in range(num_seeds):
+        seed_val = [int(seeds[i, 0]), int(seeds[i, 1])]
+        rng = np.random.default_rng(seed_val)
+        output[i] = rng.integers(
+            0, 0xFFFFFFFFFFFFFFFF, size=(length, 2), dtype=np.uint64
+        )
+    return output
+# =============================================================================
+# LDPC Encoding (Sparse)
+# =============================================================================
+def ldpc_encode(
+    message: np.ndarray, h_indices: np.ndarray, h_indptr: np.ndarray, m: int
+) -> np.ndarray:
+    """Compute syndrome S = H @ message using sparse CSR representation.
+    This is the fallback when C++ kernel is not available.
+    Args:
+        message: (N, 2) uint64 message vector
+        h_indices: CSR indices array for H
+        h_indptr: CSR indptr array for H (length m+1)
+        m: Number of rows in H (syndrome length)
+    Returns:
+        (m, 2) uint64 syndrome vector
+    """
+    syndrome = np.zeros((m, 2), dtype=np.uint64)
+    for i in range(m):
+        # Get column indices for row i
+        start, end = int(h_indptr[i]), int(h_indptr[i + 1])
+        cols = h_indices[start:end]
+        # XOR all selected message elements
+        for j in cols:
+            syndrome[i] ^= message[int(j)]
+    return syndrome

mplang-nightly 0.1.dev158__py3-none-any.whl → 0.1.dev268__py3-none-any.whl

mplang-nightly 0.1.dev158py3-none-any.whl → 0.1.dev268py3-none-any.whl