PyPI - mlxdf - Versions diffs - 0.1.0__tar.gz - Mend

mlxdf 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

mlxdf-0.1.0/PKG-INFO +148 -0
mlxdf-0.1.0/README.md +134 -0
mlxdf-0.1.0/pyproject.toml +38 -0
mlxdf-0.1.0/src/mlxdf/__init__.py +24 -0
mlxdf-0.1.0/src/mlxdf/compute/__init__.py +0 -0
mlxdf-0.1.0/src/mlxdf/compute/compiler.py +215 -0
mlxdf-0.1.0/src/mlxdf/core/__init__.py +5 -0
mlxdf-0.1.0/src/mlxdf/core/categorical.py +250 -0
mlxdf-0.1.0/src/mlxdf/core/dataframe.py +319 -0
mlxdf-0.1.0/src/mlxdf/core/series.py +308 -0
mlxdf-0.1.0/src/mlxdf/io/__init__.py +22 -0
mlxdf-0.1.0/src/mlxdf/io/arrow.py +230 -0
mlxdf-0.1.0/src/mlxdf/io/dlpack.py +158 -0
mlxdf-0.1.0/src/mlxdf/io/parquet.py +82 -0
mlxdf-0.1.0/src/mlxdf/ops/__init__.py +0 -0
mlxdf-0.1.0/src/mlxdf/ops/arithmetic.py +0 -0
mlxdf-0.1.0/src/mlxdf/ops/boolean.py +0 -0
mlxdf-0.1.0/src/mlxdf/ops/groupby.py +244 -0
mlxdf-0.1.0/src/mlxdf/ops/join.py +373 -0
mlxdf-0.1.0/src/mlxdf/py.typed +0 -0

mlxdf-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,148 @@
+Metadata-Version: 2.4
+Name: mlxdf
+Version: 0.1.0
+Summary: GPU-accelerated DataFrame library for Apple Silicon, built on MLX
+Author: Mocus Zhang
+Author-email: Mocus Zhang <mocusez@outlook.com>
+License-Expression: MIT
+Requires-Dist: mlx>=0.31.1
+Requires-Dist: numpy>=1.24.0
+Requires-Dist: pyarrow>=12.0 ; extra == 'arrow'
+Requires-Python: >=3.11
+Provides-Extra: arrow
+Description-Content-Type: text/markdown
+# MLX-DF
+GPU-accelerated DataFrame library for Apple Silicon, built on [MLX](https://github.com/ml-explore/mlx).
+MLX-DF brings cuDF-style GPU DataFrame operations to Mac, exploiting Apple's unified memory for zero-copy CPU/GPU data sharing. The API mirrors Pandas for easy migration.
+> [!WARNING]
+> MLX-DF currently supports only Apple Silicon devices (M-series chips).
+## Installation
+```bash
+pip install mlxdf
+```
+Using uv:
+```bash
+uv add mlxdf
+```
+With PyArrow/Parquet support:
+```bash
+pip install mlxdf[arrow]
+```
+Using uv:
+```bash
+uv add "mlxdf[arrow]"
+```
+From source:
+```bash
+uv sync
+uv build
+```
+## Quick Start
+```python
+from mlxdf import MlxDataFrame, merge, read_parquet
+# Create a DataFrame (string columns auto-detected as CategoricalSeries)
+df = MlxDataFrame({
+    "product_id": [1.0, 2.0, 1.0, 3.0, 2.0],
+    "quantity":   [5.0, 3.0, 2.0, 7.0, 1.0],
+    "category":   ["A", "B", "A", "C", "B"],
+})
+# Filter
+high_qty = df[df["quantity"] > 2.0]
+# Computed columns
+df["double_qty"] = df["quantity"] * 2
+# GroupBy aggregation
+result = df.groupby("category")["quantity"].sum()
+result.show()
+# Join two DataFrames
+prices = MlxDataFrame({
+    "product_id": [1.0, 2.0, 3.0],
+    "price":      [10.0, 25.0, 15.0],
+})
+joined = df.merge(prices, on="product_id", how="inner")
+# Parquet I/O (requires mlx-df[arrow])
+df.to_parquet("output.parquet")
+df2 = read_parquet("output.parquet", columns=["product_id", "quantity"])
+```
+## Features
+- **MlxSeries** — Column with boolean null mask, vectorized arithmetic, comparisons, and aggregations
+- **CategoricalSeries** — Dictionary-encoded string column (55× faster filtering vs Pandas)
+- **MlxDataFrame** — Dict-like table with column access, boolean filtering, head/tail/slicing
+- **GroupBy** — Bincount/sort-based groupby with sum/mean/count/max/min aggregations
+- **Join** — Hash-index join supporting inner/left/right/outer (4× faster vs Pandas at 200M rows)
+- **Pandas Interop** — `to_pandas()` / `from_pandas()` with automatic type conversion
+- **PyArrow & Parquet** — Read/write Parquet with column pruning and predicate pushdown
+- **JIT Compilation** — `compile_fn` for fused GPU kernel execution
+## Development
+### Setup
+```bash
+uv sync
+```
+### Running Tests
+```bash
+# Run all unit tests (benchmarks are excluded by default)
+uv run pytest
+# Run a specific test file
+uv run pytest tests/test_series.py
+# Run a specific test case
+uv run pytest tests/test_series.py::TestArithmetic::test_add_series -v
+# Run with verbose output
+uv run pytest -v
+# Run and stop on first failure
+uv run pytest -x
+```
+### Benchmarks
+Benchmarks are integrated into pytest via the `bench` marker, defaulting to deselected so they don't slow down regular test runs.
+```bash
+# Run all benchmarks
+uv run pytest -m bench
+# Run a specific benchmark
+uv run pytest -m bench -k parquet
+uv run pytest -m bench -k tpch
+uv run pytest -m bench -k categorical
+uv run pytest -m bench -k compile
+# Run both tests and benchmarks together
+uv run pytest -m ""
+# Run benchmark scripts directly (also works)
+uv run python benchmarks/bench_vs_pandas.py
+```
+Available benchmarks: `bench_vs_pandas`, `bench_categorical`, `bench_parquet`, `bench_compile_df`, `bench_tpch_q1/q3/q4/q6/q18/q19`。

mlxdf-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,134 @@
+# MLX-DF
+GPU-accelerated DataFrame library for Apple Silicon, built on [MLX](https://github.com/ml-explore/mlx).
+MLX-DF brings cuDF-style GPU DataFrame operations to Mac, exploiting Apple's unified memory for zero-copy CPU/GPU data sharing. The API mirrors Pandas for easy migration.
+> [!WARNING]
+> MLX-DF currently supports only Apple Silicon devices (M-series chips).
+## Installation
+```bash
+pip install mlxdf
+```
+Using uv:
+```bash
+uv add mlxdf
+```
+With PyArrow/Parquet support:
+```bash
+pip install mlxdf[arrow]
+```
+Using uv:
+```bash
+uv add "mlxdf[arrow]"
+```
+From source:
+```bash
+uv sync
+uv build
+```
+## Quick Start
+```python
+from mlxdf import MlxDataFrame, merge, read_parquet
+# Create a DataFrame (string columns auto-detected as CategoricalSeries)
+df = MlxDataFrame({
+    "product_id": [1.0, 2.0, 1.0, 3.0, 2.0],
+    "quantity":   [5.0, 3.0, 2.0, 7.0, 1.0],
+    "category":   ["A", "B", "A", "C", "B"],
+})
+# Filter
+high_qty = df[df["quantity"] > 2.0]
+# Computed columns
+df["double_qty"] = df["quantity"] * 2
+# GroupBy aggregation
+result = df.groupby("category")["quantity"].sum()
+result.show()
+# Join two DataFrames
+prices = MlxDataFrame({
+    "product_id": [1.0, 2.0, 3.0],
+    "price":      [10.0, 25.0, 15.0],
+})
+joined = df.merge(prices, on="product_id", how="inner")
+# Parquet I/O (requires mlx-df[arrow])
+df.to_parquet("output.parquet")
+df2 = read_parquet("output.parquet", columns=["product_id", "quantity"])
+```
+## Features
+- **MlxSeries** — Column with boolean null mask, vectorized arithmetic, comparisons, and aggregations
+- **CategoricalSeries** — Dictionary-encoded string column (55× faster filtering vs Pandas)
+- **MlxDataFrame** — Dict-like table with column access, boolean filtering, head/tail/slicing
+- **GroupBy** — Bincount/sort-based groupby with sum/mean/count/max/min aggregations
+- **Join** — Hash-index join supporting inner/left/right/outer (4× faster vs Pandas at 200M rows)
+- **Pandas Interop** — `to_pandas()` / `from_pandas()` with automatic type conversion
+- **PyArrow & Parquet** — Read/write Parquet with column pruning and predicate pushdown
+- **JIT Compilation** — `compile_fn` for fused GPU kernel execution
+## Development
+### Setup
+```bash
+uv sync
+```
+### Running Tests
+```bash
+# Run all unit tests (benchmarks are excluded by default)
+uv run pytest
+# Run a specific test file
+uv run pytest tests/test_series.py
+# Run a specific test case
+uv run pytest tests/test_series.py::TestArithmetic::test_add_series -v
+# Run with verbose output
+uv run pytest -v
+# Run and stop on first failure
+uv run pytest -x
+```
+### Benchmarks
+Benchmarks are integrated into pytest via the `bench` marker, defaulting to deselected so they don't slow down regular test runs.
+```bash
+# Run all benchmarks
+uv run pytest -m bench
+# Run a specific benchmark
+uv run pytest -m bench -k parquet
+uv run pytest -m bench -k tpch
+uv run pytest -m bench -k categorical
+uv run pytest -m bench -k compile
+# Run both tests and benchmarks together
+uv run pytest -m ""
+# Run benchmark scripts directly (also works)
+uv run python benchmarks/bench_vs_pandas.py
+```
+Available benchmarks: `bench_vs_pandas`, `bench_categorical`, `bench_parquet`, `bench_compile_df`, `bench_tpch_q1/q3/q4/q6/q18/q19`。

mlxdf-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,38 @@
+[project]
+name = "mlxdf"
+version = "0.1.0"
+description = "GPU-accelerated DataFrame library for Apple Silicon, built on MLX"
+readme = "README.md"
+license = "MIT"
+authors = [
+    { name = "Mocus Zhang", email = "mocusez@outlook.com" }
+]
+requires-python = ">=3.11"
+dependencies = [
+    "mlx>=0.31.1",
+    "numpy>=1.24.0",
+]
+[project.optional-dependencies]
+arrow = ["pyarrow>=12.0"]
+[dependency-groups]
+dev = [
+    "pytest>=8.0",
+    "pandas>=2.0",
+    "pyarrow>=12.0",
+]
+bench = [
+    "polars==1.39.3",
+    "duckdb==1.5.0",
+]
+[tool.pytest.ini_options]
+addopts = "-m 'not bench'"
+markers = [
+    "bench: benchmark tests (deselected by default, run with: uv run pytest -m bench)",
+]
+[build-system]
+requires = ["uv_build>=0.9.18,<0.10.0"]
+build-backend = "uv_build"

mlxdf-0.1.0/src/mlxdf/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""MLX-DF: GPU-accelerated DataFrame library for Apple Silicon."""
+from mlxdf.core.series import MlxSeries
+from mlxdf.core.categorical import CategoricalSeries
+from mlxdf.core.dataframe import MlxDataFrame
+from mlxdf.ops.join import merge
+from mlxdf.compute.compiler import compile_fn, compile_df
+def read_parquet(path, *, columns=None, filters=None):
+    """Read a Parquet file into an MlxDataFrame (requires PyArrow)."""
+    from mlxdf.io.parquet import read_parquet as _read_parquet
+    return _read_parquet(path, columns=columns, filters=filters)
+__all__ = [
+    "MlxSeries",
+    "CategoricalSeries",
+    "MlxDataFrame",
+    "merge",
+    "compile_fn",
+    "compile_df",
+    "read_parquet",
+]

mlxdf-0.1.0/src/mlxdf/compute/__init__.py ADDED Viewed

File without changes

mlxdf-0.1.0/src/mlxdf/compute/compiler.py ADDED Viewed

@@ -0,0 +1,215 @@
+"""Compute engine helpers: mx.compile wrapper and forced materialization."""
+from __future__ import annotations
+from functools import wraps
+import mlx.core as mx
+def compile_fn(fn):
+    """Wrap a function with mx.compile for fused GPU execution."""
+    return mx.compile(fn)
+# ---------------------------------------------------------------------------
+# Lightweight proxies for compile_df — these mimic MlxSeries / MlxDataFrame
+# using only raw mx.array values so mx.compile can trace them.
+# ---------------------------------------------------------------------------
+class _SeriesProxy:
+    """Proxy for MlxSeries inside an mx.compile context.
+    Stores raw ``data`` and ``mask`` arrays and implements arithmetic
+    with null-mask propagation identical to MlxSeries._binary_op.
+    """
+    __slots__ = ("data", "mask", "name")
+    def __init__(self, data: mx.array, mask: mx.array, name: str | None = None):
+        self.data = data
+        self.mask = mask
+        self.name = name
+    # -- arithmetic (mask = AND of both masks) --
+    def _binary_op(self, other, op_fn) -> _SeriesProxy:
+        if isinstance(other, _SeriesProxy):
+            return _SeriesProxy(
+                op_fn(self.data, other.data),
+                mx.logical_and(self.mask, other.mask),
+            )
+        scalar = mx.array(other, dtype=self.data.dtype)
+        return _SeriesProxy(op_fn(self.data, scalar), self.mask)
+    def __add__(self, other):
+        return self._binary_op(other, mx.add)
+    def __radd__(self, other):
+        return self._binary_op(other, mx.add)
+    def __sub__(self, other):
+        return self._binary_op(other, mx.subtract)
+    def __rsub__(self, other):
+        if isinstance(other, _SeriesProxy):
+            return other._binary_op(self, mx.subtract)
+        return _SeriesProxy(
+            mx.subtract(mx.array(other, dtype=self.data.dtype), self.data),
+            self.mask,
+        )
+    def __mul__(self, other):
+        return self._binary_op(other, mx.multiply)
+    def __rmul__(self, other):
+        return self._binary_op(other, mx.multiply)
+    def __truediv__(self, other):
+        return self._binary_op(other, mx.divide)
+    def __rtruediv__(self, other):
+        if isinstance(other, _SeriesProxy):
+            return other._binary_op(self, mx.divide)
+        return _SeriesProxy(
+            mx.divide(mx.array(other, dtype=self.data.dtype), self.data),
+            self.mask,
+        )
+    def __neg__(self):
+        return _SeriesProxy(mx.negative(self.data), self.mask, name=self.name)
+    # -- comparison ops --
+    def _cmp_op(self, other, op_fn) -> _SeriesProxy:
+        if isinstance(other, _SeriesProxy):
+            return _SeriesProxy(
+                op_fn(self.data, other.data),
+                mx.logical_and(self.mask, other.mask),
+            )
+        return _SeriesProxy(op_fn(self.data, mx.array(other)), self.mask)
+    def __gt__(self, other):
+        return self._cmp_op(other, mx.greater)
+    def __ge__(self, other):
+        return self._cmp_op(other, mx.greater_equal)
+    def __lt__(self, other):
+        return self._cmp_op(other, mx.less)
+    def __le__(self, other):
+        return self._cmp_op(other, mx.less_equal)
+    def __eq__(self, other):
+        return self._cmp_op(other, mx.equal)
+    def __ne__(self, other):
+        return self._cmp_op(other, mx.not_equal)
+    # -- null helpers --
+    def fillna(self, value) -> _SeriesProxy:
+        fill = mx.array(value, dtype=self.data.dtype) if not isinstance(value, mx.array) else value
+        filled = mx.where(self.mask, self.data, fill)
+        return _SeriesProxy(filled, mx.ones(self.data.shape, dtype=mx.bool_), name=self.name)
+    def isna(self) -> _SeriesProxy:
+        return _SeriesProxy(mx.logical_not(self.mask), mx.ones(self.mask.shape, dtype=mx.bool_), name=self.name)
+    def notna(self) -> _SeriesProxy:
+        return _SeriesProxy(self.mask, mx.ones(self.mask.shape, dtype=mx.bool_), name=self.name)
+class _DataFrameProxy:
+    """Proxy for MlxDataFrame inside an mx.compile context.
+    Stores columns as ``_SeriesProxy`` instances and supports
+    ``[]`` get / set so user code reads like normal DataFrame code.
+    """
+    __slots__ = ("_cols",)
+    def __init__(self, data_dict: dict[str, mx.array], mask_dict: dict[str, mx.array]):
+        self._cols: dict[str, _SeriesProxy] = {
+            k: _SeriesProxy(data_dict[k], mask_dict[k], name=k) for k in data_dict
+        }
+    def __getitem__(self, key: str) -> _SeriesProxy:
+        return self._cols[key]
+    def __setitem__(self, key: str, value: _SeriesProxy):
+        if isinstance(value, _SeriesProxy):
+            value.name = key
+            self._cols[key] = value
+        else:
+            raise TypeError(
+                f"compile_df only supports _SeriesProxy column assignment, got {type(value)}"
+            )
+    @property
+    def columns(self) -> list[str]:
+        return list(self._cols.keys())
+    def _to_dicts(self) -> tuple[dict[str, mx.array], dict[str, mx.array]]:
+        data = {k: v.data for k, v in self._cols.items()}
+        masks = {k: v.mask for k, v in self._cols.items()}
+        return data, masks
+# ---------------------------------------------------------------------------
+# compile_df — DataFrame-aware JIT compilation
+# ---------------------------------------------------------------------------
+def compile_df(fn):
+    """JIT-compile a function that operates on :class:`MlxDataFrame`.
+    The decorated function receives a lightweight proxy that supports the
+    same column access (``df['col']``), arithmetic, and ``fillna`` as
+    :class:`MlxDataFrame` / :class:`MlxSeries`.  Under the hood, the
+    proxy passes raw ``mx.array`` values through ``mx.compile`` so that
+    all GPU operations are fused into a single kernel.
+    Example::
+        @compile_df
+        def compute(df):
+            df['wap'] = (df['bid_price'] * df['ask_size']
+                         + df['ask_price'] * df['bid_size']) / \\
+                        (df['bid_size'] + df['ask_size'])
+            df['wap'] = df['wap'].fillna(0.0)
+            return df
+        result = compute(my_dataframe)   # MlxDataFrame in, MlxDataFrame out
+    """
+    from mlxdf.core.dataframe import MlxDataFrame
+    from mlxdf.core.series import MlxSeries
+    def _inner(data_dict, mask_dict):
+        proxy = _DataFrameProxy(data_dict, mask_dict)
+        result = fn(proxy)
+        return result._to_dicts()
+    compiled = mx.compile(_inner)
+    @wraps(fn)
+    def wrapper(df: MlxDataFrame) -> MlxDataFrame:
+        # Flatten: DataFrame → dict-of-arrays (only numeric MlxSeries)
+        data_dict: dict[str, mx.array] = {}
+        mask_dict: dict[str, mx.array] = {}
+        for name, col in df._columns.items():
+            if isinstance(col, MlxSeries):
+                data_dict[name] = col.data
+                mask_dict[name] = col.mask
+        out_data, out_mask = compiled(data_dict, mask_dict)
+        # Rebuild MlxDataFrame from output arrays
+        new_cols: dict[str, MlxSeries] = {}
+        for name in out_data:
+            new_cols[name] = MlxSeries(out_data[name], mask=out_mask[name], name=name)
+        return MlxDataFrame(new_cols)
+    return wrapper

mlxdf-0.1.0/src/mlxdf/core/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from mlxdf.core.series import MlxSeries
+from mlxdf.core.categorical import CategoricalSeries
+from mlxdf.core.dataframe import MlxDataFrame
+__all__ = ["MlxSeries", "CategoricalSeries", "MlxDataFrame"]