pyframe-xpy 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyframe_xpy-0.1.0/LICENSE +21 -0
- pyframe_xpy-0.1.0/MANIFEST.in +9 -0
- pyframe_xpy-0.1.0/PKG-INFO +275 -0
- pyframe_xpy-0.1.0/README.md +222 -0
- pyframe_xpy-0.1.0/framex/__init__.py +110 -0
- pyframe_xpy-0.1.0/framex/_version.py +1 -0
- pyframe_xpy-0.1.0/framex/backends/__init__.py +9 -0
- pyframe_xpy-0.1.0/framex/backends/array_accel.py +201 -0
- pyframe_xpy-0.1.0/framex/backends/c_backend.py +391 -0
- pyframe_xpy-0.1.0/framex/backends/c_kernels.c +207 -0
- pyframe_xpy-0.1.0/framex/compat.py +265 -0
- pyframe_xpy-0.1.0/framex/config.py +244 -0
- pyframe_xpy-0.1.0/framex/core/__init__.py +7 -0
- pyframe_xpy-0.1.0/framex/core/array.py +480 -0
- pyframe_xpy-0.1.0/framex/core/dataframe.py +946 -0
- pyframe_xpy-0.1.0/framex/core/dtypes.py +98 -0
- pyframe_xpy-0.1.0/framex/core/index.py +86 -0
- pyframe_xpy-0.1.0/framex/core/series.py +272 -0
- pyframe_xpy-0.1.0/framex/interchange/__init__.py +11 -0
- pyframe_xpy-0.1.0/framex/interchange/dataframe_protocol.py +114 -0
- pyframe_xpy-0.1.0/framex/interchange/numpy_protocols.py +23 -0
- pyframe_xpy-0.1.0/framex/io/__init__.py +20 -0
- pyframe_xpy-0.1.0/framex/io/arrow_ipc.py +28 -0
- pyframe_xpy-0.1.0/framex/io/csv.py +46 -0
- pyframe_xpy-0.1.0/framex/io/file.py +316 -0
- pyframe_xpy-0.1.0/framex/io/json.py +152 -0
- pyframe_xpy-0.1.0/framex/io/parquet.py +31 -0
- pyframe_xpy-0.1.0/framex/memory/__init__.py +5 -0
- pyframe_xpy-0.1.0/framex/memory/buffer.py +196 -0
- pyframe_xpy-0.1.0/framex/memory/pool.py +82 -0
- pyframe_xpy-0.1.0/framex/memory/transport.py +198 -0
- pyframe_xpy-0.1.0/framex/ops/__init__.py +23 -0
- pyframe_xpy-0.1.0/framex/ops/elementwise.py +174 -0
- pyframe_xpy-0.1.0/framex/ops/filter.py +66 -0
- pyframe_xpy-0.1.0/framex/ops/groupby.py +70 -0
- pyframe_xpy-0.1.0/framex/ops/join.py +81 -0
- pyframe_xpy-0.1.0/framex/ops/projection.py +32 -0
- pyframe_xpy-0.1.0/framex/ops/reduction.py +169 -0
- pyframe_xpy-0.1.0/framex/ops/sort.py +41 -0
- pyframe_xpy-0.1.0/framex/ops/window.py +275 -0
- pyframe_xpy-0.1.0/framex/pandas_engine.py +52 -0
- pyframe_xpy-0.1.0/framex/runtime/__init__.py +16 -0
- pyframe_xpy-0.1.0/framex/runtime/executor.py +256 -0
- pyframe_xpy-0.1.0/framex/runtime/partition.py +67 -0
- pyframe_xpy-0.1.0/framex/runtime/scheduler.py +95 -0
- pyframe_xpy-0.1.0/framex/runtime/streaming.py +64 -0
- pyframe_xpy-0.1.0/framex/runtime/task.py +103 -0
- pyframe_xpy-0.1.0/pyframe_xpy.egg-info/PKG-INFO +275 -0
- pyframe_xpy-0.1.0/pyframe_xpy.egg-info/SOURCES.txt +52 -0
- pyframe_xpy-0.1.0/pyframe_xpy.egg-info/dependency_links.txt +1 -0
- pyframe_xpy-0.1.0/pyframe_xpy.egg-info/requires.txt +38 -0
- pyframe_xpy-0.1.0/pyframe_xpy.egg-info/top_level.txt +1 -0
- pyframe_xpy-0.1.0/pyproject.toml +93 -0
- pyframe_xpy-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Theerayut Bubpamala
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyframe-xpy
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: High-performance parallel dataframe and array processing with Arrow-backed storage
|
|
5
|
+
Author: FrameX Contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/aeiwz/FrameX
|
|
8
|
+
Project-URL: Repository, https://github.com/aeiwz/FrameX
|
|
9
|
+
Project-URL: Issues, https://github.com/aeiwz/FrameX/issues
|
|
10
|
+
Project-URL: Documentation, https://github.com/aeiwz/FrameX/tree/main/docs
|
|
11
|
+
Keywords: dataframe,array,analytics,arrow,dask,ray,numpy,parallel
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: pyarrow>=14.0
|
|
26
|
+
Requires-Dist: numpy>=1.24
|
|
27
|
+
Provides-Extra: pandas-compat
|
|
28
|
+
Requires-Dist: pandas>=2.0; extra == "pandas-compat"
|
|
29
|
+
Provides-Extra: distributed
|
|
30
|
+
Requires-Dist: dask[dataframe,distributed]>=2024.1.0; extra == "distributed"
|
|
31
|
+
Requires-Dist: ray[data]>=2.9.0; extra == "distributed"
|
|
32
|
+
Provides-Extra: accel
|
|
33
|
+
Requires-Dist: numexpr>=2.9; extra == "accel"
|
|
34
|
+
Requires-Dist: numba>=0.59; extra == "accel"
|
|
35
|
+
Provides-Extra: gpu
|
|
36
|
+
Requires-Dist: cupy-cuda12x>=13.0; platform_system != "Windows" and extra == "gpu"
|
|
37
|
+
Provides-Extra: ml-accel
|
|
38
|
+
Requires-Dist: torch>=2.2; extra == "ml-accel"
|
|
39
|
+
Requires-Dist: jax>=0.4.30; extra == "ml-accel"
|
|
40
|
+
Provides-Extra: pandas-fast
|
|
41
|
+
Requires-Dist: modin[ray]>=0.30; extra == "pandas-fast"
|
|
42
|
+
Provides-Extra: dev
|
|
43
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
44
|
+
Requires-Dist: pytest-benchmark; extra == "dev"
|
|
45
|
+
Requires-Dist: hypothesis; extra == "dev"
|
|
46
|
+
Provides-Extra: bench
|
|
47
|
+
Requires-Dist: matplotlib>=3.8; extra == "bench"
|
|
48
|
+
Requires-Dist: psutil>=5.9; extra == "bench"
|
|
49
|
+
Provides-Extra: release
|
|
50
|
+
Requires-Dist: build>=1.2.2; extra == "release"
|
|
51
|
+
Requires-Dist: twine>=5.1.1; extra == "release"
|
|
52
|
+
Dynamic: license-file
|
|
53
|
+
|
|
54
|
+
# FrameX
|
|
55
|
+
|
|
56
|
+
FrameX is an Arrow-backed Python library for parallel dataframe and array processing on a single machine.
|
|
57
|
+
|
|
58
|
+
It combines:
|
|
59
|
+
|
|
60
|
+
- Pandas-like tabular APIs (`DataFrame`, `Series`, `GroupBy`)
|
|
61
|
+
- NumPy-compatible chunked arrays (`NDArray` with NumPy protocol support)
|
|
62
|
+
- Arrow-native storage/interop (`to_arrow`, Parquet/IPC I/O)
|
|
63
|
+
- Eager execution with optional lazy pipelines (`.lazy().collect()`)
|
|
64
|
+
- Runtime backends for local threads/processes plus optional Ray/Dask executors
|
|
65
|
+
|
|
66
|
+
## Why FrameX
|
|
67
|
+
|
|
68
|
+
FrameX is aimed at local analytics workflows that are bigger than comfortable single-threaded scripts but do not yet require distributed infrastructure.
|
|
69
|
+
|
|
70
|
+
Typical fit:
|
|
71
|
+
|
|
72
|
+
- ETL and analytics pipelines on medium-to-large local datasets
|
|
73
|
+
- feature engineering workflows that mix table and array operations
|
|
74
|
+
- migration paths from Pandas scripts where API familiarity matters
|
|
75
|
+
|
|
76
|
+
## Installation
|
|
77
|
+
|
|
78
|
+
From PyPI:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install pyframe-xpy
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
From source:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
git clone https://github.com/aeiwz/FrameX.git
|
|
88
|
+
cd FrameX
|
|
89
|
+
pip install -e .
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Requirements:
|
|
93
|
+
|
|
94
|
+
- Python `>=3.10`
|
|
95
|
+
- Core dependencies: `pyarrow`, `numpy`
|
|
96
|
+
- Optional compatibility: `pandas` (`pip install pyframe-xpy[pandas_compat]`)
|
|
97
|
+
|
|
98
|
+
## Quick Start
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
import framex as fx
|
|
102
|
+
|
|
103
|
+
df = fx.DataFrame(
|
|
104
|
+
{
|
|
105
|
+
"group": ["a", "a", "b"],
|
|
106
|
+
"value": [10, 20, 30],
|
|
107
|
+
"is_refund": [False, True, False],
|
|
108
|
+
}
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
result = (
|
|
112
|
+
df.filter(~df["is_refund"])
|
|
113
|
+
.groupby("group")
|
|
114
|
+
.agg({"value": ["sum", "mean", "count"]})
|
|
115
|
+
.sort("value_sum", ascending=False)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
print(result.to_pandas())
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Core API
|
|
122
|
+
|
|
123
|
+
Top-level imports:
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
import framex as fx
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Main objects and helpers:
|
|
130
|
+
|
|
131
|
+
- `fx.DataFrame`, `fx.Series`, `fx.Index`, `fx.LazyFrame`
|
|
132
|
+
- `fx.NDArray`, `fx.array(...)`
|
|
133
|
+
- `fx.read_parquet`, `fx.write_parquet`, `fx.read_ipc`, `fx.write_ipc`, `fx.read_csv`, `fx.write_csv`
|
|
134
|
+
- `fx.read_json`, `fx.write_json`, `fx.read_ndjson`, `fx.write_ndjson`
|
|
135
|
+
- `fx.read_file`, `fx.write_file` for format auto-detection
|
|
136
|
+
|
|
137
|
+
Compression:
|
|
138
|
+
- transparent extension-based compression for `read_file` / `write_file`
|
|
139
|
+
- supported wrappers: `.gz`, `.bz2`, `.xz`, `.zip`, and `.zst`/`.zstd` (when `zstandard` is installed)
|
|
140
|
+
- `fx.from_pandas`, `fx.from_dask`, `fx.from_ray`, `fx.from_dataframe`
|
|
141
|
+
- `fx.get_config`, `fx.set_backend`, `fx.set_workers`, `fx.set_serializer`, `fx.set_kernel_backend`
|
|
142
|
+
- `fx.set_array_backend` for auto/NumExpr/Numba/JAX/PyTorch/CuPy acceleration modes
|
|
143
|
+
- `fx.recommend_best_performance_config()` to inspect hardware-tuned settings
|
|
144
|
+
- `fx.auto_configure_hardware()` to apply best-performance config automatically
|
|
145
|
+
- `fx.StreamProcessor` for micro-batch streaming pipelines
|
|
146
|
+
|
|
147
|
+
Acceleration extras:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
pip install pyframe-xpy[accel] # numexpr + numba
|
|
151
|
+
pip install pyframe-xpy[gpu] # cupy (CUDA)
|
|
152
|
+
pip install pyframe-xpy[ml_accel] # jax + pytorch
|
|
153
|
+
pip install pyframe-xpy[pandas_fast] # modin backend
|
|
154
|
+
pip install pyframe-xpy[distributed] # Dask + Ray distributed/HPC backends
|
|
155
|
+
pip install zstandard # .zst/.zstd file compression
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Backend notes:
|
|
159
|
+
|
|
160
|
+
- `fx.set_backend("threads" | "processes" | "ray" | "dask" | "hpc")`
|
|
161
|
+
- Ray and Dask execution backends require their respective runtimes to be installed/available.
|
|
162
|
+
- HPC mode (`"hpc"`) uses cluster-oriented execution via Dask or Ray:
|
|
163
|
+
- `FRAMEX_HPC_ENGINE=dask|ray`
|
|
164
|
+
- `FRAMEX_DASK_SCHEDULER_ADDRESS=<tcp://...>` to connect existing Dask clusters
|
|
165
|
+
- `FRAMEX_RAY_ADDRESS=<ray://...>` to connect existing Ray clusters
|
|
166
|
+
- optional SLURM bootstrap: `FRAMEX_DASK_SLURM=1` (requires `dask-jobqueue`)
|
|
167
|
+
|
|
168
|
+
Test support notes:
|
|
169
|
+
|
|
170
|
+
- Some tests are optional-backend gated and intentionally `skipped` when deps are not installed.
|
|
171
|
+
- Typical skip reasons: missing `dask.distributed`, `dask.dataframe`, `ray`, or `ray.data`.
|
|
172
|
+
- Run full optional matrix locally:
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
pip install pyframe-xpy[distributed]
|
|
176
|
+
pytest -q
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## Documentation
|
|
180
|
+
|
|
181
|
+
Canonical docs are in [`docs/documents`](docs/documents):
|
|
182
|
+
|
|
183
|
+
- [Overview](docs/documents/overview.md)
|
|
184
|
+
- [Features](docs/documents/features.md)
|
|
185
|
+
- [Getting Started](docs/documents/getting_started.md)
|
|
186
|
+
- [Installation](docs/documents/installation.md)
|
|
187
|
+
- [Tutorial: ETL Pipeline](docs/documents/tutorial_etl_pipeline.md)
|
|
188
|
+
- [Tutorial: NumPy NDArray Interop](docs/documents/tutorial_numpy_array.md)
|
|
189
|
+
- [Use Cases](docs/documents/use_cases.md)
|
|
190
|
+
- [Configuration Guide](docs/documents/configuration_guide.md)
|
|
191
|
+
- [Performance Test](docs/documents/performance_test.md)
|
|
192
|
+
- [Architecture](docs/documents/architecture.md)
|
|
193
|
+
- [API Reference](docs/documents/api_reference.md)
|
|
194
|
+
- [Roadmap](docs/documents/roadmap.md)
|
|
195
|
+
- [FAQ](docs/documents/faq.md)
|
|
196
|
+
|
|
197
|
+
## Website (Docs UI)
|
|
198
|
+
|
|
199
|
+
The docs website lives in [`website`](website) (Next.js App Router).
|
|
200
|
+
|
|
201
|
+
Main docs routes:
|
|
202
|
+
|
|
203
|
+
- `http://localhost:3000/docs/features`
|
|
204
|
+
- `http://localhost:3000/docs/tutorial_etl_pipeline`
|
|
205
|
+
- `http://localhost:3000/docs/use_cases`
|
|
206
|
+
- `http://localhost:3000/docs/configuration_guide`
|
|
207
|
+
- `http://localhost:3000/docs/performance_test`
|
|
208
|
+
|
|
209
|
+
Run locally:
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
cd website
|
|
213
|
+
npm install
|
|
214
|
+
npm run dev
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
Production build:
|
|
218
|
+
|
|
219
|
+
```bash
|
|
220
|
+
npm run build
|
|
221
|
+
npm run start
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## Development
|
|
225
|
+
|
|
226
|
+
Install dev dependencies:
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
pip install -e .[dev]
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
Run tests:
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
pytest
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
## Benchmarks
|
|
239
|
+
|
|
240
|
+
Benchmark code and generated reports are in [`benchmarks`](benchmarks).
|
|
241
|
+
|
|
242
|
+
Run the full benchmark suite (includes in-terminal progress bar and report generation):
|
|
243
|
+
|
|
244
|
+
```bash
|
|
245
|
+
python3 -m benchmarks.benchmark_suite
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
Run workload capability matrix checks:
|
|
249
|
+
|
|
250
|
+
```bash
|
|
251
|
+
python3 -m benchmarks.check_framex_workloads
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
Benchmark outputs are written to `benchmarks/results`:
|
|
255
|
+
|
|
256
|
+
- `benchmark_results.json`
|
|
257
|
+
- `benchmark_results.csv`
|
|
258
|
+
- `benchmark_report.md`
|
|
259
|
+
- `framex_workload_check.json`
|
|
260
|
+
- `performance_speedup.png`
|
|
261
|
+
- `parallel_processing_scaling.png`
|
|
262
|
+
- `multiprocessing_scaling.png`
|
|
263
|
+
- `memory_peak_rss.png`
|
|
264
|
+
|
|
265
|
+
## Project Status
|
|
266
|
+
|
|
267
|
+
FrameX is pre-1.0 (`0.1.0`) and in active development.
|
|
268
|
+
|
|
269
|
+
- APIs are usable and documented
|
|
270
|
+
- compatibility/performance behavior will continue to evolve
|
|
271
|
+
- pin versions for production-critical workloads
|
|
272
|
+
|
|
273
|
+
## License
|
|
274
|
+
|
|
275
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
# FrameX
|
|
2
|
+
|
|
3
|
+
FrameX is an Arrow-backed Python library for parallel dataframe and array processing on a single machine.
|
|
4
|
+
|
|
5
|
+
It combines:
|
|
6
|
+
|
|
7
|
+
- Pandas-like tabular APIs (`DataFrame`, `Series`, `GroupBy`)
|
|
8
|
+
- NumPy-compatible chunked arrays (`NDArray` with NumPy protocol support)
|
|
9
|
+
- Arrow-native storage/interop (`to_arrow`, Parquet/IPC I/O)
|
|
10
|
+
- Eager execution with optional lazy pipelines (`.lazy().collect()`)
|
|
11
|
+
- Runtime backends for local threads/processes plus optional Ray/Dask executors
|
|
12
|
+
|
|
13
|
+
## Why FrameX
|
|
14
|
+
|
|
15
|
+
FrameX is aimed at local analytics workflows that are bigger than comfortable single-threaded scripts but do not yet require distributed infrastructure.
|
|
16
|
+
|
|
17
|
+
Typical fit:
|
|
18
|
+
|
|
19
|
+
- ETL and analytics pipelines on medium-to-large local datasets
|
|
20
|
+
- feature engineering workflows that mix table and array operations
|
|
21
|
+
- migration paths from Pandas scripts where API familiarity matters
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
From PyPI:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install pyframe-xpy
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
From source:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
git clone https://github.com/aeiwz/FrameX.git
|
|
35
|
+
cd FrameX
|
|
36
|
+
pip install -e .
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Requirements:
|
|
40
|
+
|
|
41
|
+
- Python `>=3.10`
|
|
42
|
+
- Core dependencies: `pyarrow`, `numpy`
|
|
43
|
+
- Optional compatibility: `pandas` (`pip install pyframe-xpy[pandas_compat]`)
|
|
44
|
+
|
|
45
|
+
## Quick Start
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import framex as fx
|
|
49
|
+
|
|
50
|
+
df = fx.DataFrame(
|
|
51
|
+
{
|
|
52
|
+
"group": ["a", "a", "b"],
|
|
53
|
+
"value": [10, 20, 30],
|
|
54
|
+
"is_refund": [False, True, False],
|
|
55
|
+
}
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
result = (
|
|
59
|
+
df.filter(~df["is_refund"])
|
|
60
|
+
.groupby("group")
|
|
61
|
+
.agg({"value": ["sum", "mean", "count"]})
|
|
62
|
+
.sort("value_sum", ascending=False)
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
print(result.to_pandas())
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Core API
|
|
69
|
+
|
|
70
|
+
Top-level imports:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
import framex as fx
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Main objects and helpers:
|
|
77
|
+
|
|
78
|
+
- `fx.DataFrame`, `fx.Series`, `fx.Index`, `fx.LazyFrame`
|
|
79
|
+
- `fx.NDArray`, `fx.array(...)`
|
|
80
|
+
- `fx.read_parquet`, `fx.write_parquet`, `fx.read_ipc`, `fx.write_ipc`, `fx.read_csv`, `fx.write_csv`
|
|
81
|
+
- `fx.read_json`, `fx.write_json`, `fx.read_ndjson`, `fx.write_ndjson`
|
|
82
|
+
- `fx.read_file`, `fx.write_file` for format auto-detection
|
|
83
|
+
|
|
84
|
+
Compression:
|
|
85
|
+
- transparent extension-based compression for `read_file` / `write_file`
|
|
86
|
+
- supported wrappers: `.gz`, `.bz2`, `.xz`, `.zip`, and `.zst`/`.zstd` (when `zstandard` is installed)
|
|
87
|
+
- `fx.from_pandas`, `fx.from_dask`, `fx.from_ray`, `fx.from_dataframe`
|
|
88
|
+
- `fx.get_config`, `fx.set_backend`, `fx.set_workers`, `fx.set_serializer`, `fx.set_kernel_backend`
|
|
89
|
+
- `fx.set_array_backend` for auto/NumExpr/Numba/JAX/PyTorch/CuPy acceleration modes
|
|
90
|
+
- `fx.recommend_best_performance_config()` to inspect hardware-tuned settings
|
|
91
|
+
- `fx.auto_configure_hardware()` to apply best-performance config automatically
|
|
92
|
+
- `fx.StreamProcessor` for micro-batch streaming pipelines
|
|
93
|
+
|
|
94
|
+
Acceleration extras:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
pip install pyframe-xpy[accel] # numexpr + numba
|
|
98
|
+
pip install pyframe-xpy[gpu] # cupy (CUDA)
|
|
99
|
+
pip install pyframe-xpy[ml_accel] # jax + pytorch
|
|
100
|
+
pip install pyframe-xpy[pandas_fast] # modin backend
|
|
101
|
+
pip install pyframe-xpy[distributed] # Dask + Ray distributed/HPC backends
|
|
102
|
+
pip install zstandard # .zst/.zstd file compression
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Backend notes:
|
|
106
|
+
|
|
107
|
+
- `fx.set_backend("threads" | "processes" | "ray" | "dask" | "hpc")`
|
|
108
|
+
- Ray and Dask execution backends require their respective runtimes to be installed/available.
|
|
109
|
+
- HPC mode (`"hpc"`) uses cluster-oriented execution via Dask or Ray:
|
|
110
|
+
- `FRAMEX_HPC_ENGINE=dask|ray`
|
|
111
|
+
- `FRAMEX_DASK_SCHEDULER_ADDRESS=<tcp://...>` to connect existing Dask clusters
|
|
112
|
+
- `FRAMEX_RAY_ADDRESS=<ray://...>` to connect existing Ray clusters
|
|
113
|
+
- optional SLURM bootstrap: `FRAMEX_DASK_SLURM=1` (requires `dask-jobqueue`)
|
|
114
|
+
|
|
115
|
+
Test support notes:
|
|
116
|
+
|
|
117
|
+
- Some tests are optional-backend gated and intentionally `skipped` when deps are not installed.
|
|
118
|
+
- Typical skip reasons: missing `dask.distributed`, `dask.dataframe`, `ray`, or `ray.data`.
|
|
119
|
+
- Run full optional matrix locally:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
pip install pyframe-xpy[distributed]
|
|
123
|
+
pytest -q
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Documentation
|
|
127
|
+
|
|
128
|
+
Canonical docs are in [`docs/documents`](docs/documents):
|
|
129
|
+
|
|
130
|
+
- [Overview](docs/documents/overview.md)
|
|
131
|
+
- [Features](docs/documents/features.md)
|
|
132
|
+
- [Getting Started](docs/documents/getting_started.md)
|
|
133
|
+
- [Installation](docs/documents/installation.md)
|
|
134
|
+
- [Tutorial: ETL Pipeline](docs/documents/tutorial_etl_pipeline.md)
|
|
135
|
+
- [Tutorial: NumPy NDArray Interop](docs/documents/tutorial_numpy_array.md)
|
|
136
|
+
- [Use Cases](docs/documents/use_cases.md)
|
|
137
|
+
- [Configuration Guide](docs/documents/configuration_guide.md)
|
|
138
|
+
- [Performance Test](docs/documents/performance_test.md)
|
|
139
|
+
- [Architecture](docs/documents/architecture.md)
|
|
140
|
+
- [API Reference](docs/documents/api_reference.md)
|
|
141
|
+
- [Roadmap](docs/documents/roadmap.md)
|
|
142
|
+
- [FAQ](docs/documents/faq.md)
|
|
143
|
+
|
|
144
|
+
## Website (Docs UI)
|
|
145
|
+
|
|
146
|
+
The docs website lives in [`website`](website) (Next.js App Router).
|
|
147
|
+
|
|
148
|
+
Main docs routes:
|
|
149
|
+
|
|
150
|
+
- `http://localhost:3000/docs/features`
|
|
151
|
+
- `http://localhost:3000/docs/tutorial_etl_pipeline`
|
|
152
|
+
- `http://localhost:3000/docs/use_cases`
|
|
153
|
+
- `http://localhost:3000/docs/configuration_guide`
|
|
154
|
+
- `http://localhost:3000/docs/performance_test`
|
|
155
|
+
|
|
156
|
+
Run locally:
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
cd website
|
|
160
|
+
npm install
|
|
161
|
+
npm run dev
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Production build:
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
npm run build
|
|
168
|
+
npm run start
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Development
|
|
172
|
+
|
|
173
|
+
Install dev dependencies:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
pip install -e .[dev]
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Run tests:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
pytest
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## Benchmarks
|
|
186
|
+
|
|
187
|
+
Benchmark code and generated reports are in [`benchmarks`](benchmarks).
|
|
188
|
+
|
|
189
|
+
Run the full benchmark suite (includes in-terminal progress bar and report generation):
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
python3 -m benchmarks.benchmark_suite
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
Run workload capability matrix checks:
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
python3 -m benchmarks.check_framex_workloads
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
Benchmark outputs are written to `benchmarks/results`:
|
|
202
|
+
|
|
203
|
+
- `benchmark_results.json`
|
|
204
|
+
- `benchmark_results.csv`
|
|
205
|
+
- `benchmark_report.md`
|
|
206
|
+
- `framex_workload_check.json`
|
|
207
|
+
- `performance_speedup.png`
|
|
208
|
+
- `parallel_processing_scaling.png`
|
|
209
|
+
- `multiprocessing_scaling.png`
|
|
210
|
+
- `memory_peak_rss.png`
|
|
211
|
+
|
|
212
|
+
## Project Status
|
|
213
|
+
|
|
214
|
+
FrameX is pre-1.0 (`0.1.0`) and in active development.
|
|
215
|
+
|
|
216
|
+
- APIs are usable and documented
|
|
217
|
+
- compatibility/performance behavior will continue to evolve
|
|
218
|
+
- pin versions for production-critical workloads
|
|
219
|
+
|
|
220
|
+
## License
|
|
221
|
+
|
|
222
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""FrameX: High-performance parallel dataframe and array processing."""
|
|
2
|
+
|
|
3
|
+
from framex._version import __version__
|
|
4
|
+
from framex.config import (
|
|
5
|
+
ArrayBackendType,
|
|
6
|
+
Config,
|
|
7
|
+
auto_configure_hardware,
|
|
8
|
+
config,
|
|
9
|
+
get_config,
|
|
10
|
+
recommend_best_performance_config,
|
|
11
|
+
set_array_backend,
|
|
12
|
+
set_backend,
|
|
13
|
+
set_kernel_backend,
|
|
14
|
+
set_serializer,
|
|
15
|
+
set_workers,
|
|
16
|
+
)
|
|
17
|
+
from framex.core.array import NDArray
|
|
18
|
+
from framex.core.dataframe import DataFrame, LazyFrame
|
|
19
|
+
from framex.core.dtypes import DType
|
|
20
|
+
from framex.core.index import Index
|
|
21
|
+
from framex.core.series import Series
|
|
22
|
+
from framex.interchange.dataframe_protocol import (
|
|
23
|
+
add_dataframe_protocol,
|
|
24
|
+
from_dask,
|
|
25
|
+
from_dataframe,
|
|
26
|
+
from_pandas,
|
|
27
|
+
from_ray,
|
|
28
|
+
)
|
|
29
|
+
from framex.io.arrow_ipc import read_ipc, write_ipc
|
|
30
|
+
from framex.io.csv import read_csv, write_csv
|
|
31
|
+
from framex.io.file import read_file, write_file
|
|
32
|
+
from framex.io.json import read_json, read_ndjson, write_json, write_ndjson
|
|
33
|
+
from framex.io.parquet import read_parquet, write_parquet
|
|
34
|
+
from framex.ops.window import rolling_mean, rolling_sum, rolling_std, rolling_min, rolling_max, top_k, rank
|
|
35
|
+
from framex.runtime.executor import detect_backend
|
|
36
|
+
from framex.runtime.streaming import StreamProcessor, StreamStats
|
|
37
|
+
from framex.compat import list_divergences, check_pandas_compat, DIVERGENCES
|
|
38
|
+
|
|
39
|
+
# Apply the __dataframe__ interchange protocol to DataFrame.
|
|
40
|
+
add_dataframe_protocol(DataFrame)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def array(
|
|
44
|
+
data: list | None = None,
|
|
45
|
+
*,
|
|
46
|
+
dtype: str | None = None,
|
|
47
|
+
chunks: int | None = None,
|
|
48
|
+
) -> NDArray:
|
|
49
|
+
"""Convenience constructor for ``NDArray``."""
|
|
50
|
+
return NDArray(data, dtype=dtype, chunks=chunks)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
__all__ = [
|
|
54
|
+
"__version__",
|
|
55
|
+
# Config
|
|
56
|
+
"Config",
|
|
57
|
+
"ArrayBackendType",
|
|
58
|
+
"recommend_best_performance_config",
|
|
59
|
+
"auto_configure_hardware",
|
|
60
|
+
"config",
|
|
61
|
+
"get_config",
|
|
62
|
+
"set_array_backend",
|
|
63
|
+
"set_backend",
|
|
64
|
+
"set_kernel_backend",
|
|
65
|
+
"set_serializer",
|
|
66
|
+
"set_workers",
|
|
67
|
+
# Core types
|
|
68
|
+
"DataFrame",
|
|
69
|
+
"DType",
|
|
70
|
+
"Index",
|
|
71
|
+
"LazyFrame",
|
|
72
|
+
"NDArray",
|
|
73
|
+
"Series",
|
|
74
|
+
# Interchange
|
|
75
|
+
"from_dataframe",
|
|
76
|
+
"from_pandas",
|
|
77
|
+
"from_dask",
|
|
78
|
+
"from_ray",
|
|
79
|
+
# IO
|
|
80
|
+
"read_csv",
|
|
81
|
+
"read_file",
|
|
82
|
+
"read_ipc",
|
|
83
|
+
"read_json",
|
|
84
|
+
"read_ndjson",
|
|
85
|
+
"read_parquet",
|
|
86
|
+
"write_csv",
|
|
87
|
+
"write_file",
|
|
88
|
+
"write_ipc",
|
|
89
|
+
"write_json",
|
|
90
|
+
"write_ndjson",
|
|
91
|
+
"write_parquet",
|
|
92
|
+
# Convenience
|
|
93
|
+
"array",
|
|
94
|
+
# Window ops
|
|
95
|
+
"rolling_mean",
|
|
96
|
+
"rolling_sum",
|
|
97
|
+
"rolling_std",
|
|
98
|
+
"rolling_min",
|
|
99
|
+
"rolling_max",
|
|
100
|
+
"top_k",
|
|
101
|
+
"rank",
|
|
102
|
+
# Runtime
|
|
103
|
+
"detect_backend",
|
|
104
|
+
"StreamProcessor",
|
|
105
|
+
"StreamStats",
|
|
106
|
+
# Compatibility
|
|
107
|
+
"DIVERGENCES",
|
|
108
|
+
"check_pandas_compat",
|
|
109
|
+
"list_divergences",
|
|
110
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|