colstore 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- colstore-0.1.0/.github/workflows/ci.yml +93 -0
- colstore-0.1.0/.github/workflows/release.yml +171 -0
- colstore-0.1.0/.gitignore +74 -0
- colstore-0.1.0/CMakeLists.txt +78 -0
- colstore-0.1.0/PKG-INFO +162 -0
- colstore-0.1.0/README.md +116 -0
- colstore-0.1.0/benchmark/profile_gather.py +242 -0
- colstore-0.1.0/benchmark/run_perf.sh +40 -0
- colstore-0.1.0/include/colstore/gather.hpp +86 -0
- colstore-0.1.0/pyproject.toml +114 -0
- colstore-0.1.0/src/colstore/__init__.py +60 -0
- colstore-0.1.0/src/colstore/_progress.py +51 -0
- colstore-0.1.0/src/colstore/config.py +68 -0
- colstore-0.1.0/src/colstore/format.py +265 -0
- colstore-0.1.0/src/colstore/kernels.py +127 -0
- colstore-0.1.0/src/colstore/store.py +383 -0
- colstore-0.1.0/src/colstore/view.py +236 -0
- colstore-0.1.0/src/cpp/gather.cpp +139 -0
- colstore-0.1.0/src/cython/_gather.pyx +164 -0
- colstore-0.1.0/tests/conftest.py +57 -0
- colstore-0.1.0/tests/test_config.py +73 -0
- colstore-0.1.0/tests/test_factory.py +104 -0
- colstore-0.1.0/tests/test_format.py +281 -0
- colstore-0.1.0/tests/test_indexing.py +182 -0
- colstore-0.1.0/tests/test_kernels.py +97 -0
- colstore-0.1.0/tests/test_store.py +82 -0
- colstore-0.1.0/tests/test_views.py +112 -0
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
# Run on every push to main and on every PR targeting main. Other
|
|
4
|
+
# branches don't trigger CI by default — open a PR to get feedback.
|
|
5
|
+
on:
|
|
6
|
+
push:
|
|
7
|
+
branches: [main]
|
|
8
|
+
pull_request:
|
|
9
|
+
branches: [main]
|
|
10
|
+
|
|
11
|
+
# Cancel in-progress runs for the same ref when a new push arrives.
|
|
12
|
+
# Saves CI minutes and gets you the "latest commit only" result.
|
|
13
|
+
concurrency:
|
|
14
|
+
group: ci-${{ github.ref }}
|
|
15
|
+
cancel-in-progress: true
|
|
16
|
+
|
|
17
|
+
# Default to read-only permissions; release.yml elevates as needed.
|
|
18
|
+
permissions:
|
|
19
|
+
contents: read
|
|
20
|
+
|
|
21
|
+
jobs:
|
|
22
|
+
lint:
|
|
23
|
+
name: Lint and type-check
|
|
24
|
+
runs-on: ubuntu-latest
|
|
25
|
+
steps:
|
|
26
|
+
- uses: actions/checkout@v4
|
|
27
|
+
|
|
28
|
+
- name: Set up Python 3.12
|
|
29
|
+
uses: actions/setup-python@v5
|
|
30
|
+
with:
|
|
31
|
+
python-version: "3.12"
|
|
32
|
+
cache: pip
|
|
33
|
+
cache-dependency-path: pyproject.toml
|
|
34
|
+
|
|
35
|
+
- name: Install dev dependencies
|
|
36
|
+
# Editable install also compiles the C++ extension via
|
|
37
|
+
# scikit-build-core; mypy needs the module importable to typecheck
|
|
38
|
+
# against it.
|
|
39
|
+
run: |
|
|
40
|
+
python -m pip install --upgrade pip
|
|
41
|
+
pip install -e ".[dev]"
|
|
42
|
+
|
|
43
|
+
- name: ruff (lint)
|
|
44
|
+
run: ruff check src tests
|
|
45
|
+
|
|
46
|
+
- name: black (format check)
|
|
47
|
+
run: black --check src tests
|
|
48
|
+
|
|
49
|
+
- name: mypy (strict type check)
|
|
50
|
+
run: mypy src
|
|
51
|
+
|
|
52
|
+
test:
|
|
53
|
+
name: Test (Python ${{ matrix.python-version }} on ${{ matrix.os }})
|
|
54
|
+
runs-on: ${{ matrix.os }}
|
|
55
|
+
strategy:
|
|
56
|
+
# Don't bail out on the first failing matrix cell — we want to
|
|
57
|
+
# see whether a failure is platform-specific or version-specific.
|
|
58
|
+
fail-fast: false
|
|
59
|
+
matrix:
|
|
60
|
+
# POSIX-only: the C++ kernel uses OpenMP via the system compiler,
|
|
61
|
+
# which is messy on Windows. Windows support is intentionally out
|
|
62
|
+
# of scope.
|
|
63
|
+
os: [ubuntu-latest, macos-latest]
|
|
64
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
65
|
+
|
|
66
|
+
steps:
|
|
67
|
+
- uses: actions/checkout@v4
|
|
68
|
+
with:
|
|
69
|
+
# setuptools_scm reads tag history to derive the version;
|
|
70
|
+
# shallow clones come back as 0.0.0.
|
|
71
|
+
fetch-depth: 0
|
|
72
|
+
|
|
73
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
74
|
+
uses: actions/setup-python@v5
|
|
75
|
+
with:
|
|
76
|
+
python-version: ${{ matrix.python-version }}
|
|
77
|
+
cache: pip
|
|
78
|
+
cache-dependency-path: pyproject.toml
|
|
79
|
+
|
|
80
|
+
- name: Install OpenMP runtime (macOS)
|
|
81
|
+
# AppleClang doesn't ship libomp; without this the find_package
|
|
82
|
+
# call in CMakeLists.txt fails to detect OpenMP and the kernel
|
|
83
|
+
# degrades to single-threaded.
|
|
84
|
+
if: runner.os == 'macOS'
|
|
85
|
+
run: brew install libomp
|
|
86
|
+
|
|
87
|
+
- name: Install package and test deps
|
|
88
|
+
run: |
|
|
89
|
+
python -m pip install --upgrade pip
|
|
90
|
+
pip install -e ".[dev]"
|
|
91
|
+
|
|
92
|
+
- name: Run pytest
|
|
93
|
+
run: pytest -v --cov=colstore --cov-report=term
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
# Triggered when a GitHub Release is published. We use `published`
|
|
4
|
+
# rather than `created` so drafting a release in the GitHub UI doesn't
|
|
5
|
+
# fire the publish step prematurely — the release has to actually go
|
|
6
|
+
# live.
|
|
7
|
+
#
|
|
8
|
+
# Also exposes a manual `workflow_dispatch` button so maintainers can
|
|
9
|
+
# build (but not publish) on demand for verification.
|
|
10
|
+
on:
|
|
11
|
+
release:
|
|
12
|
+
types: [published]
|
|
13
|
+
workflow_dispatch:
|
|
14
|
+
|
|
15
|
+
concurrency:
|
|
16
|
+
group: release-${{ github.ref }}
|
|
17
|
+
cancel-in-progress: false # never cancel an in-progress publish
|
|
18
|
+
|
|
19
|
+
permissions:
|
|
20
|
+
contents: read
|
|
21
|
+
|
|
22
|
+
jobs:
|
|
23
|
+
# ---------------------------------------------------------------------
|
|
24
|
+
# 1. Sanity-gate: re-run the test suite on the release commit.
|
|
25
|
+
#
|
|
26
|
+
# CI on `main` already covers this for the normal flow, but a release
|
|
27
|
+
# can technically be tagged on any commit, so we re-verify before
|
|
28
|
+
# touching PyPI. We only run a single Python version here (latest
|
|
29
|
+
# stable) — the full matrix lives in ci.yml.
|
|
30
|
+
# ---------------------------------------------------------------------
|
|
31
|
+
test:
|
|
32
|
+
name: Sanity-test the release commit
|
|
33
|
+
runs-on: ubuntu-latest
|
|
34
|
+
steps:
|
|
35
|
+
- uses: actions/checkout@v4
|
|
36
|
+
with:
|
|
37
|
+
fetch-depth: 0 # tags needed for setuptools_scm
|
|
38
|
+
|
|
39
|
+
- name: Set up Python 3.12
|
|
40
|
+
uses: actions/setup-python@v5
|
|
41
|
+
with:
|
|
42
|
+
python-version: "3.12"
|
|
43
|
+
cache: pip
|
|
44
|
+
cache-dependency-path: pyproject.toml
|
|
45
|
+
|
|
46
|
+
- name: Install package and test deps
|
|
47
|
+
run: |
|
|
48
|
+
python -m pip install --upgrade pip
|
|
49
|
+
pip install -e ".[dev]"
|
|
50
|
+
|
|
51
|
+
- name: Run pytest
|
|
52
|
+
run: pytest -v
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------
|
|
55
|
+
# 2a. Build the sdist on Linux.
|
|
56
|
+
#
|
|
57
|
+
# The sdist is platform-independent; one job is enough. It contains
|
|
58
|
+
# the Cython .pyx, C++ sources, and the CMake build description, so
|
|
59
|
+
# users on unsupported platforms can still build from source.
|
|
60
|
+
# ---------------------------------------------------------------------
|
|
61
|
+
build_sdist:
|
|
62
|
+
name: Build sdist
|
|
63
|
+
runs-on: ubuntu-latest
|
|
64
|
+
needs: test
|
|
65
|
+
steps:
|
|
66
|
+
- uses: actions/checkout@v4
|
|
67
|
+
with:
|
|
68
|
+
fetch-depth: 0 # setuptools_scm
|
|
69
|
+
|
|
70
|
+
- name: Set up Python 3.12
|
|
71
|
+
uses: actions/setup-python@v5
|
|
72
|
+
with:
|
|
73
|
+
python-version: "3.12"
|
|
74
|
+
|
|
75
|
+
- name: Install build tooling
|
|
76
|
+
run: |
|
|
77
|
+
python -m pip install --upgrade pip
|
|
78
|
+
pip install build
|
|
79
|
+
|
|
80
|
+
- name: Build sdist
|
|
81
|
+
run: python -m build --sdist
|
|
82
|
+
|
|
83
|
+
- name: List built artifacts
|
|
84
|
+
run: ls -l dist/
|
|
85
|
+
|
|
86
|
+
- name: Upload sdist artifact
|
|
87
|
+
uses: actions/upload-artifact@v4
|
|
88
|
+
with:
|
|
89
|
+
name: dist-sdist
|
|
90
|
+
path: dist/*.tar.gz
|
|
91
|
+
|
|
92
|
+
# ---------------------------------------------------------------------
|
|
93
|
+
# 2b. Build per-platform wheels with cibuildwheel.
|
|
94
|
+
#
|
|
95
|
+
# colstore has a C++/Cython extension, so we can't ship a single
|
|
96
|
+
# py3-none-any wheel like a pure-Python package would. cibuildwheel
|
|
97
|
+
# runs the build inside manylinux containers for Linux and natively
|
|
98
|
+
# on macOS, producing wheels that pip can install on the same
|
|
99
|
+
# ABI/arch without a compiler on the user's machine.
|
|
100
|
+
# ---------------------------------------------------------------------
|
|
101
|
+
build_wheels:
|
|
102
|
+
name: Build wheels (${{ matrix.os }})
|
|
103
|
+
runs-on: ${{ matrix.os }}
|
|
104
|
+
needs: test
|
|
105
|
+
strategy:
|
|
106
|
+
fail-fast: false
|
|
107
|
+
matrix:
|
|
108
|
+
os: [ubuntu-latest, macos-latest]
|
|
109
|
+
|
|
110
|
+
steps:
|
|
111
|
+
- uses: actions/checkout@v4
|
|
112
|
+
with:
|
|
113
|
+
fetch-depth: 0 # setuptools_scm
|
|
114
|
+
|
|
115
|
+
- name: Build wheels
|
|
116
|
+
uses: pypa/cibuildwheel@v2.21
|
|
117
|
+
env:
|
|
118
|
+
# We support 3.10+; skip older interpreters, PyPy, 32-bit
|
|
119
|
+
# Linux, and musllinux (we don't test it).
|
|
120
|
+
CIBW_SKIP: "cp36-* cp37-* cp38-* cp39-* pp* *-manylinux_i686 *-musllinux*"
|
|
121
|
+
# The kernel's OpenMP path requires libomp on macOS;
|
|
122
|
+
# manylinux images already include it.
|
|
123
|
+
CIBW_BEFORE_ALL_MACOS: brew install libomp
|
|
124
|
+
# Quick smoke test on the built wheel before declaring it good.
|
|
125
|
+
CIBW_TEST_REQUIRES: pytest pandas
|
|
126
|
+
CIBW_TEST_COMMAND: pytest -q {project}/tests/test_format.py {project}/tests/test_factory.py
|
|
127
|
+
|
|
128
|
+
- name: List built artifacts
|
|
129
|
+
run: ls -l wheelhouse/
|
|
130
|
+
|
|
131
|
+
- name: Upload wheels artifact
|
|
132
|
+
uses: actions/upload-artifact@v4
|
|
133
|
+
with:
|
|
134
|
+
name: dist-wheels-${{ matrix.os }}
|
|
135
|
+
path: ./wheelhouse/*.whl
|
|
136
|
+
|
|
137
|
+
# ---------------------------------------------------------------------
|
|
138
|
+
# 3. Publish to PyPI via Trusted Publishing (OIDC).
|
|
139
|
+
#
|
|
140
|
+
# This job ONLY runs on a real `release: published` event — the
|
|
141
|
+
# manual workflow_dispatch path stops at `build` so maintainers can
|
|
142
|
+
# download and inspect the artifacts without touching PyPI.
|
|
143
|
+
# ---------------------------------------------------------------------
|
|
144
|
+
publish:
|
|
145
|
+
name: Publish to PyPI
|
|
146
|
+
runs-on: ubuntu-latest
|
|
147
|
+
needs: [build_sdist, build_wheels]
|
|
148
|
+
if: github.event_name == 'release'
|
|
149
|
+
|
|
150
|
+
environment:
|
|
151
|
+
name: pypi
|
|
152
|
+
url: https://pypi.org/p/colstore
|
|
153
|
+
|
|
154
|
+
permissions:
|
|
155
|
+
id-token: write # required for OIDC token used by Trusted Publishing
|
|
156
|
+
|
|
157
|
+
steps:
|
|
158
|
+
- name: Download built distributions
|
|
159
|
+
uses: actions/download-artifact@v4
|
|
160
|
+
with:
|
|
161
|
+
path: dist/
|
|
162
|
+
# All three uploaded artifacts (sdist + 2 wheel sets) get
|
|
163
|
+
# flattened into dist/ for the upload step.
|
|
164
|
+
merge-multiple: true
|
|
165
|
+
|
|
166
|
+
- name: List artifacts before upload
|
|
167
|
+
run: ls -l dist/
|
|
168
|
+
|
|
169
|
+
- name: Publish to PyPI
|
|
170
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
171
|
+
# No user/password — Trusted Publishing handles auth via OIDC.
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
dist/
|
|
13
|
+
wheelhouse/
|
|
14
|
+
_skbuild/
|
|
15
|
+
.eggs/
|
|
16
|
+
*.egg-info/
|
|
17
|
+
*.egg
|
|
18
|
+
MANIFEST
|
|
19
|
+
|
|
20
|
+
# Cython
|
|
21
|
+
# The .pyx is committed; the generated .cpp is a build artifact. Ignore
|
|
22
|
+
# that but keep our hand-written C++ kernel under src/cpp/.
|
|
23
|
+
*.cpp
|
|
24
|
+
!src/cpp/*.cpp
|
|
25
|
+
cython_debug/
|
|
26
|
+
|
|
27
|
+
# setuptools_scm — auto-generated at build time from git tags; never check in.
|
|
28
|
+
src/colstore/_version.py
|
|
29
|
+
|
|
30
|
+
# Installer / publisher
|
|
31
|
+
pip-log.txt
|
|
32
|
+
pip-delete-this-directory.txt
|
|
33
|
+
.pypirc
|
|
34
|
+
|
|
35
|
+
# Tests & coverage
|
|
36
|
+
.pytest_cache/
|
|
37
|
+
.tox/
|
|
38
|
+
.nox/
|
|
39
|
+
.coverage
|
|
40
|
+
.coverage.*
|
|
41
|
+
.cache
|
|
42
|
+
coverage.xml
|
|
43
|
+
*.cover
|
|
44
|
+
cover/
|
|
45
|
+
.hypothesis/
|
|
46
|
+
htmlcov/
|
|
47
|
+
|
|
48
|
+
# Type checking
|
|
49
|
+
.mypy_cache/
|
|
50
|
+
.dmypy.json
|
|
51
|
+
dmypy.json
|
|
52
|
+
|
|
53
|
+
# Linting
|
|
54
|
+
.ruff_cache/
|
|
55
|
+
|
|
56
|
+
# Jupyter
|
|
57
|
+
.ipynb_checkpoints
|
|
58
|
+
profile_default/
|
|
59
|
+
|
|
60
|
+
# Virtual environments
|
|
61
|
+
.env
|
|
62
|
+
.envrc
|
|
63
|
+
.venv
|
|
64
|
+
env/
|
|
65
|
+
venv/
|
|
66
|
+
ENV/
|
|
67
|
+
|
|
68
|
+
# Editor
|
|
69
|
+
.vscode/
|
|
70
|
+
.idea/
|
|
71
|
+
*.swp
|
|
72
|
+
|
|
73
|
+
# OS
|
|
74
|
+
.DS_Store
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# CMake build for the colstore native extension.
|
|
2
|
+
#
|
|
3
|
+
# Builds a single Python extension module (``colstore._gather``) from a
|
|
4
|
+
# Cython binding plus a C++ implementation. OpenMP is linked in when the
|
|
5
|
+
# toolchain supports it (the kernel still works without OpenMP, just
|
|
6
|
+
# single-threaded inside the inner loop).
|
|
7
|
+
#
|
|
8
|
+
# This file is driven by scikit-build-core via pyproject.toml. To build
|
|
9
|
+
# manually without the Python packaging:
|
|
10
|
+
# cmake -S . -B build -G Ninja -DPython_EXECUTABLE=$(which python)
|
|
11
|
+
# cmake --build build
|
|
12
|
+
|
|
13
|
+
cmake_minimum_required(VERSION 3.18)
|
|
14
|
+
project(colstore LANGUAGES CXX)
|
|
15
|
+
|
|
16
|
+
set(CMAKE_CXX_STANDARD 17)
|
|
17
|
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
18
|
+
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|
19
|
+
|
|
20
|
+
if(NOT CMAKE_BUILD_TYPE)
|
|
21
|
+
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
|
22
|
+
endif()
|
|
23
|
+
|
|
24
|
+
# --- Dependencies --------------------------------------------------------
|
|
25
|
+
find_package(Python COMPONENTS Interpreter Development.Module NumPy REQUIRED)
|
|
26
|
+
find_package(OpenMP)
|
|
27
|
+
|
|
28
|
+
# --- Cython preprocess ---------------------------------------------------
|
|
29
|
+
# Invoke Cython at build time to translate _gather.pyx into _gather.cpp.
|
|
30
|
+
# Done with a custom command rather than a CMake module so the user only
|
|
31
|
+
# needs Cython installed via pip, not a CMake find module.
|
|
32
|
+
set(CYTHON_PYX "${CMAKE_CURRENT_SOURCE_DIR}/src/cython/_gather.pyx")
|
|
33
|
+
set(CYTHON_CPP "${CMAKE_CURRENT_BINARY_DIR}/_gather.cpp")
|
|
34
|
+
|
|
35
|
+
add_custom_command(
|
|
36
|
+
OUTPUT ${CYTHON_CPP}
|
|
37
|
+
COMMAND ${Python_EXECUTABLE} -m cython
|
|
38
|
+
--cplus -3
|
|
39
|
+
-I "${CMAKE_CURRENT_SOURCE_DIR}/include"
|
|
40
|
+
-o "${CYTHON_CPP}"
|
|
41
|
+
"${CYTHON_PYX}"
|
|
42
|
+
DEPENDS ${CYTHON_PYX}
|
|
43
|
+
"${CMAKE_CURRENT_SOURCE_DIR}/include/colstore/gather.hpp"
|
|
44
|
+
COMMENT "Cythonizing _gather.pyx -> _gather.cpp"
|
|
45
|
+
VERBATIM)
|
|
46
|
+
|
|
47
|
+
# --- Extension module ----------------------------------------------------
|
|
48
|
+
Python_add_library(_gather MODULE WITH_SOABI
|
|
49
|
+
${CYTHON_CPP}
|
|
50
|
+
src/cpp/gather.cpp
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
target_include_directories(_gather PRIVATE
|
|
54
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include
|
|
55
|
+
${Python_NumPy_INCLUDE_DIRS}
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Portable optimization flags. We deliberately do not pass -march=native so
|
|
59
|
+
# wheels stay portable across CPUs; users who build from source for a
|
|
60
|
+
# specific machine can override CMAKE_CXX_FLAGS_RELEASE.
|
|
61
|
+
target_compile_options(_gather PRIVATE
|
|
62
|
+
$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-O3 -ffast-math -funroll-loops>
|
|
63
|
+
$<$<CXX_COMPILER_ID:MSVC>:/O2 /fp:fast>
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
if(OpenMP_CXX_FOUND)
|
|
67
|
+
target_link_libraries(_gather PRIVATE OpenMP::OpenMP_CXX)
|
|
68
|
+
endif()
|
|
69
|
+
|
|
70
|
+
# Quiet a NumPy 2.x deprecation that fires harmlessly during Cython compile
|
|
71
|
+
# of array-API users that don't opt into the new API yet.
|
|
72
|
+
target_compile_definitions(_gather PRIVATE
|
|
73
|
+
NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# --- Install -------------------------------------------------------------
|
|
77
|
+
# scikit-build-core copies this into the wheel under wheel.packages.
|
|
78
|
+
install(TARGETS _gather DESTINATION colstore)
|
colstore-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: colstore
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Memory-mapped columnar binary format for fast random-access I/O on structured arrays.
|
|
5
|
+
Author-Email: Alkaid Cheng <alkaid.ccheng@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Development Status :: 4 - Beta
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Programming Language :: C++
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
20
|
+
Classifier: Operating System :: MacOS
|
|
21
|
+
Project-URL: Homepage, https://github.com/AlkaidCheng/colstore
|
|
22
|
+
Project-URL: Issues, https://github.com/AlkaidCheng/colstore/issues
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Requires-Dist: numpy>=1.25
|
|
25
|
+
Requires-Dist: psutil>=5.9
|
|
26
|
+
Provides-Extra: pandas
|
|
27
|
+
Requires-Dist: pandas>=1.5; extra == "pandas"
|
|
28
|
+
Provides-Extra: progress
|
|
29
|
+
Requires-Dist: tqdm>=4.60; extra == "progress"
|
|
30
|
+
Provides-Extra: numba
|
|
31
|
+
Requires-Dist: numba>=0.59; extra == "numba"
|
|
32
|
+
Provides-Extra: all
|
|
33
|
+
Requires-Dist: pandas>=1.5; extra == "all"
|
|
34
|
+
Requires-Dist: tqdm>=4.60; extra == "all"
|
|
35
|
+
Requires-Dist: numba>=0.59; extra == "all"
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: pandas>=1.5; extra == "dev"
|
|
38
|
+
Requires-Dist: tqdm>=4.60; extra == "dev"
|
|
39
|
+
Requires-Dist: numba>=0.59; extra == "dev"
|
|
40
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
41
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
42
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
43
|
+
Requires-Dist: black>=24.0; extra == "dev"
|
|
44
|
+
Requires-Dist: mypy>=1.8; extra == "dev"
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
|
|
47
|
+
# ColStore
|
|
48
|
+
|
|
49
|
+
A memory-mapped columnar binary format for fast, memory-efficient I/O on
|
|
50
|
+
structured arrays. `colstore` lets you write a tabular dataset to a single
|
|
51
|
+
`.cstore` file once and then load arbitrary row/column subsets without
|
|
52
|
+
materializing the rest. Internally, columns are stored back-to-back as raw
|
|
53
|
+
NumPy bytes, reads use `np.memmap`, and fancy-index gathers run through a
|
|
54
|
+
parallel C++ kernel (OpenMP + software prefetching) bound via Cython. Process
|
|
55
|
+
memory stays bounded by the size of the output you ask for; the source file
|
|
56
|
+
is never fully read into RAM.
|
|
57
|
+
|
|
58
|
+
## Install
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install colstore
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Building from source needs a C++17 compiler and CMake ≥ 3.18. On macOS install
|
|
65
|
+
`libomp` (`brew install libomp`) to get the parallel kernel; without it the
|
|
66
|
+
build still succeeds but the kernel runs single-threaded.
|
|
67
|
+
|
|
68
|
+
## Quick start
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from colstore import ColStore
|
|
72
|
+
|
|
73
|
+
# Write and open in one call. `.cstore` is the canonical extension.
|
|
74
|
+
ds = ColStore.from_dataframe(df, "data.cstore")
|
|
75
|
+
|
|
76
|
+
# Indexing returns lazy views; no data is read yet.
|
|
77
|
+
ds['price'] # ColumnView
|
|
78
|
+
ds[100:200] # TableView
|
|
79
|
+
ds[100:200, 'price'] # ColumnView
|
|
80
|
+
ds[100:200, ['price', 'qty']] # TableView
|
|
81
|
+
ds[[1, 5, 9], ['price', 'qty']] # TableView (fancy rows + cols)
|
|
82
|
+
|
|
83
|
+
# Materialize through one of the to_* methods.
|
|
84
|
+
ds['price'].to_array() # 1D ndarray
|
|
85
|
+
ds[indices, ['price', 'qty']].to_dict() # dict of 1D arrays
|
|
86
|
+
ds[indices, ['price', 'qty']].to_record() # structured ndarray
|
|
87
|
+
ds[indices, ['price', 'qty']].to_dataframe() # pandas DataFrame
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Writing from other sources
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from colstore import ColStore
|
|
94
|
+
import numpy as np
|
|
95
|
+
|
|
96
|
+
# From a dict of 1D arrays.
|
|
97
|
+
ColStore.from_dict(
|
|
98
|
+
{"x": np.arange(100, dtype=np.float32), "y": np.arange(100, dtype=np.int64)},
|
|
99
|
+
"data.cstore",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# From a structured (record) array.
|
|
103
|
+
records = np.empty(100, dtype=[("price", np.float32), ("qty", np.int32)])
|
|
104
|
+
ColStore.from_records(records, "data.cstore")
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Each factory returns an opened `ColStore` ready to read from.
|
|
108
|
+
|
|
109
|
+
## Configuration
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from colstore import set_max_workers, set_default_madvise, set_default_backend
|
|
113
|
+
|
|
114
|
+
set_max_workers(8) # parallel gathers across columns
|
|
115
|
+
set_default_madvise("sequential") # OS read-ahead hint for sorted-index reads
|
|
116
|
+
set_default_backend("cpp") # gather kernel: cpp | numpy | numba
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## On-disk format
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
[magic 8B = b"CSTORE\x00\x01"]
|
|
123
|
+
[manifest_len 8B (u64 little-endian)]
|
|
124
|
+
[manifest_json]
|
|
125
|
+
[zero-padding to 64-byte alignment]
|
|
126
|
+
[column_0 raw bytes][column_1 raw bytes]...[column_n raw bytes]
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
The manifest is a small JSON object recording `format_version`, `n_rows`,
|
|
130
|
+
and per-column `{name, dtype}`. Column dtypes are preserved byte-for-byte;
|
|
131
|
+
columns are stored back-to-back with no per-row overhead.
|
|
132
|
+
|
|
133
|
+
## Supported dtypes
|
|
134
|
+
|
|
135
|
+
Fixed-size only: `float32`, `float64`, `int8/16/32/64`, `uint8/16/32/64`,
|
|
136
|
+
`bool`. Object dtype (strings, Python objects) is rejected at write time —
|
|
137
|
+
the design point is zero-copy random access, which requires a fixed stride.
|
|
138
|
+
|
|
139
|
+
## Layout
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
colstore/
|
|
143
|
+
├── pyproject.toml # scikit-build-core build
|
|
144
|
+
├── CMakeLists.txt # Cython + C++ build
|
|
145
|
+
├── include/colstore/
|
|
146
|
+
│ └── gather.hpp # public C++ header
|
|
147
|
+
├── src/
|
|
148
|
+
│ ├── cpp/gather.cpp # OpenMP + prefetch kernel
|
|
149
|
+
│ ├── cython/_gather.pyx # dtype-dispatched binding
|
|
150
|
+
│ └── colstore/ # Python package
|
|
151
|
+
│ ├── __init__.py
|
|
152
|
+
│ ├── config.py
|
|
153
|
+
│ ├── format.py
|
|
154
|
+
│ ├── kernels.py
|
|
155
|
+
│ ├── view.py # ColumnView + TableView
|
|
156
|
+
│ └── store.py
|
|
157
|
+
└── tests/ # pytest suite
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## License
|
|
161
|
+
|
|
162
|
+
MIT.
|
colstore-0.1.0/README.md
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# ColStore
|
|
2
|
+
|
|
3
|
+
A memory-mapped columnar binary format for fast, memory-efficient I/O on
|
|
4
|
+
structured arrays. `colstore` lets you write a tabular dataset to a single
|
|
5
|
+
`.cstore` file once and then load arbitrary row/column subsets without
|
|
6
|
+
materializing the rest. Internally, columns are stored back-to-back as raw
|
|
7
|
+
NumPy bytes, reads use `np.memmap`, and fancy-index gathers run through a
|
|
8
|
+
parallel C++ kernel (OpenMP + software prefetching) bound via Cython. Process
|
|
9
|
+
memory stays bounded by the size of the output you ask for; the source file
|
|
10
|
+
is never fully read into RAM.
|
|
11
|
+
|
|
12
|
+
## Install
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install colstore
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Building from source needs a C++17 compiler and CMake ≥ 3.18. On macOS install
|
|
19
|
+
`libomp` (`brew install libomp`) to get the parallel kernel; without it the
|
|
20
|
+
build still succeeds but the kernel runs single-threaded.
|
|
21
|
+
|
|
22
|
+
## Quick start
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
from colstore import ColStore
|
|
26
|
+
|
|
27
|
+
# Write and open in one call. `.cstore` is the canonical extension.
|
|
28
|
+
ds = ColStore.from_dataframe(df, "data.cstore")
|
|
29
|
+
|
|
30
|
+
# Indexing returns lazy views; no data is read yet.
|
|
31
|
+
ds['price'] # ColumnView
|
|
32
|
+
ds[100:200] # TableView
|
|
33
|
+
ds[100:200, 'price'] # ColumnView
|
|
34
|
+
ds[100:200, ['price', 'qty']] # TableView
|
|
35
|
+
ds[[1, 5, 9], ['price', 'qty']] # TableView (fancy rows + cols)
|
|
36
|
+
|
|
37
|
+
# Materialize through one of the to_* methods.
|
|
38
|
+
ds['price'].to_array() # 1D ndarray
|
|
39
|
+
ds[indices, ['price', 'qty']].to_dict() # dict of 1D arrays
|
|
40
|
+
ds[indices, ['price', 'qty']].to_record() # structured ndarray
|
|
41
|
+
ds[indices, ['price', 'qty']].to_dataframe() # pandas DataFrame
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Writing from other sources
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from colstore import ColStore
|
|
48
|
+
import numpy as np
|
|
49
|
+
|
|
50
|
+
# From a dict of 1D arrays.
|
|
51
|
+
ColStore.from_dict(
|
|
52
|
+
{"x": np.arange(100, dtype=np.float32), "y": np.arange(100, dtype=np.int64)},
|
|
53
|
+
"data.cstore",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# From a structured (record) array.
|
|
57
|
+
records = np.empty(100, dtype=[("price", np.float32), ("qty", np.int32)])
|
|
58
|
+
ColStore.from_records(records, "data.cstore")
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Each factory returns an opened `ColStore` ready to read from.
|
|
62
|
+
|
|
63
|
+
## Configuration
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from colstore import set_max_workers, set_default_madvise, set_default_backend
|
|
67
|
+
|
|
68
|
+
set_max_workers(8) # parallel gathers across columns
|
|
69
|
+
set_default_madvise("sequential") # OS read-ahead hint for sorted-index reads
|
|
70
|
+
set_default_backend("cpp") # gather kernel: cpp | numpy | numba
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## On-disk format
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
[magic 8B = b"CSTORE\x00\x01"]
|
|
77
|
+
[manifest_len 8B (u64 little-endian)]
|
|
78
|
+
[manifest_json]
|
|
79
|
+
[zero-padding to 64-byte alignment]
|
|
80
|
+
[column_0 raw bytes][column_1 raw bytes]...[column_n raw bytes]
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
The manifest is a small JSON object recording `format_version`, `n_rows`,
|
|
84
|
+
and per-column `{name, dtype}`. Column dtypes are preserved byte-for-byte;
|
|
85
|
+
columns are stored back-to-back with no per-row overhead.
|
|
86
|
+
|
|
87
|
+
## Supported dtypes
|
|
88
|
+
|
|
89
|
+
Fixed-size only: `float32`, `float64`, `int8/16/32/64`, `uint8/16/32/64`,
|
|
90
|
+
`bool`. Object dtype (strings, Python objects) is rejected at write time —
|
|
91
|
+
the design point is zero-copy random access, which requires a fixed stride.
|
|
92
|
+
|
|
93
|
+
## Layout
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
colstore/
|
|
97
|
+
├── pyproject.toml # scikit-build-core build
|
|
98
|
+
├── CMakeLists.txt # Cython + C++ build
|
|
99
|
+
├── include/colstore/
|
|
100
|
+
│ └── gather.hpp # public C++ header
|
|
101
|
+
├── src/
|
|
102
|
+
│ ├── cpp/gather.cpp # OpenMP + prefetch kernel
|
|
103
|
+
│ ├── cython/_gather.pyx # dtype-dispatched binding
|
|
104
|
+
│ └── colstore/ # Python package
|
|
105
|
+
│ ├── __init__.py
|
|
106
|
+
│ ├── config.py
|
|
107
|
+
│ ├── format.py
|
|
108
|
+
│ ├── kernels.py
|
|
109
|
+
│ ├── view.py # ColumnView + TableView
|
|
110
|
+
│ └── store.py
|
|
111
|
+
└── tests/ # pytest suite
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## License
|
|
115
|
+
|
|
116
|
+
MIT.
|