colstore 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,93 @@
1
+ name: CI
2
+
3
+ # Run on every push to main and on every PR targeting main. Other
4
+ # branches don't trigger CI by default — open a PR to get feedback.
5
+ on:
6
+ push:
7
+ branches: [main]
8
+ pull_request:
9
+ branches: [main]
10
+
11
+ # Cancel in-progress runs for the same ref when a new push arrives.
12
+ # Saves CI minutes and gets you the "latest commit only" result.
13
+ concurrency:
14
+ group: ci-${{ github.ref }}
15
+ cancel-in-progress: true
16
+
17
+ # Default to read-only permissions; release.yml elevates as needed.
18
+ permissions:
19
+ contents: read
20
+
21
+ jobs:
22
+ lint:
23
+ name: Lint and type-check
24
+ runs-on: ubuntu-latest
25
+ steps:
26
+ - uses: actions/checkout@v4
27
+
28
+ - name: Set up Python 3.12
29
+ uses: actions/setup-python@v5
30
+ with:
31
+ python-version: "3.12"
32
+ cache: pip
33
+ cache-dependency-path: pyproject.toml
34
+
35
+ - name: Install dev dependencies
36
+ # Editable install also compiles the C++ extension via
37
+ # scikit-build-core; mypy needs the module importable to typecheck
38
+ # against it.
39
+ run: |
40
+ python -m pip install --upgrade pip
41
+ pip install -e ".[dev]"
42
+
43
+ - name: ruff (lint)
44
+ run: ruff check src tests
45
+
46
+ - name: black (format check)
47
+ run: black --check src tests
48
+
49
+ - name: mypy (strict type check)
50
+ run: mypy src
51
+
52
+ test:
53
+ name: Test (Python ${{ matrix.python-version }} on ${{ matrix.os }})
54
+ runs-on: ${{ matrix.os }}
55
+ strategy:
56
+ # Don't bail out on the first failing matrix cell — we want to
57
+ # see whether a failure is platform-specific or version-specific.
58
+ fail-fast: false
59
+ matrix:
60
+ # POSIX-only: the C++ kernel uses OpenMP via the system compiler,
61
+ # which is messy on Windows. Windows support is intentionally out
62
+ # of scope.
63
+ os: [ubuntu-latest, macos-latest]
64
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
65
+
66
+ steps:
67
+ - uses: actions/checkout@v4
68
+ with:
69
+ # setuptools_scm reads tag history to derive the version;
70
+ # shallow clones come back as 0.0.0.
71
+ fetch-depth: 0
72
+
73
+ - name: Set up Python ${{ matrix.python-version }}
74
+ uses: actions/setup-python@v5
75
+ with:
76
+ python-version: ${{ matrix.python-version }}
77
+ cache: pip
78
+ cache-dependency-path: pyproject.toml
79
+
80
+ - name: Install OpenMP runtime (macOS)
81
+ # AppleClang doesn't ship libomp; without this the find_package
82
+ # call in CMakeLists.txt fails to detect OpenMP and the kernel
83
+ # degrades to single-threaded.
84
+ if: runner.os == 'macOS'
85
+ run: brew install libomp
86
+
87
+ - name: Install package and test deps
88
+ run: |
89
+ python -m pip install --upgrade pip
90
+ pip install -e ".[dev]"
91
+
92
+ - name: Run pytest
93
+ run: pytest -v --cov=colstore --cov-report=term
@@ -0,0 +1,171 @@
1
+ name: Release
2
+
3
+ # Triggered when a GitHub Release is published. We use `published`
4
+ # rather than `created` so drafting a release in the GitHub UI doesn't
5
+ # fire the publish step prematurely — the release has to actually go
6
+ # live.
7
+ #
8
+ # Also exposes a manual `workflow_dispatch` button so maintainers can
9
+ # build (but not publish) on demand for verification.
10
+ on:
11
+ release:
12
+ types: [published]
13
+ workflow_dispatch:
14
+
15
+ concurrency:
16
+ group: release-${{ github.ref }}
17
+ cancel-in-progress: false # never cancel an in-progress publish
18
+
19
+ permissions:
20
+ contents: read
21
+
22
+ jobs:
23
+ # ---------------------------------------------------------------------
24
+ # 1. Sanity-gate: re-run the test suite on the release commit.
25
+ #
26
+ # CI on `main` already covers this for the normal flow, but a release
27
+ # can technically be tagged on any commit, so we re-verify before
28
+ # touching PyPI. We only run a single Python version here (latest
29
+ # stable) — the full matrix lives in ci.yml.
30
+ # ---------------------------------------------------------------------
31
+ test:
32
+ name: Sanity-test the release commit
33
+ runs-on: ubuntu-latest
34
+ steps:
35
+ - uses: actions/checkout@v4
36
+ with:
37
+ fetch-depth: 0 # tags needed for setuptools_scm
38
+
39
+ - name: Set up Python 3.12
40
+ uses: actions/setup-python@v5
41
+ with:
42
+ python-version: "3.12"
43
+ cache: pip
44
+ cache-dependency-path: pyproject.toml
45
+
46
+ - name: Install package and test deps
47
+ run: |
48
+ python -m pip install --upgrade pip
49
+ pip install -e ".[dev]"
50
+
51
+ - name: Run pytest
52
+ run: pytest -v
53
+
54
+ # ---------------------------------------------------------------------
55
+ # 2a. Build the sdist on Linux.
56
+ #
57
+ # The sdist is platform-independent; one job is enough. It contains
58
+ # the Cython .pyx, C++ sources, and the CMake build description, so
59
+ # users on unsupported platforms can still build from source.
60
+ # ---------------------------------------------------------------------
61
+ build_sdist:
62
+ name: Build sdist
63
+ runs-on: ubuntu-latest
64
+ needs: test
65
+ steps:
66
+ - uses: actions/checkout@v4
67
+ with:
68
+ fetch-depth: 0 # setuptools_scm
69
+
70
+ - name: Set up Python 3.12
71
+ uses: actions/setup-python@v5
72
+ with:
73
+ python-version: "3.12"
74
+
75
+ - name: Install build tooling
76
+ run: |
77
+ python -m pip install --upgrade pip
78
+ pip install build
79
+
80
+ - name: Build sdist
81
+ run: python -m build --sdist
82
+
83
+ - name: List built artifacts
84
+ run: ls -l dist/
85
+
86
+ - name: Upload sdist artifact
87
+ uses: actions/upload-artifact@v4
88
+ with:
89
+ name: dist-sdist
90
+ path: dist/*.tar.gz
91
+
92
+ # ---------------------------------------------------------------------
93
+ # 2b. Build per-platform wheels with cibuildwheel.
94
+ #
95
+ # colstore has a C++/Cython extension, so we can't ship a single
96
+ # py3-none-any wheel like a pure-Python package would. cibuildwheel
97
+ # runs the build inside manylinux containers for Linux and natively
98
+ # on macOS, producing wheels that pip can install on the same
99
+ # ABI/arch without a compiler on the user's machine.
100
+ # ---------------------------------------------------------------------
101
+ build_wheels:
102
+ name: Build wheels (${{ matrix.os }})
103
+ runs-on: ${{ matrix.os }}
104
+ needs: test
105
+ strategy:
106
+ fail-fast: false
107
+ matrix:
108
+ os: [ubuntu-latest, macos-latest]
109
+
110
+ steps:
111
+ - uses: actions/checkout@v4
112
+ with:
113
+ fetch-depth: 0 # setuptools_scm
114
+
115
+ - name: Build wheels
116
+ uses: pypa/cibuildwheel@v2.21
117
+ env:
118
+ # We support 3.10+; skip older interpreters, PyPy, 32-bit
119
+ # Linux, and musllinux (we don't test it).
120
+ CIBW_SKIP: "cp36-* cp37-* cp38-* cp39-* pp* *-manylinux_i686 *-musllinux*"
121
+ # The kernel's OpenMP path requires libomp on macOS;
122
+ # manylinux images already include it.
123
+ CIBW_BEFORE_ALL_MACOS: brew install libomp
124
+ # Quick smoke test on the built wheel before declaring it good.
125
+ CIBW_TEST_REQUIRES: pytest pandas
126
+ CIBW_TEST_COMMAND: pytest -q {project}/tests/test_format.py {project}/tests/test_factory.py
127
+
128
+ - name: List built artifacts
129
+ run: ls -l wheelhouse/
130
+
131
+ - name: Upload wheels artifact
132
+ uses: actions/upload-artifact@v4
133
+ with:
134
+ name: dist-wheels-${{ matrix.os }}
135
+ path: ./wheelhouse/*.whl
136
+
137
+ # ---------------------------------------------------------------------
138
+ # 3. Publish to PyPI via Trusted Publishing (OIDC).
139
+ #
140
+ # This job ONLY runs on a real `release: published` event — the
141
+ # manual workflow_dispatch path stops at `build` so maintainers can
142
+ # download and inspect the artifacts without touching PyPI.
143
+ # ---------------------------------------------------------------------
144
+ publish:
145
+ name: Publish to PyPI
146
+ runs-on: ubuntu-latest
147
+ needs: [build_sdist, build_wheels]
148
+ if: github.event_name == 'release'
149
+
150
+ environment:
151
+ name: pypi
152
+ url: https://pypi.org/p/colstore
153
+
154
+ permissions:
155
+ id-token: write # required for OIDC token used by Trusted Publishing
156
+
157
+ steps:
158
+ - name: Download built distributions
159
+ uses: actions/download-artifact@v4
160
+ with:
161
+ path: dist/
162
+ # All three uploaded artifacts (sdist + 2 wheel sets) get
163
+ # flattened into dist/ for the upload step.
164
+ merge-multiple: true
165
+
166
+ - name: List artifacts before upload
167
+ run: ls -l dist/
168
+
169
+ - name: Publish to PyPI
170
+ uses: pypa/gh-action-pypi-publish@release/v1
171
+ # No user/password — Trusted Publishing handles auth via OIDC.
@@ -0,0 +1,74 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ dist/
13
+ wheelhouse/
14
+ _skbuild/
15
+ .eggs/
16
+ *.egg-info/
17
+ *.egg
18
+ MANIFEST
19
+
20
+ # Cython
21
+ # The .pyx is committed; the generated .cpp is a build artifact. Ignore
22
+ # that but keep our hand-written C++ kernel under src/cpp/.
23
+ *.cpp
24
+ !src/cpp/*.cpp
25
+ cython_debug/
26
+
27
+ # setuptools_scm — auto-generated at build time from git tags; never check in.
28
+ src/colstore/_version.py
29
+
30
+ # Installer / publisher
31
+ pip-log.txt
32
+ pip-delete-this-directory.txt
33
+ .pypirc
34
+
35
+ # Tests & coverage
36
+ .pytest_cache/
37
+ .tox/
38
+ .nox/
39
+ .coverage
40
+ .coverage.*
41
+ .cache
42
+ coverage.xml
43
+ *.cover
44
+ cover/
45
+ .hypothesis/
46
+ htmlcov/
47
+
48
+ # Type checking
49
+ .mypy_cache/
50
+ .dmypy.json
51
+ dmypy.json
52
+
53
+ # Linting
54
+ .ruff_cache/
55
+
56
+ # Jupyter
57
+ .ipynb_checkpoints
58
+ profile_default/
59
+
60
+ # Virtual environments
61
+ .env
62
+ .envrc
63
+ .venv
64
+ env/
65
+ venv/
66
+ ENV/
67
+
68
+ # Editor
69
+ .vscode/
70
+ .idea/
71
+ *.swp
72
+
73
+ # OS
74
+ .DS_Store
@@ -0,0 +1,78 @@
1
+ # CMake build for the colstore native extension.
2
+ #
3
+ # Builds a single Python extension module (``colstore._gather``) from a
4
+ # Cython binding plus a C++ implementation. OpenMP is linked in when the
5
+ # toolchain supports it (the kernel still works without OpenMP, just
6
+ # single-threaded inside the inner loop).
7
+ #
8
+ # This file is driven by scikit-build-core via pyproject.toml. To build
9
+ # manually without the Python packaging:
10
+ # cmake -S . -B build -G Ninja -DPython_EXECUTABLE=$(which python)
11
+ # cmake --build build
12
+
13
+ cmake_minimum_required(VERSION 3.18)
14
+ project(colstore LANGUAGES CXX)
15
+
16
+ set(CMAKE_CXX_STANDARD 17)
17
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
18
+ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
19
+
20
+ if(NOT CMAKE_BUILD_TYPE)
21
+ set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
22
+ endif()
23
+
24
+ # --- Dependencies --------------------------------------------------------
25
+ find_package(Python COMPONENTS Interpreter Development.Module NumPy REQUIRED)
26
+ find_package(OpenMP)
27
+
28
+ # --- Cython preprocess ---------------------------------------------------
29
+ # Invoke Cython at build time to translate _gather.pyx into _gather.cpp.
30
+ # Done with a custom command rather than a CMake module so the user only
31
+ # needs Cython installed via pip, not a CMake find module.
32
+ set(CYTHON_PYX "${CMAKE_CURRENT_SOURCE_DIR}/src/cython/_gather.pyx")
33
+ set(CYTHON_CPP "${CMAKE_CURRENT_BINARY_DIR}/_gather.cpp")
34
+
35
+ add_custom_command(
36
+ OUTPUT ${CYTHON_CPP}
37
+ COMMAND ${Python_EXECUTABLE} -m cython
38
+ --cplus -3
39
+ -I "${CMAKE_CURRENT_SOURCE_DIR}/include"
40
+ -o "${CYTHON_CPP}"
41
+ "${CYTHON_PYX}"
42
+ DEPENDS ${CYTHON_PYX}
43
+ "${CMAKE_CURRENT_SOURCE_DIR}/include/colstore/gather.hpp"
44
+ COMMENT "Cythonizing _gather.pyx -> _gather.cpp"
45
+ VERBATIM)
46
+
47
+ # --- Extension module ----------------------------------------------------
48
+ Python_add_library(_gather MODULE WITH_SOABI
49
+ ${CYTHON_CPP}
50
+ src/cpp/gather.cpp
51
+ )
52
+
53
+ target_include_directories(_gather PRIVATE
54
+ ${CMAKE_CURRENT_SOURCE_DIR}/include
55
+ ${Python_NumPy_INCLUDE_DIRS}
56
+ )
57
+
58
+ # Portable optimization flags. We deliberately do not pass -march=native so
59
+ # wheels stay portable across CPUs; users who build from source for a
60
+ # specific machine can override CMAKE_CXX_FLAGS_RELEASE.
61
+ target_compile_options(_gather PRIVATE
62
+ $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-O3 -ffast-math -funroll-loops>
63
+ $<$<CXX_COMPILER_ID:MSVC>:/O2 /fp:fast>
64
+ )
65
+
66
+ if(OpenMP_CXX_FOUND)
67
+ target_link_libraries(_gather PRIVATE OpenMP::OpenMP_CXX)
68
+ endif()
69
+
70
+ # Quiet a NumPy 2.x deprecation that fires harmlessly during Cython compile
71
+ # of array-API users that don't opt into the new API yet.
72
+ target_compile_definitions(_gather PRIVATE
73
+ NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION
74
+ )
75
+
76
+ # --- Install -------------------------------------------------------------
77
+ # scikit-build-core copies this into the wheel under wheel.packages.
78
+ install(TARGETS _gather DESTINATION colstore)
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.1
2
+ Name: colstore
3
+ Version: 0.1.0
4
+ Summary: Memory-mapped columnar binary format for fast random-access I/O on structured arrays.
5
+ Author-Email: Alkaid Cheng <alkaid.ccheng@gmail.com>
6
+ License: MIT
7
+ Classifier: Development Status :: 4 - Beta
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Programming Language :: C++
17
+ Classifier: Topic :: Scientific/Engineering
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Classifier: Operating System :: POSIX :: Linux
20
+ Classifier: Operating System :: MacOS
21
+ Project-URL: Homepage, https://github.com/AlkaidCheng/colstore
22
+ Project-URL: Issues, https://github.com/AlkaidCheng/colstore/issues
23
+ Requires-Python: >=3.10
24
+ Requires-Dist: numpy>=1.25
25
+ Requires-Dist: psutil>=5.9
26
+ Provides-Extra: pandas
27
+ Requires-Dist: pandas>=1.5; extra == "pandas"
28
+ Provides-Extra: progress
29
+ Requires-Dist: tqdm>=4.60; extra == "progress"
30
+ Provides-Extra: numba
31
+ Requires-Dist: numba>=0.59; extra == "numba"
32
+ Provides-Extra: all
33
+ Requires-Dist: pandas>=1.5; extra == "all"
34
+ Requires-Dist: tqdm>=4.60; extra == "all"
35
+ Requires-Dist: numba>=0.59; extra == "all"
36
+ Provides-Extra: dev
37
+ Requires-Dist: pandas>=1.5; extra == "dev"
38
+ Requires-Dist: tqdm>=4.60; extra == "dev"
39
+ Requires-Dist: numba>=0.59; extra == "dev"
40
+ Requires-Dist: pytest>=7.0; extra == "dev"
41
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
42
+ Requires-Dist: ruff>=0.4; extra == "dev"
43
+ Requires-Dist: black>=24.0; extra == "dev"
44
+ Requires-Dist: mypy>=1.8; extra == "dev"
45
+ Description-Content-Type: text/markdown
46
+
47
+ # ColStore
48
+
49
+ A memory-mapped columnar binary format for fast, memory-efficient I/O on
50
+ structured arrays. `colstore` lets you write a tabular dataset to a single
51
+ `.cstore` file once and then load arbitrary row/column subsets without
52
+ materializing the rest. Internally, columns are stored back-to-back as raw
53
+ NumPy bytes, reads use `np.memmap`, and fancy-index gathers run through a
54
+ parallel C++ kernel (OpenMP + software prefetching) bound via Cython. Process
55
+ memory stays bounded by the size of the output you ask for; the source file
56
+ is never fully read into RAM.
57
+
58
+ ## Install
59
+
60
+ ```bash
61
+ pip install colstore
62
+ ```
63
+
64
+ Building from source needs a C++17 compiler and CMake ≥ 3.18. On macOS install
65
+ `libomp` (`brew install libomp`) to get the parallel kernel; without it the
66
+ build still succeeds but the kernel runs single-threaded.
67
+
68
+ ## Quick start
69
+
70
+ ```python
71
+ from colstore import ColStore
72
+
73
+ # Write and open in one call. `.cstore` is the canonical extension.
74
+ ds = ColStore.from_dataframe(df, "data.cstore")
75
+
76
+ # Indexing returns lazy views; no data is read yet.
77
+ ds['price'] # ColumnView
78
+ ds[100:200] # TableView
79
+ ds[100:200, 'price'] # ColumnView
80
+ ds[100:200, ['price', 'qty']] # TableView
81
+ ds[[1, 5, 9], ['price', 'qty']] # TableView (fancy rows + cols)
82
+
83
+ # Materialize through one of the to_* methods.
84
+ ds['price'].to_array() # 1D ndarray
85
+ ds[indices, ['price', 'qty']].to_dict() # dict of 1D arrays
86
+ ds[indices, ['price', 'qty']].to_record() # structured ndarray
87
+ ds[indices, ['price', 'qty']].to_dataframe() # pandas DataFrame
88
+ ```
89
+
90
+ ## Writing from other sources
91
+
92
+ ```python
93
+ from colstore import ColStore
94
+ import numpy as np
95
+
96
+ # From a dict of 1D arrays.
97
+ ColStore.from_dict(
98
+ {"x": np.arange(100, dtype=np.float32), "y": np.arange(100, dtype=np.int64)},
99
+ "data.cstore",
100
+ )
101
+
102
+ # From a structured (record) array.
103
+ records = np.empty(100, dtype=[("price", np.float32), ("qty", np.int32)])
104
+ ColStore.from_records(records, "data.cstore")
105
+ ```
106
+
107
+ Each factory returns an opened `ColStore` ready to read from.
108
+
109
+ ## Configuration
110
+
111
+ ```python
112
+ from colstore import set_max_workers, set_default_madvise, set_default_backend
113
+
114
+ set_max_workers(8) # parallel gathers across columns
115
+ set_default_madvise("sequential") # OS read-ahead hint for sorted-index reads
116
+ set_default_backend("cpp") # gather kernel: cpp | numpy | numba
117
+ ```
118
+
119
+ ## On-disk format
120
+
121
+ ```
122
+ [magic 8B = b"CSTORE\x00\x01"]
123
+ [manifest_len 8B (u64 little-endian)]
124
+ [manifest_json]
125
+ [zero-padding to 64-byte alignment]
126
+ [column_0 raw bytes][column_1 raw bytes]...[column_n raw bytes]
127
+ ```
128
+
129
+ The manifest is a small JSON object recording `format_version`, `n_rows`,
130
+ and per-column `{name, dtype}`. Column dtypes are preserved byte-for-byte;
131
+ columns are stored back-to-back with no per-row overhead.
132
+
133
+ ## Supported dtypes
134
+
135
+ Fixed-size only: `float32`, `float64`, `int8/16/32/64`, `uint8/16/32/64`,
136
+ `bool`. Object dtype (strings, Python objects) is rejected at write time —
137
+ the design point is zero-copy random access, which requires a fixed stride.
138
+
139
+ ## Layout
140
+
141
+ ```
142
+ colstore/
143
+ ├── pyproject.toml # scikit-build-core build
144
+ ├── CMakeLists.txt # Cython + C++ build
145
+ ├── include/colstore/
146
+ │ └── gather.hpp # public C++ header
147
+ ├── src/
148
+ │ ├── cpp/gather.cpp # OpenMP + prefetch kernel
149
+ │ ├── cython/_gather.pyx # dtype-dispatched binding
150
+ │ └── colstore/ # Python package
151
+ │ ├── __init__.py
152
+ │ ├── config.py
153
+ │ ├── format.py
154
+ │ ├── kernels.py
155
+ │ ├── view.py # ColumnView + TableView
156
+ │ └── store.py
157
+ └── tests/ # pytest suite
158
+ ```
159
+
160
+ ## License
161
+
162
+ MIT.
@@ -0,0 +1,116 @@
1
+ # ColStore
2
+
3
+ A memory-mapped columnar binary format for fast, memory-efficient I/O on
4
+ structured arrays. `colstore` lets you write a tabular dataset to a single
5
+ `.cstore` file once and then load arbitrary row/column subsets without
6
+ materializing the rest. Internally, columns are stored back-to-back as raw
7
+ NumPy bytes, reads use `np.memmap`, and fancy-index gathers run through a
8
+ parallel C++ kernel (OpenMP + software prefetching) bound via Cython. Process
9
+ memory stays bounded by the size of the output you ask for; the source file
10
+ is never fully read into RAM.
11
+
12
+ ## Install
13
+
14
+ ```bash
15
+ pip install colstore
16
+ ```
17
+
18
+ Building from source needs a C++17 compiler and CMake ≥ 3.18. On macOS install
19
+ `libomp` (`brew install libomp`) to get the parallel kernel; without it the
20
+ build still succeeds but the kernel runs single-threaded.
21
+
22
+ ## Quick start
23
+
24
+ ```python
25
+ from colstore import ColStore
26
+
27
+ # Write and open in one call. `.cstore` is the canonical extension.
28
+ ds = ColStore.from_dataframe(df, "data.cstore")
29
+
30
+ # Indexing returns lazy views; no data is read yet.
31
+ ds['price'] # ColumnView
32
+ ds[100:200] # TableView
33
+ ds[100:200, 'price'] # ColumnView
34
+ ds[100:200, ['price', 'qty']] # TableView
35
+ ds[[1, 5, 9], ['price', 'qty']] # TableView (fancy rows + cols)
36
+
37
+ # Materialize through one of the to_* methods.
38
+ ds['price'].to_array() # 1D ndarray
39
+ ds[indices, ['price', 'qty']].to_dict() # dict of 1D arrays
40
+ ds[indices, ['price', 'qty']].to_record() # structured ndarray
41
+ ds[indices, ['price', 'qty']].to_dataframe() # pandas DataFrame
42
+ ```
43
+
44
+ ## Writing from other sources
45
+
46
+ ```python
47
+ from colstore import ColStore
48
+ import numpy as np
49
+
50
+ # From a dict of 1D arrays.
51
+ ColStore.from_dict(
52
+ {"x": np.arange(100, dtype=np.float32), "y": np.arange(100, dtype=np.int64)},
53
+ "data.cstore",
54
+ )
55
+
56
+ # From a structured (record) array.
57
+ records = np.empty(100, dtype=[("price", np.float32), ("qty", np.int32)])
58
+ ColStore.from_records(records, "data.cstore")
59
+ ```
60
+
61
+ Each factory returns an opened `ColStore` ready to read from.
62
+
63
+ ## Configuration
64
+
65
+ ```python
66
+ from colstore import set_max_workers, set_default_madvise, set_default_backend
67
+
68
+ set_max_workers(8) # parallel gathers across columns
69
+ set_default_madvise("sequential") # OS read-ahead hint for sorted-index reads
70
+ set_default_backend("cpp") # gather kernel: cpp | numpy | numba
71
+ ```
72
+
73
+ ## On-disk format
74
+
75
+ ```
76
+ [magic 8B = b"CSTORE\x00\x01"]
77
+ [manifest_len 8B (u64 little-endian)]
78
+ [manifest_json]
79
+ [zero-padding to 64-byte alignment]
80
+ [column_0 raw bytes][column_1 raw bytes]...[column_n raw bytes]
81
+ ```
82
+
83
+ The manifest is a small JSON object recording `format_version`, `n_rows`,
84
+ and per-column `{name, dtype}`. Column dtypes are preserved byte-for-byte;
85
+ columns are stored back-to-back with no per-row overhead.
86
+
87
+ ## Supported dtypes
88
+
89
+ Fixed-size only: `float32`, `float64`, `int8/16/32/64`, `uint8/16/32/64`,
90
+ `bool`. Object dtype (strings, Python objects) is rejected at write time —
91
+ the design point is zero-copy random access, which requires a fixed stride.
92
+
93
+ ## Layout
94
+
95
+ ```
96
+ colstore/
97
+ ├── pyproject.toml # scikit-build-core build
98
+ ├── CMakeLists.txt # Cython + C++ build
99
+ ├── include/colstore/
100
+ │ └── gather.hpp # public C++ header
101
+ ├── src/
102
+ │ ├── cpp/gather.cpp # OpenMP + prefetch kernel
103
+ │ ├── cython/_gather.pyx # dtype-dispatched binding
104
+ │ └── colstore/ # Python package
105
+ │ ├── __init__.py
106
+ │ ├── config.py
107
+ │ ├── format.py
108
+ │ ├── kernels.py
109
+ │ ├── view.py # ColumnView + TableView
110
+ │ └── store.py
111
+ └── tests/ # pytest suite
112
+ ```
113
+
114
+ ## License
115
+
116
+ MIT.