peclet-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- peclet_core-0.1.0/.clang-format +25 -0
- peclet_core-0.1.0/.github/workflows/ci.yml +53 -0
- peclet_core-0.1.0/.github/workflows/docs.yml +50 -0
- peclet_core-0.1.0/.github/workflows/quality.yml +40 -0
- peclet_core-0.1.0/.github/workflows/release.yml +41 -0
- peclet_core-0.1.0/.gitignore +14 -0
- peclet_core-0.1.0/CLAUDE.md +117 -0
- peclet_core-0.1.0/CMakeLists.txt +106 -0
- peclet_core-0.1.0/LICENSE +21 -0
- peclet_core-0.1.0/PKG-INFO +94 -0
- peclet_core-0.1.0/README.md +80 -0
- peclet_core-0.1.0/benchmarks/CMakeLists.txt +8 -0
- peclet_core-0.1.0/benchmarks/bench_amr_flow.cpp +354 -0
- peclet_core-0.1.0/benchmarks/bench_halo.cpp +88 -0
- peclet_core-0.1.0/docs/Doxyfile +66 -0
- peclet_core-0.1.0/docs/amr_collocated_projection.md +161 -0
- peclet_core-0.1.0/docs/amr_device_assembly_plan.md +95 -0
- peclet_core-0.1.0/docs/cuda-aware-mpi.md +71 -0
- peclet_core-0.1.0/include/peclet/core/amr/adapt.hpp +227 -0
- peclet_core-0.1.0/include/peclet/core/amr/advect_recon.hpp +40 -0
- peclet_core-0.1.0/include/peclet/core/amr/assembly.hpp +179 -0
- peclet_core-0.1.0/include/peclet/core/amr/barnes_hut.hpp +207 -0
- peclet_core-0.1.0/include/peclet/core/amr/block_octree.hpp +326 -0
- peclet_core-0.1.0/include/peclet/core/amr/block_octree_view.hpp +70 -0
- peclet_core-0.1.0/include/peclet/core/amr/csr.hpp +119 -0
- peclet_core-0.1.0/include/peclet/core/amr/cut_cell.hpp +607 -0
- peclet_core-0.1.0/include/peclet/core/amr/distributed_adapt.hpp +99 -0
- peclet_core-0.1.0/include/peclet/core/amr/distributed_fv.hpp +541 -0
- peclet_core-0.1.0/include/peclet/core/amr/distributed_octree.hpp +826 -0
- peclet_core-0.1.0/include/peclet/core/amr/distributed_poisson.hpp +239 -0
- peclet_core-0.1.0/include/peclet/core/amr/distributed_view.hpp +338 -0
- peclet_core-0.1.0/include/peclet/core/amr/face_csr.hpp +141 -0
- peclet_core-0.1.0/include/peclet/core/amr/face_geom.hpp +38 -0
- peclet_core-0.1.0/include/peclet/core/amr/facegeom_assembly.hpp +180 -0
- peclet_core-0.1.0/include/peclet/core/amr/flow.hpp +859 -0
- peclet_core-0.1.0/include/peclet/core/amr/flow_oracle.hpp +375 -0
- peclet_core-0.1.0/include/peclet/core/amr/fv_op.hpp +198 -0
- peclet_core-0.1.0/include/peclet/core/amr/indicators.hpp +91 -0
- peclet_core-0.1.0/include/peclet/core/amr/leaf_field.hpp +75 -0
- peclet_core-0.1.0/include/peclet/core/amr/momentum.hpp +584 -0
- peclet_core-0.1.0/include/peclet/core/amr/momentum_assembly.hpp +216 -0
- peclet_core-0.1.0/include/peclet/core/amr/multigrid.hpp +417 -0
- peclet_core-0.1.0/include/peclet/core/amr/pcg.hpp +234 -0
- peclet_core-0.1.0/include/peclet/core/amr/poisson.hpp +661 -0
- peclet_core-0.1.0/include/peclet/core/amr/refine.hpp +65 -0
- peclet_core-0.1.0/include/peclet/core/amr/scalar_transport.hpp +163 -0
- peclet_core-0.1.0/include/peclet/core/amr/velocity_mg.hpp +267 -0
- peclet_core-0.1.0/include/peclet/core/amr/vtu_io.hpp +126 -0
- peclet_core-0.1.0/include/peclet/core/common/mpi.hpp +10 -0
- peclet_core-0.1.0/include/peclet/core/common/mpi_stub.hpp +90 -0
- peclet_core-0.1.0/include/peclet/core/common/types.hpp +70 -0
- peclet_core-0.1.0/include/peclet/core/common/view.hpp +78 -0
- peclet_core-0.1.0/include/peclet/core/decomp/block_decomposer.hpp +227 -0
- peclet_core-0.1.0/include/peclet/core/decomp/block_indexer.hpp +121 -0
- peclet_core-0.1.0/include/peclet/core/decomp/morton_indexer.hpp +87 -0
- peclet_core-0.1.0/include/peclet/core/geom/grid_sdf.hpp +80 -0
- peclet_core-0.1.0/include/peclet/core/geom/sdf.hpp +94 -0
- peclet_core-0.1.0/include/peclet/core/geom/vti_io.hpp +149 -0
- peclet_core-0.1.0/include/peclet/core/halo/grid_halo.hpp +145 -0
- peclet_core-0.1.0/include/peclet/core/halo/grid_halo_topology.hpp +356 -0
- peclet_core-0.1.0/include/peclet/core/halo/nbx.hpp +101 -0
- peclet_core-0.1.0/include/peclet/core/halo/particle_halo.hpp +227 -0
- peclet_core-0.1.0/include/peclet/core/halo/particle_halo_topology.hpp +286 -0
- peclet_core-0.1.0/include/peclet/core/halo/particle_migrator.hpp +301 -0
- peclet_core-0.1.0/include/peclet/core/halo/particle_migrator_view.hpp +268 -0
- peclet_core-0.1.0/include/peclet/core/halo/particle_rebalance.hpp +67 -0
- peclet_core-0.1.0/include/peclet/core/python/ndarray_interop.hpp +172 -0
- peclet_core-0.1.0/pyproject.toml +47 -0
- peclet_core-0.1.0/python/CMakeLists.txt +70 -0
- peclet_core-0.1.0/python/amr_bindings.cpp +756 -0
- peclet_core-0.1.0/python/example_amr.py +80 -0
- peclet_core-0.1.0/python/mpi_bindings.cpp +279 -0
- peclet_core-0.1.0/python/packaging/core_amr.pyi +159 -0
- peclet_core-0.1.0/python/packaging/core_init.py +15 -0
- peclet_core-0.1.0/python/test_amr.py +303 -0
- peclet_core-0.1.0/python/test_mpi.py +89 -0
- peclet_core-0.1.0/tests/CMakeLists.txt +340 -0
- peclet_core-0.1.0/tests/python/CMakeLists.txt +38 -0
- peclet_core-0.1.0/tests/python/interop_test_module.cpp +64 -0
- peclet_core-0.1.0/tests/python/test_ndarray_interop.py +77 -0
- peclet_core-0.1.0/tests/test_amr_adapt.cpp +142 -0
- peclet_core-0.1.0/tests/test_amr_adapt_transport.cpp +152 -0
- peclet_core-0.1.0/tests/test_amr_assembly.cpp +141 -0
- peclet_core-0.1.0/tests/test_amr_barnes_hut.cpp +85 -0
- peclet_core-0.1.0/tests/test_amr_cf_quadratic.cpp +132 -0
- peclet_core-0.1.0/tests/test_amr_cut_cell.cpp +139 -0
- peclet_core-0.1.0/tests/test_amr_distributed_adapt_mpi.cpp +143 -0
- peclet_core-0.1.0/tests/test_amr_distributed_fv_mpi.cpp +169 -0
- peclet_core-0.1.0/tests/test_amr_distributed_graded_mg_mpi.cpp +136 -0
- peclet_core-0.1.0/tests/test_amr_distributed_mg_mpi.cpp +113 -0
- peclet_core-0.1.0/tests/test_amr_distributed_mpi.cpp +170 -0
- peclet_core-0.1.0/tests/test_amr_distributed_openness_mpi.cpp +161 -0
- peclet_core-0.1.0/tests/test_amr_distributed_poisson_mpi.cpp +109 -0
- peclet_core-0.1.0/tests/test_amr_distributed_rebalance_mpi.cpp +183 -0
- peclet_core-0.1.0/tests/test_amr_distributed_view_mpi.cpp +138 -0
- peclet_core-0.1.0/tests/test_amr_drag.cpp +124 -0
- peclet_core-0.1.0/tests/test_amr_face_field.cpp +80 -0
- peclet_core-0.1.0/tests/test_amr_facegeom.cpp +118 -0
- peclet_core-0.1.0/tests/test_amr_flow.cpp +312 -0
- peclet_core-0.1.0/tests/test_amr_flow_solver.cpp +603 -0
- peclet_core-0.1.0/tests/test_amr_fv_op.cpp +117 -0
- peclet_core-0.1.0/tests/test_amr_fv_openness.cpp +144 -0
- peclet_core-0.1.0/tests/test_amr_kappa_dirichlet.cpp +136 -0
- peclet_core-0.1.0/tests/test_amr_kappa_restrict.cpp +146 -0
- peclet_core-0.1.0/tests/test_amr_momentum.cpp +216 -0
- peclet_core-0.1.0/tests/test_amr_momentum_assembly.cpp +140 -0
- peclet_core-0.1.0/tests/test_amr_multigrid.cpp +262 -0
- peclet_core-0.1.0/tests/test_amr_openness.cpp +135 -0
- peclet_core-0.1.0/tests/test_amr_pcg.cpp +216 -0
- peclet_core-0.1.0/tests/test_amr_poisson.cpp +188 -0
- peclet_core-0.1.0/tests/test_amr_sdf.cpp +84 -0
- peclet_core-0.1.0/tests/test_amr_transfer.cpp +166 -0
- peclet_core-0.1.0/tests/test_amr_transport.cpp +173 -0
- peclet_core-0.1.0/tests/test_amr_vtu.cpp +105 -0
- peclet_core-0.1.0/tests/test_block_octree.cpp +199 -0
- peclet_core-0.1.0/tests/test_block_octree_view.cpp +107 -0
- peclet_core-0.1.0/tests/test_decomposition.cpp +181 -0
- peclet_core-0.1.0/tests/test_diffusion.cpp +154 -0
- peclet_core-0.1.0/tests/test_diffusion_sdf.cpp +139 -0
- peclet_core-0.1.0/tests/test_ghost_particles_mpi.cpp +139 -0
- peclet_core-0.1.0/tests/test_grid_halo.cpp +120 -0
- peclet_core-0.1.0/tests/test_grid_halo_exchange.cpp +114 -0
- peclet_core-0.1.0/tests/test_morton_indexer.cpp +86 -0
- peclet_core-0.1.0/tests/test_particle_halo_exchange.cpp +142 -0
- peclet_core-0.1.0/tests/test_particle_halo_mpi.cpp +217 -0
- peclet_core-0.1.0/tests/test_particle_migration.cpp +122 -0
- peclet_core-0.1.0/tests/test_particle_migrator_view_mpi.cpp +159 -0
- peclet_core-0.1.0/tests/test_particle_rebalance.cpp +157 -0
- peclet_core-0.1.0/tests/test_sdf.cpp +69 -0
- peclet_core-0.1.0/tests/test_util.hpp +42 -0
- peclet_core-0.1.0/tests/test_vti.cpp +69 -0
- peclet_core-0.1.0/tools/cuda_aware_mpi_check.cpp +59 -0
- peclet_core-0.1.0/tools/cudampi-env.sh +12 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Adopted suite-wide from voronoi_dynamics (Google C++ Style).
|
|
2
|
+
BasedOnStyle: Google
|
|
3
|
+
Language: Cpp
|
|
4
|
+
ColumnLimit: 100
|
|
5
|
+
IndentWidth: 2
|
|
6
|
+
TabWidth: 2
|
|
7
|
+
UseTab: Never
|
|
8
|
+
ContinuationIndentWidth: 4
|
|
9
|
+
BreakBeforeBraces: Attach
|
|
10
|
+
AlignAfterOpenBracket: Align
|
|
11
|
+
AlignTrailingComments: true
|
|
12
|
+
IncludeBlocks: Regroup
|
|
13
|
+
SortIncludes: CaseInsensitive
|
|
14
|
+
PointerAlignment: Left
|
|
15
|
+
ReferenceAlignment: Left
|
|
16
|
+
SpaceBeforeParens: ControlStatements
|
|
17
|
+
AllowShortFunctionsOnASingleLine: Inline
|
|
18
|
+
AllowShortIfStatementsOnASingleLine: false
|
|
19
|
+
AllowShortLoopsOnASingleLine: false
|
|
20
|
+
SpaceAfterTemplateKeyword: true
|
|
21
|
+
BreakConstructorInitializers: BeforeComma
|
|
22
|
+
AccessModifierOffset: -1
|
|
23
|
+
NamespaceIndentation: None
|
|
24
|
+
CompactNamespaces: false
|
|
25
|
+
ReflowComments: true
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
# Build the header-only library and run the full ctest suite (serial + MPI, np = 1,2,4).
|
|
4
|
+
# core is CPU-testable end-to-end: header-only C++20 + MPI, no GPU required. The optional
|
|
5
|
+
# Kokkos GPU-resident halo path is NOT exercised here (no GPU on the runners) — it is covered by the
|
|
6
|
+
# consumer codes' own CI.
|
|
7
|
+
|
|
8
|
+
on:
|
|
9
|
+
push:
|
|
10
|
+
branches: [main, "**"]
|
|
11
|
+
pull_request:
|
|
12
|
+
workflow_dispatch:
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
cpp:
|
|
16
|
+
name: ubuntu / ${{ matrix.cc }} / MPI / ${{ matrix.build_type }}
|
|
17
|
+
runs-on: ubuntu-latest
|
|
18
|
+
strategy:
|
|
19
|
+
fail-fast: false
|
|
20
|
+
matrix:
|
|
21
|
+
build_type: [Debug, Release]
|
|
22
|
+
include:
|
|
23
|
+
- cc: gcc
|
|
24
|
+
cxx: g++
|
|
25
|
+
- cc: clang
|
|
26
|
+
cxx: clang++
|
|
27
|
+
env:
|
|
28
|
+
CC: ${{ matrix.cc }}
|
|
29
|
+
CXX: ${{ matrix.cxx }}
|
|
30
|
+
OMPI_ALLOW_RUN_AS_ROOT: 1
|
|
31
|
+
OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1
|
|
32
|
+
# GitHub runners have only 2 physical cores; allow OpenMPI to oversubscribe so the
|
|
33
|
+
# np=4 and np=8 correctness tests run (slower, but the logic is what we check).
|
|
34
|
+
OMPI_MCA_rmaps_base_oversubscribe: 1
|
|
35
|
+
PRTE_MCA_rmaps_default_mapping_policy: ":oversubscribe"
|
|
36
|
+
steps:
|
|
37
|
+
- uses: actions/checkout@v4
|
|
38
|
+
|
|
39
|
+
- name: Install MPI
|
|
40
|
+
run: |
|
|
41
|
+
sudo apt-get update
|
|
42
|
+
sudo apt-get install -y libopenmpi-dev openmpi-bin
|
|
43
|
+
# morton is an optional sibling (enables PECLET_CORE_HAVE_MORTON); it is a private repo, so CI
|
|
44
|
+
# builds without it — the halo/decomp tests do not depend on it.
|
|
45
|
+
|
|
46
|
+
- name: Configure
|
|
47
|
+
run: cmake -S . -B build -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
|
48
|
+
|
|
49
|
+
- name: Build
|
|
50
|
+
run: cmake --build build -j
|
|
51
|
+
|
|
52
|
+
- name: Test (serial + MPI np=1,2,4)
|
|
53
|
+
run: ctest --test-dir build --output-on-failure
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
name: Documentation
|
|
2
|
+
|
|
3
|
+
# Build the Doxygen API docs (header-only C++20 in include/, plus the Markdown design notes) and
|
|
4
|
+
# publish them to GitHub Pages. Requires Pages to be enabled for the repository with
|
|
5
|
+
# "Source: GitHub Actions" (Settings -> Pages). Runs on every push to main; can also be run manually.
|
|
6
|
+
|
|
7
|
+
on:
|
|
8
|
+
push:
|
|
9
|
+
branches: [main]
|
|
10
|
+
paths:
|
|
11
|
+
- 'include/**'
|
|
12
|
+
- 'docs/**'
|
|
13
|
+
- 'README.md'
|
|
14
|
+
- '.github/workflows/docs.yml'
|
|
15
|
+
workflow_dispatch:
|
|
16
|
+
|
|
17
|
+
permissions:
|
|
18
|
+
contents: read
|
|
19
|
+
pages: write
|
|
20
|
+
id-token: write
|
|
21
|
+
|
|
22
|
+
# allow one concurrent deployment; cancel an in-progress run for a newer push
|
|
23
|
+
concurrency:
|
|
24
|
+
group: pages
|
|
25
|
+
cancel-in-progress: true
|
|
26
|
+
|
|
27
|
+
jobs:
|
|
28
|
+
build:
|
|
29
|
+
runs-on: ubuntu-latest
|
|
30
|
+
steps:
|
|
31
|
+
- uses: actions/checkout@v4
|
|
32
|
+
- name: Install Doxygen + Graphviz
|
|
33
|
+
run: sudo apt-get update && sudo apt-get install -y doxygen graphviz
|
|
34
|
+
- name: Build documentation
|
|
35
|
+
run: doxygen docs/Doxyfile
|
|
36
|
+
- name: Upload Pages artifact
|
|
37
|
+
uses: actions/upload-pages-artifact@v3
|
|
38
|
+
with:
|
|
39
|
+
path: docs/html
|
|
40
|
+
|
|
41
|
+
deploy:
|
|
42
|
+
needs: build
|
|
43
|
+
runs-on: ubuntu-latest
|
|
44
|
+
environment:
|
|
45
|
+
name: github-pages
|
|
46
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
47
|
+
steps:
|
|
48
|
+
- name: Deploy to GitHub Pages
|
|
49
|
+
id: deployment
|
|
50
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
name: Quality
|
|
2
|
+
|
|
3
|
+
# Lightweight code-quality gates. Python hard-fails on critical errors only. The C++ clang-format
|
|
4
|
+
# check is informational for now (continue-on-error) so version skew between the runner's
|
|
5
|
+
# clang-format and the local one can't red the build; promote it to required once a clang-format
|
|
6
|
+
# version is pinned and the tree is reformatted against it.
|
|
7
|
+
|
|
8
|
+
on:
|
|
9
|
+
push:
|
|
10
|
+
branches: [main, "**"]
|
|
11
|
+
pull_request:
|
|
12
|
+
workflow_dispatch:
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
python-lint:
|
|
16
|
+
name: ruff (critical errors)
|
|
17
|
+
runs-on: ubuntu-latest
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
- uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: "3.12"
|
|
23
|
+
- run: pip install ruff
|
|
24
|
+
- name: Ruff critical-error check
|
|
25
|
+
run: |
|
|
26
|
+
ruff check . --select E9,F63,F7,F82 \
|
|
27
|
+
--exclude ".venv,venv,build,build_*,__pycache__,_deps,extern,legacy,notebooks"
|
|
28
|
+
|
|
29
|
+
cpp-format:
|
|
30
|
+
name: clang-format (informational)
|
|
31
|
+
runs-on: ubuntu-latest
|
|
32
|
+
continue-on-error: true
|
|
33
|
+
steps:
|
|
34
|
+
- uses: actions/checkout@v4
|
|
35
|
+
- run: sudo apt-get update && sudo apt-get install -y clang-format
|
|
36
|
+
- name: clang-format dry-run (uses the repo .clang-format)
|
|
37
|
+
run: |
|
|
38
|
+
files=$(git ls-files 'include/*.hpp' 'tests/*.hpp' 'tests/*.cpp' 'benchmarks/*.cpp')
|
|
39
|
+
[ -z "$files" ] && { echo "no C++ sources"; exit 0; }
|
|
40
|
+
clang-format --dry-run --Werror $files
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
# Publish peclet-core to PyPI on a version tag via Trusted Publishing (OIDC) — configure the publisher
|
|
4
|
+
# on PyPI first; no API token secret needed. SDIST ONLY: peclet-core's Python surface is the MPI particle
|
|
5
|
+
# halo (peclet.core.mpi) + the Kokkos AMR octree (peclet.core.amr); both link MPI/Kokkos, whose ABI/arch
|
|
6
|
+
# make a portable binary wheel impossible. Consumers `pip install peclet-core` and build against their
|
|
7
|
+
# site MPI (+ optional Kokkos prefix), or use the suite containers. See ../docs/DEPLOYMENT.md.
|
|
8
|
+
|
|
9
|
+
on:
|
|
10
|
+
push:
|
|
11
|
+
tags: ["v*"]
|
|
12
|
+
workflow_dispatch:
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
sdist:
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
- uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: "3.12"
|
|
22
|
+
- run: pip install build
|
|
23
|
+
- run: python -m build --sdist
|
|
24
|
+
- uses: actions/upload-artifact@v4
|
|
25
|
+
with:
|
|
26
|
+
name: sdist
|
|
27
|
+
path: dist/*.tar.gz
|
|
28
|
+
|
|
29
|
+
publish:
|
|
30
|
+
needs: [sdist]
|
|
31
|
+
runs-on: ubuntu-latest
|
|
32
|
+
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
|
33
|
+
environment: pypi
|
|
34
|
+
permissions:
|
|
35
|
+
id-token: write # required for trusted publishing
|
|
36
|
+
steps:
|
|
37
|
+
- uses: actions/download-artifact@v4
|
|
38
|
+
with:
|
|
39
|
+
path: dist
|
|
40
|
+
merge-multiple: true
|
|
41
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## What this is
|
|
6
|
+
|
|
7
|
+
`core` is the shared infrastructure library for the transport-phenomena simulation suite
|
|
8
|
+
(sibling repos under `../`: `flow`, `dem`, `voro`, `morton`). The suite-wide design contract lives in `../docs/` — read
|
|
9
|
+
`../docs/ARCHITECTURE.md`, `CONVENTIONS.md`, `STYLE.md`, `INTERFACES.md`, `ROADMAP.md` before
|
|
10
|
+
cross-cutting changes. Header-only C++20; the device side is compiled through Kokkos (CUDA / HIP /
|
|
11
|
+
OpenMP) and is also C++20 — only the `morton` dependency pins C++17 (see `../docs/STYLE.md`). CUDA is
|
|
12
|
+
retired; Kokkos is the canonical device path.
|
|
13
|
+
|
|
14
|
+
## Build / test / benchmark
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
# CPU library + tests (no device dependency):
|
|
18
|
+
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j
|
|
19
|
+
ctest --test-dir build --output-on-failure # serial + MPI halo + particle migration + diffusion
|
|
20
|
+
|
|
21
|
+
# Portable Kokkos device halo (CUDA / HIP / OpenMP) -- opt-in, find_package(Kokkos):
|
|
22
|
+
export PATH=/usr/local/cuda-13.2/bin:$PATH # if the Kokkos install targets the CUDA backend
|
|
23
|
+
cmake -S . -B build_kokkos -DPECLET_CORE_ENABLE_KOKKOS=ON \
|
|
24
|
+
-DCMAKE_PREFIX_PATH=../extern/install/nvidia-cuda
|
|
25
|
+
cmake --build build_kokkos -j && ctest --test-dir build_kokkos --output-on-failure # + GPU halo np=1,2,4
|
|
26
|
+
mpirun -np 4 ./build/benchmarks/bench_halo 48 1 300
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
The Kokkos halo path is provisioned via `find_package(Kokkos CONFIG)` against a cluster module or the
|
|
30
|
+
suite's local install prefix (`../tools/bootstrap_deps.sh`). The legacy native-CUDA halo was retired.
|
|
31
|
+
|
|
32
|
+
## Architecture
|
|
33
|
+
|
|
34
|
+
Header-only under `include/peclet/core/`:
|
|
35
|
+
|
|
36
|
+
- `common/types.hpp` — `Index` (int64), `Real` (double), `IVec<Dim>`/`Vec<Dim>`, `wrap()`,
|
|
37
|
+
compile-time `forEachInBox`. **Convention: x-fastest linear index** `I = x + y*nx + z*nx*ny`
|
|
38
|
+
(matches flow and `../docs/CONVENTIONS.md`). Keep this header C++17-clean (shared with `morton`,
|
|
39
|
+
which pins C++17).
|
|
40
|
+
- `decomp/block_decomposer.hpp` — ORB decomposition (ported & modernized from
|
|
41
|
+
`../block_decomposer/src/BlockDecomposer.hpp`). `ownerOf()` walks the implicit binary tree
|
|
42
|
+
(children at `2i+1`/`2i+2`, leaves carry the block index) and is the key primitive for halo
|
|
43
|
+
topology. `linearGlobal`/`multiGlobal` are x-fastest and mutually inverse. `init(numBlocks,
|
|
44
|
+
globalSize, weights)` is the **weighted ORB** for dynamic load balancing: it bisects at the cell
|
|
45
|
+
boundary whose cumulative weight reaches the sub-block target fraction (vs equal cell count);
|
|
46
|
+
equal weights reduce to the unweighted `init()` bit-for-bit.
|
|
47
|
+
- `decomp/block_indexer.hpp` — local↔global indexing for an extended (inner+ghost) block.
|
|
48
|
+
- `decomp/morton_indexer.hpp` — `MortonIndexer<Dim>`: Z-order (Morton) cell indexing via the `morton`
|
|
49
|
+
primitive (`morton::Morton<Dim,Bits>`), guarded by `PECLET_CORE_HAVE_MORTON`. The cache-friendly alternative
|
|
50
|
+
to the x-fastest order (which stays the convention): `codeOf`/`multiIndex` map global multi-index ↔
|
|
51
|
+
Z-order code, `neighborCode` steps one cell along an axis directly in Morton space. Methods carry
|
|
52
|
+
morton's `MORTON_HD`, so they are device-callable under a Kokkos build (the Kokkos build defines
|
|
53
|
+
`MORTON_ENABLE_KOKKOS` ⇒ `MORTON_HD` is `KOKKOS_FUNCTION`).
|
|
54
|
+
- `halo/nbx.hpp` — `NbxEngine`: canonical NBX (Issend + Ibarrier consensus). Reimplements the engine
|
|
55
|
+
from `../block_decomposer/src/MPISync.hpp`. Use for dynamic/sparse exchange.
|
|
56
|
+
- `halo/grid_halo_topology.hpp` — `GridHaloTopology<Dim>`: the ghost-layer exchange. **Topology** (who
|
|
57
|
+
owns each ghost cell, established via one NBX round so owners learn what to send) is built once in
|
|
58
|
+
`buildTopology()`; **exchange** runs every step. Field-agnostic: any type with
|
|
59
|
+
`bytesPerElem()`/`pack(localIdx,dst)`/`unpack(localIdx,src)` works (`GridFieldView<T>` is the
|
|
60
|
+
contiguous-array adapter). Two engines give identical results — `exchangeNbx`/`start`+`wait`
|
|
61
|
+
(overlap-capable) and `exchangePersistent` (`MPI_Neighbor_alltoallv`, faster for static grids).
|
|
62
|
+
`flatten()` exposes a device-friendly topology consumed by the device `GridHalo`.
|
|
63
|
+
- `halo/particle_migrator.hpp` — `ParticleMigrator<Dim>`: Lagrangian counterpart. Reassigns particles
|
|
64
|
+
(positions + opaque fixed-stride payload) to their owning rank via the NBX engine, with periodic wrap.
|
|
65
|
+
`cellOf()` exposes the global binning cell (`ownerOf == dec.ownerOf(cellOf(x))`).
|
|
66
|
+
- `halo/particle_rebalance.hpp` — `rebalanceByParticleCount(dec, mig, pos, payload, …)`: Lagrangian load
|
|
67
|
+
balancing. Bins particles onto the grid, re-inits `dec` in place with the **weighted ORB** (so a
|
|
68
|
+
migrator/halo holding a pointer to it sees the new partition), and migrates. Pure redistribution
|
|
69
|
+
(count/payload preserved). The dem distributed step is the consumer; also bound in `python/tpx_mpi.cpp`.
|
|
70
|
+
- `halo/grid_halo.hpp` — `GridHalo<T>`: portable GPU-resident halo (Kokkos; CUDA / HIP / OpenMP
|
|
71
|
+
backends). pack/unpack/self-copy run as `parallel_for` over the device `peclet::core::View<T>` field; only the
|
|
72
|
+
compact halo buffers are host-staged for MPI by default (the field stays on the device), with an
|
|
73
|
+
opt-in GPU-aware path (env `PECLET_CORE_GPU_AWARE_MPI`, legacy `PECLET_CORE_CUDA_AWARE_MPI` still honoured). Built
|
|
74
|
+
from a host `GridHaloTopology<Dim>::flatten()` via `init()`. Bit-for-bit matches the CPU exchange.
|
|
75
|
+
(The legacy native-CUDA `grid_halo_cuda.cuh` / `DeviceGridExchange<T>` was retired when Kokkos became
|
|
76
|
+
the canonical device path; see `docs/cuda-aware-mpi.md` for the historical
|
|
77
|
+
host-staging-vs-GPU-aware analysis.)
|
|
78
|
+
- `halo/particle_halo_topology.hpp` — `ParticleHaloTopology<Dim>`: persistent Lagrangian ghost halo
|
|
79
|
+
(host topology + field-agnostic exchange). `build()` establishes the owner↔ghost correspondence from
|
|
80
|
+
particle proximity; `forward` (owner→ghost), `reverse` (ghost→owner, accumulate) and
|
|
81
|
+
`forwardPositions` (periodic image shift) are the cheap per-step exchanges. The standard distributed
|
|
82
|
+
particle schemes (frozen/replicate, Newton-on, force-accumulate) are compositions of these.
|
|
83
|
+
- `halo/particle_halo.hpp` — `ParticleHalo<Dim>`: the Kokkos GPU-resident driver for
|
|
84
|
+
`ParticleHaloTopology` (on-device forward gather + reverse atomic-accumulate; host-staged or
|
|
85
|
+
GPU-aware MPI). Built from `ParticleHaloTopology::flatten()`; consumed by dem's distributed step.
|
|
86
|
+
- `geom/` — shared SDF solids. `geom/sdf.hpp` is the `Sdf` concept + analytic primitives;
|
|
87
|
+
`geom/grid_sdf.hpp` is the trilinearly-sampled `GridSdf`; `geom/vti_io.hpp` reads/writes scalar &
|
|
88
|
+
vector VTI (`.vti`). The shared geometry representation behind flow's and dem's cut-cell IBM.
|
|
89
|
+
- `amr/` — block-local-Morton **AMR octree** flow subsystem (`peclet::core::amr`, guarded by `PECLET_CORE_HAVE_MORTON`).
|
|
90
|
+
`amr/block_octree.hpp` is the per-block octree; `amr/flow.hpp` is the canonical device `AmrFlow`
|
|
91
|
+
(collocated-projection Navier–Stokes with `maskSolid` and a div-free face field), with
|
|
92
|
+
`amr/flow_oracle.hpp` an unexposed serial host reference. Device + distributed multigrid live in
|
|
93
|
+
`amr/pcg.hpp`, `amr/multigrid.hpp`, `amr/velocity_mg.hpp`, `amr/momentum.hpp` and the
|
|
94
|
+
`amr/distributed_*.hpp` set (`distributed_octree.hpp::rebalance` is the Eulerian leaf/field load
|
|
95
|
+
balancer). Cut-cell openness is `amr/cut_cell.hpp`; solution-adaptive refinement is `amr/adapt.hpp` /
|
|
96
|
+
`amr/indicators.hpp` / `amr/refine.hpp`. Design notes: `docs/amr_collocated_projection.md`,
|
|
97
|
+
`docs/amr_device_assembly_plan.md`.
|
|
98
|
+
- `python/` + `python/include/peclet/core/python/ndarray_interop.hpp` — **nanobind** Python bindings over a
|
|
99
|
+
shared **zero-copy `peclet::core::View`↔ndarray bridge** (`include/peclet/core/python/ndarray_interop.hpp`).
|
|
100
|
+
`python/tpx_mpi.cpp` is host-only (no Kokkos): exposes `ParticleMigrator` / `ParticleHaloTopology` /
|
|
101
|
+
`rebalanceByParticleCount` for an mpi4py driver. `python/tpx_amr.cpp` exposes the device `AmrFlow`
|
|
102
|
+
(needs the `morton` sibling + a Kokkos backend). Both are built via `include(SuiteNanobind)` +
|
|
103
|
+
`suite_require_nanobind()` from `../cmake/SuiteNanobind.cmake` (suite-root).
|
|
104
|
+
|
|
105
|
+
## Gotchas
|
|
106
|
+
|
|
107
|
+
- `GridHalo` caches a distributed-graph `MPI_Comm`. Its destructor guards `MPI_Comm_free` with
|
|
108
|
+
`MPI_Finalized` so an instance that outlives `MPI_Finalize` (e.g. on `main`'s stack) does not abort.
|
|
109
|
+
Don't remove that guard. The class is non-copyable (it owns the comm).
|
|
110
|
+
- The halo is owner-based, not adjacency-based: a ghost cell maps to whichever rank owns its wrapped
|
|
111
|
+
global cell, so it is correct for ORB's irregular block neighbours and any ghost width — no
|
|
112
|
+
Cartesian-grid assumption.
|
|
113
|
+
- Tests are dependency-free (`tests/test_util.hpp`, non-zero exit on failure). MPI tests run under
|
|
114
|
+
`mpirun` at several rank counts via ctest.
|
|
115
|
+
- `../cmake/SuiteNanobind.cmake` MUST be a CMake **macro**, not a `function()`: it sets/propagates
|
|
116
|
+
variables (the located nanobind, the interpreter) into the including scope, which a function's nested
|
|
117
|
+
scope would swallow. Keep `suite_require_nanobind` defined as a macro.
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.24)
|
|
2
|
+
project(transport_core LANGUAGES CXX)
|
|
3
|
+
|
|
4
|
+
# --- Standard: C++20 host (see suite/docs/STYLE.md). Device/CUDA stays C++17-compatible. ---
|
|
5
|
+
set(CMAKE_CXX_STANDARD 20)
|
|
6
|
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
7
|
+
set(CMAKE_CXX_EXTENSIONS OFF)
|
|
8
|
+
|
|
9
|
+
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
|
|
10
|
+
set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)
|
|
11
|
+
endif()
|
|
12
|
+
|
|
13
|
+
option(PECLET_CORE_BUILD_TESTS "Build tests" ON)
|
|
14
|
+
option(PECLET_CORE_BUILD_BENCHMARKS "Build benchmarks" ON)
|
|
15
|
+
option(PECLET_CORE_ENABLE_MPI "Build the halo layer against MPI (OFF = single-rank no-MPI stub)" ON)
|
|
16
|
+
|
|
17
|
+
# --- Header-only core library ---
|
|
18
|
+
add_library(tpx_core INTERFACE)
|
|
19
|
+
add_library(tpx::core ALIAS tpx_core)
|
|
20
|
+
target_include_directories(tpx_core INTERFACE
|
|
21
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
|
22
|
+
$<INSTALL_INTERFACE:include>)
|
|
23
|
+
target_compile_features(tpx_core INTERFACE cxx_std_20)
|
|
24
|
+
|
|
25
|
+
# Optional: the morton spatial-index primitive if present as a sibling checkout.
|
|
26
|
+
set(_morton_inc "${CMAKE_CURRENT_SOURCE_DIR}/../morton/include")
|
|
27
|
+
if(EXISTS "${_morton_inc}/morton/morton.hpp")
|
|
28
|
+
target_include_directories(tpx_core INTERFACE $<BUILD_INTERFACE:${_morton_inc}>)
|
|
29
|
+
target_compile_definitions(tpx_core INTERFACE PECLET_CORE_HAVE_MORTON=1)
|
|
30
|
+
message(STATUS "transport-core: morton found at ${_morton_inc}")
|
|
31
|
+
endif()
|
|
32
|
+
|
|
33
|
+
# --- Halo layer: against MPI (default), or a single-rank no-MPI stub (PECLET_CORE_ENABLE_MPI=OFF). One code
|
|
34
|
+
# path either way -- see tpx/common/mpi.hpp + mpi_stub.hpp. PECLET_CORE_HALO_OK marks the layer available
|
|
35
|
+
# (MPI tests/benchmarks still guard on MPI_FOUND; the no-MPI build runs single-rank). ---
|
|
36
|
+
set(PECLET_CORE_HALO_OK OFF)
|
|
37
|
+
if(PECLET_CORE_ENABLE_MPI)
|
|
38
|
+
find_package(MPI COMPONENTS CXX)
|
|
39
|
+
if(MPI_FOUND)
|
|
40
|
+
# Pin the launcher to the MPI compiler's own prefix. FindMPI locates the compiler
|
|
41
|
+
# wrapper (mpicxx) deterministically, but searches MPIEXEC_EXECUTABLE on PATH — so a
|
|
42
|
+
# foreign launcher earlier on PATH (e.g. ParaView's bundled mpiexec) gets picked up
|
|
43
|
+
# while the binary is built against the system MPI. That mismatch is silent and nasty:
|
|
44
|
+
# every rank inits as a singleton (MPI_Comm_size==1), so `mpirun -n N` runs N
|
|
45
|
+
# independent serial processes and multi-rank tests "pass" without ever communicating.
|
|
46
|
+
# The launcher next to mpicxx always belongs to the same MPI, on any system.
|
|
47
|
+
# Scheduler-launcher clusters (srun/aprun) pass -DTPX_PIN_MPIEXEC=OFF and set
|
|
48
|
+
# MPIEXEC_EXECUTABLE themselves.
|
|
49
|
+
option(PECLET_CORE_PIN_MPIEXEC "Pin MPIEXEC_EXECUTABLE to the MPI compiler's prefix" ON)
|
|
50
|
+
if(PECLET_CORE_PIN_MPIEXEC AND MPI_CXX_COMPILER)
|
|
51
|
+
get_filename_component(_mpi_bin "${MPI_CXX_COMPILER}" DIRECTORY)
|
|
52
|
+
unset(_mpi_launcher CACHE)
|
|
53
|
+
find_program(_mpi_launcher NAMES mpirun mpiexec HINTS "${_mpi_bin}" NO_DEFAULT_PATH)
|
|
54
|
+
if(_mpi_launcher AND NOT _mpi_launcher STREQUAL "${MPIEXEC_EXECUTABLE}")
|
|
55
|
+
message(STATUS "transport-core: MPIEXEC_EXECUTABLE was '${MPIEXEC_EXECUTABLE}' — "
|
|
56
|
+
"pinning to '${_mpi_launcher}' (matches ${MPI_CXX_COMPILER})")
|
|
57
|
+
set(MPIEXEC_EXECUTABLE "${_mpi_launcher}" CACHE FILEPATH "MPI launcher" FORCE)
|
|
58
|
+
endif()
|
|
59
|
+
unset(_mpi_launcher CACHE)
|
|
60
|
+
endif()
|
|
61
|
+
add_library(tpx_halo INTERFACE)
|
|
62
|
+
add_library(tpx::halo ALIAS tpx_halo)
|
|
63
|
+
target_link_libraries(tpx_halo INTERFACE tpx_core MPI::MPI_CXX)
|
|
64
|
+
set(PECLET_CORE_HALO_OK ON)
|
|
65
|
+
message(STATUS "transport-core: MPI found (${MPI_CXX_COMPILER}) — halo layer enabled")
|
|
66
|
+
else()
|
|
67
|
+
message(WARNING "transport-core: MPI not found — halo layer and its tests are disabled")
|
|
68
|
+
endif()
|
|
69
|
+
else()
|
|
70
|
+
add_library(tpx_halo INTERFACE)
|
|
71
|
+
add_library(tpx::halo ALIAS tpx_halo)
|
|
72
|
+
target_link_libraries(tpx_halo INTERFACE tpx_core)
|
|
73
|
+
target_compile_definitions(tpx_halo INTERFACE PECLET_CORE_NO_MPI=1)
|
|
74
|
+
set(PECLET_CORE_HALO_OK ON)
|
|
75
|
+
message(STATUS "transport-core: MPI disabled — halo uses the single-rank no-MPI stub")
|
|
76
|
+
endif()
|
|
77
|
+
|
|
78
|
+
# --- Optional Kokkos: portable (CUDA/HIP/OpenMP) GPU-resident halo exchange ---
|
|
79
|
+
# (The legacy native-CUDA halo, grid_halo_cuda.cuh, was retired in favour of the Kokkos path below.)
|
|
80
|
+
# Consumed via find_package(Kokkos CONFIG); provide it with a cluster module or the suite's local
|
|
81
|
+
# install prefix (suite/tools/bootstrap_deps.sh) on CMAKE_PREFIX_PATH. Independent of the legacy
|
|
82
|
+
# native-CUDA path above — both can be built at once.
|
|
83
|
+
option(PECLET_CORE_ENABLE_KOKKOS "Build the portable Kokkos halo path (find_package(Kokkos))" OFF)
|
|
84
|
+
set(PECLET_CORE_HAVE_KOKKOS OFF)
|
|
85
|
+
if(PECLET_CORE_ENABLE_KOKKOS AND PECLET_CORE_HALO_OK)
|
|
86
|
+
find_package(Kokkos CONFIG REQUIRED)
|
|
87
|
+
set(PECLET_CORE_HAVE_KOKKOS ON)
|
|
88
|
+
message(STATUS "transport-core: Kokkos ${Kokkos_VERSION} found (${Kokkos_DEVICES}) — "
|
|
89
|
+
"portable halo path enabled")
|
|
90
|
+
# When morton is present, build it in Kokkos mode so MORTON_HD resolves to
|
|
91
|
+
# KOKKOS_FUNCTION — the MortonIndexer (tpx/decomp/morton_indexer.hpp) is then
|
|
92
|
+
# callable from device kernels on any backend.
|
|
93
|
+
if(TARGET Kokkos::kokkos AND EXISTS "${_morton_inc}/morton/morton.hpp")
|
|
94
|
+
target_link_libraries(tpx_core INTERFACE Kokkos::kokkos)
|
|
95
|
+
target_compile_definitions(tpx_core INTERFACE MORTON_ENABLE_KOKKOS=1)
|
|
96
|
+
endif()
|
|
97
|
+
endif()
|
|
98
|
+
|
|
99
|
+
if(PECLET_CORE_BUILD_TESTS)
|
|
100
|
+
enable_testing()
|
|
101
|
+
add_subdirectory(tests)
|
|
102
|
+
endif()
|
|
103
|
+
|
|
104
|
+
if(PECLET_CORE_BUILD_BENCHMARKS AND MPI_FOUND)
|
|
105
|
+
add_subdirectory(benchmarks)
|
|
106
|
+
endif()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Frank Peters
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: peclet-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: peclet.core — shared transport-core infrastructure: Lagrangian particle halo (MPI) + AMR octree
|
|
5
|
+
Author-Email: Frank Peters <e.a.j.f.peters@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Project-URL: Homepage, https://github.com/computational-chemical-engineering/peclet
|
|
9
|
+
Project-URL: Documentation, https://github.com/computational-chemical-engineering/peclet
|
|
10
|
+
Project-URL: Source, https://github.com/computational-chemical-engineering/peclet-core
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Requires-Dist: numpy>=1.20
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# core
|
|
16
|
+
|
|
17
|
+
Shared infrastructure for the transport-phenomena simulation suite (see `../docs/` for the suite-wide
|
|
18
|
+
[architecture](../docs/ARCHITECTURE.md), [conventions](../docs/CONVENTIONS.md),
|
|
19
|
+
[style](../docs/STYLE.md), [interfaces](../docs/INTERFACES.md) and [roadmap](../docs/ROADMAP.md)).
|
|
20
|
+
|
|
21
|
+
It provides the pieces every method code (`flow`, `dem`, `voro`, …) should
|
|
22
|
+
share: a common MPI **block domain decomposition**, an efficient **asynchronous ghost-layer
|
|
23
|
+
exchange** (CPU + portable Kokkos GPU), **particle migration**, **dynamic load balancing**, unified
|
|
24
|
+
**SDF geometry** (`peclet::core::geom`), an **AMR octree** flow subsystem (`peclet::core::amr`), and **nanobind Python
|
|
25
|
+
bindings**. Header-only C++20 (the device side, compiled through Kokkos, is also C++20; only the
|
|
26
|
+
`morton` dependency pins C++17 — see `../docs/STYLE.md`). Cut-cell IBM is not a standalone shared
|
|
27
|
+
module: it currently lives inside the AMR flow solver (`peclet::core::amr`) and in `flow`.
|
|
28
|
+
|
|
29
|
+
## What works today
|
|
30
|
+
|
|
31
|
+
- `peclet::core::decomp::BlockDecomposer<Dim>` — orthogonal recursive bisection of a global cell grid into
|
|
32
|
+
rank-owned blocks; `ownerOf()` tree-walk; x-fastest global/local linear indexing.
|
|
33
|
+
- `peclet::core::decomp::BlockIndexer<Dim>` — local↔global indexing for a block with a ghost layer.
|
|
34
|
+
- `peclet::core::halo::NbxEngine` — nonblocking-consensus sparse exchange (Issend + Ibarrier), for dynamic
|
|
35
|
+
patterns.
|
|
36
|
+
- `peclet::core::halo::GridHaloTopology<Dim>` (`grid_halo_topology.hpp`) — asynchronous ghost-layer exchange
|
|
37
|
+
with **topology separated from exchange** and a **field-agnostic** pack/unpack interface.
|
|
38
|
+
`buildTopology()` runs once; two interchangeable exchange engines give identical results:
|
|
39
|
+
- `exchangeNbx()` / `start()`+`wait()` — NBX, supports compute/comm overlap.
|
|
40
|
+
- `exchangePersistent()` — `MPI_Neighbor_alltoallv` on a cached distributed-graph communicator;
|
|
41
|
+
fastest for the static neighbour pattern of a fixed grid.
|
|
42
|
+
- `peclet::core::halo::GridFieldView<T>` — wraps a contiguous local array as an exchangeable field.
|
|
43
|
+
- `peclet::core::halo::GridHalo<T>` (`grid_halo.hpp`) — portable **GPU-resident** ghost-layer exchange (Kokkos:
|
|
44
|
+
CUDA / HIP / OpenMP). Built once from a host `GridHaloTopology<Dim>::flatten()`; pack / unpack /
|
|
45
|
+
periodic self-copy run as `Kokkos::parallel_for` over the device `peclet::core::View<T>` field, so the full
|
|
46
|
+
field never crosses the bus — only the compact halo buffers are host-staged for MPI by default, with
|
|
47
|
+
an opt-in GPU-aware path (env `PECLET_CORE_GPU_AWARE_MPI`). Bit-for-bit identical to the CPU exchange.
|
|
48
|
+
- `peclet::core::halo::ParticleMigrator<Dim>` — Lagrangian particle migration to owning ranks (NBX), the
|
|
49
|
+
dynamic counterpart to the Eulerian grid halo.
|
|
50
|
+
- `peclet::core::halo::ParticleHaloTopology<Dim>` (`particle_halo_topology.hpp`) — persistent Lagrangian ghost
|
|
51
|
+
halo: `forward` (owner→ghost), `reverse` (ghost→owner, accumulate) and `forwardPositions` (periodic
|
|
52
|
+
image shift). `peclet::core::halo::ParticleHalo<Dim>` (`particle_halo.hpp`) is its GPU-resident Kokkos driver
|
|
53
|
+
(on-device gather/scatter, host-staged or GPU-aware MPI), consumed by dem's distributed step.
|
|
54
|
+
- `peclet::core::halo::rebalanceByParticleCount(...)` (`particle_rebalance.hpp`) — **dynamic load balancing**
|
|
55
|
+
for the Lagrangian path: re-inits the decomposition in place with the **weighted ORB**
|
|
56
|
+
(`BlockDecomposer::init(numBlocks, globalSize, weights)`) and migrates. The Eulerian/AMR counterpart
|
|
57
|
+
is `peclet::core::amr::DistributedOctree::rebalance`.
|
|
58
|
+
- `peclet::core::geom` (`sdf.hpp`, `grid_sdf.hpp`, `vti_io.hpp`) — shared SDF solids: analytic primitives +
|
|
59
|
+
trilinear `GridSdf` behind one `Sdf` concept, with VTI (.vti) read/write.
|
|
60
|
+
- `peclet::core::amr` (`include/peclet/core/amr/`) — block-local-Morton **AMR octree** flow subsystem: `peclet::core::amr::AmrFlow`
|
|
61
|
+
(collocated projection Navier–Stokes), device + distributed multigrid (`pcg.hpp`, `multigrid.hpp`,
|
|
62
|
+
`velocity_mg.hpp`, `distributed_*.hpp`), cut-cell IBM (`cut_cell.hpp`) and solution-adaptive refinement
|
|
63
|
+
(`adapt.hpp`, `indicators.hpp`). See `docs/amr_collocated_projection.md`.
|
|
64
|
+
- **Python bindings** (`python/tpx_mpi.cpp`, `python/tpx_amr.cpp`) — **nanobind** modules over the
|
|
65
|
+
shared zero-copy `View`↔ndarray bridge (`include/peclet/core/python/ndarray_interop.hpp`). `peclet.core.mpi` exposes
|
|
66
|
+
the host Lagrangian halo / migration / rebalance; `peclet.core.amr` exposes the device AMR flow.
|
|
67
|
+
|
|
68
|
+
Validated end-to-end by distributed explicit heat-diffusion solvers (plain, and **around an SDF solid
|
|
69
|
+
obstacle**) matching a serial reference cell-for-cell across ranks, and consumed by the validated
|
|
70
|
+
`flow` and `dem` distributed solvers. 26 ctests pass (`np` 1–8 CPU, 1–4 GPU).
|
|
71
|
+
|
|
72
|
+
## Build / test / benchmark
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
|
|
76
|
+
cmake --build build -j
|
|
77
|
+
ctest --test-dir build --output-on-failure # 26 ctests: serial + MPI (np=1,2,4,8)
|
|
78
|
+
|
|
79
|
+
# halo microbenchmark: weak scaling, NBX vs persistent
|
|
80
|
+
mpirun -np 4 ./build/benchmarks/bench_halo 48 1 300 # cells/rank/axis, ghost, iters
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Requires MPI (OpenMPI/MPICH) and a C++20 compiler. `morton` is picked up automatically if
|
|
84
|
+
checked out as a sibling directory (enables `PECLET_CORE_HAVE_MORTON`).
|
|
85
|
+
|
|
86
|
+
## Status
|
|
87
|
+
|
|
88
|
+
Complete and in production. The block decomposition, the async ghost-layer exchange (CPU + portable
|
|
89
|
+
Kokkos GPU, host-staged and opt-in GPU-aware), particle migration, dynamic load balancing (weighted
|
|
90
|
+
ORB + AMR/Lagrangian rebalancing), SDF geometry, the AMR octree flow subsystem (device + distributed
|
|
91
|
+
multigrid, collocated projection), and the nanobind Python bindings are all shipped and tested
|
|
92
|
+
(26 ctests, `np` 1–8 CPU / 1–4 GPU). `flow` (distributed cut-cell IBM Navier–Stokes) and `dem`
|
|
93
|
+
(distributed XPBD with load rebalancing) are validated consumers. CUDA is retired; Kokkos
|
|
94
|
+
(CUDA / HIP / OpenMP) is the canonical device path. Remaining work is at-scale multi-GPU tuning.
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# core
|
|
2
|
+
|
|
3
|
+
Shared infrastructure for the transport-phenomena simulation suite (see `../docs/` for the suite-wide
|
|
4
|
+
[architecture](../docs/ARCHITECTURE.md), [conventions](../docs/CONVENTIONS.md),
|
|
5
|
+
[style](../docs/STYLE.md), [interfaces](../docs/INTERFACES.md) and [roadmap](../docs/ROADMAP.md)).
|
|
6
|
+
|
|
7
|
+
It provides the pieces every method code (`flow`, `dem`, `voro`, …) should
|
|
8
|
+
share: a common MPI **block domain decomposition**, an efficient **asynchronous ghost-layer
|
|
9
|
+
exchange** (CPU + portable Kokkos GPU), **particle migration**, **dynamic load balancing**, unified
|
|
10
|
+
**SDF geometry** (`peclet::core::geom`), an **AMR octree** flow subsystem (`peclet::core::amr`), and **nanobind Python
|
|
11
|
+
bindings**. Header-only C++20 (the device side, compiled through Kokkos, is also C++20; only the
|
|
12
|
+
`morton` dependency pins C++17 — see `../docs/STYLE.md`). Cut-cell IBM is not a standalone shared
|
|
13
|
+
module: it currently lives inside the AMR flow solver (`peclet::core::amr`) and in `flow`.
|
|
14
|
+
|
|
15
|
+
## What works today
|
|
16
|
+
|
|
17
|
+
- `peclet::core::decomp::BlockDecomposer<Dim>` — orthogonal recursive bisection of a global cell grid into
|
|
18
|
+
rank-owned blocks; `ownerOf()` tree-walk; x-fastest global/local linear indexing.
|
|
19
|
+
- `peclet::core::decomp::BlockIndexer<Dim>` — local↔global indexing for a block with a ghost layer.
|
|
20
|
+
- `peclet::core::halo::NbxEngine` — nonblocking-consensus sparse exchange (Issend + Ibarrier), for dynamic
|
|
21
|
+
patterns.
|
|
22
|
+
- `peclet::core::halo::GridHaloTopology<Dim>` (`grid_halo_topology.hpp`) — asynchronous ghost-layer exchange
|
|
23
|
+
with **topology separated from exchange** and a **field-agnostic** pack/unpack interface.
|
|
24
|
+
`buildTopology()` runs once; two interchangeable exchange engines give identical results:
|
|
25
|
+
- `exchangeNbx()` / `start()`+`wait()` — NBX, supports compute/comm overlap.
|
|
26
|
+
- `exchangePersistent()` — `MPI_Neighbor_alltoallv` on a cached distributed-graph communicator;
|
|
27
|
+
fastest for the static neighbour pattern of a fixed grid.
|
|
28
|
+
- `peclet::core::halo::GridFieldView<T>` — wraps a contiguous local array as an exchangeable field.
|
|
29
|
+
- `peclet::core::halo::GridHalo<T>` (`grid_halo.hpp`) — portable **GPU-resident** ghost-layer exchange (Kokkos:
|
|
30
|
+
CUDA / HIP / OpenMP). Built once from a host `GridHaloTopology<Dim>::flatten()`; pack / unpack /
|
|
31
|
+
periodic self-copy run as `Kokkos::parallel_for` over the device `peclet::core::View<T>` field, so the full
|
|
32
|
+
field never crosses the bus — only the compact halo buffers are host-staged for MPI by default, with
|
|
33
|
+
an opt-in GPU-aware path (env `PECLET_CORE_GPU_AWARE_MPI`). Bit-for-bit identical to the CPU exchange.
|
|
34
|
+
- `peclet::core::halo::ParticleMigrator<Dim>` — Lagrangian particle migration to owning ranks (NBX), the
|
|
35
|
+
dynamic counterpart to the Eulerian grid halo.
|
|
36
|
+
- `peclet::core::halo::ParticleHaloTopology<Dim>` (`particle_halo_topology.hpp`) — persistent Lagrangian ghost
|
|
37
|
+
halo: `forward` (owner→ghost), `reverse` (ghost→owner, accumulate) and `forwardPositions` (periodic
|
|
38
|
+
image shift). `peclet::core::halo::ParticleHalo<Dim>` (`particle_halo.hpp`) is its GPU-resident Kokkos driver
|
|
39
|
+
(on-device gather/scatter, host-staged or GPU-aware MPI), consumed by dem's distributed step.
|
|
40
|
+
- `peclet::core::halo::rebalanceByParticleCount(...)` (`particle_rebalance.hpp`) — **dynamic load balancing**
|
|
41
|
+
for the Lagrangian path: re-inits the decomposition in place with the **weighted ORB**
|
|
42
|
+
(`BlockDecomposer::init(numBlocks, globalSize, weights)`) and migrates. The Eulerian/AMR counterpart
|
|
43
|
+
is `peclet::core::amr::DistributedOctree::rebalance`.
|
|
44
|
+
- `peclet::core::geom` (`sdf.hpp`, `grid_sdf.hpp`, `vti_io.hpp`) — shared SDF solids: analytic primitives +
|
|
45
|
+
trilinear `GridSdf` behind one `Sdf` concept, with VTI (.vti) read/write.
|
|
46
|
+
- `peclet::core::amr` (`include/peclet/core/amr/`) — block-local-Morton **AMR octree** flow subsystem: `peclet::core::amr::AmrFlow`
|
|
47
|
+
(collocated projection Navier–Stokes), device + distributed multigrid (`pcg.hpp`, `multigrid.hpp`,
|
|
48
|
+
`velocity_mg.hpp`, `distributed_*.hpp`), cut-cell IBM (`cut_cell.hpp`) and solution-adaptive refinement
|
|
49
|
+
(`adapt.hpp`, `indicators.hpp`). See `docs/amr_collocated_projection.md`.
|
|
50
|
+
- **Python bindings** (`python/tpx_mpi.cpp`, `python/tpx_amr.cpp`) — **nanobind** modules over the
|
|
51
|
+
shared zero-copy `View`↔ndarray bridge (`include/peclet/core/python/ndarray_interop.hpp`). `peclet.core.mpi` exposes
|
|
52
|
+
the host Lagrangian halo / migration / rebalance; `peclet.core.amr` exposes the device AMR flow.
|
|
53
|
+
|
|
54
|
+
Validated end-to-end by distributed explicit heat-diffusion solvers (plain, and **around an SDF solid
|
|
55
|
+
obstacle**) matching a serial reference cell-for-cell across ranks, and consumed by the validated
|
|
56
|
+
`flow` and `dem` distributed solvers. 26 ctests pass (`np` 1–8 CPU, 1–4 GPU).
|
|
57
|
+
|
|
58
|
+
## Build / test / benchmark
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
|
|
62
|
+
cmake --build build -j
|
|
63
|
+
ctest --test-dir build --output-on-failure # 26 ctests: serial + MPI (np=1,2,4,8)
|
|
64
|
+
|
|
65
|
+
# halo microbenchmark: weak scaling, NBX vs persistent
|
|
66
|
+
mpirun -np 4 ./build/benchmarks/bench_halo 48 1 300 # cells/rank/axis, ghost, iters
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Requires MPI (OpenMPI/MPICH) and a C++20 compiler. `morton` is picked up automatically if
|
|
70
|
+
checked out as a sibling directory (enables `PECLET_CORE_HAVE_MORTON`).
|
|
71
|
+
|
|
72
|
+
## Status
|
|
73
|
+
|
|
74
|
+
Complete and in production. The block decomposition, the async ghost-layer exchange (CPU + portable
|
|
75
|
+
Kokkos GPU, host-staged and opt-in GPU-aware), particle migration, dynamic load balancing (weighted
|
|
76
|
+
ORB + AMR/Lagrangian rebalancing), SDF geometry, the AMR octree flow subsystem (device + distributed
|
|
77
|
+
multigrid, collocated projection), and the nanobind Python bindings are all shipped and tested
|
|
78
|
+
(26 ctests, `np` 1–8 CPU / 1–4 GPU). `flow` (distributed cut-cell IBM Navier–Stokes) and `dem`
|
|
79
|
+
(distributed XPBD with load rebalancing) are validated consumers. CUDA is retired; Kokkos
|
|
80
|
+
(CUDA / HIP / OpenMP) is the canonical device path. Remaining work is at-scale multi-GPU tuning.
|