peclet-core 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. peclet_core-0.1.0/.clang-format +25 -0
  2. peclet_core-0.1.0/.github/workflows/ci.yml +53 -0
  3. peclet_core-0.1.0/.github/workflows/docs.yml +50 -0
  4. peclet_core-0.1.0/.github/workflows/quality.yml +40 -0
  5. peclet_core-0.1.0/.github/workflows/release.yml +41 -0
  6. peclet_core-0.1.0/.gitignore +14 -0
  7. peclet_core-0.1.0/CLAUDE.md +117 -0
  8. peclet_core-0.1.0/CMakeLists.txt +106 -0
  9. peclet_core-0.1.0/LICENSE +21 -0
  10. peclet_core-0.1.0/PKG-INFO +94 -0
  11. peclet_core-0.1.0/README.md +80 -0
  12. peclet_core-0.1.0/benchmarks/CMakeLists.txt +8 -0
  13. peclet_core-0.1.0/benchmarks/bench_amr_flow.cpp +354 -0
  14. peclet_core-0.1.0/benchmarks/bench_halo.cpp +88 -0
  15. peclet_core-0.1.0/docs/Doxyfile +66 -0
  16. peclet_core-0.1.0/docs/amr_collocated_projection.md +161 -0
  17. peclet_core-0.1.0/docs/amr_device_assembly_plan.md +95 -0
  18. peclet_core-0.1.0/docs/cuda-aware-mpi.md +71 -0
  19. peclet_core-0.1.0/include/peclet/core/amr/adapt.hpp +227 -0
  20. peclet_core-0.1.0/include/peclet/core/amr/advect_recon.hpp +40 -0
  21. peclet_core-0.1.0/include/peclet/core/amr/assembly.hpp +179 -0
  22. peclet_core-0.1.0/include/peclet/core/amr/barnes_hut.hpp +207 -0
  23. peclet_core-0.1.0/include/peclet/core/amr/block_octree.hpp +326 -0
  24. peclet_core-0.1.0/include/peclet/core/amr/block_octree_view.hpp +70 -0
  25. peclet_core-0.1.0/include/peclet/core/amr/csr.hpp +119 -0
  26. peclet_core-0.1.0/include/peclet/core/amr/cut_cell.hpp +607 -0
  27. peclet_core-0.1.0/include/peclet/core/amr/distributed_adapt.hpp +99 -0
  28. peclet_core-0.1.0/include/peclet/core/amr/distributed_fv.hpp +541 -0
  29. peclet_core-0.1.0/include/peclet/core/amr/distributed_octree.hpp +826 -0
  30. peclet_core-0.1.0/include/peclet/core/amr/distributed_poisson.hpp +239 -0
  31. peclet_core-0.1.0/include/peclet/core/amr/distributed_view.hpp +338 -0
  32. peclet_core-0.1.0/include/peclet/core/amr/face_csr.hpp +141 -0
  33. peclet_core-0.1.0/include/peclet/core/amr/face_geom.hpp +38 -0
  34. peclet_core-0.1.0/include/peclet/core/amr/facegeom_assembly.hpp +180 -0
  35. peclet_core-0.1.0/include/peclet/core/amr/flow.hpp +859 -0
  36. peclet_core-0.1.0/include/peclet/core/amr/flow_oracle.hpp +375 -0
  37. peclet_core-0.1.0/include/peclet/core/amr/fv_op.hpp +198 -0
  38. peclet_core-0.1.0/include/peclet/core/amr/indicators.hpp +91 -0
  39. peclet_core-0.1.0/include/peclet/core/amr/leaf_field.hpp +75 -0
  40. peclet_core-0.1.0/include/peclet/core/amr/momentum.hpp +584 -0
  41. peclet_core-0.1.0/include/peclet/core/amr/momentum_assembly.hpp +216 -0
  42. peclet_core-0.1.0/include/peclet/core/amr/multigrid.hpp +417 -0
  43. peclet_core-0.1.0/include/peclet/core/amr/pcg.hpp +234 -0
  44. peclet_core-0.1.0/include/peclet/core/amr/poisson.hpp +661 -0
  45. peclet_core-0.1.0/include/peclet/core/amr/refine.hpp +65 -0
  46. peclet_core-0.1.0/include/peclet/core/amr/scalar_transport.hpp +163 -0
  47. peclet_core-0.1.0/include/peclet/core/amr/velocity_mg.hpp +267 -0
  48. peclet_core-0.1.0/include/peclet/core/amr/vtu_io.hpp +126 -0
  49. peclet_core-0.1.0/include/peclet/core/common/mpi.hpp +10 -0
  50. peclet_core-0.1.0/include/peclet/core/common/mpi_stub.hpp +90 -0
  51. peclet_core-0.1.0/include/peclet/core/common/types.hpp +70 -0
  52. peclet_core-0.1.0/include/peclet/core/common/view.hpp +78 -0
  53. peclet_core-0.1.0/include/peclet/core/decomp/block_decomposer.hpp +227 -0
  54. peclet_core-0.1.0/include/peclet/core/decomp/block_indexer.hpp +121 -0
  55. peclet_core-0.1.0/include/peclet/core/decomp/morton_indexer.hpp +87 -0
  56. peclet_core-0.1.0/include/peclet/core/geom/grid_sdf.hpp +80 -0
  57. peclet_core-0.1.0/include/peclet/core/geom/sdf.hpp +94 -0
  58. peclet_core-0.1.0/include/peclet/core/geom/vti_io.hpp +149 -0
  59. peclet_core-0.1.0/include/peclet/core/halo/grid_halo.hpp +145 -0
  60. peclet_core-0.1.0/include/peclet/core/halo/grid_halo_topology.hpp +356 -0
  61. peclet_core-0.1.0/include/peclet/core/halo/nbx.hpp +101 -0
  62. peclet_core-0.1.0/include/peclet/core/halo/particle_halo.hpp +227 -0
  63. peclet_core-0.1.0/include/peclet/core/halo/particle_halo_topology.hpp +286 -0
  64. peclet_core-0.1.0/include/peclet/core/halo/particle_migrator.hpp +301 -0
  65. peclet_core-0.1.0/include/peclet/core/halo/particle_migrator_view.hpp +268 -0
  66. peclet_core-0.1.0/include/peclet/core/halo/particle_rebalance.hpp +67 -0
  67. peclet_core-0.1.0/include/peclet/core/python/ndarray_interop.hpp +172 -0
  68. peclet_core-0.1.0/pyproject.toml +47 -0
  69. peclet_core-0.1.0/python/CMakeLists.txt +70 -0
  70. peclet_core-0.1.0/python/amr_bindings.cpp +756 -0
  71. peclet_core-0.1.0/python/example_amr.py +80 -0
  72. peclet_core-0.1.0/python/mpi_bindings.cpp +279 -0
  73. peclet_core-0.1.0/python/packaging/core_amr.pyi +159 -0
  74. peclet_core-0.1.0/python/packaging/core_init.py +15 -0
  75. peclet_core-0.1.0/python/test_amr.py +303 -0
  76. peclet_core-0.1.0/python/test_mpi.py +89 -0
  77. peclet_core-0.1.0/tests/CMakeLists.txt +340 -0
  78. peclet_core-0.1.0/tests/python/CMakeLists.txt +38 -0
  79. peclet_core-0.1.0/tests/python/interop_test_module.cpp +64 -0
  80. peclet_core-0.1.0/tests/python/test_ndarray_interop.py +77 -0
  81. peclet_core-0.1.0/tests/test_amr_adapt.cpp +142 -0
  82. peclet_core-0.1.0/tests/test_amr_adapt_transport.cpp +152 -0
  83. peclet_core-0.1.0/tests/test_amr_assembly.cpp +141 -0
  84. peclet_core-0.1.0/tests/test_amr_barnes_hut.cpp +85 -0
  85. peclet_core-0.1.0/tests/test_amr_cf_quadratic.cpp +132 -0
  86. peclet_core-0.1.0/tests/test_amr_cut_cell.cpp +139 -0
  87. peclet_core-0.1.0/tests/test_amr_distributed_adapt_mpi.cpp +143 -0
  88. peclet_core-0.1.0/tests/test_amr_distributed_fv_mpi.cpp +169 -0
  89. peclet_core-0.1.0/tests/test_amr_distributed_graded_mg_mpi.cpp +136 -0
  90. peclet_core-0.1.0/tests/test_amr_distributed_mg_mpi.cpp +113 -0
  91. peclet_core-0.1.0/tests/test_amr_distributed_mpi.cpp +170 -0
  92. peclet_core-0.1.0/tests/test_amr_distributed_openness_mpi.cpp +161 -0
  93. peclet_core-0.1.0/tests/test_amr_distributed_poisson_mpi.cpp +109 -0
  94. peclet_core-0.1.0/tests/test_amr_distributed_rebalance_mpi.cpp +183 -0
  95. peclet_core-0.1.0/tests/test_amr_distributed_view_mpi.cpp +138 -0
  96. peclet_core-0.1.0/tests/test_amr_drag.cpp +124 -0
  97. peclet_core-0.1.0/tests/test_amr_face_field.cpp +80 -0
  98. peclet_core-0.1.0/tests/test_amr_facegeom.cpp +118 -0
  99. peclet_core-0.1.0/tests/test_amr_flow.cpp +312 -0
  100. peclet_core-0.1.0/tests/test_amr_flow_solver.cpp +603 -0
  101. peclet_core-0.1.0/tests/test_amr_fv_op.cpp +117 -0
  102. peclet_core-0.1.0/tests/test_amr_fv_openness.cpp +144 -0
  103. peclet_core-0.1.0/tests/test_amr_kappa_dirichlet.cpp +136 -0
  104. peclet_core-0.1.0/tests/test_amr_kappa_restrict.cpp +146 -0
  105. peclet_core-0.1.0/tests/test_amr_momentum.cpp +216 -0
  106. peclet_core-0.1.0/tests/test_amr_momentum_assembly.cpp +140 -0
  107. peclet_core-0.1.0/tests/test_amr_multigrid.cpp +262 -0
  108. peclet_core-0.1.0/tests/test_amr_openness.cpp +135 -0
  109. peclet_core-0.1.0/tests/test_amr_pcg.cpp +216 -0
  110. peclet_core-0.1.0/tests/test_amr_poisson.cpp +188 -0
  111. peclet_core-0.1.0/tests/test_amr_sdf.cpp +84 -0
  112. peclet_core-0.1.0/tests/test_amr_transfer.cpp +166 -0
  113. peclet_core-0.1.0/tests/test_amr_transport.cpp +173 -0
  114. peclet_core-0.1.0/tests/test_amr_vtu.cpp +105 -0
  115. peclet_core-0.1.0/tests/test_block_octree.cpp +199 -0
  116. peclet_core-0.1.0/tests/test_block_octree_view.cpp +107 -0
  117. peclet_core-0.1.0/tests/test_decomposition.cpp +181 -0
  118. peclet_core-0.1.0/tests/test_diffusion.cpp +154 -0
  119. peclet_core-0.1.0/tests/test_diffusion_sdf.cpp +139 -0
  120. peclet_core-0.1.0/tests/test_ghost_particles_mpi.cpp +139 -0
  121. peclet_core-0.1.0/tests/test_grid_halo.cpp +120 -0
  122. peclet_core-0.1.0/tests/test_grid_halo_exchange.cpp +114 -0
  123. peclet_core-0.1.0/tests/test_morton_indexer.cpp +86 -0
  124. peclet_core-0.1.0/tests/test_particle_halo_exchange.cpp +142 -0
  125. peclet_core-0.1.0/tests/test_particle_halo_mpi.cpp +217 -0
  126. peclet_core-0.1.0/tests/test_particle_migration.cpp +122 -0
  127. peclet_core-0.1.0/tests/test_particle_migrator_view_mpi.cpp +159 -0
  128. peclet_core-0.1.0/tests/test_particle_rebalance.cpp +157 -0
  129. peclet_core-0.1.0/tests/test_sdf.cpp +69 -0
  130. peclet_core-0.1.0/tests/test_util.hpp +42 -0
  131. peclet_core-0.1.0/tests/test_vti.cpp +69 -0
  132. peclet_core-0.1.0/tools/cuda_aware_mpi_check.cpp +59 -0
  133. peclet_core-0.1.0/tools/cudampi-env.sh +12 -0
@@ -0,0 +1,25 @@
1
+ # Adopted suite-wide from voronoi_dynamics (Google C++ Style).
2
+ BasedOnStyle: Google
3
+ Language: Cpp
4
+ ColumnLimit: 100
5
+ IndentWidth: 2
6
+ TabWidth: 2
7
+ UseTab: Never
8
+ ContinuationIndentWidth: 4
9
+ BreakBeforeBraces: Attach
10
+ AlignAfterOpenBracket: Align
11
+ AlignTrailingComments: true
12
+ IncludeBlocks: Regroup
13
+ SortIncludes: CaseInsensitive
14
+ PointerAlignment: Left
15
+ ReferenceAlignment: Left
16
+ SpaceBeforeParens: ControlStatements
17
+ AllowShortFunctionsOnASingleLine: Inline
18
+ AllowShortIfStatementsOnASingleLine: false
19
+ AllowShortLoopsOnASingleLine: false
20
+ SpaceAfterTemplateKeyword: true
21
+ BreakConstructorInitializers: BeforeComma
22
+ AccessModifierOffset: -1
23
+ NamespaceIndentation: None
24
+ CompactNamespaces: false
25
+ ReflowComments: true
@@ -0,0 +1,53 @@
1
+ name: CI
2
+
3
+ # Build the header-only library and run the full ctest suite (serial + MPI, np = 1,2,4).
4
+ # core is CPU-testable end-to-end: header-only C++20 + MPI, no GPU required. The optional
5
+ # Kokkos GPU-resident halo path is NOT exercised here (no GPU on the runners) — it is covered by the
6
+ # consumer codes' own CI.
7
+
8
+ on:
9
+ push:
10
+ branches: [main, "**"]
11
+ pull_request:
12
+ workflow_dispatch:
13
+
14
+ jobs:
15
+ cpp:
16
+ name: ubuntu / ${{ matrix.cc }} / MPI / ${{ matrix.build_type }}
17
+ runs-on: ubuntu-latest
18
+ strategy:
19
+ fail-fast: false
20
+ matrix:
21
+ build_type: [Debug, Release]
22
+ include:
23
+ - cc: gcc
24
+ cxx: g++
25
+ - cc: clang
26
+ cxx: clang++
27
+ env:
28
+ CC: ${{ matrix.cc }}
29
+ CXX: ${{ matrix.cxx }}
30
+ OMPI_ALLOW_RUN_AS_ROOT: 1
31
+ OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1
32
+ # GitHub runners have only 2 physical cores; allow OpenMPI to oversubscribe so the
33
+ # np=4 and np=8 correctness tests run (slower, but the logic is what we check).
34
+ OMPI_MCA_rmaps_base_oversubscribe: 1
35
+ PRTE_MCA_rmaps_default_mapping_policy: ":oversubscribe"
36
+ steps:
37
+ - uses: actions/checkout@v4
38
+
39
+ - name: Install MPI
40
+ run: |
41
+ sudo apt-get update
42
+ sudo apt-get install -y libopenmpi-dev openmpi-bin
43
+ # morton is an optional sibling (enables PECLET_CORE_HAVE_MORTON); it is a private repo, so CI
44
+ # builds without it — the halo/decomp tests do not depend on it.
45
+
46
+ - name: Configure
47
+ run: cmake -S . -B build -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
48
+
49
+ - name: Build
50
+ run: cmake --build build -j
51
+
52
+ - name: Test (serial + MPI np=1,2,4)
53
+ run: ctest --test-dir build --output-on-failure
@@ -0,0 +1,50 @@
1
+ name: Documentation
2
+
3
+ # Build the Doxygen API docs (header-only C++20 in include/, plus the Markdown design notes) and
4
+ # publish them to GitHub Pages. Requires Pages to be enabled for the repository with
5
+ # "Source: GitHub Actions" (Settings -> Pages). Runs on every push to main; can also be run manually.
6
+
7
+ on:
8
+ push:
9
+ branches: [main]
10
+ paths:
11
+ - 'include/**'
12
+ - 'docs/**'
13
+ - 'README.md'
14
+ - '.github/workflows/docs.yml'
15
+ workflow_dispatch:
16
+
17
+ permissions:
18
+ contents: read
19
+ pages: write
20
+ id-token: write
21
+
22
+ # allow one concurrent deployment; cancel an in-progress run for a newer push
23
+ concurrency:
24
+ group: pages
25
+ cancel-in-progress: true
26
+
27
+ jobs:
28
+ build:
29
+ runs-on: ubuntu-latest
30
+ steps:
31
+ - uses: actions/checkout@v4
32
+ - name: Install Doxygen + Graphviz
33
+ run: sudo apt-get update && sudo apt-get install -y doxygen graphviz
34
+ - name: Build documentation
35
+ run: doxygen docs/Doxyfile
36
+ - name: Upload Pages artifact
37
+ uses: actions/upload-pages-artifact@v3
38
+ with:
39
+ path: docs/html
40
+
41
+ deploy:
42
+ needs: build
43
+ runs-on: ubuntu-latest
44
+ environment:
45
+ name: github-pages
46
+ url: ${{ steps.deployment.outputs.page_url }}
47
+ steps:
48
+ - name: Deploy to GitHub Pages
49
+ id: deployment
50
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,40 @@
1
+ name: Quality
2
+
3
+ # Lightweight code-quality gates. Python hard-fails on critical errors only. The C++ clang-format
4
+ # check is informational for now (continue-on-error) so version skew between the runner's
5
+ # clang-format and the local one can't red the build; promote it to required once a clang-format
6
+ # version is pinned and the tree is reformatted against it.
7
+
8
+ on:
9
+ push:
10
+ branches: [main, "**"]
11
+ pull_request:
12
+ workflow_dispatch:
13
+
14
+ jobs:
15
+ python-lint:
16
+ name: ruff (critical errors)
17
+ runs-on: ubuntu-latest
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ - uses: actions/setup-python@v5
21
+ with:
22
+ python-version: "3.12"
23
+ - run: pip install ruff
24
+ - name: Ruff critical-error check
25
+ run: |
26
+ ruff check . --select E9,F63,F7,F82 \
27
+ --exclude ".venv,venv,build,build_*,__pycache__,_deps,extern,legacy,notebooks"
28
+
29
+ cpp-format:
30
+ name: clang-format (informational)
31
+ runs-on: ubuntu-latest
32
+ continue-on-error: true
33
+ steps:
34
+ - uses: actions/checkout@v4
35
+ - run: sudo apt-get update && sudo apt-get install -y clang-format
36
+ - name: clang-format dry-run (uses the repo .clang-format)
37
+ run: |
38
+ files=$(git ls-files 'include/*.hpp' 'tests/*.hpp' 'tests/*.cpp' 'benchmarks/*.cpp')
39
+ [ -z "$files" ] && { echo "no C++ sources"; exit 0; }
40
+ clang-format --dry-run --Werror $files
@@ -0,0 +1,41 @@
1
+ name: Release
2
+
3
+ # Publish peclet-core to PyPI on a version tag via Trusted Publishing (OIDC) — configure the publisher
4
+ # on PyPI first; no API token secret needed. SDIST ONLY: peclet-core's Python surface is the MPI particle
5
+ # halo (peclet.core.mpi) + the Kokkos AMR octree (peclet.core.amr); both link MPI/Kokkos, whose ABI/arch
6
+ # make a portable binary wheel impossible. Consumers `pip install peclet-core` and build against their
7
+ # site MPI (+ optional Kokkos prefix), or use the suite containers. See ../docs/DEPLOYMENT.md.
8
+
9
+ on:
10
+ push:
11
+ tags: ["v*"]
12
+ workflow_dispatch:
13
+
14
+ jobs:
15
+ sdist:
16
+ runs-on: ubuntu-latest
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ - uses: actions/setup-python@v5
20
+ with:
21
+ python-version: "3.12"
22
+ - run: pip install build
23
+ - run: python -m build --sdist
24
+ - uses: actions/upload-artifact@v4
25
+ with:
26
+ name: sdist
27
+ path: dist/*.tar.gz
28
+
29
+ publish:
30
+ needs: [sdist]
31
+ runs-on: ubuntu-latest
32
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
33
+ environment: pypi
34
+ permissions:
35
+ id-token: write # required for trusted publishing
36
+ steps:
37
+ - uses: actions/download-artifact@v4
38
+ with:
39
+ path: dist
40
+ merge-multiple: true
41
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,14 @@
1
+ build/
2
+ build_*/
3
+ build-*/
4
+ *.o
5
+ *.so
6
+ .cache/
7
+ compile_commands.json
8
+
9
+ # Generated Doxygen output (built in CI, published to Pages)
10
+ docs/html/
11
+
12
+ # Python build/test artifacts
13
+ __pycache__/
14
+ *.pyc
@@ -0,0 +1,117 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## What this is
6
+
7
+ `core` is the shared infrastructure library for the transport-phenomena simulation suite
8
+ (sibling repos under `../`: `flow`, `dem`, `voro`, `morton`). The suite-wide design contract lives in `../docs/` — read
9
+ `../docs/ARCHITECTURE.md`, `CONVENTIONS.md`, `STYLE.md`, `INTERFACES.md`, `ROADMAP.md` before
10
+ cross-cutting changes. Header-only C++20; the device side is compiled through Kokkos (CUDA / HIP /
11
+ OpenMP) and is also C++20 — only the `morton` dependency pins C++17 (see `../docs/STYLE.md`). CUDA is
12
+ retired; Kokkos is the canonical device path.
13
+
14
+ ## Build / test / benchmark
15
+
16
+ ```bash
17
+ # CPU library + tests (no device dependency):
18
+ cmake -S . -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j
19
+ ctest --test-dir build --output-on-failure # serial + MPI halo + particle migration + diffusion
20
+
21
+ # Portable Kokkos device halo (CUDA / HIP / OpenMP) -- opt-in, find_package(Kokkos):
22
+ export PATH=/usr/local/cuda-13.2/bin:$PATH # if the Kokkos install targets the CUDA backend
23
+ cmake -S . -B build_kokkos -DPECLET_CORE_ENABLE_KOKKOS=ON \
24
+ -DCMAKE_PREFIX_PATH=../extern/install/nvidia-cuda
25
+ cmake --build build_kokkos -j && ctest --test-dir build_kokkos --output-on-failure # + GPU halo np=1,2,4
26
+ mpirun -np 4 ./build/benchmarks/bench_halo 48 1 300
27
+ ```
28
+
29
+ The Kokkos halo path is provisioned via `find_package(Kokkos CONFIG)` against a cluster module or the
30
+ suite's local install prefix (`../tools/bootstrap_deps.sh`). The legacy native-CUDA halo was retired.
31
+
32
+ ## Architecture
33
+
34
+ Header-only under `include/peclet/core/`:
35
+
36
+ - `common/types.hpp` — `Index` (int64), `Real` (double), `IVec<Dim>`/`Vec<Dim>`, `wrap()`,
37
+ compile-time `forEachInBox`. **Convention: x-fastest linear index** `I = x + y*nx + z*nx*ny`
38
+ (matches flow and `../docs/CONVENTIONS.md`). Keep this header C++17-clean (shared with `morton`,
39
+ which pins C++17).
40
+ - `decomp/block_decomposer.hpp` — ORB decomposition (ported & modernized from
41
+ `../block_decomposer/src/BlockDecomposer.hpp`). `ownerOf()` walks the implicit binary tree
42
+ (children at `2i+1`/`2i+2`, leaves carry the block index) and is the key primitive for halo
43
+ topology. `linearGlobal`/`multiGlobal` are x-fastest and mutually inverse. `init(numBlocks,
44
+ globalSize, weights)` is the **weighted ORB** for dynamic load balancing: it bisects at the cell
45
+ boundary whose cumulative weight reaches the sub-block target fraction (vs equal cell count);
46
+ equal weights reduce to the unweighted `init()` bit-for-bit.
47
+ - `decomp/block_indexer.hpp` — local↔global indexing for an extended (inner+ghost) block.
48
+ - `decomp/morton_indexer.hpp` — `MortonIndexer<Dim>`: Z-order (Morton) cell indexing via the `morton`
49
+ primitive (`morton::Morton<Dim,Bits>`), guarded by `PECLET_CORE_HAVE_MORTON`. The cache-friendly alternative
50
+ to the x-fastest order (which stays the convention): `codeOf`/`multiIndex` map global multi-index ↔
51
+ Z-order code, `neighborCode` steps one cell along an axis directly in Morton space. Methods carry
52
+ morton's `MORTON_HD`, so they are device-callable under a Kokkos build (the Kokkos build defines
53
+ `MORTON_ENABLE_KOKKOS` ⇒ `MORTON_HD` is `KOKKOS_FUNCTION`).
54
+ - `halo/nbx.hpp` — `NbxEngine`: canonical NBX (Issend + Ibarrier consensus). Reimplements the engine
55
+ from `../block_decomposer/src/MPISync.hpp`. Use for dynamic/sparse exchange.
56
+ - `halo/grid_halo_topology.hpp` — `GridHaloTopology<Dim>`: the ghost-layer exchange. **Topology** (who
57
+ owns each ghost cell, established via one NBX round so owners learn what to send) is built once in
58
+ `buildTopology()`; **exchange** runs every step. Field-agnostic: any type with
59
+ `bytesPerElem()`/`pack(localIdx,dst)`/`unpack(localIdx,src)` works (`GridFieldView<T>` is the
60
+ contiguous-array adapter). Two engines give identical results — `exchangeNbx`/`start`+`wait`
61
+ (overlap-capable) and `exchangePersistent` (`MPI_Neighbor_alltoallv`, faster for static grids).
62
+ `flatten()` exposes a device-friendly topology consumed by the device `GridHalo`.
63
+ - `halo/particle_migrator.hpp` — `ParticleMigrator<Dim>`: Lagrangian counterpart. Reassigns particles
64
+ (positions + opaque fixed-stride payload) to their owning rank via the NBX engine, with periodic wrap.
65
+ `cellOf()` exposes the global binning cell (`ownerOf == dec.ownerOf(cellOf(x))`).
66
+ - `halo/particle_rebalance.hpp` — `rebalanceByParticleCount(dec, mig, pos, payload, …)`: Lagrangian load
67
+ balancing. Bins particles onto the grid, re-inits `dec` in place with the **weighted ORB** (so a
68
+ migrator/halo holding a pointer to it sees the new partition), and migrates. Pure redistribution
69
+ (count/payload preserved). The dem distributed step is the consumer; also bound in `python/tpx_mpi.cpp`.
70
+ - `halo/grid_halo.hpp` — `GridHalo<T>`: portable GPU-resident halo (Kokkos; CUDA / HIP / OpenMP
71
+ backends). pack/unpack/self-copy run as `parallel_for` over the device `peclet::core::View<T>` field; only the
72
+ compact halo buffers are host-staged for MPI by default (the field stays on the device), with an
73
+ opt-in GPU-aware path (env `PECLET_CORE_GPU_AWARE_MPI`, legacy `PECLET_CORE_CUDA_AWARE_MPI` still honoured). Built
74
+ from a host `GridHaloTopology<Dim>::flatten()` via `init()`. Bit-for-bit matches the CPU exchange.
75
+ (The legacy native-CUDA `grid_halo_cuda.cuh` / `DeviceGridExchange<T>` was retired when Kokkos became
76
+ the canonical device path; see `docs/cuda-aware-mpi.md` for the historical
77
+ host-staging-vs-GPU-aware analysis.)
78
+ - `halo/particle_halo_topology.hpp` — `ParticleHaloTopology<Dim>`: persistent Lagrangian ghost halo
79
+ (host topology + field-agnostic exchange). `build()` establishes the owner↔ghost correspondence from
80
+ particle proximity; `forward` (owner→ghost), `reverse` (ghost→owner, accumulate) and
81
+ `forwardPositions` (periodic image shift) are the cheap per-step exchanges. The standard distributed
82
+ particle schemes (frozen/replicate, Newton-on, force-accumulate) are compositions of these.
83
+ - `halo/particle_halo.hpp` — `ParticleHalo<Dim>`: the Kokkos GPU-resident driver for
84
+ `ParticleHaloTopology` (on-device forward gather + reverse atomic-accumulate; host-staged or
85
+ GPU-aware MPI). Built from `ParticleHaloTopology::flatten()`; consumed by dem's distributed step.
86
+ - `geom/` — shared SDF solids. `geom/sdf.hpp` is the `Sdf` concept + analytic primitives;
87
+ `geom/grid_sdf.hpp` is the trilinearly-sampled `GridSdf`; `geom/vti_io.hpp` reads/writes scalar &
88
+ vector VTI (`.vti`). The shared geometry representation behind flow's and dem's cut-cell IBM.
89
+ - `amr/` — block-local-Morton **AMR octree** flow subsystem (`peclet::core::amr`, guarded by `PECLET_CORE_HAVE_MORTON`).
90
+ `amr/block_octree.hpp` is the per-block octree; `amr/flow.hpp` is the canonical device `AmrFlow`
91
+ (collocated-projection Navier–Stokes with `maskSolid` and a div-free face field), with
92
+ `amr/flow_oracle.hpp` an unexposed serial host reference. Device + distributed multigrid live in
93
+ `amr/pcg.hpp`, `amr/multigrid.hpp`, `amr/velocity_mg.hpp`, `amr/momentum.hpp` and the
94
+ `amr/distributed_*.hpp` set (`distributed_octree.hpp::rebalance` is the Eulerian leaf/field load
95
+ balancer). Cut-cell openness is `amr/cut_cell.hpp`; solution-adaptive refinement is `amr/adapt.hpp` /
96
+ `amr/indicators.hpp` / `amr/refine.hpp`. Design notes: `docs/amr_collocated_projection.md`,
97
+ `docs/amr_device_assembly_plan.md`.
98
+ - `python/` + `python/include/peclet/core/python/ndarray_interop.hpp` — **nanobind** Python bindings over a
99
+ shared **zero-copy `peclet::core::View`↔ndarray bridge** (`include/peclet/core/python/ndarray_interop.hpp`).
100
+ `python/tpx_mpi.cpp` is host-only (no Kokkos): exposes `ParticleMigrator` / `ParticleHaloTopology` /
101
+ `rebalanceByParticleCount` for an mpi4py driver. `python/tpx_amr.cpp` exposes the device `AmrFlow`
102
+ (needs the `morton` sibling + a Kokkos backend). Both are built via `include(SuiteNanobind)` +
103
+ `suite_require_nanobind()` from `../cmake/SuiteNanobind.cmake` (suite-root).
104
+
105
+ ## Gotchas
106
+
107
+ - `GridHalo` caches a distributed-graph `MPI_Comm`. Its destructor guards `MPI_Comm_free` with
108
+ `MPI_Finalized` so an instance that outlives `MPI_Finalize` (e.g. on `main`'s stack) does not abort.
109
+ Don't remove that guard. The class is non-copyable (it owns the comm).
110
+ - The halo is owner-based, not adjacency-based: a ghost cell maps to whichever rank owns its wrapped
111
+ global cell, so it is correct for ORB's irregular block neighbours and any ghost width — no
112
+ Cartesian-grid assumption.
113
+ - Tests are dependency-free (`tests/test_util.hpp`, non-zero exit on failure). MPI tests run under
114
+ `mpirun` at several rank counts via ctest.
115
+ - `../cmake/SuiteNanobind.cmake` MUST be a CMake **macro**, not a `function()`: it sets/propagates
116
+ variables (the located nanobind, the interpreter) into the including scope, which a function's nested
117
+ scope would swallow. Keep `suite_require_nanobind` defined as a macro.
@@ -0,0 +1,106 @@
1
+ cmake_minimum_required(VERSION 3.24)
2
+ project(transport_core LANGUAGES CXX)
3
+
4
+ # --- Standard: C++20 host (see suite/docs/STYLE.md). Device/CUDA stays C++17-compatible. ---
5
+ set(CMAKE_CXX_STANDARD 20)
6
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
7
+ set(CMAKE_CXX_EXTENSIONS OFF)
8
+
9
+ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
10
+ set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)
11
+ endif()
12
+
13
+ option(PECLET_CORE_BUILD_TESTS "Build tests" ON)
14
+ option(PECLET_CORE_BUILD_BENCHMARKS "Build benchmarks" ON)
15
+ option(PECLET_CORE_ENABLE_MPI "Build the halo layer against MPI (OFF = single-rank no-MPI stub)" ON)
16
+
17
+ # --- Header-only core library ---
18
+ add_library(tpx_core INTERFACE)
19
+ add_library(tpx::core ALIAS tpx_core)
20
+ target_include_directories(tpx_core INTERFACE
21
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
22
+ $<INSTALL_INTERFACE:include>)
23
+ target_compile_features(tpx_core INTERFACE cxx_std_20)
24
+
25
+ # Optional: the morton spatial-index primitive if present as a sibling checkout.
26
+ set(_morton_inc "${CMAKE_CURRENT_SOURCE_DIR}/../morton/include")
27
+ if(EXISTS "${_morton_inc}/morton/morton.hpp")
28
+ target_include_directories(tpx_core INTERFACE $<BUILD_INTERFACE:${_morton_inc}>)
29
+ target_compile_definitions(tpx_core INTERFACE PECLET_CORE_HAVE_MORTON=1)
30
+ message(STATUS "transport-core: morton found at ${_morton_inc}")
31
+ endif()
32
+
33
+ # --- Halo layer: against MPI (default), or a single-rank no-MPI stub (PECLET_CORE_ENABLE_MPI=OFF). One code
34
+ # path either way -- see tpx/common/mpi.hpp + mpi_stub.hpp. PECLET_CORE_HALO_OK marks the layer available
35
+ # (MPI tests/benchmarks still guard on MPI_FOUND; the no-MPI build runs single-rank). ---
36
+ set(PECLET_CORE_HALO_OK OFF)
37
+ if(PECLET_CORE_ENABLE_MPI)
38
+ find_package(MPI COMPONENTS CXX)
39
+ if(MPI_FOUND)
40
+ # Pin the launcher to the MPI compiler's own prefix. FindMPI locates the compiler
41
+ # wrapper (mpicxx) deterministically, but searches MPIEXEC_EXECUTABLE on PATH — so a
42
+ # foreign launcher earlier on PATH (e.g. ParaView's bundled mpiexec) gets picked up
43
+ # while the binary is built against the system MPI. That mismatch is silent and nasty:
44
+ # every rank inits as a singleton (MPI_Comm_size==1), so `mpirun -n N` runs N
45
+ # independent serial processes and multi-rank tests "pass" without ever communicating.
46
+ # The launcher next to mpicxx always belongs to the same MPI, on any system.
47
+ # Scheduler-launcher clusters (srun/aprun) pass -DTPX_PIN_MPIEXEC=OFF and set
48
+ # MPIEXEC_EXECUTABLE themselves.
49
+ option(PECLET_CORE_PIN_MPIEXEC "Pin MPIEXEC_EXECUTABLE to the MPI compiler's prefix" ON)
50
+ if(PECLET_CORE_PIN_MPIEXEC AND MPI_CXX_COMPILER)
51
+ get_filename_component(_mpi_bin "${MPI_CXX_COMPILER}" DIRECTORY)
52
+ unset(_mpi_launcher CACHE)
53
+ find_program(_mpi_launcher NAMES mpirun mpiexec HINTS "${_mpi_bin}" NO_DEFAULT_PATH)
54
+ if(_mpi_launcher AND NOT _mpi_launcher STREQUAL "${MPIEXEC_EXECUTABLE}")
55
+ message(STATUS "transport-core: MPIEXEC_EXECUTABLE was '${MPIEXEC_EXECUTABLE}' — "
56
+ "pinning to '${_mpi_launcher}' (matches ${MPI_CXX_COMPILER})")
57
+ set(MPIEXEC_EXECUTABLE "${_mpi_launcher}" CACHE FILEPATH "MPI launcher" FORCE)
58
+ endif()
59
+ unset(_mpi_launcher CACHE)
60
+ endif()
61
+ add_library(tpx_halo INTERFACE)
62
+ add_library(tpx::halo ALIAS tpx_halo)
63
+ target_link_libraries(tpx_halo INTERFACE tpx_core MPI::MPI_CXX)
64
+ set(PECLET_CORE_HALO_OK ON)
65
+ message(STATUS "transport-core: MPI found (${MPI_CXX_COMPILER}) — halo layer enabled")
66
+ else()
67
+ message(WARNING "transport-core: MPI not found — halo layer and its tests are disabled")
68
+ endif()
69
+ else()
70
+ add_library(tpx_halo INTERFACE)
71
+ add_library(tpx::halo ALIAS tpx_halo)
72
+ target_link_libraries(tpx_halo INTERFACE tpx_core)
73
+ target_compile_definitions(tpx_halo INTERFACE PECLET_CORE_NO_MPI=1)
74
+ set(PECLET_CORE_HALO_OK ON)
75
+ message(STATUS "transport-core: MPI disabled — halo uses the single-rank no-MPI stub")
76
+ endif()
77
+
78
+ # --- Optional Kokkos: portable (CUDA/HIP/OpenMP) GPU-resident halo exchange ---
79
+ # (The legacy native-CUDA halo, grid_halo_cuda.cuh, was retired in favour of the Kokkos path below.)
80
+ # Consumed via find_package(Kokkos CONFIG); provide it with a cluster module or the suite's local
81
+ # install prefix (suite/tools/bootstrap_deps.sh) on CMAKE_PREFIX_PATH. Independent of the legacy
82
+ # native-CUDA path above — both can be built at once.
83
+ option(PECLET_CORE_ENABLE_KOKKOS "Build the portable Kokkos halo path (find_package(Kokkos))" OFF)
84
+ set(PECLET_CORE_HAVE_KOKKOS OFF)
85
+ if(PECLET_CORE_ENABLE_KOKKOS AND PECLET_CORE_HALO_OK)
86
+ find_package(Kokkos CONFIG REQUIRED)
87
+ set(PECLET_CORE_HAVE_KOKKOS ON)
88
+ message(STATUS "transport-core: Kokkos ${Kokkos_VERSION} found (${Kokkos_DEVICES}) — "
89
+ "portable halo path enabled")
90
+ # When morton is present, build it in Kokkos mode so MORTON_HD resolves to
91
+ # KOKKOS_FUNCTION — the MortonIndexer (tpx/decomp/morton_indexer.hpp) is then
92
+ # callable from device kernels on any backend.
93
+ if(TARGET Kokkos::kokkos AND EXISTS "${_morton_inc}/morton/morton.hpp")
94
+ target_link_libraries(tpx_core INTERFACE Kokkos::kokkos)
95
+ target_compile_definitions(tpx_core INTERFACE MORTON_ENABLE_KOKKOS=1)
96
+ endif()
97
+ endif()
98
+
99
+ if(PECLET_CORE_BUILD_TESTS)
100
+ enable_testing()
101
+ add_subdirectory(tests)
102
+ endif()
103
+
104
+ if(PECLET_CORE_BUILD_BENCHMARKS AND MPI_FOUND)
105
+ add_subdirectory(benchmarks)
106
+ endif()
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Frank Peters
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.4
2
+ Name: peclet-core
3
+ Version: 0.1.0
4
+ Summary: peclet.core — shared transport-core infrastructure: Lagrangian particle halo (MPI) + AMR octree
5
+ Author-Email: Frank Peters <e.a.j.f.peters@gmail.com>
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Project-URL: Homepage, https://github.com/computational-chemical-engineering/peclet
9
+ Project-URL: Documentation, https://github.com/computational-chemical-engineering/peclet
10
+ Project-URL: Source, https://github.com/computational-chemical-engineering/peclet-core
11
+ Requires-Python: >=3.10
12
+ Requires-Dist: numpy>=1.20
13
+ Description-Content-Type: text/markdown
14
+
15
+ # core
16
+
17
+ Shared infrastructure for the transport-phenomena simulation suite (see `../docs/` for the suite-wide
18
+ [architecture](../docs/ARCHITECTURE.md), [conventions](../docs/CONVENTIONS.md),
19
+ [style](../docs/STYLE.md), [interfaces](../docs/INTERFACES.md) and [roadmap](../docs/ROADMAP.md)).
20
+
21
+ It provides the pieces every method code (`flow`, `dem`, `voro`, …) should
22
+ share: a common MPI **block domain decomposition**, an efficient **asynchronous ghost-layer
23
+ exchange** (CPU + portable Kokkos GPU), **particle migration**, **dynamic load balancing**, unified
24
+ **SDF geometry** (`peclet::core::geom`), an **AMR octree** flow subsystem (`peclet::core::amr`), and **nanobind Python
25
+ bindings**. Header-only C++20 (the device side, compiled through Kokkos, is also C++20; only the
26
+ `morton` dependency pins C++17 — see `../docs/STYLE.md`). Cut-cell IBM is not a standalone shared
27
+ module: it currently lives inside the AMR flow solver (`peclet::core::amr`) and in `flow`.
28
+
29
+ ## What works today
30
+
31
+ - `peclet::core::decomp::BlockDecomposer<Dim>` — orthogonal recursive bisection of a global cell grid into
32
+ rank-owned blocks; `ownerOf()` tree-walk; x-fastest global/local linear indexing.
33
+ - `peclet::core::decomp::BlockIndexer<Dim>` — local↔global indexing for a block with a ghost layer.
34
+ - `peclet::core::halo::NbxEngine` — nonblocking-consensus sparse exchange (Issend + Ibarrier), for dynamic
35
+ patterns.
36
+ - `peclet::core::halo::GridHaloTopology<Dim>` (`grid_halo_topology.hpp`) — asynchronous ghost-layer exchange
37
+ with **topology separated from exchange** and a **field-agnostic** pack/unpack interface.
38
+ `buildTopology()` runs once; two interchangeable exchange engines give identical results:
39
+ - `exchangeNbx()` / `start()`+`wait()` — NBX, supports compute/comm overlap.
40
+ - `exchangePersistent()` — `MPI_Neighbor_alltoallv` on a cached distributed-graph communicator;
41
+ fastest for the static neighbour pattern of a fixed grid.
42
+ - `peclet::core::halo::GridFieldView<T>` — wraps a contiguous local array as an exchangeable field.
43
+ - `peclet::core::halo::GridHalo<T>` (`grid_halo.hpp`) — portable **GPU-resident** ghost-layer exchange (Kokkos:
44
+ CUDA / HIP / OpenMP). Built once from a host `GridHaloTopology<Dim>::flatten()`; pack / unpack /
45
+ periodic self-copy run as `Kokkos::parallel_for` over the device `peclet::core::View<T>` field, so the full
46
+ field never crosses the bus — only the compact halo buffers are host-staged for MPI by default, with
47
+ an opt-in GPU-aware path (env `PECLET_CORE_GPU_AWARE_MPI`). Bit-for-bit identical to the CPU exchange.
48
+ - `peclet::core::halo::ParticleMigrator<Dim>` — Lagrangian particle migration to owning ranks (NBX), the
49
+ dynamic counterpart to the Eulerian grid halo.
50
+ - `peclet::core::halo::ParticleHaloTopology<Dim>` (`particle_halo_topology.hpp`) — persistent Lagrangian ghost
51
+ halo: `forward` (owner→ghost), `reverse` (ghost→owner, accumulate) and `forwardPositions` (periodic
52
+ image shift). `peclet::core::halo::ParticleHalo<Dim>` (`particle_halo.hpp`) is its GPU-resident Kokkos driver
53
+ (on-device gather/scatter, host-staged or GPU-aware MPI), consumed by dem's distributed step.
54
+ - `peclet::core::halo::rebalanceByParticleCount(...)` (`particle_rebalance.hpp`) — **dynamic load balancing**
55
+ for the Lagrangian path: re-inits the decomposition in place with the **weighted ORB**
56
+ (`BlockDecomposer::init(numBlocks, globalSize, weights)`) and migrates. The Eulerian/AMR counterpart
57
+ is `peclet::core::amr::DistributedOctree::rebalance`.
58
+ - `peclet::core::geom` (`sdf.hpp`, `grid_sdf.hpp`, `vti_io.hpp`) — shared SDF solids: analytic primitives +
59
+ trilinear `GridSdf` behind one `Sdf` concept, with VTI (.vti) read/write.
60
+ - `peclet::core::amr` (`include/peclet/core/amr/`) — block-local-Morton **AMR octree** flow subsystem: `peclet::core::amr::AmrFlow`
61
+ (collocated projection Navier–Stokes), device + distributed multigrid (`pcg.hpp`, `multigrid.hpp`,
62
+ `velocity_mg.hpp`, `distributed_*.hpp`), cut-cell IBM (`cut_cell.hpp`) and solution-adaptive refinement
63
+ (`adapt.hpp`, `indicators.hpp`). See `docs/amr_collocated_projection.md`.
64
+ - **Python bindings** (`python/tpx_mpi.cpp`, `python/tpx_amr.cpp`) — **nanobind** modules over the
65
+ shared zero-copy `View`↔ndarray bridge (`include/peclet/core/python/ndarray_interop.hpp`). `peclet.core.mpi` exposes
66
+ the host Lagrangian halo / migration / rebalance; `peclet.core.amr` exposes the device AMR flow.
67
+
68
+ Validated end-to-end by distributed explicit heat-diffusion solvers (plain, and **around an SDF solid
69
+ obstacle**) matching a serial reference cell-for-cell across ranks, and consumed by the validated
70
+ `flow` and `dem` distributed solvers. 26 ctests pass (`np` 1–8 CPU, 1–4 GPU).
71
+
72
+ ## Build / test / benchmark
73
+
74
+ ```bash
75
+ cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
76
+ cmake --build build -j
77
+ ctest --test-dir build --output-on-failure # 26 ctests: serial + MPI (np=1,2,4,8)
78
+
79
+ # halo microbenchmark: weak scaling, NBX vs persistent
80
+ mpirun -np 4 ./build/benchmarks/bench_halo 48 1 300 # cells/rank/axis, ghost, iters
81
+ ```
82
+
83
+ Requires MPI (OpenMPI/MPICH) and a C++20 compiler. `morton` is picked up automatically if
84
+ checked out as a sibling directory (enables `PECLET_CORE_HAVE_MORTON`).
85
+
86
+ ## Status
87
+
88
+ Complete and in production. The block decomposition, the async ghost-layer exchange (CPU + portable
89
+ Kokkos GPU, host-staged and opt-in GPU-aware), particle migration, dynamic load balancing (weighted
90
+ ORB + AMR/Lagrangian rebalancing), SDF geometry, the AMR octree flow subsystem (device + distributed
91
+ multigrid, collocated projection), and the nanobind Python bindings are all shipped and tested
92
+ (26 ctests, `np` 1–8 CPU / 1–4 GPU). `flow` (distributed cut-cell IBM Navier–Stokes) and `dem`
93
+ (distributed XPBD with load rebalancing) are validated consumers. CUDA is retired; Kokkos
94
+ (CUDA / HIP / OpenMP) is the canonical device path. Remaining work is at-scale multi-GPU tuning.
@@ -0,0 +1,80 @@
1
+ # core
2
+
3
+ Shared infrastructure for the transport-phenomena simulation suite (see `../docs/` for the suite-wide
4
+ [architecture](../docs/ARCHITECTURE.md), [conventions](../docs/CONVENTIONS.md),
5
+ [style](../docs/STYLE.md), [interfaces](../docs/INTERFACES.md) and [roadmap](../docs/ROADMAP.md)).
6
+
7
+ It provides the pieces every method code (`flow`, `dem`, `voro`, …) should
8
+ share: a common MPI **block domain decomposition**, an efficient **asynchronous ghost-layer
9
+ exchange** (CPU + portable Kokkos GPU), **particle migration**, **dynamic load balancing**, unified
10
+ **SDF geometry** (`peclet::core::geom`), an **AMR octree** flow subsystem (`peclet::core::amr`), and **nanobind Python
11
+ bindings**. Header-only C++20 (the device side, compiled through Kokkos, is also C++20; only the
12
+ `morton` dependency pins C++17 — see `../docs/STYLE.md`). Cut-cell IBM is not a standalone shared
13
+ module: it currently lives inside the AMR flow solver (`peclet::core::amr`) and in `flow`.
14
+
15
+ ## What works today
16
+
17
+ - `peclet::core::decomp::BlockDecomposer<Dim>` — orthogonal recursive bisection of a global cell grid into
18
+ rank-owned blocks; `ownerOf()` tree-walk; x-fastest global/local linear indexing.
19
+ - `peclet::core::decomp::BlockIndexer<Dim>` — local↔global indexing for a block with a ghost layer.
20
+ - `peclet::core::halo::NbxEngine` — nonblocking-consensus sparse exchange (Issend + Ibarrier), for dynamic
21
+ patterns.
22
+ - `peclet::core::halo::GridHaloTopology<Dim>` (`grid_halo_topology.hpp`) — asynchronous ghost-layer exchange
23
+ with **topology separated from exchange** and a **field-agnostic** pack/unpack interface.
24
+ `buildTopology()` runs once; two interchangeable exchange engines give identical results:
25
+ - `exchangeNbx()` / `start()`+`wait()` — NBX, supports compute/comm overlap.
26
+ - `exchangePersistent()` — `MPI_Neighbor_alltoallv` on a cached distributed-graph communicator;
27
+ fastest for the static neighbour pattern of a fixed grid.
28
+ - `peclet::core::halo::GridFieldView<T>` — wraps a contiguous local array as an exchangeable field.
29
+ - `peclet::core::halo::GridHalo<T>` (`grid_halo.hpp`) — portable **GPU-resident** ghost-layer exchange (Kokkos:
30
+ CUDA / HIP / OpenMP). Built once from a host `GridHaloTopology<Dim>::flatten()`; pack / unpack /
31
+ periodic self-copy run as `Kokkos::parallel_for` over the device `peclet::core::View<T>` field, so the full
32
+ field never crosses the bus — only the compact halo buffers are host-staged for MPI by default, with
33
+ an opt-in GPU-aware path (env `PECLET_CORE_GPU_AWARE_MPI`). Bit-for-bit identical to the CPU exchange.
34
+ - `peclet::core::halo::ParticleMigrator<Dim>` — Lagrangian particle migration to owning ranks (NBX), the
35
+ dynamic counterpart to the Eulerian grid halo.
36
+ - `peclet::core::halo::ParticleHaloTopology<Dim>` (`particle_halo_topology.hpp`) — persistent Lagrangian ghost
37
+ halo: `forward` (owner→ghost), `reverse` (ghost→owner, accumulate) and `forwardPositions` (periodic
38
+ image shift). `peclet::core::halo::ParticleHalo<Dim>` (`particle_halo.hpp`) is its GPU-resident Kokkos driver
39
+ (on-device gather/scatter, host-staged or GPU-aware MPI), consumed by dem's distributed step.
40
+ - `peclet::core::halo::rebalanceByParticleCount(...)` (`particle_rebalance.hpp`) — **dynamic load balancing**
41
+ for the Lagrangian path: re-inits the decomposition in place with the **weighted ORB**
42
+ (`BlockDecomposer::init(numBlocks, globalSize, weights)`) and migrates. The Eulerian/AMR counterpart
43
+ is `peclet::core::amr::DistributedOctree::rebalance`.
44
+ - `peclet::core::geom` (`sdf.hpp`, `grid_sdf.hpp`, `vti_io.hpp`) — shared SDF solids: analytic primitives +
45
+ trilinear `GridSdf` behind one `Sdf` concept, with VTI (.vti) read/write.
46
+ - `peclet::core::amr` (`include/peclet/core/amr/`) — block-local-Morton **AMR octree** flow subsystem: `peclet::core::amr::AmrFlow`
47
+ (collocated projection Navier–Stokes), device + distributed multigrid (`pcg.hpp`, `multigrid.hpp`,
48
+ `velocity_mg.hpp`, `distributed_*.hpp`), cut-cell IBM (`cut_cell.hpp`) and solution-adaptive refinement
49
+ (`adapt.hpp`, `indicators.hpp`). See `docs/amr_collocated_projection.md`.
50
+ - **Python bindings** (`python/tpx_mpi.cpp`, `python/tpx_amr.cpp`) — **nanobind** modules over the
51
+ shared zero-copy `View`↔ndarray bridge (`include/peclet/core/python/ndarray_interop.hpp`). `peclet.core.mpi` exposes
52
+ the host Lagrangian halo / migration / rebalance; `peclet.core.amr` exposes the device AMR flow.
53
+
54
+ Validated end-to-end by distributed explicit heat-diffusion solvers (plain, and **around an SDF solid
55
+ obstacle**) matching a serial reference cell-for-cell across ranks, and consumed by the validated
56
+ `flow` and `dem` distributed solvers. 26 ctests pass (`np` 1–8 CPU, 1–4 GPU).
57
+
58
+ ## Build / test / benchmark
59
+
60
+ ```bash
61
+ cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
62
+ cmake --build build -j
63
+ ctest --test-dir build --output-on-failure # 26 ctests: serial + MPI (np=1,2,4,8)
64
+
65
+ # halo microbenchmark: weak scaling, NBX vs persistent
66
+ mpirun -np 4 ./build/benchmarks/bench_halo 48 1 300 # cells/rank/axis, ghost, iters
67
+ ```
68
+
69
+ Requires MPI (OpenMPI/MPICH) and a C++20 compiler. `morton` is picked up automatically if
70
+ checked out as a sibling directory (enables `PECLET_CORE_HAVE_MORTON`).
71
+
72
+ ## Status
73
+
74
+ Complete and in production. The block decomposition, the async ghost-layer exchange (CPU + portable
75
+ Kokkos GPU, host-staged and opt-in GPU-aware), particle migration, dynamic load balancing (weighted
76
+ ORB + AMR/Lagrangian rebalancing), SDF geometry, the AMR octree flow subsystem (device + distributed
77
+ multigrid, collocated projection), and the nanobind Python bindings are all shipped and tested
78
+ (26 ctests, `np` 1–8 CPU / 1–4 GPU). `flow` (distributed cut-cell IBM Navier–Stokes) and `dem`
79
+ (distributed XPBD with load rebalancing) are validated consumers. CUDA is retired; Kokkos
80
+ (CUDA / HIP / OpenMP) is the canonical device path. Remaining work is at-scale multi-GPU tuning.