galform-analysis 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. galform_analysis-0.1.0/LICENSE +21 -0
  2. galform_analysis-0.1.0/PKG-INFO +133 -0
  3. galform_analysis-0.1.0/README.md +82 -0
  4. galform_analysis-0.1.0/galform_analysis/__init__.py +86 -0
  5. galform_analysis-0.1.0/galform_analysis/analysis/__init__.py +47 -0
  6. galform_analysis-0.1.0/galform_analysis/analysis/aggregation.py +314 -0
  7. galform_analysis-0.1.0/galform_analysis/analysis/correlation/__init__.py +68 -0
  8. galform_analysis-0.1.0/galform_analysis/analysis/correlation/correlation.py +508 -0
  9. galform_analysis-0.1.0/galform_analysis/analysis/correlation/dm_correlation.py +681 -0
  10. galform_analysis-0.1.0/galform_analysis/analysis/correlation/galaxy_bias.py +55 -0
  11. galform_analysis-0.1.0/galform_analysis/analysis/correlation/n_point_bruteforce.py +119 -0
  12. galform_analysis-0.1.0/galform_analysis/analysis/correlation/satellite_cross_correlation.py +255 -0
  13. galform_analysis-0.1.0/galform_analysis/analysis/correlation/scope_wrapper.py +21 -0
  14. galform_analysis-0.1.0/galform_analysis/analysis/correlation/subvol_weighted_correction.py +693 -0
  15. galform_analysis-0.1.0/galform_analysis/analysis/correlation/three_point_bruteforce.py +92 -0
  16. galform_analysis-0.1.0/galform_analysis/analysis/correlation/three_point_reference.py +95 -0
  17. galform_analysis-0.1.0/galform_analysis/analysis/correlation/three_point_scope.py +156 -0
  18. galform_analysis-0.1.0/galform_analysis/analysis/mass_functions/__init__.py +53 -0
  19. galform_analysis-0.1.0/galform_analysis/analysis/mass_functions/hmf.py +373 -0
  20. galform_analysis-0.1.0/galform_analysis/analysis/mass_functions/hod.py +570 -0
  21. galform_analysis-0.1.0/galform_analysis/analysis/mass_functions/smf.py +295 -0
  22. galform_analysis-0.1.0/galform_analysis/analysis/mass_functions/theoretical_hmf.py +566 -0
  23. galform_analysis-0.1.0/galform_analysis/analysis/redshift_space_distortions/__init__.py +1 -0
  24. galform_analysis-0.1.0/galform_analysis/analysis/redshift_space_distortions/subvol_weighted_multipoles.py +429 -0
  25. galform_analysis-0.1.0/galform_analysis/config.py +230 -0
  26. galform_analysis-0.1.0/galform_analysis/readers/__init__.py +17 -0
  27. galform_analysis-0.1.0/galform_analysis/readers/loaders.py +289 -0
  28. galform_analysis-0.1.0/galform_analysis/redshift_lists/COLIBRE-L100m6.txt +3 -0
  29. galform_analysis-0.1.0/galform_analysis/redshift_lists/FLAMINGO-L1000N1800.txt +78 -0
  30. galform_analysis-0.1.0/galform_analysis/redshift_lists/L800.txt +253 -0
  31. galform_analysis-0.1.0/galform_analysis/redshift_lists/Mill1.txt +2 -0
  32. galform_analysis-0.1.0/galform_analysis/redshift_lists/Mill2.txt +2 -0
  33. galform_analysis-0.1.0/galform_analysis/sim_configs/COLIBRE.json +14 -0
  34. galform_analysis-0.1.0/galform_analysis/sim_configs/FLAMINGO.json +14 -0
  35. galform_analysis-0.1.0/galform_analysis/sim_configs/L800.json +14 -0
  36. galform_analysis-0.1.0/galform_analysis/sim_configs/Mill1.json +14 -0
  37. galform_analysis-0.1.0/galform_analysis/sim_configs/Mill2.json +14 -0
  38. galform_analysis-0.1.0/galform_analysis/utils/__init__.py +17 -0
  39. galform_analysis-0.1.0/galform_analysis/utils/matplotlib_config.py +115 -0
  40. galform_analysis-0.1.0/galform_analysis/utils/read_galaxies.py +357 -0
  41. galform_analysis-0.1.0/galform_analysis/utils/stats.py +77 -0
  42. galform_analysis-0.1.0/galform_analysis.egg-info/PKG-INFO +133 -0
  43. galform_analysis-0.1.0/galform_analysis.egg-info/SOURCES.txt +46 -0
  44. galform_analysis-0.1.0/galform_analysis.egg-info/dependency_links.txt +1 -0
  45. galform_analysis-0.1.0/galform_analysis.egg-info/requires.txt +19 -0
  46. galform_analysis-0.1.0/galform_analysis.egg-info/top_level.txt +1 -0
  47. galform_analysis-0.1.0/pyproject.toml +58 -0
  48. galform_analysis-0.1.0/setup.cfg +4 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Oscar Hickman
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,133 @@
1
+ Metadata-Version: 2.4
2
+ Name: galform_analysis
3
+ Version: 0.1.0
4
+ Summary: A modular Python framework for reading and analyzing GALFORM HDF5 simulation outputs.
5
+ Author-email: Oscar Hickman <oscar.hickman17@alumni.imperial.ac.uk>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Oscar Hickman
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+ Project-URL: Homepage, https://github.com/OscarHickman/galform_analysis
28
+ Project-URL: Repository, https://github.com/OscarHickman/galform_analysis
29
+ Project-URL: Issues, https://github.com/OscarHickman/galform_analysis/issues
30
+ Requires-Python: >=3.12
31
+ Description-Content-Type: text/markdown
32
+ License-File: LICENSE
33
+ Requires-Dist: numpy>=1.23.0
34
+ Requires-Dist: scipy>=1.7.0
35
+ Requires-Dist: matplotlib>=3.3.0
36
+ Requires-Dist: polars>=1.0.0
37
+ Requires-Dist: h5py>=3.0.0
38
+ Requires-Dist: seaborn>=0.11.0
39
+ Requires-Dist: Corrfunc>=2.3.0
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
42
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
43
+ Requires-Dist: build; extra == "dev"
44
+ Provides-Extra: science
45
+ Requires-Dist: astropy>=4.0; extra == "science"
46
+ Requires-Dist: hmf>=3.0; extra == "science"
47
+ Requires-Dist: packaging>=20.0; extra == "science"
48
+ Requires-Dist: deprecation>=2.0; extra == "science"
49
+ Requires-Dist: halotools>=0.7.0; extra == "science"
50
+ Dynamic: license-file
51
+
52
+ # galform_analysis
53
+
54
+ [![CI](https://github.com/OscarHickman/galform_analysis/actions/workflows/ci.yml/badge.svg)](https://github.com/OscarHickman/galform_analysis/actions/workflows/ci.yml)
55
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
56
+
57
+
58
+ A modular Python framework designed for the efficient reading and analysis of GALFORM HDF5 simulation outputs. This library provides standardized tools for processing large-scale galaxy formation data, from low-level HDF5 I/O to high-level astronomical probes.
59
+
60
+ ## Core Features
61
+
62
+ - **Standardized I/O**: Robust loaders for GALFORM `galaxies.hdf5` files with support for different output versions.
63
+ - **Data Aggregation**: Tools to scan simulation directories and aggregate data across subvolumes using high-performance `polars` dataframes.
64
+ - **Mass Functions**: Computation of Stellar Mass Functions (SMF), Halo Mass Functions (HMF), and Halo Occupation Distribution (HOD).
65
+ - **Correlation Functions**: Estimators for 2-point, 3-point, and N-point correlation functions (2PCF/NPCF) including subvolume-weighted corrections for convergence analysis.
66
+ - **Redshift-Space Distortions**: Estimators for anisotropic clustering ($\xi(s, \mu)$) and multipoles ($\xi_0, \xi_2, \xi_4$).
67
+ - **Simulation Management**: Built-in configurations for major N-body simulations including L800, Millennium I/II, COLIBRE, and FLAMINGO.
68
+
69
+ ## Installation
70
+
71
+ Install the package in your Python environment:
72
+
73
+ ```bash
74
+ uv pip install -e .
75
+ ```
76
+
77
+ ### Dependencies
78
+ The library requires `numpy`, `scipy`, `matplotlib`, `polars`, `h5py`, `seaborn`, and `Corrfunc`. These are automatically managed during installation.
79
+
80
+ ## Quick Start
81
+
82
+ The following example demonstrates how to load a simulation configuration and read galaxy data:
83
+
84
+ ```python
85
+ from galform_analysis import SimulationConfig, config
86
+ from galform_analysis.readers.loaders import read_snapshot_data
87
+
88
+ # 1. Access simulation-specific constants (box size, cosmology, etc.)
89
+ sim = SimulationConfig('L800')
90
+ print(f"Simulation: {sim.name}, Box Size: {sim.box_size} Mpc/h")
91
+
92
+ # 2. Configure the data location
93
+ config.set_base_dir('/path/to/Galform_Out/L800/model_name')
94
+
95
+ # 3. Load snapshot data for a specific subvolume
96
+ data = read_snapshot_data('iz271', ivol=0)
97
+ mstar = data['mstar'] # Stellar masses in M_sun/h
98
+ ```
99
+
100
+ ## Simulation Metadata
101
+
102
+ Configurations for supported simulations are stored centrally in `galform_analysis/sim_configs/`. This allows for dynamic access to cosmological parameters and volume metadata:
103
+
104
+ ```python
105
+ from galform_analysis import SimulationConfig
106
+
107
+ flamingo = SimulationConfig('FLAMINGO')
108
+ omega_m = flamingo.omega_m
109
+ h0 = flamingo.h0
110
+ ```
111
+
112
+ ## Documentation & Examples
113
+
114
+ Refer to the `examples/` directory for interactive Jupyter notebooks:
115
+ - `examples/readers/load_snapshot.ipynb`: Introduction to data loading.
116
+ - `examples/analysis/mass_functions/smf_example.ipynb`: Plotting Stellar Mass Functions.
117
+ - `examples/analysis/correlation/correlation_example.ipynb`: Computing clustering statistics.
118
+
119
+ ## Testing & Quality Standards
120
+
121
+ The project maintains high code quality through automated linting and comprehensive testing:
122
+
123
+ ```bash
124
+ # Run the test suite
125
+ pytest tests
126
+
127
+ # Check code style
128
+ ruff check galform_analysis
129
+ ```
130
+
131
+ ## Author
132
+
133
+ Oscar Hickman
@@ -0,0 +1,82 @@
1
+ # galform_analysis
2
+
3
+ [![CI](https://github.com/OscarHickman/galform_analysis/actions/workflows/ci.yml/badge.svg)](https://github.com/OscarHickman/galform_analysis/actions/workflows/ci.yml)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
+
6
+
7
+ A modular Python framework designed for the efficient reading and analysis of GALFORM HDF5 simulation outputs. This library provides standardized tools for processing large-scale galaxy formation data, from low-level HDF5 I/O to high-level astronomical probes.
8
+
9
+ ## Core Features
10
+
11
+ - **Standardized I/O**: Robust loaders for GALFORM `galaxies.hdf5` files with support for different output versions.
12
+ - **Data Aggregation**: Tools to scan simulation directories and aggregate data across subvolumes using high-performance `polars` dataframes.
13
+ - **Mass Functions**: Computation of Stellar Mass Functions (SMF), Halo Mass Functions (HMF), and Halo Occupation Distribution (HOD).
14
+ - **Correlation Functions**: Estimators for 2-point, 3-point, and N-point correlation functions (2PCF/NPCF) including subvolume-weighted corrections for convergence analysis.
15
+ - **Redshift-Space Distortions**: Estimators for anisotropic clustering ($\xi(s, \mu)$) and multipoles ($\xi_0, \xi_2, \xi_4$).
16
+ - **Simulation Management**: Built-in configurations for major N-body simulations including L800, Millennium I/II, COLIBRE, and FLAMINGO.
17
+
18
+ ## Installation
19
+
20
+ Install the package in your Python environment:
21
+
22
+ ```bash
23
+ uv pip install -e .
24
+ ```
25
+
26
+ ### Dependencies
27
+ The library requires `numpy`, `scipy`, `matplotlib`, `polars`, `h5py`, `seaborn`, and `Corrfunc`. These are automatically managed during installation.
28
+
29
+ ## Quick Start
30
+
31
+ The following example demonstrates how to load a simulation configuration and read galaxy data:
32
+
33
+ ```python
34
+ from galform_analysis import SimulationConfig, config
35
+ from galform_analysis.readers.loaders import read_snapshot_data
36
+
37
+ # 1. Access simulation-specific constants (box size, cosmology, etc.)
38
+ sim = SimulationConfig('L800')
39
+ print(f"Simulation: {sim.name}, Box Size: {sim.box_size} Mpc/h")
40
+
41
+ # 2. Configure the data location
42
+ config.set_base_dir('/path/to/Galform_Out/L800/model_name')
43
+
44
+ # 3. Load snapshot data for a specific subvolume
45
+ data = read_snapshot_data('iz271', ivol=0)
46
+ mstar = data['mstar'] # Stellar masses in M_sun/h
47
+ ```
48
+
49
+ ## Simulation Metadata
50
+
51
+ Configurations for supported simulations are stored centrally in `galform_analysis/sim_configs/`. This allows for dynamic access to cosmological parameters and volume metadata:
52
+
53
+ ```python
54
+ from galform_analysis import SimulationConfig
55
+
56
+ flamingo = SimulationConfig('FLAMINGO')
57
+ omega_m = flamingo.omega_m
58
+ h0 = flamingo.h0
59
+ ```
60
+
61
+ ## Documentation & Examples
62
+
63
+ Refer to the `examples/` directory for interactive Jupyter notebooks:
64
+ - `examples/readers/load_snapshot.ipynb`: Introduction to data loading.
65
+ - `examples/analysis/mass_functions/smf_example.ipynb`: Plotting Stellar Mass Functions.
66
+ - `examples/analysis/correlation/correlation_example.ipynb`: Computing clustering statistics.
67
+
68
+ ## Testing & Quality Standards
69
+
70
+ The project maintains high code quality through automated linting and comprehensive testing:
71
+
72
+ ```bash
73
+ # Run the test suite
74
+ pytest tests
75
+
76
+ # Check code style
77
+ ruff check galform_analysis
78
+ ```
79
+
80
+ ## Author
81
+
82
+ Oscar Hickman
@@ -0,0 +1,86 @@
1
+ """galform_analysis - A Python library for GALFORM simulation analysis.
2
+
3
+ This library provides tools for analyzing GALFORM galaxy formation simulation outputs,
4
+ including:
5
+ - Reading HDF5 snapshot data
6
+ - Computing mass functions (stellar and halo)
7
+ - Aggregating data across subvolumes
8
+
9
+ Quick Start:
10
+ >>> from config import set_base_dir
11
+ >>> from analysis import avg_hmf_given_redshift_and_subvolumes
12
+ >>> from analysis import avg_smf_given_redshift_and_subvolumes
13
+ >>>
14
+ >>> # Set your GALFORM output directory
15
+ >>> set_base_dir('/path/to/galform/output')
16
+ >>>
17
+ >>> # Compute stellar mass function
18
+ >>> smf = avg_smf_given_redshift_and_subvolumes(iz_num=99, ivols=[0, 1, 2])
19
+
20
+ Configuration:
21
+ Set the BASE_DIR for your GALFORM outputs:
22
+ - Via Python: config.set_base_dir('/path')
23
+ - Via environment: export GALFORM_BASE_DIR=/path
24
+ - Edit config.py directly
25
+ """
26
+
27
+ __version__ = "0.1.0"
28
+
29
+ # Import key modules for convenience
30
+ from galform_analysis import analysis, config
31
+ from galform_analysis.analysis import (
32
+ aggregate_snapshot,
33
+ avg_hmf_given_redshift_and_subvolumes,
34
+ avg_hmf_given_redshifts_and_subvolume,
35
+ avg_smf_given_redshift_and_subvolumes,
36
+ avg_smf_given_redshifts_and_subvolume,
37
+ # HMF functions
38
+ hmf_given_redshift_and_subvolume,
39
+ hmfs_given_redshifts_and_subvolume,
40
+ # SMF functions
41
+ smf_given_redshift_and_subvolume,
42
+ smfs_given_redshifts_and_subvolume,
43
+ )
44
+
45
+ # Expose commonly used functions at package level
46
+ from galform_analysis.config import (
47
+ Cosmology,
48
+ SimulationConfig,
49
+ find_snapshot_at_redshift,
50
+ get_base_dir,
51
+ get_snapshot_redshift,
52
+ load_redshift_mapping,
53
+ load_sim_config,
54
+ set_base_dir,
55
+ )
56
+ from galform_analysis.readers import close_snapshot, read_snapshot_data
57
+
58
+ __all__ = [
59
+ "__version__",
60
+ # Submodules
61
+ "config",
62
+ "io",
63
+ "analysis",
64
+ # Common functions
65
+ "set_base_dir",
66
+ "get_base_dir",
67
+ "Cosmology",
68
+ "load_sim_config",
69
+ "SimulationConfig",
70
+ "load_redshift_mapping",
71
+ "get_snapshot_redshift",
72
+ "find_snapshot_at_redshift",
73
+ "read_snapshot_data",
74
+ "close_snapshot",
75
+ "aggregate_snapshot",
76
+ # HMF functions
77
+ "hmf_given_redshift_and_subvolume",
78
+ "hmfs_given_redshifts_and_subvolume",
79
+ "avg_hmf_given_redshift_and_subvolumes",
80
+ "avg_hmf_given_redshifts_and_subvolume",
81
+ # SMF functions
82
+ "smf_given_redshift_and_subvolume",
83
+ "smfs_given_redshifts_and_subvolume",
84
+ "avg_smf_given_redshift_and_subvolumes",
85
+ "avg_smf_given_redshifts_and_subvolume",
86
+ ]
@@ -0,0 +1,47 @@
1
+ """Analysis subpackage for GALFORM data processing."""
2
+
3
+ from .aggregation import aggregate_snapshot, completed_galaxies, incomplete_subvolumes
4
+ from .correlation import (
5
+ avg_correlation_given_redshift_and_subvolumes,
6
+ compute_xi_corrfunc,
7
+ correlation_given_redshift_and_subvolume,
8
+ )
9
+ from .mass_functions import (
10
+ avg_hmf_given_redshift_and_subvolumes,
11
+ avg_hmf_given_redshifts_and_subvolume,
12
+ avg_hod_given_redshift_and_subvolumes,
13
+ avg_hod_given_redshifts_and_subvolume,
14
+ avg_smf_given_redshift_and_subvolumes,
15
+ avg_smf_given_redshifts_and_subvolume,
16
+ hmf_given_redshift_and_subvolume,
17
+ hmfs_given_redshifts_and_subvolume,
18
+ hod_given_redshift_and_subvolume,
19
+ hods_given_redshifts_and_subvolume,
20
+ smf_given_redshift_and_subvolume,
21
+ smfs_given_redshifts_and_subvolume,
22
+ )
23
+
24
+ __all__ = [
25
+ "aggregate_snapshot",
26
+ "completed_galaxies",
27
+ "incomplete_subvolumes",
28
+ # HMF functions
29
+ "hmf_given_redshift_and_subvolume",
30
+ "hmfs_given_redshifts_and_subvolume",
31
+ "avg_hmf_given_redshift_and_subvolumes",
32
+ "avg_hmf_given_redshifts_and_subvolume",
33
+ # SMF functions
34
+ "smf_given_redshift_and_subvolume",
35
+ "smfs_given_redshifts_and_subvolume",
36
+ "avg_smf_given_redshift_and_subvolumes",
37
+ "avg_smf_given_redshifts_and_subvolume",
38
+ # HOD functions
39
+ "hod_given_redshift_and_subvolume",
40
+ "hods_given_redshifts_and_subvolume",
41
+ "avg_hod_given_redshift_and_subvolumes",
42
+ "avg_hod_given_redshifts_and_subvolume",
43
+ # Correlation functions
44
+ "compute_xi_corrfunc",
45
+ "correlation_given_redshift_and_subvolume",
46
+ "avg_correlation_given_redshift_and_subvolumes",
47
+ ]
@@ -0,0 +1,314 @@
1
+ """Analysis functions for aggregating GALFORM data across subvolumes."""
2
+
3
+ import glob
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ import h5py
9
+ import numpy as np
10
+ import polars as pl
11
+
12
+ from galform_analysis.config import get_base_dir
13
+ from galform_analysis.readers.loaders import close_snapshot, read_snapshot_data
14
+
15
+
16
+ def completed_galaxies(
17
+ basedir: str = get_base_dir(), iz_snapshots: Optional[List[int]] = None
18
+ ) -> pl.DataFrame:
19
+ """Scan base directory and return DataFrame of all completed galaxy files.
20
+
21
+ Looks through all iz*/ivol* directories and checks CompletionFlag in
22
+ galaxies.hdf5 files.
23
+
24
+ Args:
25
+ basedir: Base directory containing iz* snapshot folders
26
+ iz_snapshots: Optional list of snapshot numbers (e.g., [82, 100, 105]).
27
+ If provided, only these snapshots will be scanned.
28
+ If None, all iz* directories are scanned.
29
+
30
+ Returns:
31
+ DataFrame with columns:
32
+ - iz: Snapshot name (e.g., 'iz100')
33
+ - iz_num: Numeric iz value (e.g., 100)
34
+ - ivol: Subvolume number
35
+ - path: Full path to the galaxies.hdf5 file
36
+ - completed: Whether CompletionFlag==1
37
+ """
38
+ records = []
39
+
40
+ # Find all iz* directories
41
+ if iz_snapshots is not None:
42
+ # Filter to only the requested snapshots
43
+ iz_dirs = sorted(
44
+ [
45
+ os.path.join(basedir, f"iz{iz}")
46
+ for iz in iz_snapshots
47
+ if os.path.isdir(os.path.join(basedir, f"iz{iz}"))
48
+ ]
49
+ )
50
+ else:
51
+ iz_dirs = sorted(glob.glob(os.path.join(basedir, "iz*")))
52
+
53
+ for iz_dir in iz_dirs:
54
+ iz_name = Path(iz_dir).name
55
+ iz_records = [] # Track records for this redshift only
56
+
57
+ # Extract numeric iz value
58
+ try:
59
+ iz_num = int(iz_name.replace("iz", ""))
60
+ except ValueError:
61
+ continue
62
+
63
+ ivol_dirs = sorted(glob.glob(os.path.join(iz_dir, "ivol*")))
64
+
65
+ for ivol_dir in ivol_dirs:
66
+ ivol_name = Path(ivol_dir).name
67
+
68
+ try:
69
+ ivol_num = int(ivol_name.replace("ivol", ""))
70
+ except ValueError:
71
+ continue
72
+
73
+ # Check for galaxies.hdf5 file
74
+ gal_file = os.path.join(ivol_dir, "galaxies.hdf5")
75
+
76
+ if not os.path.exists(gal_file):
77
+ continue
78
+
79
+ # Quick file size check - empty or very small files are incomplete
80
+ try:
81
+ file_size = os.path.getsize(gal_file)
82
+ if file_size < 1000: # Less than 1KB is definitely incomplete
83
+ record = {
84
+ "iz": iz_name,
85
+ "iz_num": iz_num,
86
+ "ivol": ivol_num,
87
+ "path": gal_file,
88
+ "completed": False,
89
+ }
90
+ records.append(record)
91
+ iz_records.append(record)
92
+ continue
93
+ except OSError:
94
+ continue
95
+
96
+ # Try to open the file - if it fails with serialization error,
97
+ # it's incomplete
98
+ completed = False
99
+
100
+ try:
101
+ # Use swmr mode for faster read access
102
+ with h5py.File(gal_file, "r", swmr=True):
103
+ # If we can open it without error, it's completed
104
+ completed = True
105
+ except (OSError, KeyError, RuntimeError) as e:
106
+ # Check if it's the specific serialization error indicating
107
+ # incomplete file
108
+ if "Can't deserialize" in str(e) or "bad object header" in str(e):
109
+ completed = False
110
+ else:
111
+ # Other errors might be temporary, but mark as incomplete
112
+ completed = False
113
+
114
+ record = {
115
+ "iz": iz_name,
116
+ "iz_num": iz_num,
117
+ "ivol": ivol_num,
118
+ "path": gal_file,
119
+ "completed": completed,
120
+ }
121
+ records.append(record)
122
+ iz_records.append(record)
123
+
124
+ df = pl.DataFrame(records)
125
+
126
+ if not df.is_empty():
127
+ df = df.sort(["iz_num", "ivol"])
128
+
129
+ return df
130
+
131
+
132
+ def incomplete_subvolumes(
133
+ basedir: str = get_base_dir(), iz_snapshots: Optional[List[int]] = None
134
+ ) -> pl.DataFrame:
135
+ """Scan base directory and return DataFrame of incomplete/missing galaxy files.
136
+
137
+ This is the complement of completed_galaxies(). Returns records for subvolumes
138
+ where galaxies.hdf5 either doesn't exist or is incomplete/corrupted.
139
+
140
+ Args:
141
+ basedir: Base directory containing iz* snapshot folders
142
+ iz_snapshots: Optional list of snapshot numbers (e.g., [82, 100, 105]).
143
+ If provided, only these snapshots will be scanned.
144
+ If None, all iz* directories are scanned.
145
+
146
+ Returns:
147
+ DataFrame with columns:
148
+ - iz: Snapshot name (e.g., 'iz100')
149
+ - iz_num: Numeric iz value (e.g., 100)
150
+ - ivol: Subvolume number
151
+ - path: Path to the expected galaxies.hdf5 file (may not exist)
152
+ - reason: Why the file is incomplete ('missing', 'incomplete',
153
+ or 'corrupted')
154
+ """
155
+ records = []
156
+
157
+ # Find all iz* directories
158
+ if iz_snapshots is not None:
159
+ # Filter to only the requested snapshots
160
+ iz_dirs = sorted(
161
+ [
162
+ os.path.join(basedir, f"iz{iz}")
163
+ for iz in iz_snapshots
164
+ if os.path.isdir(os.path.join(basedir, f"iz{iz}"))
165
+ ]
166
+ )
167
+ else:
168
+ iz_dirs = sorted(glob.glob(os.path.join(basedir, "iz*")))
169
+
170
+ for iz_dir in iz_dirs:
171
+ iz_name = Path(iz_dir).name
172
+ iz_incomplete = [] # Track incomplete records for this redshift
173
+
174
+ # Extract numeric iz value
175
+ try:
176
+ iz_num = int(iz_name.replace("iz", ""))
177
+ except ValueError:
178
+ continue
179
+
180
+ ivol_dirs = sorted(glob.glob(os.path.join(iz_dir, "ivol*")))
181
+
182
+ for ivol_dir in ivol_dirs:
183
+ ivol_name = Path(ivol_dir).name
184
+
185
+ try:
186
+ ivol_num = int(ivol_name.replace("ivol", ""))
187
+ except ValueError:
188
+ continue
189
+
190
+ # Check for galaxies.hdf5 file
191
+ gal_file = os.path.join(ivol_dir, "galaxies.hdf5")
192
+
193
+ if not os.path.exists(gal_file):
194
+ record = {
195
+ "iz": iz_name,
196
+ "iz_num": iz_num,
197
+ "ivol": ivol_num,
198
+ "path": gal_file,
199
+ "reason": "missing",
200
+ }
201
+ records.append(record)
202
+ iz_incomplete.append(record)
203
+ continue
204
+
205
+ # Quick file size check - empty or very small files are incomplete
206
+ try:
207
+ file_size = os.path.getsize(gal_file)
208
+ if file_size < 1000: # Less than 1KB is definitely incomplete
209
+ record = {
210
+ "iz": iz_name,
211
+ "iz_num": iz_num,
212
+ "ivol": ivol_num,
213
+ "path": gal_file,
214
+ "reason": "incomplete",
215
+ }
216
+ records.append(record)
217
+ iz_incomplete.append(record)
218
+ continue
219
+ except OSError:
220
+ record = {
221
+ "iz": iz_name,
222
+ "iz_num": iz_num,
223
+ "ivol": ivol_num,
224
+ "path": gal_file,
225
+ "reason": "inaccessible",
226
+ }
227
+ records.append(record)
228
+ iz_incomplete.append(record)
229
+ continue
230
+
231
+ # Try to open the file - if it fails, it's corrupted
232
+ try:
233
+ # Use swmr mode for faster read access
234
+ with h5py.File(gal_file, "r", swmr=True):
235
+ pass # File is valid
236
+ except (OSError, KeyError, RuntimeError) as e:
237
+ # Check if it's the specific serialization error indicating
238
+ # incomplete file
239
+ if "Can't deserialize" in str(e) or "bad object header" in str(e):
240
+ reason = "corrupted"
241
+ else:
242
+ reason = "corrupted"
243
+
244
+ record = {
245
+ "iz": iz_name,
246
+ "iz_num": iz_num,
247
+ "ivol": ivol_num,
248
+ "path": gal_file,
249
+ "reason": reason,
250
+ }
251
+ records.append(record)
252
+ iz_incomplete.append(record)
253
+
254
+ df = pl.DataFrame(records)
255
+
256
+ if not df.is_empty():
257
+ df = df.sort(["iz_num", "ivol"])
258
+
259
+ return df
260
+
261
+
262
+ def aggregate_snapshot(iz_path: str) -> Optional[Dict[str, Any]]:
263
+ """Aggregate mstar, mhalo, and volume from all ivols in a snapshot.
264
+
265
+ Args:
266
+ iz_path: Path to the snapshot directory
267
+
268
+ Returns:
269
+ Dictionary with keys: 'iz', 'z', 'volume', 'mstar', 'mhalo'
270
+ Returns None if no data found
271
+ """
272
+ ivol_paths = sorted(glob.glob(os.path.join(iz_path, "ivol*")))
273
+ if not ivol_paths:
274
+ return None
275
+
276
+ all_mstar, all_mhalo = [], []
277
+ total_vol = 0
278
+ z = None
279
+
280
+ for ivp in ivol_paths:
281
+ iv = int(Path(ivp).name.replace("ivol", ""))
282
+ try:
283
+ data = read_snapshot_data(iz_path, ivol=iv)
284
+ if data.get("V_ivol") and data["V_ivol"] > 0:
285
+ total_vol += data["V_ivol"]
286
+ if z is None:
287
+ z = data.get("z")
288
+
289
+ mstar = data.get("mstar")
290
+ mhalo = data.get("mhalo")
291
+ if mstar is not None:
292
+ all_mstar.append(mstar)
293
+ if mhalo is not None:
294
+ all_mhalo.append(mhalo)
295
+
296
+ close_snapshot(data)
297
+ except Exception:
298
+ continue
299
+
300
+ if not all_mstar and not all_mhalo:
301
+ return None
302
+
303
+ return {
304
+ "iz": Path(iz_path).name,
305
+ "z": z,
306
+ "volume": total_vol,
307
+ "mstar": np.concatenate(all_mstar) if all_mstar else np.array([]),
308
+ "mhalo": np.concatenate(all_mhalo) if all_mhalo else np.array([]),
309
+ }
310
+
311
+
312
+ if __name__ == "__main__":
313
+ base_dir = get_base_dir()
314
+ df = completed_galaxies(str(base_dir))