manifold-microscope 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. manifold_microscope-0.0.1/LICENSE +29 -0
  2. manifold_microscope-0.0.1/PKG-INFO +182 -0
  3. manifold_microscope-0.0.1/README.md +133 -0
  4. manifold_microscope-0.0.1/experiment_scripts/__init__.py +0 -0
  5. manifold_microscope-0.0.1/experiment_scripts/manifold_analysis/__init__.py +0 -0
  6. manifold_microscope-0.0.1/experiment_scripts/manifold_analysis/analysis.py +107 -0
  7. manifold_microscope-0.0.1/experiment_scripts/manifold_fitting/__init__.py +0 -0
  8. manifold_microscope-0.0.1/experiment_scripts/manifold_fitting/analysis.py +106 -0
  9. manifold_microscope-0.0.1/experiment_scripts/manifold_fitting/inference.py +147 -0
  10. manifold_microscope-0.0.1/experiment_scripts/manifold_fitting/mmls.py +63 -0
  11. manifold_microscope-0.0.1/experiment_scripts/manifold_fitting/training.py +220 -0
  12. manifold_microscope-0.0.1/experiment_scripts/model_configs.py +102 -0
  13. manifold_microscope-0.0.1/experiment_scripts/toy_manifolds_experiment/__init__.py +0 -0
  14. manifold_microscope-0.0.1/experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py +218 -0
  15. manifold_microscope-0.0.1/experiment_scripts/toy_manifolds_experiment/manifold_fitting_denoising_autoencoder.py +156 -0
  16. manifold_microscope-0.0.1/experiment_scripts/toy_manifolds_experiment/manifold_fitting_no_noise.py +87 -0
  17. manifold_microscope-0.0.1/manifold_microscope.egg-info/PKG-INFO +182 -0
  18. manifold_microscope-0.0.1/manifold_microscope.egg-info/SOURCES.txt +61 -0
  19. manifold_microscope-0.0.1/manifold_microscope.egg-info/dependency_links.txt +1 -0
  20. manifold_microscope-0.0.1/manifold_microscope.egg-info/requires.txt +23 -0
  21. manifold_microscope-0.0.1/manifold_microscope.egg-info/top_level.txt +3 -0
  22. manifold_microscope-0.0.1/microscope/__init__.py +0 -0
  23. manifold_microscope-0.0.1/microscope/computations_grid/__init__.py +0 -0
  24. manifold_microscope-0.0.1/microscope/computations_grid/basic.py +166 -0
  25. manifold_microscope-0.0.1/microscope/computations_grid/curvature.py +240 -0
  26. manifold_microscope-0.0.1/microscope/computations_grid/data_analysis/__init__.py +0 -0
  27. manifold_microscope-0.0.1/microscope/computations_grid/data_analysis/data_analysis.py +630 -0
  28. manifold_microscope-0.0.1/microscope/computations_grid/data_analysis/merge_analysis_outputs.py +314 -0
  29. manifold_microscope-0.0.1/microscope/computations_grid/data_analysis/run_data_analysis.py +229 -0
  30. manifold_microscope-0.0.1/microscope/computations_grid/reach.py +148 -0
  31. manifold_microscope-0.0.1/microscope/computations_grid/volume.py +100 -0
  32. manifold_microscope-0.0.1/microscope/cyclic_dimensions.py +57 -0
  33. manifold_microscope-0.0.1/microscope/datasets/__init__.py +0 -0
  34. manifold_microscope-0.0.1/microscope/datasets/coil20.py +171 -0
  35. manifold_microscope-0.0.1/microscope/datasets/custom_dsprites.py +392 -0
  36. manifold_microscope-0.0.1/microscope/datasets/dataset_split.py +120 -0
  37. manifold_microscope-0.0.1/microscope/datasets/generic_dataset_loader.py +476 -0
  38. manifold_microscope-0.0.1/microscope/datasets/image_transforms.py +156 -0
  39. manifold_microscope-0.0.1/microscope/datasets/noise_adding.py +103 -0
  40. manifold_microscope-0.0.1/microscope/datasets/original_dsprites.py +58 -0
  41. manifold_microscope-0.0.1/microscope/datasets/toy_manifolds.py +686 -0
  42. manifold_microscope-0.0.1/microscope/manifold_examples/__init__.py +0 -0
  43. manifold_microscope-0.0.1/microscope/manifold_examples/ellipsoid.py +77 -0
  44. manifold_microscope-0.0.1/microscope/manifold_examples/hyperboloid.py +47 -0
  45. manifold_microscope-0.0.1/microscope/manifold_examples/plotting.py +74 -0
  46. manifold_microscope-0.0.1/microscope/manifold_examples/sampling_grid.py +103 -0
  47. manifold_microscope-0.0.1/microscope/manifold_examples/sampling_uniform.py +273 -0
  48. manifold_microscope-0.0.1/microscope/manifold_examples/sphere.py +41 -0
  49. manifold_microscope-0.0.1/microscope/manifold_examples/symbolic_computations.py +332 -0
  50. manifold_microscope-0.0.1/microscope/manifold_examples/utils.py +58 -0
  51. manifold_microscope-0.0.1/microscope/patches.py +120 -0
  52. manifold_microscope-0.0.1/pyproject.toml +74 -0
  53. manifold_microscope-0.0.1/representation_learning/__init__.py +0 -0
  54. manifold_microscope-0.0.1/representation_learning/beta_vae/LICENSE +21 -0
  55. manifold_microscope-0.0.1/representation_learning/beta_vae/NOTICE.md +17 -0
  56. manifold_microscope-0.0.1/representation_learning/beta_vae/__init__.py +0 -0
  57. manifold_microscope-0.0.1/representation_learning/beta_vae/dataset.py +106 -0
  58. manifold_microscope-0.0.1/representation_learning/beta_vae/inference_intermediate_layers.py +185 -0
  59. manifold_microscope-0.0.1/representation_learning/beta_vae/main.py +79 -0
  60. manifold_microscope-0.0.1/representation_learning/beta_vae/model.py +172 -0
  61. manifold_microscope-0.0.1/representation_learning/beta_vae/solver.py +432 -0
  62. manifold_microscope-0.0.1/representation_learning/beta_vae/utils.py +50 -0
  63. manifold_microscope-0.0.1/setup.cfg +4 -0
@@ -0,0 +1,29 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2026, Marios Koulakis
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ 1. Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ 2. Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ 3. Neither the name of the copyright holder nor the names of its contributors
17
+ may be used to endorse or promote products derived from this software
18
+ without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,182 @@
1
+ Metadata-Version: 2.4
2
+ Name: manifold-microscope
3
+ Version: 0.0.1
4
+ Summary: The data manifold under the microscope.
5
+ Author: Marios Koulakis
6
+ License-Expression: BSD-3-Clause
7
+ Project-URL: Homepage, https://github.com/koulakis/manifold-microscope
8
+ Project-URL: Repository, https://github.com/koulakis/manifold-microscope
9
+ Project-URL: Issues, https://github.com/koulakis/manifold-microscope/issues
10
+ Keywords: manifold learning,differential geometry,geometric deep learning,datasets,curvature
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3 :: Only
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ License-File: representation_learning/beta_vae/LICENSE
25
+ License-File: representation_learning/beta_vae/NOTICE.md
26
+ Requires-Dist: torch>=2.2.0
27
+ Requires-Dist: numpy>=1.22.0
28
+ Requires-Dist: tqdm
29
+ Requires-Dist: opencv-python
30
+ Requires-Dist: Pillow
31
+ Requires-Dist: scipy
32
+ Requires-Dist: geomloss
33
+ Requires-Dist: joblib
34
+ Requires-Dist: sympy
35
+ Requires-Dist: scikit-learn
36
+ Requires-Dist: seaborn
37
+ Requires-Dist: einops>=0.8.0
38
+ Provides-Extra: benchmark
39
+ Requires-Dist: torchvision; extra == "benchmark"
40
+ Requires-Dist: typer; extra == "benchmark"
41
+ Requires-Dist: pynndescent>=0.5.13; extra == "benchmark"
42
+ Requires-Dist: pytorch-lightning; extra == "benchmark"
43
+ Requires-Dist: PyYAML; extra == "benchmark"
44
+ Requires-Dist: matplotlib; extra == "benchmark"
45
+ Requires-Dist: pandas; extra == "benchmark"
46
+ Requires-Dist: timm>=1.0.14; extra == "benchmark"
47
+ Requires-Dist: lightly>=1.5.18; extra == "benchmark"
48
+ Dynamic: license-file
49
+
50
+ > [!CAUTION]
51
+ > **This project is under active construction.** The code runs and can be used to explore the framework, but it still
52
+ > needs more updates before it can be relied on to reproduce the original paper results or to support new research
53
+ > projects.
54
+
55
+ # The Data Manifold Under the Microscope
56
+
57
+ ## Library transition status
58
+
59
+ ---
60
+ This repository is being transitioned from a paper-review artifact into a reusable Python library for working with
61
+ grid-sampled data manifolds and geometric measurements. The current library-facing core is the dataset generation and
62
+ finite-difference geometry code. The manifold fitting and bounds evaluation code is still included as reference material
63
+ from the accompanying paper and as example usage of the framework, but it is not yet a polished or stable public API.
64
+
65
+ Here is a map of the main components:
66
+
67
+ - Datasets
68
+ - There are two notebooks under `notebooks/datasets_and_measures` which contain examples of loading and visualizing the toy and image datasets.
69
+ - The toy datasets code is in `microscope/datasets/toy_manifolds.py`
70
+ - The image datasets code is in `microscope/datasets/custom_dsprites.py` and `microscope/datasets/coil20.py` respectively.
71
+ - Geometric measures
72
+ - The notebook `notebooks/datasets_and_measures/toy_manifold_datasets.ipynb` contains example computation and visualization of the measures on the image datasets.
73
+ - The finite element computations of the measures are under `microscope/computations_grid`. `basic.py` contains basic computations such as partial derivatives or the Riemannian metric and on top of it are built `volume.py`, `curvature.py` and `reach.py`.
74
+ - The functions are thoroughly tested, check `microscope/computations_grid/tests`. Those use a whole separate part of the codebase, `microscope/manifold_examples` where symbolic computations of the measures on simple manifolds are performed.
75
+ - Manifold fitting and bounds
76
+ - The MMLS fitting method is in `experiment_scripts/manifold_fitting/mmls.py`.
77
+ - The denoising autoencoder used for the toy datasets is in `experiment_scripts/toy_manifolds_experiment/manifold_fitting_denoising_autoencoder.py`.
78
+ - The beta-VAE components are under `representation_learning/beta_vae`.
79
+ - The scripts running the main experiments are `experiment_scripts/manifold_fitting/training.py` for the image datasets and `experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py` for the toy datasets.
80
+ - The three notebooks under `notebooks/manifold_fitting` show how to generate some of the results of the paper, namely the bound curves for all datasets on MMLS, the curves on dSprites with MMLS on different dimensions and the curves for all methods on dSprites. Please note that the code and plots are not very polished there. The final plots were generated separately using the curves exported from the notebook.
81
+
82
+ ---
83
+
84
+ `manifold-microscope` provides a framework for studying and benchmarking data manifolds through densely sampled
85
+ grid-based datasets and finite-difference geometric computations. The goal of the library is to make it practical to
86
+ construct measurable reference manifolds, compute geometric quantities such as curvature, reach, and volume, and use
87
+ those quantities when evaluating manifold fitting, generalization bounds, and geometric estimation methods.
88
+
89
+ ## Why Use the Microscope?
90
+
91
+ In most research settings, one must choose between idealized mathematical manifolds (e.g., spheres, ellipsoids) with unrealistic simplicity, or real-world datasets where true geometric quantities are unknown or hard to measure accurately for testing. This framework bridges that gap by offering datasets that are both structured and realistic, yet fully measurable.
92
+
93
+ For instance, if you derive a new generalization or manifold fitting bound involving curvature or volume, you can directly test how tight it is under controlled geometric conditions. Similarly, if you develop a curvature estimation algorithm, you can benchmark its performance on datasets where the true curvature is exactly known. Randomly sampling points from the provided grids lets you simulate realistic sparse sampling scenarios and directly compare estimates to ground truth.
94
+
95
+ ## Setup and requirements
96
+
97
+ To install the current development version of the library, run `pip install .` at the top level of the project. It is
98
+ recommended to use a computer with a GPU of at least 5 GB memory and 30 GB of RAM, especially for dense image-dataset
99
+ experiments.
100
+
101
+ To run the unit tests of the project, run `pytest` on the top level of the project.
102
+
103
+ ## Datasets
104
+
105
+ The two main image datasets included are:
106
+
107
+ - **dSprites (grid generator)** – generates synthetic images with controllable grid density, image size, and transformations.
108
+ - **COIL-20 (augmented generator)** – extends the original dataset (to be downloaded separately) with controlled xy-rotations and rescaling.
109
+
110
+ Additionally, four toy datasets are provided for smaller experiments:
111
+
112
+ - **Circle** - A circle embedded in 2D.
113
+ - **Moons** - Two semicircles close to each other. It is practically the same as the moons dataset from sklearn.
114
+ - **Sphere** – A sphere embedded in 3D.
115
+ - **Torus** – A torus in 3D. Has slightly more complex topology and nonuniform geometric measures.
116
+
117
+
118
+ All datasets are densely sampled on a grid, where each grid direction corresponds to a transformation axis. This limits practical dimensionality to about 4–5 directions but provides highly accurate geometric values which can be used as ground truth. Datasets can be loaded as full grids or sampled uniformly with respect to the local volume element. For the toy datasets, the geometric measures are computed directly using the corresponding closed-form formulas.
119
+
120
+ For example usages and visualizations of the datasets look in the notebooks in `notebooks/datasets_and_measures`.
121
+
122
+ ## Geometric Measures
123
+
124
+ Finite-difference operators are used to compute geometric quantities directly on the grid—making the framework accurate, stable, and differentiable.
125
+
126
+ Available measures include:
127
+
128
+ - Volume and volume element
129
+ - Tangent spaces and the Riemannian metric tensor
130
+ - Scalar curvature
131
+ - Reach along with a pointwise version of it.
132
+
133
+ All computations can be executed on GPU, allowing fast analysis even for dense grids.
134
+
135
+ Example computations of the measures can be found in the following notebook: `notebooks/datasets_and_measures/toy_manifold_datasets.ipynb`.
136
+
137
+ Long term, the goal is to extend this module to include geodesic distances, exponential maps, and other advanced differential quantities.
138
+
139
+ ## Manifold Fitting Bounds
140
+
141
+ This section provides reference experiments used in the accompanying paper to validate theoretical manifold fitting
142
+ bounds. These scripts and notebooks are kept in this repository for now so that the original experiments remain visible,
143
+ but this part of the codebase is still being reorganized and may move to a separate public reproducibility repository.
144
+
145
+ Two types of models are used to approximate the reference manifolds:
146
+
147
+ - Moving Least Squares (MMLS): A classical local manifold fitting algorithm used to recover smooth embeddings from sampled data.
148
+ - $\beta$-VAE: A deep generative model trained to learn a low-dimensional latent manifold consistent with the data geometry and using it to reconstruct a full data manifold.
149
+
150
+ The $\beta$-VAE implementation in `representation_learning/beta_vae` is copied and adapted from
151
+ [1Konny/Beta-VAE](https://github.com/1Konny/Beta-VAE), a PyTorch reproduction of the $\beta$-VAE models from Higgins
152
+ et al. (2017) and Burgess et al. (2018). The upstream project is distributed under the MIT License; the original
153
+ copyright and license text are included in `representation_learning/beta_vae/LICENSE`, and a package-level attribution
154
+ notice is included in `representation_learning/beta_vae/NOTICE.md`.
155
+
156
+ The results are compared to theoretical bounds proposed by Fefferman, Narayanan & Mitter (2016) and Genovese et al. (2012), assessing their tightness and dependence on curvature, reach, and sample density.
157
+
158
+ To reproduce the fitting of the manifolds:
159
+
160
+ - For the toy datasets run:
161
+ ```
162
+ python experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py \
163
+ --output-path <path to output dir>/toy_datasets_fitting_mmls \
164
+ --n-range 25 505 5 \
165
+ --n-examples-per-size 20 \
166
+ --n-ground-truth 1_000 \
167
+ --max-workers 5 \
168
+ --fitting-method MMLS
169
+
170
+ python experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py \
171
+ --output-path <path to output dir>/toy_dataset_fitting_denoising_autoencoder \
172
+ --n-examples-per-size 5 \
173
+ --n-ground-truth 1_000 \
174
+ --max-workers 5 \
175
+ --fitting-method denoising_autoencoder_random_noise
176
+ ```
177
+ - For the image datasets run:
178
+ ```
179
+ COIL20_PATH=<path where you extracted coil-20-proc> python experiment_scripts/manifold_fitting/training.py --output-path <output path>
180
+ ```
181
+
182
+ Examples of generated results can be found in the three notebooks under `notebooks/manifold_fitting`.
@@ -0,0 +1,133 @@
1
+ > [!CAUTION]
2
+ > **This project is under active construction.** The code runs and can be used to explore the framework, but it still
3
+ > needs more updates before it can be relied on to reproduce the original paper results or to support new research
4
+ > projects.
5
+
6
+ # The Data Manifold Under the Microscope
7
+
8
+ ## Library transition status
9
+
10
+ ---
11
+ This repository is being transitioned from a paper-review artifact into a reusable Python library for working with
12
+ grid-sampled data manifolds and geometric measurements. The current library-facing core is the dataset generation and
13
+ finite-difference geometry code. The manifold fitting and bounds evaluation code is still included as reference material
14
+ from the accompanying paper and as example usage of the framework, but it is not yet a polished or stable public API.
15
+
16
+ Here is a map of the main components:
17
+
18
+ - Datasets
19
+ - There are two notebooks under `notebooks/datasets_and_measures` which contain examples of loading and visualizing the toy and image datasets.
20
+ - The toy datasets code is in `microscope/datasets/toy_manifolds.py`
21
+ - The image datasets code is in `microscope/datasets/custom_dsprites.py` and `microscope/datasets/coil20.py` respectively.
22
+ - Geometric measures
23
+ - The notebook `notebooks/datasets_and_measures/toy_manifold_datasets.ipynb` contains example computation and visualization of the measures on the image datasets.
24
+ - The finite element computations of the measures are under `microscope/computations_grid`. `basic.py` contains basic computations such as partial derivatives or the Riemannian metric and on top of it are built `volume.py`, `curvature.py` and `reach.py`.
25
+ - The functions are thoroughly tested, check `microscope/computations_grid/tests`. Those use a whole separate part of the codebase, `microscope/manifold_examples` where symbolic computations of the measures on simple manifolds are performed.
26
+ - Manifold fitting and bounds
27
+ - The MMLS fitting method is in `experiment_scripts/manifold_fitting/mmls.py`.
28
+ - The denoising autoencoder used for the toy datasets is in `experiment_scripts/toy_manifolds_experiment/manifold_fitting_denoising_autoencoder.py`.
29
+ - The beta-VAE components are under `representation_learning/beta_vae`.
30
+ - The scripts running the main experiments are `experiment_scripts/manifold_fitting/training.py` for the image datasets and `experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py` for the toy datasets.
31
+ - The three notebooks under `notebooks/manifold_fitting` show how to generate some of the results of the paper, namely the bound curves for all datasets on MMLS, the curves on dSprites with MMLS on different dimensions and the curves for all methods on dSprites. Please note that the code and plots are not very polished there. The final plots were generated separately using the curves exported from the notebook.
32
+
33
+ ---
34
+
35
+ `manifold-microscope` provides a framework for studying and benchmarking data manifolds through densely sampled
36
+ grid-based datasets and finite-difference geometric computations. The goal of the library is to make it practical to
37
+ construct measurable reference manifolds, compute geometric quantities such as curvature, reach, and volume, and use
38
+ those quantities when evaluating manifold fitting, generalization bounds, and geometric estimation methods.
39
+
40
+ ## Why Use the Microscope?
41
+
42
+ In most research settings, one must choose between idealized mathematical manifolds (e.g., spheres, ellipsoids) with unrealistic simplicity, or real-world datasets where true geometric quantities are unknown or hard to measure accurately for testing. This framework bridges that gap by offering datasets that are both structured and realistic, yet fully measurable.
43
+
44
+ For instance, if you derive a new generalization or manifold fitting bound involving curvature or volume, you can directly test how tight it is under controlled geometric conditions. Similarly, if you develop a curvature estimation algorithm, you can benchmark its performance on datasets where the true curvature is exactly known. Randomly sampling points from the provided grids lets you simulate realistic sparse sampling scenarios and directly compare estimates to ground truth.
45
+
46
+ ## Setup and requirements
47
+
48
+ To install the current development version of the library, run `pip install .` at the top level of the project. It is
49
+ recommended to use a computer with a GPU of at least 5 GB memory and 30 GB of RAM, especially for dense image-dataset
50
+ experiments.
51
+
52
+ To run the unit tests of the project, run `pytest` on the top level of the project.
53
+
54
+ ## Datasets
55
+
56
+ The two main image datasets included are:
57
+
58
+ - **dSprites (grid generator)** – generates synthetic images with controllable grid density, image size, and transformations.
59
+ - **COIL-20 (augmented generator)** – extends the original dataset (to be downloaded separately) with controlled xy-rotations and rescaling.
60
+
61
+ Additionally, four toy datasets are provided for smaller experiments:
62
+
63
+ - **Circle** - A circle embedded in 2D.
64
+ - **Moons** - Two semicircles close to each other. It is practically the same as the moons dataset from sklearn.
65
+ - **Sphere** – A sphere embedded in 3D.
66
+ - **Torus** – A torus in 3D. Has slightly more complex topology and nonuniform geometric measures.
67
+
68
+
69
+ All datasets are densely sampled on a grid, where each grid direction corresponds to a transformation axis. This limits practical dimensionality to about 4–5 directions but provides highly accurate geometric values which can be used as ground truth. Datasets can be loaded as full grids or sampled uniformly with respect to the local volume element. For the toy datasets, the geometric measures are computed directly using the corresponding closed-form formulas.
70
+
71
+ For example usages and visualizations of the datasets look in the notebooks in `notebooks/datasets_and_measures`.
72
+
73
+ ## Geometric Measures
74
+
75
+ Finite-difference operators are used to compute geometric quantities directly on the grid—making the framework accurate, stable, and differentiable.
76
+
77
+ Available measures include:
78
+
79
+ - Volume and volume element
80
+ - Tangent spaces and the Riemannian metric tensor
81
+ - Scalar curvature
82
+ - Reach along with a pointwise version of it.
83
+
84
+ All computations can be executed on GPU, allowing fast analysis even for dense grids.
85
+
86
+ Example computations of the measures can be found in the following notebook: `notebooks/datasets_and_measures/toy_manifold_datasets.ipynb`.
87
+
88
+ Long term, the goal is to extend this module to include geodesic distances, exponential maps, and other advanced differential quantities.
89
+
90
+ ## Manifold Fitting Bounds
91
+
92
+ This section provides reference experiments used in the accompanying paper to validate theoretical manifold fitting
93
+ bounds. These scripts and notebooks are kept in this repository for now so that the original experiments remain visible,
94
+ but this part of the codebase is still being reorganized and may move to a separate public reproducibility repository.
95
+
96
+ Two types of models are used to approximate the reference manifolds:
97
+
98
+ - Moving Least Squares (MMLS): A classical local manifold fitting algorithm used to recover smooth embeddings from sampled data.
99
+ - $\beta$-VAE: A deep generative model trained to learn a low-dimensional latent manifold consistent with the data geometry and using it to reconstruct a full data manifold.
100
+
101
+ The $\beta$-VAE implementation in `representation_learning/beta_vae` is copied and adapted from
102
+ [1Konny/Beta-VAE](https://github.com/1Konny/Beta-VAE), a PyTorch reproduction of the $\beta$-VAE models from Higgins
103
+ et al. (2017) and Burgess et al. (2018). The upstream project is distributed under the MIT License; the original
104
+ copyright and license text are included in `representation_learning/beta_vae/LICENSE`, and a package-level attribution
105
+ notice is included in `representation_learning/beta_vae/NOTICE.md`.
106
+
107
+ The results are compared to theoretical bounds proposed by Fefferman, Narayanan & Mitter (2016) and Genovese et al. (2012), assessing their tightness and dependence on curvature, reach, and sample density.
108
+
109
+ To reproduce the fitting of the manifolds:
110
+
111
+ - For the toy datasets run:
112
+ ```
113
+ python experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py \
114
+ --output-path <path to output dir>/toy_datasets_fitting_mmls \
115
+ --n-range 25 505 5 \
116
+ --n-examples-per-size 20 \
117
+ --n-ground-truth 1_000 \
118
+ --max-workers 5 \
119
+ --fitting-method MMLS
120
+
121
+ python experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py \
122
+ --output-path <path to output dir>/toy_dataset_fitting_denoising_autoencoder \
123
+ --n-examples-per-size 5 \
124
+ --n-ground-truth 1_000 \
125
+ --max-workers 5 \
126
+ --fitting-method denoising_autoencoder_random_noise
127
+ ```
128
+ - For the image datasets run:
129
+ ```
130
+ COIL20_PATH=<path where you extracted coil-20-proc> python experiment_scripts/manifold_fitting/training.py --output-path <output path>
131
+ ```
132
+
133
+ Examples of generated results can be found in the three notebooks under `notebooks/manifold_fitting`.
@@ -0,0 +1,107 @@
1
+ import itertools
2
+ from pathlib import Path
3
+
4
+ import typer
5
+ from tqdm import tqdm
6
+
7
+ from microscope.datasets.generic_dataset_loader import DatasetName
8
+ from microscope.computations_grid.data_analysis.run_data_analysis import main as run_analysis
9
+
10
+ app = typer.Typer(pretty_exceptions_enable=False)
11
+
12
+
13
+ def analysis_on_model(
14
+ inference_path: Path,
15
+ output_path: Path,
16
+ dataset_name: DatasetName,
17
+ model_type: str,
18
+ number_of_dims: int,
19
+ only_evolution: bool,
20
+ normalize_for_volume: bool,
21
+ skip_done: bool,
22
+ n_samples_for_plots: int = 50_000
23
+ ) -> None:
24
+ if output_path.exists() and skip_done:
25
+ print(f"Skipping {output_path.name} as it exists.")
26
+ return None
27
+
28
+ print(f"Analysis on {output_path.name}.")
29
+ run_analysis(
30
+ inference_path=inference_path,
31
+ output_path=output_path,
32
+ dataset=dataset_name,
33
+ model_type=model_type,
34
+ number_of_dims=number_of_dims,
35
+ only_evolution=only_evolution,
36
+ normalize_for_volume=normalize_for_volume,
37
+ n_samples_for_plots=n_samples_for_plots
38
+ )
39
+
40
+
41
+ @app.command()
42
+ def run_analyses(
43
+ inference_path: Path = typer.Option(...),
44
+ output_path: Path = typer.Option(...),
45
+ only_evolution: bool = True,
46
+ normalize_for_volume: bool = True,
47
+ skip_done: bool = True,
48
+ n_samples_for_plots: int = 50_000
49
+ ) -> None:
50
+ dataset_name_list = [
51
+ "custom_dsprites_balanced",
52
+ "extended_coil20"
53
+ ]
54
+ model_type_list = [
55
+ "beta_vae",
56
+ "mae"
57
+ ]
58
+ training_ratio_per_dim_list = [
59
+ 1.0
60
+ ]
61
+ number_of_dims_list = [
62
+ # 1,
63
+ 2,
64
+ 3,
65
+ # 4
66
+ ]
67
+ # No noise.
68
+ noise_sigma_list = [
69
+ 0
70
+ ]
71
+
72
+ hyperparameter_grid = list(itertools.product(
73
+ dataset_name_list,
74
+ model_type_list,
75
+ training_ratio_per_dim_list,
76
+ number_of_dims_list,
77
+ noise_sigma_list
78
+ ))
79
+
80
+ for dataset_name, model_type, training_ratio_per_dim, number_of_dims, noise_sigma in tqdm(hyperparameter_grid):
81
+ # Skip dimension 4 for COIL20.
82
+ if (number_of_dims == 4) and (dataset_name == "extended_coil20"):
83
+ continue
84
+ if (number_of_dims == 4) and (model_type == "mae"):
85
+ continue
86
+ model_dir = "__".join([
87
+ dataset_name,
88
+ model_type,
89
+ str(training_ratio_per_dim),
90
+ str(number_of_dims),
91
+ str(noise_sigma)
92
+ ])
93
+ analysis_on_model(
94
+ output_path=output_path / model_dir,
95
+ inference_path=inference_path / model_dir,
96
+ dataset_name=dataset_name,
97
+ model_type=model_type,
98
+ number_of_dims=number_of_dims,
99
+ only_evolution=only_evolution,
100
+ normalize_for_volume=normalize_for_volume,
101
+ skip_done=skip_done,
102
+ n_samples_for_plots=n_samples_for_plots
103
+ )
104
+
105
+
106
+ if __name__ == "__main__":
107
+ app()
@@ -0,0 +1,106 @@
1
+ import itertools
2
+ from pathlib import Path
3
+
4
+ import typer
5
+ from tqdm import tqdm
6
+
7
+ from microscope.datasets.generic_dataset_loader import DatasetName
8
+ from microscope.computations_grid.data_analysis.run_data_analysis import main as run_analysis
9
+
10
+ app = typer.Typer(pretty_exceptions_enable=False)
11
+
12
+
13
+ def analysis_on_model(
14
+ inference_path: Path,
15
+ output_path: Path,
16
+ dataset_name: DatasetName,
17
+ model_type: str,
18
+ number_of_dims: int,
19
+ only_evolution: bool,
20
+ normalize_for_volume: bool,
21
+ skip_done: bool
22
+ ) -> None:
23
+ if output_path.exists() and skip_done:
24
+ print(f"Skipping {output_path.name} as it exists.")
25
+ return None
26
+
27
+ print(f"Analysis on {output_path.name}.")
28
+ run_analysis(
29
+ inference_path=inference_path,
30
+ output_path=output_path,
31
+ dataset=dataset_name,
32
+ model_type=model_type,
33
+ number_of_dims=number_of_dims,
34
+ only_evolution=only_evolution,
35
+ normalize_for_volume=normalize_for_volume
36
+ )
37
+
38
+
39
+ @app.command()
40
+ def run_analyses(
41
+ inference_path: Path = typer.Option(...),
42
+ output_path: Path = typer.Option(...),
43
+ only_evolution: bool = True,
44
+ normalize_for_volume: bool = True,
45
+ skip_done: bool = True
46
+ ) -> None:
47
+ dataset_name_list = [
48
+ "custom_dsprites_balanced",
49
+ "extended_coil20"
50
+ ]
51
+ model_type_list = [
52
+ "beta_vae",
53
+ "mae"
54
+ ]
55
+ training_ratio_per_dim_list = [
56
+ 0.4,
57
+ 0.5,
58
+ 0.6,
59
+ 1.0
60
+ ]
61
+ number_of_dims_list = [
62
+ 1,
63
+ 2,
64
+ 3,
65
+ 4
66
+ ]
67
+ # No noise.
68
+ noise_sigma_list = [
69
+ 0
70
+ ]
71
+
72
+ hyperparameter_grid = list(itertools.product(
73
+ dataset_name_list,
74
+ model_type_list,
75
+ training_ratio_per_dim_list,
76
+ number_of_dims_list,
77
+ noise_sigma_list
78
+ ))
79
+
80
+ for dataset_name, model_type, training_ratio_per_dim, number_of_dims, noise_sigma in tqdm(hyperparameter_grid):
81
+ # Skip dimension 4 for COIL20.
82
+ if (number_of_dims == 4) and (dataset_name == "extended_coil20"):
83
+ continue
84
+ if (number_of_dims == 4) and (model_type == "mae"):
85
+ continue
86
+ model_dir = "__".join([
87
+ dataset_name,
88
+ model_type,
89
+ str(training_ratio_per_dim),
90
+ str(number_of_dims),
91
+ str(noise_sigma)
92
+ ])
93
+ analysis_on_model(
94
+ output_path=output_path / model_dir,
95
+ inference_path=inference_path / model_dir,
96
+ dataset_name=dataset_name,
97
+ model_type=model_type,
98
+ number_of_dims=number_of_dims,
99
+ only_evolution=only_evolution,
100
+ normalize_for_volume=normalize_for_volume,
101
+ skip_done=skip_done
102
+ )
103
+
104
+
105
+ if __name__ == "__main__":
106
+ app()