manifold-microscope 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- manifold_microscope-0.0.1/LICENSE +29 -0
- manifold_microscope-0.0.1/PKG-INFO +182 -0
- manifold_microscope-0.0.1/README.md +133 -0
- manifold_microscope-0.0.1/experiment_scripts/__init__.py +0 -0
- manifold_microscope-0.0.1/experiment_scripts/manifold_analysis/__init__.py +0 -0
- manifold_microscope-0.0.1/experiment_scripts/manifold_analysis/analysis.py +107 -0
- manifold_microscope-0.0.1/experiment_scripts/manifold_fitting/__init__.py +0 -0
- manifold_microscope-0.0.1/experiment_scripts/manifold_fitting/analysis.py +106 -0
- manifold_microscope-0.0.1/experiment_scripts/manifold_fitting/inference.py +147 -0
- manifold_microscope-0.0.1/experiment_scripts/manifold_fitting/mmls.py +63 -0
- manifold_microscope-0.0.1/experiment_scripts/manifold_fitting/training.py +220 -0
- manifold_microscope-0.0.1/experiment_scripts/model_configs.py +102 -0
- manifold_microscope-0.0.1/experiment_scripts/toy_manifolds_experiment/__init__.py +0 -0
- manifold_microscope-0.0.1/experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py +218 -0
- manifold_microscope-0.0.1/experiment_scripts/toy_manifolds_experiment/manifold_fitting_denoising_autoencoder.py +156 -0
- manifold_microscope-0.0.1/experiment_scripts/toy_manifolds_experiment/manifold_fitting_no_noise.py +87 -0
- manifold_microscope-0.0.1/manifold_microscope.egg-info/PKG-INFO +182 -0
- manifold_microscope-0.0.1/manifold_microscope.egg-info/SOURCES.txt +61 -0
- manifold_microscope-0.0.1/manifold_microscope.egg-info/dependency_links.txt +1 -0
- manifold_microscope-0.0.1/manifold_microscope.egg-info/requires.txt +23 -0
- manifold_microscope-0.0.1/manifold_microscope.egg-info/top_level.txt +3 -0
- manifold_microscope-0.0.1/microscope/__init__.py +0 -0
- manifold_microscope-0.0.1/microscope/computations_grid/__init__.py +0 -0
- manifold_microscope-0.0.1/microscope/computations_grid/basic.py +166 -0
- manifold_microscope-0.0.1/microscope/computations_grid/curvature.py +240 -0
- manifold_microscope-0.0.1/microscope/computations_grid/data_analysis/__init__.py +0 -0
- manifold_microscope-0.0.1/microscope/computations_grid/data_analysis/data_analysis.py +630 -0
- manifold_microscope-0.0.1/microscope/computations_grid/data_analysis/merge_analysis_outputs.py +314 -0
- manifold_microscope-0.0.1/microscope/computations_grid/data_analysis/run_data_analysis.py +229 -0
- manifold_microscope-0.0.1/microscope/computations_grid/reach.py +148 -0
- manifold_microscope-0.0.1/microscope/computations_grid/volume.py +100 -0
- manifold_microscope-0.0.1/microscope/cyclic_dimensions.py +57 -0
- manifold_microscope-0.0.1/microscope/datasets/__init__.py +0 -0
- manifold_microscope-0.0.1/microscope/datasets/coil20.py +171 -0
- manifold_microscope-0.0.1/microscope/datasets/custom_dsprites.py +392 -0
- manifold_microscope-0.0.1/microscope/datasets/dataset_split.py +120 -0
- manifold_microscope-0.0.1/microscope/datasets/generic_dataset_loader.py +476 -0
- manifold_microscope-0.0.1/microscope/datasets/image_transforms.py +156 -0
- manifold_microscope-0.0.1/microscope/datasets/noise_adding.py +103 -0
- manifold_microscope-0.0.1/microscope/datasets/original_dsprites.py +58 -0
- manifold_microscope-0.0.1/microscope/datasets/toy_manifolds.py +686 -0
- manifold_microscope-0.0.1/microscope/manifold_examples/__init__.py +0 -0
- manifold_microscope-0.0.1/microscope/manifold_examples/ellipsoid.py +77 -0
- manifold_microscope-0.0.1/microscope/manifold_examples/hyperboloid.py +47 -0
- manifold_microscope-0.0.1/microscope/manifold_examples/plotting.py +74 -0
- manifold_microscope-0.0.1/microscope/manifold_examples/sampling_grid.py +103 -0
- manifold_microscope-0.0.1/microscope/manifold_examples/sampling_uniform.py +273 -0
- manifold_microscope-0.0.1/microscope/manifold_examples/sphere.py +41 -0
- manifold_microscope-0.0.1/microscope/manifold_examples/symbolic_computations.py +332 -0
- manifold_microscope-0.0.1/microscope/manifold_examples/utils.py +58 -0
- manifold_microscope-0.0.1/microscope/patches.py +120 -0
- manifold_microscope-0.0.1/pyproject.toml +74 -0
- manifold_microscope-0.0.1/representation_learning/__init__.py +0 -0
- manifold_microscope-0.0.1/representation_learning/beta_vae/LICENSE +21 -0
- manifold_microscope-0.0.1/representation_learning/beta_vae/NOTICE.md +17 -0
- manifold_microscope-0.0.1/representation_learning/beta_vae/__init__.py +0 -0
- manifold_microscope-0.0.1/representation_learning/beta_vae/dataset.py +106 -0
- manifold_microscope-0.0.1/representation_learning/beta_vae/inference_intermediate_layers.py +185 -0
- manifold_microscope-0.0.1/representation_learning/beta_vae/main.py +79 -0
- manifold_microscope-0.0.1/representation_learning/beta_vae/model.py +172 -0
- manifold_microscope-0.0.1/representation_learning/beta_vae/solver.py +432 -0
- manifold_microscope-0.0.1/representation_learning/beta_vae/utils.py +50 -0
- manifold_microscope-0.0.1/setup.cfg +4 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026, Marios Koulakis
|
|
4
|
+
All rights reserved.
|
|
5
|
+
|
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
|
8
|
+
|
|
9
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
10
|
+
list of conditions and the following disclaimer.
|
|
11
|
+
|
|
12
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
|
14
|
+
and/or other materials provided with the distribution.
|
|
15
|
+
|
|
16
|
+
3. Neither the name of the copyright holder nor the names of its contributors
|
|
17
|
+
may be used to endorse or promote products derived from this software
|
|
18
|
+
without specific prior written permission.
|
|
19
|
+
|
|
20
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
21
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
22
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
23
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
24
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
25
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
26
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
27
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
28
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
29
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: manifold-microscope
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: The data manifold under the microscope.
|
|
5
|
+
Author: Marios Koulakis
|
|
6
|
+
License-Expression: BSD-3-Clause
|
|
7
|
+
Project-URL: Homepage, https://github.com/koulakis/manifold-microscope
|
|
8
|
+
Project-URL: Repository, https://github.com/koulakis/manifold-microscope
|
|
9
|
+
Project-URL: Issues, https://github.com/koulakis/manifold-microscope/issues
|
|
10
|
+
Keywords: manifold learning,differential geometry,geometric deep learning,datasets,curvature
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
License-File: representation_learning/beta_vae/LICENSE
|
|
25
|
+
License-File: representation_learning/beta_vae/NOTICE.md
|
|
26
|
+
Requires-Dist: torch>=2.2.0
|
|
27
|
+
Requires-Dist: numpy>=1.22.0
|
|
28
|
+
Requires-Dist: tqdm
|
|
29
|
+
Requires-Dist: opencv-python
|
|
30
|
+
Requires-Dist: Pillow
|
|
31
|
+
Requires-Dist: scipy
|
|
32
|
+
Requires-Dist: geomloss
|
|
33
|
+
Requires-Dist: joblib
|
|
34
|
+
Requires-Dist: sympy
|
|
35
|
+
Requires-Dist: scikit-learn
|
|
36
|
+
Requires-Dist: seaborn
|
|
37
|
+
Requires-Dist: einops>=0.8.0
|
|
38
|
+
Provides-Extra: benchmark
|
|
39
|
+
Requires-Dist: torchvision; extra == "benchmark"
|
|
40
|
+
Requires-Dist: typer; extra == "benchmark"
|
|
41
|
+
Requires-Dist: pynndescent>=0.5.13; extra == "benchmark"
|
|
42
|
+
Requires-Dist: pytorch-lightning; extra == "benchmark"
|
|
43
|
+
Requires-Dist: PyYAML; extra == "benchmark"
|
|
44
|
+
Requires-Dist: matplotlib; extra == "benchmark"
|
|
45
|
+
Requires-Dist: pandas; extra == "benchmark"
|
|
46
|
+
Requires-Dist: timm>=1.0.14; extra == "benchmark"
|
|
47
|
+
Requires-Dist: lightly>=1.5.18; extra == "benchmark"
|
|
48
|
+
Dynamic: license-file
|
|
49
|
+
|
|
50
|
+
> [!CAUTION]
|
|
51
|
+
> **This project is under active construction.** The code runs and can be used to explore the framework, but it still
|
|
52
|
+
> needs more updates before it can be relied on to reproduce the original paper results or to support new research
|
|
53
|
+
> projects.
|
|
54
|
+
|
|
55
|
+
# The Data Manifold Under the Microscope
|
|
56
|
+
|
|
57
|
+
## Library transition status
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
This repository is being transitioned from a paper-review artifact into a reusable Python library for working with
|
|
61
|
+
grid-sampled data manifolds and geometric measurements. The current library-facing core is the dataset generation and
|
|
62
|
+
finite-difference geometry code. The manifold fitting and bounds evaluation code is still included as reference material
|
|
63
|
+
from the accompanying paper and as example usage of the framework, but it is not yet a polished or stable public API.
|
|
64
|
+
|
|
65
|
+
Here is a map of the main components:
|
|
66
|
+
|
|
67
|
+
- Datasets
|
|
68
|
+
- There are two notebooks under `notebooks/datasets_and_measures` which contain examples of loading and visualizing the toy and image datasets.
|
|
69
|
+
- The toy datasets code is in `microscope/datasets/toy_manifolds.py`
|
|
70
|
+
- The image datasets code is in `microscope/datasets/custom_dsprites.py` and `microscope/datasets/coil20.py` respectively.
|
|
71
|
+
- Geometric measures
|
|
72
|
+
- The notebook `notebooks/datasets_and_measures/toy_manifold_datasets.ipynb` contains example computation and visualization of the measures on the image datasets.
|
|
73
|
+
- The finite element computations of the measures are under `microscope/computations_grid`. `basic.py` contains basic computations such as partial derivatives or the Riemannian metric and on top of it are built `volume.py`, `curvature.py` and `reach.py`.
|
|
74
|
+
- The functions are thoroughly tested, check `microscope/computations_grid/tests`. Those use a whole separate part of the codebase, `microscope/manifold_examples` where symbolic computations of the measures on simple manifolds are performed.
|
|
75
|
+
- Manifold fitting and bounds
|
|
76
|
+
- The MMLS fitting method is in `experiment_scripts/manifold_fitting/mmls.py`.
|
|
77
|
+
- The denoising autoencoder used for the toy datasets is in `experiment_scripts/toy_manifolds_experiment/manifold_fitting_denoising_autoencoder.py`.
|
|
78
|
+
- The beta-VAE components are under `representation_learning/beta_vae`.
|
|
79
|
+
- The scripts running the main experiments are `experiment_scripts/manifold_fitting/training.py` for the image datasets and `experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py` for the toy datasets.
|
|
80
|
+
- The three notebooks under `notebooks/manifold_fitting` show how to generate some of the results of the paper, namely the bound curves for all datasets on MMLS, the curves on dSprites with MMLS on different dimensions and the curves for all methods on dSprites. Please note that the code and plots are not very polished there. The final plots were generated separately using the curves exported from the notebook.
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
`manifold-microscope` provides a framework for studying and benchmarking data manifolds through densely sampled
|
|
85
|
+
grid-based datasets and finite-difference geometric computations. The goal of the library is to make it practical to
|
|
86
|
+
construct measurable reference manifolds, compute geometric quantities such as curvature, reach, and volume, and use
|
|
87
|
+
those quantities when evaluating manifold fitting, generalization bounds, and geometric estimation methods.
|
|
88
|
+
|
|
89
|
+
## Why Use the Microscope?
|
|
90
|
+
|
|
91
|
+
In most research settings, one must choose between idealized mathematical manifolds (e.g., spheres, ellipsoids) with unrealistic simplicity, or real-world datasets where true geometric quantities are unknown or hard to measure accurately for testing. This framework bridges that gap by offering datasets that are both structured and realistic, yet fully measurable.
|
|
92
|
+
|
|
93
|
+
For instance, if you derive a new generalization or manifold fitting bound involving curvature or volume, you can directly test how tight it is under controlled geometric conditions. Similarly, if you develop a curvature estimation algorithm, you can benchmark its performance on datasets where the true curvature is exactly known. Randomly sampling points from the provided grids lets you simulate realistic sparse sampling scenarios and directly compare estimates to ground truth.
|
|
94
|
+
|
|
95
|
+
## Setup and requirements
|
|
96
|
+
|
|
97
|
+
To install the current development version of the library, run `pip install .` at the top level of the project. It is
|
|
98
|
+
recommended to use a computer with a GPU of at least 5 GB memory and 30 GB of RAM, especially for dense image-dataset
|
|
99
|
+
experiments.
|
|
100
|
+
|
|
101
|
+
To run the unit tests of the project, run `pytest` on the top level of the project.
|
|
102
|
+
|
|
103
|
+
## Datasets
|
|
104
|
+
|
|
105
|
+
The two main image datasets included are:
|
|
106
|
+
|
|
107
|
+
- **dSprites (grid generator)** – generates synthetic images with controllable grid density, image size, and transformations.
|
|
108
|
+
- **COIL-20 (augmented generator)** – extends the original dataset (to be downloaded separately) with controlled xy-rotations and rescaling.
|
|
109
|
+
|
|
110
|
+
Additionally, four toy datasets are provided for smaller experiments:
|
|
111
|
+
|
|
112
|
+
- **Circle** - A circle embedded in 2D.
|
|
113
|
+
- **Moons** - Two semicircles close to each other. It is practically the same as the moons dataset from sklearn.
|
|
114
|
+
- **Sphere** – A sphere embedded in 3D.
|
|
115
|
+
- **Torus** – A torus in 3D. Has slightly more complex topology and nonuniform geometric measures.
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
All datasets are densely sampled on a grid, where each grid direction corresponds to a transformation axis. This limits practical dimensionality to about 4–5 directions but provides highly accurate geometric values which can be used as ground truth. Datasets can be loaded as full grids or sampled uniformly with respect to the local volume element. For the toy datasets, the geometric measures are computed directly using the corresponding closed-form formulas.
|
|
119
|
+
|
|
120
|
+
For example usages and visualizations of the datasets look in the notebooks in `notebooks/datasets_and_measures`.
|
|
121
|
+
|
|
122
|
+
## Geometric Measures
|
|
123
|
+
|
|
124
|
+
Finite-difference operators are used to compute geometric quantities directly on the grid—making the framework accurate, stable, and differentiable.
|
|
125
|
+
|
|
126
|
+
Available measures include:
|
|
127
|
+
|
|
128
|
+
- Volume and volume element
|
|
129
|
+
- Tangent spaces and the Riemannian metric tensor
|
|
130
|
+
- Scalar curvature
|
|
131
|
+
- Reach along with a pointwise version of it.
|
|
132
|
+
|
|
133
|
+
All computations can be executed on GPU, allowing fast analysis even for dense grids.
|
|
134
|
+
|
|
135
|
+
Example computations of the measures can be found in the following notebook: `notebooks/datasets_and_measures/toy_manifold_datasets.ipynb`.
|
|
136
|
+
|
|
137
|
+
Long term, the goal is to extend this module to include geodesic distances, exponential maps, and other advanced differential quantities.
|
|
138
|
+
|
|
139
|
+
## Manifold Fitting Bounds
|
|
140
|
+
|
|
141
|
+
This section provides reference experiments used in the accompanying paper to validate theoretical manifold fitting
|
|
142
|
+
bounds. These scripts and notebooks are kept in this repository for now so that the original experiments remain visible,
|
|
143
|
+
but this part of the codebase is still being reorganized and may move to a separate public reproducibility repository.
|
|
144
|
+
|
|
145
|
+
Two types of models are used to approximate the reference manifolds:
|
|
146
|
+
|
|
147
|
+
- Moving Least Squares (MMLS): A classical local manifold fitting algorithm used to recover smooth embeddings from sampled data.
|
|
148
|
+
- $\beta$-VAE: A deep generative model trained to learn a low-dimensional latent manifold consistent with the data geometry and using it to reconstruct a full data manifold.
|
|
149
|
+
|
|
150
|
+
The $\beta$-VAE implementation in `representation_learning/beta_vae` is copied and adapted from
|
|
151
|
+
[1Konny/Beta-VAE](https://github.com/1Konny/Beta-VAE), a PyTorch reproduction of the $\beta$-VAE models from Higgins
|
|
152
|
+
et al. (2017) and Burgess et al. (2018). The upstream project is distributed under the MIT License; the original
|
|
153
|
+
copyright and license text are included in `representation_learning/beta_vae/LICENSE`, and a package-level attribution
|
|
154
|
+
notice is included in `representation_learning/beta_vae/NOTICE.md`.
|
|
155
|
+
|
|
156
|
+
The results are compared to theoretical bounds proposed by Fefferman, Narayanan & Mitter (2016) and Genovese et al. (2012), assessing their tightness and dependence on curvature, reach, and sample density.
|
|
157
|
+
|
|
158
|
+
To reproduce the fitting of the manifolds:
|
|
159
|
+
|
|
160
|
+
- For the toy datasets run:
|
|
161
|
+
```
|
|
162
|
+
python experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py \
|
|
163
|
+
--output-path <path to output dir>/toy_datasets_fitting_mmls \
|
|
164
|
+
--n-range 25 505 5 \
|
|
165
|
+
--n-examples-per-size 20 \
|
|
166
|
+
--n-ground-truth 1_000 \
|
|
167
|
+
--max-workers 5 \
|
|
168
|
+
--fitting-method MMLS
|
|
169
|
+
|
|
170
|
+
python experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py \
|
|
171
|
+
--output-path <path to output dir>/toy_dataset_fitting_denoising_autoencoder \
|
|
172
|
+
--n-examples-per-size 5 \
|
|
173
|
+
--n-ground-truth 1_000 \
|
|
174
|
+
--max-workers 5 \
|
|
175
|
+
--fitting-method denoising_autoencoder_random_noise
|
|
176
|
+
```
|
|
177
|
+
- For the image datasets run:
|
|
178
|
+
```
|
|
179
|
+
COIL20_PATH=<path where you extracted coil-20-proc> python experiment_scripts/manifold_fitting/training.py --output-path <output path>
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Examples of generated results can be found in the three notebooks under `notebooks/manifold_fitting`.
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
> [!CAUTION]
|
|
2
|
+
> **This project is under active construction.** The code runs and can be used to explore the framework, but it still
|
|
3
|
+
> needs more updates before it can be relied on to reproduce the original paper results or to support new research
|
|
4
|
+
> projects.
|
|
5
|
+
|
|
6
|
+
# The Data Manifold Under the Microscope
|
|
7
|
+
|
|
8
|
+
## Library transition status
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
This repository is being transitioned from a paper-review artifact into a reusable Python library for working with
|
|
12
|
+
grid-sampled data manifolds and geometric measurements. The current library-facing core is the dataset generation and
|
|
13
|
+
finite-difference geometry code. The manifold fitting and bounds evaluation code is still included as reference material
|
|
14
|
+
from the accompanying paper and as example usage of the framework, but it is not yet a polished or stable public API.
|
|
15
|
+
|
|
16
|
+
Here is a map of the main components:
|
|
17
|
+
|
|
18
|
+
- Datasets
|
|
19
|
+
- There are two notebooks under `notebooks/datasets_and_measures` which contain examples of loading and visualizing the toy and image datasets.
|
|
20
|
+
- The toy datasets code is in `microscope/datasets/toy_manifolds.py`
|
|
21
|
+
- The image datasets code is in `microscope/datasets/custom_dsprites.py` and `microscope/datasets/coil20.py` respectively.
|
|
22
|
+
- Geometric measures
|
|
23
|
+
- The notebook `notebooks/datasets_and_measures/toy_manifold_datasets.ipynb` contains example computation and visualization of the measures on the image datasets.
|
|
24
|
+
- The finite element computations of the measures are under `microscope/computations_grid`. `basic.py` contains basic computations such as partial derivatives or the Riemannian metric and on top of it are built `volume.py`, `curvature.py` and `reach.py`.
|
|
25
|
+
- The functions are thoroughly tested, check `microscope/computations_grid/tests`. Those use a whole separate part of the codebase, `microscope/manifold_examples` where symbolic computations of the measures on simple manifolds are performed.
|
|
26
|
+
- Manifold fitting and bounds
|
|
27
|
+
- The MMLS fitting method is in `experiment_scripts/manifold_fitting/mmls.py`.
|
|
28
|
+
- The denoising autoencoder used for the toy datasets is in `experiment_scripts/toy_manifolds_experiment/manifold_fitting_denoising_autoencoder.py`.
|
|
29
|
+
- The beta-VAE components are under `representation_learning/beta_vae`.
|
|
30
|
+
- The scripts running the main experiments are `experiment_scripts/manifold_fitting/training.py` for the image datasets and `experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py` for the toy datasets.
|
|
31
|
+
- The three notebooks under `notebooks/manifold_fitting` show how to generate some of the results of the paper, namely the bound curves for all datasets on MMLS, the curves on dSprites with MMLS on different dimensions and the curves for all methods on dSprites. Please note that the code and plots are not very polished there. The final plots were generated separately using the curves exported from the notebook.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
`manifold-microscope` provides a framework for studying and benchmarking data manifolds through densely sampled
|
|
36
|
+
grid-based datasets and finite-difference geometric computations. The goal of the library is to make it practical to
|
|
37
|
+
construct measurable reference manifolds, compute geometric quantities such as curvature, reach, and volume, and use
|
|
38
|
+
those quantities when evaluating manifold fitting, generalization bounds, and geometric estimation methods.
|
|
39
|
+
|
|
40
|
+
## Why Use the Microscope?
|
|
41
|
+
|
|
42
|
+
In most research settings, one must choose between idealized mathematical manifolds (e.g., spheres, ellipsoids) with unrealistic simplicity, or real-world datasets where true geometric quantities are unknown or hard to measure accurately for testing. This framework bridges that gap by offering datasets that are both structured and realistic, yet fully measurable.
|
|
43
|
+
|
|
44
|
+
For instance, if you derive a new generalization or manifold fitting bound involving curvature or volume, you can directly test how tight it is under controlled geometric conditions. Similarly, if you develop a curvature estimation algorithm, you can benchmark its performance on datasets where the true curvature is exactly known. Randomly sampling points from the provided grids lets you simulate realistic sparse sampling scenarios and directly compare estimates to ground truth.
|
|
45
|
+
|
|
46
|
+
## Setup and requirements
|
|
47
|
+
|
|
48
|
+
To install the current development version of the library, run `pip install .` at the top level of the project. It is
|
|
49
|
+
recommended to use a computer with a GPU of at least 5 GB memory and 30 GB of RAM, especially for dense image-dataset
|
|
50
|
+
experiments.
|
|
51
|
+
|
|
52
|
+
To run the unit tests of the project, run `pytest` on the top level of the project.
|
|
53
|
+
|
|
54
|
+
## Datasets
|
|
55
|
+
|
|
56
|
+
The two main image datasets included are:
|
|
57
|
+
|
|
58
|
+
- **dSprites (grid generator)** – generates synthetic images with controllable grid density, image size, and transformations.
|
|
59
|
+
- **COIL-20 (augmented generator)** – extends the original dataset (to be downloaded separately) with controlled xy-rotations and rescaling.
|
|
60
|
+
|
|
61
|
+
Additionally, four toy datasets are provided for smaller experiments:
|
|
62
|
+
|
|
63
|
+
- **Circle** - A circle embedded in 2D.
|
|
64
|
+
- **Moons** - Two semicircles close to each other. It is practically the same as the moons dataset from sklearn.
|
|
65
|
+
- **Sphere** – A sphere embedded in 3D.
|
|
66
|
+
- **Torus** – A torus in 3D. Has slightly more complex topology and nonuniform geometric measures.
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
All datasets are densely sampled on a grid, where each grid direction corresponds to a transformation axis. This limits practical dimensionality to about 4–5 directions but provides highly accurate geometric values which can be used as ground truth. Datasets can be loaded as full grids or sampled uniformly with respect to the local volume element. For the toy datasets, the geometric measures are computed directly using the corresponding closed-form formulas.
|
|
70
|
+
|
|
71
|
+
For example usages and visualizations of the datasets look in the notebooks in `notebooks/datasets_and_measures`.
|
|
72
|
+
|
|
73
|
+
## Geometric Measures
|
|
74
|
+
|
|
75
|
+
Finite-difference operators are used to compute geometric quantities directly on the grid—making the framework accurate, stable, and differentiable.
|
|
76
|
+
|
|
77
|
+
Available measures include:
|
|
78
|
+
|
|
79
|
+
- Volume and volume element
|
|
80
|
+
- Tangent spaces and the Riemannian metric tensor
|
|
81
|
+
- Scalar curvature
|
|
82
|
+
- Reach along with a pointwise version of it.
|
|
83
|
+
|
|
84
|
+
All computations can be executed on GPU, allowing fast analysis even for dense grids.
|
|
85
|
+
|
|
86
|
+
Example computations of the measures can be found in the following notebook: `notebooks/datasets_and_measures/toy_manifold_datasets.ipynb`.
|
|
87
|
+
|
|
88
|
+
Long term, the goal is to extend this module to include geodesic distances, exponential maps, and other advanced differential quantities.
|
|
89
|
+
|
|
90
|
+
## Manifold Fitting Bounds
|
|
91
|
+
|
|
92
|
+
This section provides reference experiments used in the accompanying paper to validate theoretical manifold fitting
|
|
93
|
+
bounds. These scripts and notebooks are kept in this repository for now so that the original experiments remain visible,
|
|
94
|
+
but this part of the codebase is still being reorganized and may move to a separate public reproducibility repository.
|
|
95
|
+
|
|
96
|
+
Two types of models are used to approximate the reference manifolds:
|
|
97
|
+
|
|
98
|
+
- Moving Least Squares (MMLS): A classical local manifold fitting algorithm used to recover smooth embeddings from sampled data.
|
|
99
|
+
- $\beta$-VAE: A deep generative model trained to learn a low-dimensional latent manifold consistent with the data geometry and using it to reconstruct a full data manifold.
|
|
100
|
+
|
|
101
|
+
The $\beta$-VAE implementation in `representation_learning/beta_vae` is copied and adapted from
|
|
102
|
+
[1Konny/Beta-VAE](https://github.com/1Konny/Beta-VAE), a PyTorch reproduction of the $\beta$-VAE models from Higgins
|
|
103
|
+
et al. (2017) and Burgess et al. (2018). The upstream project is distributed under the MIT License; the original
|
|
104
|
+
copyright and license text are included in `representation_learning/beta_vae/LICENSE`, and a package-level attribution
|
|
105
|
+
notice is included in `representation_learning/beta_vae/NOTICE.md`.
|
|
106
|
+
|
|
107
|
+
The results are compared to theoretical bounds proposed by Fefferman, Narayanan & Mitter (2016) and Genovese et al. (2012), assessing their tightness and dependence on curvature, reach, and sample density.
|
|
108
|
+
|
|
109
|
+
To reproduce the fitting of the manifolds:
|
|
110
|
+
|
|
111
|
+
- For the toy datasets run:
|
|
112
|
+
```
|
|
113
|
+
python experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py \
|
|
114
|
+
--output-path <path to output dir>/toy_datasets_fitting_mmls \
|
|
115
|
+
--n-range 25 505 5 \
|
|
116
|
+
--n-examples-per-size 20 \
|
|
117
|
+
--n-ground-truth 1_000 \
|
|
118
|
+
--max-workers 5 \
|
|
119
|
+
--fitting-method MMLS
|
|
120
|
+
|
|
121
|
+
python experiment_scripts/toy_manifolds_experiment/fit_and_get_measures.py \
|
|
122
|
+
--output-path <path to output dir>/toy_dataset_fitting_denoising_autoencoder \
|
|
123
|
+
--n-examples-per-size 5 \
|
|
124
|
+
--n-ground-truth 1_000 \
|
|
125
|
+
--max-workers 5 \
|
|
126
|
+
--fitting-method denoising_autoencoder_random_noise
|
|
127
|
+
```
|
|
128
|
+
- For the image datasets run:
|
|
129
|
+
```
|
|
130
|
+
COIL20_PATH=<path where you extracted coil-20-proc> python experiment_scripts/manifold_fitting/training.py --output-path <output path>
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Examples of generated results can be found in the three notebooks under `notebooks/manifold_fitting`.
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import typer
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from microscope.datasets.generic_dataset_loader import DatasetName
|
|
8
|
+
from microscope.computations_grid.data_analysis.run_data_analysis import main as run_analysis
|
|
9
|
+
|
|
10
|
+
app = typer.Typer(pretty_exceptions_enable=False)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def analysis_on_model(
|
|
14
|
+
inference_path: Path,
|
|
15
|
+
output_path: Path,
|
|
16
|
+
dataset_name: DatasetName,
|
|
17
|
+
model_type: str,
|
|
18
|
+
number_of_dims: int,
|
|
19
|
+
only_evolution: bool,
|
|
20
|
+
normalize_for_volume: bool,
|
|
21
|
+
skip_done: bool,
|
|
22
|
+
n_samples_for_plots: int = 50_000
|
|
23
|
+
) -> None:
|
|
24
|
+
if output_path.exists() and skip_done:
|
|
25
|
+
print(f"Skipping {output_path.name} as it exists.")
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
print(f"Analysis on {output_path.name}.")
|
|
29
|
+
run_analysis(
|
|
30
|
+
inference_path=inference_path,
|
|
31
|
+
output_path=output_path,
|
|
32
|
+
dataset=dataset_name,
|
|
33
|
+
model_type=model_type,
|
|
34
|
+
number_of_dims=number_of_dims,
|
|
35
|
+
only_evolution=only_evolution,
|
|
36
|
+
normalize_for_volume=normalize_for_volume,
|
|
37
|
+
n_samples_for_plots=n_samples_for_plots
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@app.command()
|
|
42
|
+
def run_analyses(
|
|
43
|
+
inference_path: Path = typer.Option(...),
|
|
44
|
+
output_path: Path = typer.Option(...),
|
|
45
|
+
only_evolution: bool = True,
|
|
46
|
+
normalize_for_volume: bool = True,
|
|
47
|
+
skip_done: bool = True,
|
|
48
|
+
n_samples_for_plots: int = 50_000
|
|
49
|
+
) -> None:
|
|
50
|
+
dataset_name_list = [
|
|
51
|
+
"custom_dsprites_balanced",
|
|
52
|
+
"extended_coil20"
|
|
53
|
+
]
|
|
54
|
+
model_type_list = [
|
|
55
|
+
"beta_vae",
|
|
56
|
+
"mae"
|
|
57
|
+
]
|
|
58
|
+
training_ratio_per_dim_list = [
|
|
59
|
+
1.0
|
|
60
|
+
]
|
|
61
|
+
number_of_dims_list = [
|
|
62
|
+
# 1,
|
|
63
|
+
2,
|
|
64
|
+
3,
|
|
65
|
+
# 4
|
|
66
|
+
]
|
|
67
|
+
# No noise.
|
|
68
|
+
noise_sigma_list = [
|
|
69
|
+
0
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
hyperparameter_grid = list(itertools.product(
|
|
73
|
+
dataset_name_list,
|
|
74
|
+
model_type_list,
|
|
75
|
+
training_ratio_per_dim_list,
|
|
76
|
+
number_of_dims_list,
|
|
77
|
+
noise_sigma_list
|
|
78
|
+
))
|
|
79
|
+
|
|
80
|
+
for dataset_name, model_type, training_ratio_per_dim, number_of_dims, noise_sigma in tqdm(hyperparameter_grid):
|
|
81
|
+
# Skip dimension 4 for COIL20.
|
|
82
|
+
if (number_of_dims == 4) and (dataset_name == "extended_coil20"):
|
|
83
|
+
continue
|
|
84
|
+
if (number_of_dims == 4) and (model_type == "mae"):
|
|
85
|
+
continue
|
|
86
|
+
model_dir = "__".join([
|
|
87
|
+
dataset_name,
|
|
88
|
+
model_type,
|
|
89
|
+
str(training_ratio_per_dim),
|
|
90
|
+
str(number_of_dims),
|
|
91
|
+
str(noise_sigma)
|
|
92
|
+
])
|
|
93
|
+
analysis_on_model(
|
|
94
|
+
output_path=output_path / model_dir,
|
|
95
|
+
inference_path=inference_path / model_dir,
|
|
96
|
+
dataset_name=dataset_name,
|
|
97
|
+
model_type=model_type,
|
|
98
|
+
number_of_dims=number_of_dims,
|
|
99
|
+
only_evolution=only_evolution,
|
|
100
|
+
normalize_for_volume=normalize_for_volume,
|
|
101
|
+
skip_done=skip_done,
|
|
102
|
+
n_samples_for_plots=n_samples_for_plots
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == "__main__":
|
|
107
|
+
app()
|
|
File without changes
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import typer
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from microscope.datasets.generic_dataset_loader import DatasetName
|
|
8
|
+
from microscope.computations_grid.data_analysis.run_data_analysis import main as run_analysis
|
|
9
|
+
|
|
10
|
+
app = typer.Typer(pretty_exceptions_enable=False)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def analysis_on_model(
|
|
14
|
+
inference_path: Path,
|
|
15
|
+
output_path: Path,
|
|
16
|
+
dataset_name: DatasetName,
|
|
17
|
+
model_type: str,
|
|
18
|
+
number_of_dims: int,
|
|
19
|
+
only_evolution: bool,
|
|
20
|
+
normalize_for_volume: bool,
|
|
21
|
+
skip_done: bool
|
|
22
|
+
) -> None:
|
|
23
|
+
if output_path.exists() and skip_done:
|
|
24
|
+
print(f"Skipping {output_path.name} as it exists.")
|
|
25
|
+
return None
|
|
26
|
+
|
|
27
|
+
print(f"Analysis on {output_path.name}.")
|
|
28
|
+
run_analysis(
|
|
29
|
+
inference_path=inference_path,
|
|
30
|
+
output_path=output_path,
|
|
31
|
+
dataset=dataset_name,
|
|
32
|
+
model_type=model_type,
|
|
33
|
+
number_of_dims=number_of_dims,
|
|
34
|
+
only_evolution=only_evolution,
|
|
35
|
+
normalize_for_volume=normalize_for_volume
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@app.command()
|
|
40
|
+
def run_analyses(
|
|
41
|
+
inference_path: Path = typer.Option(...),
|
|
42
|
+
output_path: Path = typer.Option(...),
|
|
43
|
+
only_evolution: bool = True,
|
|
44
|
+
normalize_for_volume: bool = True,
|
|
45
|
+
skip_done: bool = True
|
|
46
|
+
) -> None:
|
|
47
|
+
dataset_name_list = [
|
|
48
|
+
"custom_dsprites_balanced",
|
|
49
|
+
"extended_coil20"
|
|
50
|
+
]
|
|
51
|
+
model_type_list = [
|
|
52
|
+
"beta_vae",
|
|
53
|
+
"mae"
|
|
54
|
+
]
|
|
55
|
+
training_ratio_per_dim_list = [
|
|
56
|
+
0.4,
|
|
57
|
+
0.5,
|
|
58
|
+
0.6,
|
|
59
|
+
1.0
|
|
60
|
+
]
|
|
61
|
+
number_of_dims_list = [
|
|
62
|
+
1,
|
|
63
|
+
2,
|
|
64
|
+
3,
|
|
65
|
+
4
|
|
66
|
+
]
|
|
67
|
+
# No noise.
|
|
68
|
+
noise_sigma_list = [
|
|
69
|
+
0
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
hyperparameter_grid = list(itertools.product(
|
|
73
|
+
dataset_name_list,
|
|
74
|
+
model_type_list,
|
|
75
|
+
training_ratio_per_dim_list,
|
|
76
|
+
number_of_dims_list,
|
|
77
|
+
noise_sigma_list
|
|
78
|
+
))
|
|
79
|
+
|
|
80
|
+
for dataset_name, model_type, training_ratio_per_dim, number_of_dims, noise_sigma in tqdm(hyperparameter_grid):
|
|
81
|
+
# Skip dimension 4 for COIL20.
|
|
82
|
+
if (number_of_dims == 4) and (dataset_name == "extended_coil20"):
|
|
83
|
+
continue
|
|
84
|
+
if (number_of_dims == 4) and (model_type == "mae"):
|
|
85
|
+
continue
|
|
86
|
+
model_dir = "__".join([
|
|
87
|
+
dataset_name,
|
|
88
|
+
model_type,
|
|
89
|
+
str(training_ratio_per_dim),
|
|
90
|
+
str(number_of_dims),
|
|
91
|
+
str(noise_sigma)
|
|
92
|
+
])
|
|
93
|
+
analysis_on_model(
|
|
94
|
+
output_path=output_path / model_dir,
|
|
95
|
+
inference_path=inference_path / model_dir,
|
|
96
|
+
dataset_name=dataset_name,
|
|
97
|
+
model_type=model_type,
|
|
98
|
+
number_of_dims=number_of_dims,
|
|
99
|
+
only_evolution=only_evolution,
|
|
100
|
+
normalize_for_volume=normalize_for_volume,
|
|
101
|
+
skip_done=skip_done
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
if __name__ == "__main__":
|
|
106
|
+
app()
|