topogeoml 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. topogeoml-0.0.3/.gitignore +68 -0
  2. topogeoml-0.0.3/CHANGELOG.md +145 -0
  3. topogeoml-0.0.3/LICENSE +21 -0
  4. topogeoml-0.0.3/PKG-INFO +449 -0
  5. topogeoml-0.0.3/README.md +377 -0
  6. topogeoml-0.0.3/examples/circles_vs_lines.py +93 -0
  7. topogeoml-0.0.3/examples/configs/synthetic_shapes.yaml +38 -0
  8. topogeoml-0.0.3/examples/outputs/shape_divergence_evidence.json +3841 -0
  9. topogeoml-0.0.3/examples/outputs/shape_divergence_plot.png +0 -0
  10. topogeoml-0.0.3/examples/outputs/synthetic_shapes_v001.json +85 -0
  11. topogeoml-0.0.3/examples/run_experiment.py +164 -0
  12. topogeoml-0.0.3/examples/shape_divergence_demo.py +365 -0
  13. topogeoml-0.0.3/pyproject.toml +190 -0
  14. topogeoml-0.0.3/tests/__init__.py +0 -0
  15. topogeoml-0.0.3/tests/conftest.py +42 -0
  16. topogeoml-0.0.3/tests/test_benchmarks.py +296 -0
  17. topogeoml-0.0.3/tests/test_benchmarks_cli.py +197 -0
  18. topogeoml-0.0.3/tests/test_benchmarks_coverage.py +605 -0
  19. topogeoml-0.0.3/tests/test_benchmarks_gudhi_hofer.py +287 -0
  20. topogeoml-0.0.3/tests/test_benchmarks_mnist.py +235 -0
  21. topogeoml-0.0.3/tests/test_benchmarks_report.py +261 -0
  22. topogeoml-0.0.3/tests/test_benchmarks_speed.py +96 -0
  23. topogeoml-0.0.3/tests/test_benchmarks_stats_bca_block.py +398 -0
  24. topogeoml-0.0.3/tests/test_benchmarks_stats_extras.py +213 -0
  25. topogeoml-0.0.3/tests/test_complexes.py +191 -0
  26. topogeoml-0.0.3/tests/test_configs.py +152 -0
  27. topogeoml-0.0.3/tests/test_cubical.py +109 -0
  28. topogeoml-0.0.3/tests/test_cubical_diff_ph.py +425 -0
  29. topogeoml-0.0.3/tests/test_diagrams.py +97 -0
  30. topogeoml-0.0.3/tests/test_diff_ph.py +219 -0
  31. topogeoml-0.0.3/tests/test_drive_pipeline.py +104 -0
  32. topogeoml-0.0.3/tests/test_embedding_audit.py +97 -0
  33. topogeoml-0.0.3/tests/test_feature_pipeline.py +184 -0
  34. topogeoml-0.0.3/tests/test_filtrations.py +106 -0
  35. topogeoml-0.0.3/tests/test_graph_lift.py +120 -0
  36. topogeoml-0.0.3/tests/test_hodge.py +126 -0
  37. topogeoml-0.0.3/tests/test_hodge_bench.py +1077 -0
  38. topogeoml-0.0.3/tests/test_library_coverage.py +779 -0
  39. topogeoml-0.0.3/tests/test_notebooks.py +79 -0
  40. topogeoml-0.0.3/tests/test_shape_of_learning.py +202 -0
  41. topogeoml-0.0.3/tests/test_signal.py +259 -0
  42. topogeoml-0.0.3/tests/test_topology_predicts_divergence.py +289 -0
  43. topogeoml-0.0.3/tests/test_vectorizers.py +122 -0
  44. topogeoml-0.0.3/topogeoml/__init__.py +91 -0
  45. topogeoml-0.0.3/topogeoml/_version.py +3 -0
  46. topogeoml-0.0.3/topogeoml/audits/__init__.py +13 -0
  47. topogeoml-0.0.3/topogeoml/audits/embedding_audit.py +200 -0
  48. topogeoml-0.0.3/topogeoml/core/__init__.py +41 -0
  49. topogeoml-0.0.3/topogeoml/core/complexes.py +287 -0
  50. topogeoml-0.0.3/topogeoml/core/cubical.py +140 -0
  51. topogeoml-0.0.3/topogeoml/core/diagrams.py +148 -0
  52. topogeoml-0.0.3/topogeoml/core/filtrations.py +135 -0
  53. topogeoml-0.0.3/topogeoml/core/vectorizers.py +222 -0
  54. topogeoml-0.0.3/topogeoml/data/__init__.py +5 -0
  55. topogeoml-0.0.3/topogeoml/data/graph_to_complex.py +108 -0
  56. topogeoml-0.0.3/topogeoml/experiments/__init__.py +28 -0
  57. topogeoml-0.0.3/topogeoml/experiments/configs.py +208 -0
  58. topogeoml-0.0.3/topogeoml/nn/__init__.py +11 -0
  59. topogeoml-0.0.3/topogeoml/nn/cubical_diff_ph.py +470 -0
  60. topogeoml-0.0.3/topogeoml/nn/diff_ph.py +483 -0
  61. topogeoml-0.0.3/topogeoml/nn/hodge.py +239 -0
  62. topogeoml-0.0.3/topogeoml/pipelines/__init__.py +5 -0
  63. topogeoml-0.0.3/topogeoml/pipelines/feature_pipeline.py +283 -0
  64. topogeoml-0.0.3/topogeoml/py.typed +0 -0
  65. topogeoml-0.0.3/topogeoml/services/__init__.py +3 -0
  66. topogeoml-0.0.3/topogeoml/signal/__init__.py +50 -0
  67. topogeoml-0.0.3/topogeoml/signal/delay_embedding.py +177 -0
  68. topogeoml-0.0.3/topogeoml/signal/sliding_window.py +302 -0
  69. topogeoml-0.0.3/topogeoml/training/__init__.py +15 -0
  70. topogeoml-0.0.3/topogeoml/training/callbacks.py +405 -0
  71. topogeoml-0.0.3/topogeoml/training/snapshot.py +100 -0
@@ -0,0 +1,68 @@
1
+ # Byte-compiled / optimized
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+
7
+ # Distribution / packaging
8
+ .Python
9
+ build/
10
+ dist/
11
+ *.egg-info/
12
+ *.egg
13
+ wheels/
14
+ .eggs/
15
+
16
+ # Virtual environments
17
+ .venv/
18
+ venv/
19
+ env/
20
+ ENV/
21
+
22
+ # Testing / coverage
23
+ .pytest_cache/
24
+ .coverage
25
+ .coverage.*
26
+ htmlcov/
27
+ .tox/
28
+ .hypothesis/
29
+ coverage.xml
30
+ *.cover
31
+
32
+ # Type checking
33
+ .mypy_cache/
34
+ .ruff_cache/
35
+ .pyre/
36
+ .pytype/
37
+
38
+ # Jupyter
39
+ .ipynb_checkpoints/
40
+ *.ipynb_checkpoints
41
+
42
+ # IDE
43
+ .vscode/
44
+ .idea/
45
+ *.swp
46
+ *~
47
+ .DS_Store
48
+
49
+ # Project
50
+ *.log
51
+ /data/
52
+ !topogeoml/data/
53
+ checkpoints/
54
+ runs/
55
+ wandb/
56
+ mlruns/
57
+ artifacts/
58
+
59
+ # Submission / scratch
60
+ scratch/
61
+ submissions/
62
+ *.parquet
63
+ *.pkl
64
+ *.npz
65
+
66
+ # OS
67
+ Thumbs.db
68
+ .DS_Store
@@ -0,0 +1,145 @@
1
+ # Changelog
2
+
3
+ All notable changes to TopoGeomML will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.0.3] — 2026-06-05
9
+
10
+ Packaging/distribution release — no library or result changes.
11
+
12
+ ### Added
13
+
14
+ - **PyPI distribution.** `topogeoml` is now published to PyPI via GitHub Actions Trusted Publishing (OIDC, no stored token), with a CycloneDX SBOM, build-provenance + SBOM attestations, and Sigstore signing — the same release pipeline used across the author's other packages. Install with `pip install topogeoml` (add the `[torch]` extra for the differentiable layers in `topogeoml.nn`).
15
+
16
+ ## [0.0.2] — 2026-05-24
17
+
18
+ Headline: **the framework has its first strict positive-difference real-data claim** (hypothesis 003, NCI1). The v0.0.2 release gate set in `docs/hypotheses/HYPOTHESIS-002-hodge-proteins.md` §5 ("strictly beats MLP at p_BH < 0.01") is met by the `hodge-mp-residual` arm on NCI1 (p_BH = 4.83 × 10⁻³, +8.6 pp). A preregistered hypothesis series (H001–H007, 27 falsifiable sub-predictions) investigates the mechanism through systematic elimination.
19
+
20
+ ### Added — empirical results (all 30-seed, BCa CIs, paired Wilcoxon + BH-FDR; per-seed reports in `notebooks/results/`)
21
+
22
+ - **PR #11 — Topology-divergence callback validated.** `ShapeOfLearningCallback.divergence_score` fires no later than a textbook val-loss-ratio watchdog on a controlled overfitting regime (200-sample `sklearn.load_digits`, p_raw = 5.77 × 10⁻⁴, r = +1.0).
23
+ - **PR #15 (hypothesis 001) — MUTAG ablation, five-arm matched-capacity.** Symmetric Laplacian normalisation is sufficient to make a one-layer Hodge MP match an MLP baseline on MUTAG (p_BH = 0.714). The combinatorial Laplacian underperforms by 9 pp (p_BH = 5.66 × 10⁻⁴). The residual variant *underperforms* MLP at this scale (p_BH = 0.019).
24
+ - **PR #16 (hypothesis 002) — PROTEINS replication.** Two-dataset equality holds for the symm-normalised arm (p_BH = 0.548). The MUTAG combinatorial-L harm does not replicate (Δ shrinks by ~10×). Strong "topology beats MLP" claim refuted on PROTEINS.
25
+ - **PR #19 (hypothesis 003) — NCI1, the headline.** On 4110 chemical-compound graphs, the symm-normalised + residual variant **strictly beats** MLP at p_BH = 4.83 × 10⁻³ (median Δ = +0.086, BCa 95% CI: [0.581, 0.625] vs MLP's [0.513, 0.566]). The residual variant *inverts* its verdict from MUTAG to NCI1 — residuals scale with dataset size at this architectural class.
26
+
27
+ ### Added — public API surface
28
+
29
+ **Neural-network layers** (requires torch)
30
+ - `topogeoml.nn.diff_ph` — differentiable Vietoris-Rips persistent homology via critical-edge indexing (Hofer 2017 / Carrière 2021). Public surface: `rips_diagram_torch`, `finite_lifetimes`, `total_persistence_loss`, `persistence_entropy_loss`, `betti_matching_loss`.
31
+ - `topogeoml.nn.cubical_diff_ph` — differentiable lower-star cubical persistent homology on 2-D/3-D images, with `CubicalTopologyLoss(nn.Module)` for image-segmentation training in the Clough et al. 2020 style.
32
+ - `topogeoml.training.ShapeOfLearningCallback` — empirically validated topology-divergence watchdog for PyTorch training loops (see PR #11 row above).
33
+
34
+ **Benchmark framework** (`benchmarks/`)
35
+ - 4 backends × 4 measurement axes with statistically defensible reporting; `python -m benchmarks` CLI with `--quick` smoke tier.
36
+ - `benchmarks.stats` — bootstrap CI (percentile, BCa, block), Mann-Whitney U + Cliff's δ, Wilcoxon signed-rank + rank-biserial, Benjamini-Hochberg FDR. All citations in module docstring.
37
+ - `benchmarks.hodge` — graph-classification subsystem with five matched-capacity classifier arms (combinatorial L, symm L̃, +residual, +2-stacked+residual, MLP control) and three dataset adapters (MUTAG, PROTEINS, NCI1).
38
+
39
+ **Documentation + discipline**
40
+ - `LEADERBOARD.md` — single navigable record of every empirical claim with reproducibility instructions and discipline rules.
41
+ - `docs/hypotheses/HYPOTHESIS-001-hodge-mutag.md` through `HYPOTHESIS-003-hodge-nci1.md` — preregistered hypothesis docs with falsifiable sub-predictions and post-hoc resolved outcomes.
42
+
43
+ ### Changed
44
+
45
+ - The README's roadmap is narrower and honest: no "drift-tensor", no "TOPOLOGICA proprietary", no "GPU-batched", no peer-review or DOI promises. v0.0.2 is gated on the NCI1 positive claim; v1.0 is conditional on a deeper empirical record and a methods paper.
46
+ - The "Hodge MP layer: minimal, not state-of-the-art" caveat in `LIMITATIONS.md` §1.8 is preserved; the framework now ships *five* Hodge arms in the benchmark, and the architectural element that produces the NCI1 win (residual + normalisation) is named explicitly.
47
+
48
+ ### Fixed
49
+
50
+ - **Critical bug in `benchmarks/hodge/models.py` (PR #12).** The original `HodgeMessagePassing` layer was instantiated inside `forward_one()` per graph, so its weights were never registered with the optimizer and were re-randomised on every forward call. The pre-PR-#12 MUTAG numbers measured a 2-layer MLP through a random topology filter, not the Hodge architecture. Two regression tests prevent recurrence.
51
+ - **`benchmarks/cli.py` exit-code logic** — `SkippedNonDifferentiable` and `UnavailableBackend` cells no longer count as failures (PR #16, #17). Real cell failures now surface on stderr.
52
+ - **Bench workflow torch/torchvision ABI mismatch** — install both from the CPU wheel index so `torchvision::nms` resolves (PR #17).
53
+
54
+ ### Deferred indefinitely (no implementation timeline)
55
+ - PH metric cascade (Euclidean → Spectral → Fermat)
56
+ - TopoNetX integration for non-simplicial complexes
57
+ - GPU-batched Rips
58
+ - MLflow / W&B tracking
59
+ - Multi-rank simplicial neural network (full SCN)
60
+ - Real DRIVE numbers — pipeline shipped (PR #9), user-side GPU run pending
61
+
62
+ ### Added — academic infrastructure
63
+
64
+ - `docs/RESEARCH_REPORT.md` — structured technical report documenting the full preregistered hypothesis series (H001–H007) with results, discussion, and bounded claims
65
+ - `CITATION.cff` — CFF 1.2.0 machine-readable citation (GitHub renders "Cite this repository")
66
+ - `.zenodo.json` — Zenodo deposit metadata for DOI minting
67
+ - `CONTRIBUTING.md` — academic collaboration guidelines (preregistration pattern, code standards, statistical discipline)
68
+ - `REPRODUCING.md` — per-hypothesis reproduction guide with commands, wall-clock estimates, and expected outputs
69
+
70
+ ### Changed
71
+
72
+ - `pyproject.toml`: version bumped to 0.0.2, classifier updated to "Development Status :: 4 - Beta", URLs fixed to `smaniches/TopoGeoML`
73
+ - `README.md`: badges updated, status section rewritten to reflect positive results, mechanism-investigation section added, roadmap updated, citation section updated with CITATION.cff reference
74
+
75
+ ### Quality gates for v0.0.2
76
+ - `ruff check topogeoml tests benchmarks scripts notebooks`: clean
77
+ - `pytest`: 497 passed (up from 118 at v0.0.1)
78
+ - `coverage(topogeoml/ + benchmarks/)`: 100%
79
+ - `mypy topogeoml`: 0 errors (CI enforcement deferred to a separate PR pending constrained-env reproduction)
80
+ - 6 CI workflows on main, all green
81
+
82
+ ## [0.0.1] — 2026-05-20
83
+
84
+ Initial pre-stable release. The eleven-item v0.0.1 scope lock is fully implemented.
85
+
86
+ ### Added
87
+
88
+ **Core mathematical objects**
89
+ - `PersistenceDiagram` frozen dataclass with mandatory `DiagramProvenance`
90
+ - `RipsFiltration` via ripser with float64 enforcement and provenance recording
91
+ - `PersistenceImageVectorizer` (Adams et al. 2017) via persim
92
+ - `BettiCurveVectorizer` (vectorized sampling on uniform grid)
93
+ - `SimplicialComplex` with lexicographic simplex ordering and automatic face closure
94
+ - `boundary_matrix(k)` — signed sparse boundary operator ∂_k over R
95
+ - `is_chain_complex` — verifies ∂_{k-1} ∂_k = 0 within numerical tolerance
96
+ - `hodge_laplacian(k)` — symmetric PSD sparse Laplacian L_k = ∂_k^T ∂_k + ∂_{k+1} ∂_{k+1}^T
97
+ - `betti_numbers` via dense eigendecomposition (discrete Hodge theorem)
98
+ - `cubical_mask_diagnostic` — β_0/β_1 + Euler characteristic for binary 2D masks (3D β_0 only)
99
+
100
+ **Data adapters**
101
+ - `graph_to_clique_complex` — NetworkX graph or adjacency matrix → SimplicialComplex with bounded max_dim
102
+
103
+ **Pipelines**
104
+ - `TopologyFeaturePipeline` — sklearn `BaseEstimator + TransformerMixin`, supports list-of-arrays and 3D ndarray inputs, captures `FitProvenance`
105
+
106
+ **Audits**
107
+ - `audit_embedding` — Rips-based topology audit of embedding matrices with NN-distance threshold heuristic
108
+ - `EmbeddingTopologyAudit` dataclass with β_0/β_1 estimates, total persistence, longest H_1 lifetime
109
+
110
+ **Neural-network layers (requires torch)**
111
+ - `HodgeMessagePassing` — minimal one-round propagation x' = σ(L̃_k @ x @ W + b)
112
+ - `normalize_hodge_laplacian` — symmetric normalization D^{-1/2} L D^{-1/2}
113
+ - `sparse_scipy_to_torch` — sparse format converter
114
+ - `build_hodge_layer_from_complex` — convenience constructor
115
+
116
+ **Experiments**
117
+ - `load_experiment_config` — YAML loader with dataclass validation
118
+ - `write_results` — JSON writer with config echo, environment snapshot, UTC timestamp, numpy-aware serialization
119
+ - `ExperimentConfig`, `DatasetConfig`, `PipelineConfig`, `ValidationConfig`, `OutputConfig` dataclasses
120
+ - `examples/run_experiment.py` — end-to-end YAML → JSON runner
121
+ - `examples/configs/synthetic_shapes.yaml` — first benchmark config
122
+
123
+ **Documentation**
124
+ - `LIMITATIONS.md` — explicit scope cuts, failure modes, and unvalidated claims
125
+ - `README.md` — quick-start for each of the 11 items, architecture diagram, contracts, citation block
126
+
127
+ **Infrastructure**
128
+ - `pyproject.toml` (hatchling backend, MIT license, optional extras: `torch`, `tda`, `higher-order`, `api`, `dev`, `all`)
129
+ - GitHub Actions CI matrix (Python 3.11/3.12 × Ubuntu/macOS)
130
+ - PEP 561 `py.typed` marker
131
+
132
+ ### Verified
133
+ - Boundary identity ∂² = 0 on triangle, tetrahedron, two-triangle complexes
134
+ - Hodge β recovery: D² (1,0,0); S¹ (1,1); S² (1,0,1); disjoint vertices; disjoint triangles
135
+ - L_0 on path graph reduces to standard combinatorial graph Laplacian
136
+ - Cubical β on disk (1,0), annulus (1,1), two disks (2,0), disk-with-two-holes (1,2)
137
+ - Clique complex topology on K_3, K_4, C_4, K_4-boundary
138
+ - Hodge MP forward/backward, gradient flow, shape contracts, layer stacking
139
+ - Embedding audit on single-circle (β_1=1), two-circle (β_1=2) layouts
140
+ - YAML round-trip, JSON output schema with mandatory fields, numpy serialization
141
+ - End-to-end synthetic-shapes benchmark: 5-fold CV accuracy 1.0000 ± 0.0000
142
+
143
+ ### Test suite
144
+
145
+ 118 tests passing, 3 skipped (torch-gated: Hodge MP layer, differentiable PH, ShapeOfLearning callback — collected and run only when the `[torch]` extra is installed). Verified on Python 3.11 and 3.12, Ubuntu and macOS, via `.github/workflows/ci.yml`.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Santiago Maniches / TOPOLOGICA LLC
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.