plsdo 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- plsdo-0.0.1/.github/ISSUE_TEMPLATE/bug_report.yml +48 -0
- plsdo-0.0.1/.github/ISSUE_TEMPLATE/config.yml +1 -0
- plsdo-0.0.1/.github/ISSUE_TEMPLATE/docs_issue.yml +30 -0
- plsdo-0.0.1/.github/ISSUE_TEMPLATE/feature_request.yml +29 -0
- plsdo-0.0.1/.github/workflows/ci.yml +52 -0
- plsdo-0.0.1/.gitignore +22 -0
- plsdo-0.0.1/CLAUDE.md +68 -0
- plsdo-0.0.1/LICENSE +28 -0
- plsdo-0.0.1/PKG-INFO +145 -0
- plsdo-0.0.1/README.md +110 -0
- plsdo-0.0.1/docs/input-format.md +60 -0
- plsdo-0.0.1/docs/interpreting-output.md +70 -0
- plsdo-0.0.1/docs/missing-data.md +42 -0
- plsdo-0.0.1/docs/usage.md +88 -0
- plsdo-0.0.1/plsdo/__init__.py +3 -0
- plsdo-0.0.1/plsdo/cli.py +237 -0
- plsdo-0.0.1/plsdo/core.py +179 -0
- plsdo-0.0.1/plsdo/cross_validate.py +157 -0
- plsdo-0.0.1/plsdo/io.py +447 -0
- plsdo-0.0.1/plsdo/pipeline.py +735 -0
- plsdo-0.0.1/plsdo/plotting.py +707 -0
- plsdo-0.0.1/pyproject.toml +60 -0
- plsdo-0.0.1/tests/conftest.py +62 -0
- plsdo-0.0.1/tests/data/behaviour.csv +13 -0
- plsdo-0.0.1/tests/data/behaviour_meta.csv +5 -0
- plsdo-0.0.1/tests/data/brain.csv +13 -0
- plsdo-0.0.1/tests/data/brain_meta.csv +6 -0
- plsdo-0.0.1/tests/data/demographics.csv +13 -0
- plsdo-0.0.1/tests/data/groups.yaml +8 -0
- plsdo-0.0.1/tests/test_cli.py +242 -0
- plsdo-0.0.1/tests/test_core.py +271 -0
- plsdo-0.0.1/tests/test_cross_validate.py +73 -0
- plsdo-0.0.1/tests/test_io.py +337 -0
- plsdo-0.0.1/tests/test_plotting.py +352 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
name: Bug report
|
|
2
|
+
description: Report something that is not working correctly
|
|
3
|
+
labels: ["type: bug"]
|
|
4
|
+
body:
|
|
5
|
+
- type: textarea
|
|
6
|
+
id: command
|
|
7
|
+
attributes:
|
|
8
|
+
label: Command run
|
|
9
|
+
description: The exact `plsdo` invocation, including all flags
|
|
10
|
+
placeholder: "plsdo run --method c --x X.csv --y Y.csv --demographics demo.csv --output results/"
|
|
11
|
+
validations:
|
|
12
|
+
required: true
|
|
13
|
+
|
|
14
|
+
- type: textarea
|
|
15
|
+
id: expected
|
|
16
|
+
attributes:
|
|
17
|
+
label: Expected behaviour
|
|
18
|
+
description: What should have happened
|
|
19
|
+
validations:
|
|
20
|
+
required: true
|
|
21
|
+
|
|
22
|
+
- type: textarea
|
|
23
|
+
id: actual
|
|
24
|
+
attributes:
|
|
25
|
+
label: Actual behaviour
|
|
26
|
+
description: What happened instead — include the full error message and traceback if applicable
|
|
27
|
+
validations:
|
|
28
|
+
required: true
|
|
29
|
+
|
|
30
|
+
- type: textarea
|
|
31
|
+
id: input
|
|
32
|
+
attributes:
|
|
33
|
+
label: Input description
|
|
34
|
+
description: >
|
|
35
|
+
Describe the shape and content of your input files (number of subjects, features, groups).
|
|
36
|
+
Do not attach real data — synthetic or anonymised descriptions are fine.
|
|
37
|
+
placeholder: "X: 42 subjects × 120 features, Y: 42 subjects × 8 scores, 3 groups"
|
|
38
|
+
validations:
|
|
39
|
+
required: false
|
|
40
|
+
|
|
41
|
+
- type: textarea
|
|
42
|
+
id: environment
|
|
43
|
+
attributes:
|
|
44
|
+
label: Environment
|
|
45
|
+
description: Python version, operating system, and plsdo version (found in `log.txt` under `version:`)
|
|
46
|
+
placeholder: "Python 3.11, macOS 14.4, plsdo 0.1.0"
|
|
47
|
+
validations:
|
|
48
|
+
required: true
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
blank_issues_enabled: true
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: Documentation issue
|
|
2
|
+
description: Report missing, incorrect, or unclear documentation — including missing references
|
|
3
|
+
labels: ["type: docs"]
|
|
4
|
+
body:
|
|
5
|
+
- type: textarea
|
|
6
|
+
id: description
|
|
7
|
+
attributes:
|
|
8
|
+
label: What is missing or incorrect
|
|
9
|
+
description: Describe the gap or error
|
|
10
|
+
validations:
|
|
11
|
+
required: true
|
|
12
|
+
|
|
13
|
+
- type: input
|
|
14
|
+
id: location
|
|
15
|
+
attributes:
|
|
16
|
+
label: Location
|
|
17
|
+
description: Which doc page, CLI help text, log output, or plot label is affected?
|
|
18
|
+
placeholder: "docs/interpreting-output.md, section on bootstrap ratios"
|
|
19
|
+
validations:
|
|
20
|
+
required: false
|
|
21
|
+
|
|
22
|
+
- type: textarea
|
|
23
|
+
id: reference
|
|
24
|
+
attributes:
|
|
25
|
+
label: Reference or source
|
|
26
|
+
description: >
|
|
27
|
+
Optional. Cite the paper, source, or example that should be referenced or used to correct the docs.
|
|
28
|
+
placeholder: "McIntosh & Lobaugh (2004), Partial least squares analysis of neuroimaging data, doi:10.1016/j.neuroimage.2004.07.020"
|
|
29
|
+
validations:
|
|
30
|
+
required: false
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
name: Feature request
|
|
2
|
+
description: Propose a new capability or enhancement
|
|
3
|
+
labels: ["type: feature"]
|
|
4
|
+
body:
|
|
5
|
+
- type: textarea
|
|
6
|
+
id: motivation
|
|
7
|
+
attributes:
|
|
8
|
+
label: Use case and motivation
|
|
9
|
+
description: What problem does this solve, and for whom?
|
|
10
|
+
validations:
|
|
11
|
+
required: true
|
|
12
|
+
|
|
13
|
+
- type: textarea
|
|
14
|
+
id: proposal
|
|
15
|
+
attributes:
|
|
16
|
+
label: Proposed behaviour
|
|
17
|
+
description: How should the feature work? Include proposed CLI flags, output changes, or API behaviour as applicable.
|
|
18
|
+
validations:
|
|
19
|
+
required: true
|
|
20
|
+
|
|
21
|
+
- type: textarea
|
|
22
|
+
id: reference
|
|
23
|
+
attributes:
|
|
24
|
+
label: Literature reference
|
|
25
|
+
description: >
|
|
26
|
+
Optional. If the request relates to a specific PLS variant, statistical method, or published analysis,
|
|
27
|
+
cite the paper here (author, year, DOI or URL if available).
|
|
28
|
+
validations:
|
|
29
|
+
required: false
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
name: Test (Python ${{ matrix.python-version }})
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
fail-fast: false
|
|
15
|
+
matrix:
|
|
16
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
|
|
21
|
+
- uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: ${{ matrix.python-version }}
|
|
24
|
+
|
|
25
|
+
- name: Install uv
|
|
26
|
+
run: pip install uv
|
|
27
|
+
|
|
28
|
+
- name: Install package and dev dependencies
|
|
29
|
+
run: uv pip install --system -e ".[dev]"
|
|
30
|
+
|
|
31
|
+
- name: Run tests
|
|
32
|
+
run: pytest -v
|
|
33
|
+
|
|
34
|
+
lint:
|
|
35
|
+
name: Lint
|
|
36
|
+
runs-on: ubuntu-latest
|
|
37
|
+
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/checkout@v4
|
|
40
|
+
|
|
41
|
+
- uses: actions/setup-python@v5
|
|
42
|
+
with:
|
|
43
|
+
python-version: "3.12"
|
|
44
|
+
|
|
45
|
+
- name: Install uv
|
|
46
|
+
run: pip install uv
|
|
47
|
+
|
|
48
|
+
- name: Install ruff
|
|
49
|
+
run: uv pip install --system ruff
|
|
50
|
+
|
|
51
|
+
- name: Run ruff
|
|
52
|
+
run: ruff check plsdo/ tests/
|
plsdo-0.0.1/.gitignore
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
dist/
|
|
6
|
+
build/
|
|
7
|
+
|
|
8
|
+
# Virtual environments
|
|
9
|
+
.venv/
|
|
10
|
+
|
|
11
|
+
# IDE
|
|
12
|
+
.idea/
|
|
13
|
+
.vscode/
|
|
14
|
+
|
|
15
|
+
# OS
|
|
16
|
+
.DS_Store
|
|
17
|
+
|
|
18
|
+
# uv
|
|
19
|
+
uv.lock
|
|
20
|
+
|
|
21
|
+
# Local dev / agentic reference files (not for distribution)
|
|
22
|
+
.dev/
|
plsdo-0.0.1/CLAUDE.md
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Commands
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Install for development (from repo root, with .venv active)
|
|
9
|
+
uv pip install -e ".[dev]"
|
|
10
|
+
|
|
11
|
+
# Run all tests
|
|
12
|
+
.venv/bin/pytest tests/
|
|
13
|
+
|
|
14
|
+
# Run a single test file
|
|
15
|
+
.venv/bin/pytest tests/test_core.py
|
|
16
|
+
|
|
17
|
+
# Run a single test by name
|
|
18
|
+
.venv/bin/pytest tests/test_core.py::TestBootstrap::test_seed_reproducibility
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Architecture
|
|
22
|
+
|
|
23
|
+
The package has a strict separation of concerns across five modules:
|
|
24
|
+
|
|
25
|
+
- **`io.py`** — everything that touches files or validates inputs: loading CSVs, detecting subject IDs, aligning subjects, checking missing values and variance, z-scoring, parsing YAML group configs, loading feature metadata, and building the dummy-coded design matrix.
|
|
26
|
+
- **`core.py`** — the `PLS` class. Stateful: takes z-scored arrays, runs `fit()` → `permutation_test()` → `bootstrap()` → `filter_lvs()` in sequence. Stores results as instance attributes.
|
|
27
|
+
- **`cross_validate.py`** — `run_cv()` and `permutation_test_cv()`. Uses `sklearn.PLSRegression` (not the SVD-based `PLS` class) because prediction requires `predict()`. Entirely independent of `core.py`.
|
|
28
|
+
- **`plotting.py`** — stateless functions. All take data arrays and an `out_path`, save the figure, return nothing. `meta_colours()` is here too (not in pipeline).
|
|
29
|
+
- **`pipeline.py`** — orchestration only. Calls `io` → `core` → `plotting` in sequence, writes CSVs and `log.txt`. No computation here.
|
|
30
|
+
- **`cli.py`** — argument parsing and validation only. Dispatches to `pipeline.run_pipeline()` or `pipeline.cross_validate_pipeline()`.
|
|
31
|
+
|
|
32
|
+
### Key design decisions
|
|
33
|
+
|
|
34
|
+
**One SVD engine for both PLS variants.** Correlational PLS z-scores both X and Y; discriminatory PLS uses a dummy-coded X (not z-scored) and z-scores Y only. The `PLS` class handles both — `pipeline.py` builds the right inputs before calling it.
|
|
35
|
+
|
|
36
|
+
**Bootstrap uses Procrustes + sign correction.** Each bootstrap SVD is aligned to the reference via `scipy.linalg.orthogonal_procrustes` on Vt, then signs are corrected by dot product with the reference Vt loadings. Both U and Vt loadings are aligned together.
|
|
37
|
+
|
|
38
|
+
**LV filtering is two-stage.** `filter_lvs()` keeps LVs that are (1) significant by permutation (p < 0.05) and (2) have at least one feature with |bootstrap ratio| > 1.96 on *both* the X and Y sides. Result is a boolean `final_lvs` mask.
|
|
39
|
+
|
|
40
|
+
**CV flips X and Y.** `cross_validate.py` uses Y (continuous data) as the predictor and dummy-coded groups as the target, so `pls.predict()` gives predicted group scores. This is the opposite convention from `plsdo run`.
|
|
41
|
+
|
|
42
|
+
## Design philosophy
|
|
43
|
+
|
|
44
|
+
Four principles, in priority order:
|
|
45
|
+
|
|
46
|
+
1. **Mathematical validity** — correctness is non-negotiable.
|
|
47
|
+
2. **Lightweight** — no unnecessary dependencies. Every dependency must earn its place.
|
|
48
|
+
3. **Scientific Python standards** — follow community conventions so the package is citable, installable, and maintainable.
|
|
49
|
+
4. **Glass-box and FAIR** — output everything needed to reproduce a result; keep the implementation transparent.
|
|
50
|
+
|
|
51
|
+
Practical consequences: prefer stdlib over third-party where reasonable (argparse over click, logging over print). Do not add inference or statistical tests beyond PLS itself — plotting scores by group factors is in scope; pairwise post-hoc tests are not. When in doubt, do less.
|
|
52
|
+
|
|
53
|
+
**Efficiency** is part of correctness. Prefer vectorised NumPy operations over Python loops wherever the maths permits — not for micro-optimisation, but because this code runs on HPCs and environmental cost is real. If a loop can be replaced by array operations without adding complexity or obscuring intent, replace it.
|
|
54
|
+
|
|
55
|
+
**Robustness** sits inside principle 1, not alongside it. Validate aggressively anywhere a silent failure could propagate — at file boundaries and wherever mathematical assumptions could be violated (zero variance before z-scoring, empty group levels before dummy coding, etc.). Fail loudly with informative errors. Trust internal transformations between already-validated states; defensive checks between modules add noise without catching anything real.
|
|
56
|
+
|
|
57
|
+
## Conventions
|
|
58
|
+
|
|
59
|
+
- British English in all prose: docs, commit messages, user-facing strings, comments.
|
|
60
|
+
- Commit messages use conventional prefixes: `feat`, `fix`, `enh`, `ref`, `test`, `docs`, `chore`. User commits with a GPG key — stage files and provide message text only, with attribution for claude, do not run `git commit`.
|
|
61
|
+
- `plsdo/` contains no data. Test data lives in `tests/data/` (synthetic, small).
|
|
62
|
+
- Reference notebooks (`.dev/correlational_pls.ipynb`, `.dev/discriminatory_pls.ipynb`, `.dev/claude_cross_validation.py`) are the source of truth for computational steps and plot styling. Deviations require discussion. These files are gitignored and live only in your local working copy.
|
|
63
|
+
- `.dev/superpowers/specs/` and `.dev/superpowers/plans/` contain the design spec and implementation plan. Consult them before making structural changes. These files are gitignored.
|
|
64
|
+
|
|
65
|
+
## Before public release / PyPI submission
|
|
66
|
+
- Claim the `plsdo` package name on PyPI before announcing the package publicly — squatting is a real risk once there is any visibility.
|
|
67
|
+
- Update `README.md` and `docs/usage.md` installation instructions from `git clone` to `pip install plsdo` once the package is published.
|
|
68
|
+
- Bump version to `1.0.0` and update the `Development Status` classifier to `4 - Beta` or `5 - Production/Stable` as appropriate.
|
plsdo-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026, Eilidh MacNicol
|
|
4
|
+
|
|
5
|
+
Redistribution and use in source and binary forms, with or without
|
|
6
|
+
modification, are permitted provided that the following conditions are met:
|
|
7
|
+
|
|
8
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
9
|
+
list of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12
|
+
this list of conditions and the following disclaimer in the documentation
|
|
13
|
+
and/or other materials provided with the distribution.
|
|
14
|
+
|
|
15
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
16
|
+
contributors may be used to endorse or promote products derived from
|
|
17
|
+
this software without specific prior written permission.
|
|
18
|
+
|
|
19
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
20
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
21
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
22
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
23
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
24
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
25
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
26
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
27
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
plsdo-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: plsdo
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: PLS covariance analysis with statistical testing and visualisation
|
|
5
|
+
Project-URL: Repository, https://github.com/braincentrekcl/plsdo
|
|
6
|
+
Project-URL: Issues, https://github.com/braincentrekcl/plsdo/issues
|
|
7
|
+
Author: Eilidh MacNicol
|
|
8
|
+
Maintainer: Eilidh MacNicol
|
|
9
|
+
License-Expression: BSD-3-Clause
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: PLS,analysis,multivariate,statistics
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: matplotlib>=3.7
|
|
23
|
+
Requires-Dist: numpy>=1.24
|
|
24
|
+
Requires-Dist: pandas>=2.0
|
|
25
|
+
Requires-Dist: pyyaml>=6.0
|
|
26
|
+
Requires-Dist: scipy>=1.10
|
|
27
|
+
Requires-Dist: seaborn>=0.13
|
|
28
|
+
Provides-Extra: cv
|
|
29
|
+
Requires-Dist: scikit-learn>=1.2; extra == 'cv'
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
32
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
33
|
+
Requires-Dist: scikit-learn; extra == 'dev'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# plsdo
|
|
37
|
+
|
|
38
|
+
Partial Least Squares (PLS) covariance analysis with permutation testing, bootstrap reliability, and publication-ready visualisation — from the command line.
|
|
39
|
+
|
|
40
|
+
(Pronounced: "please do")
|
|
41
|
+
|
|
42
|
+
`plsdo` was built out of necessity for project-specific neuroscience and neuroimaging pipelines, then generalised to handle flexible, diverse datasets beyond its origins. It implements two PLS variants used in neuroimaging and cognitive neuroscience research:
|
|
43
|
+
|
|
44
|
+
- **Correlational PLS** — finds latent variables that maximise covariance between two continuous data matrices (e.g. brain measures and behaviour scores).
|
|
45
|
+
- **Discriminatory PLS** — finds latent variables that maximise covariance between a continuous data matrix and a dummy-coded group matrix (i.e. group differences).
|
|
46
|
+
|
|
47
|
+
Statistical validity is built in: every analysis runs a permutation test on singular values and bootstraps loading stability. Only latent variables that pass both tests appear in the output.
|
|
48
|
+
|
|
49
|
+
> **Early alpha.** The API and output format may change before the first stable release. Feedback and bug reports are very welcome — please open an issue.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
Requires Python ≥ 3.10.
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
git clone https://github.com/braincentrekcl/plsdo.git
|
|
59
|
+
cd plsdo
|
|
60
|
+
uv venv .venv && source .venv/bin/activate
|
|
61
|
+
uv pip install -e .
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
For discriminatory PLS with cross-validation (requires scikit-learn):
|
|
65
|
+
```bash
|
|
66
|
+
uv pip install -e ".[cv]"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Quick start
|
|
72
|
+
|
|
73
|
+
### Correlational PLS
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
plsdo run --method c \
|
|
77
|
+
--x brain_measures.csv \
|
|
78
|
+
--y behaviour_scores.csv \
|
|
79
|
+
--demographics participants.csv \
|
|
80
|
+
--group-col treatment \
|
|
81
|
+
--subject-id participant_id \
|
|
82
|
+
--output results/
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Discriminatory PLS
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
plsdo run --method d \
|
|
89
|
+
--y mri_features.csv \
|
|
90
|
+
--demographics participants.csv \
|
|
91
|
+
--group-col drug_group \
|
|
92
|
+
--subject-id participant_id \
|
|
93
|
+
--output results/
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Cross-validation (discriminatory only)
|
|
97
|
+
|
|
98
|
+
Requires `plsdo[cv]` — see Installation above.
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
plsdo cross-validate \
|
|
102
|
+
--y mri_features.csv \
|
|
103
|
+
--demographics participants.csv \
|
|
104
|
+
--group-col drug_group \
|
|
105
|
+
--subject-id participant_id \
|
|
106
|
+
--output cv_results/
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Output
|
|
112
|
+
|
|
113
|
+
Each run writes to the output directory:
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
results/
|
|
117
|
+
figures/ cross-correlation heatmap, permutation test, loading bar plots, score plots
|
|
118
|
+
data/ singular values, p-values, loadings, bootstrap ratios, subject scores (CSV)
|
|
119
|
+
log.txt parameters and version stamp
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Documentation
|
|
125
|
+
|
|
126
|
+
| Page | Contents |
|
|
127
|
+
|------|----------|
|
|
128
|
+
| [Usage guide](docs/usage.md) | Full CLI options, multiple grouping variables, all flags |
|
|
129
|
+
| [Input format](docs/input-format.md) | How to structure X, Y, demographics, and metadata files |
|
|
130
|
+
| [Interpreting output](docs/interpreting-output.md) | What each plot and CSV means |
|
|
131
|
+
| [Missing data](docs/missing-data.md) | Why plsdo does not impute, and what to do instead |
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## Contributing
|
|
136
|
+
|
|
137
|
+
Issues and pull requests are welcome. Please open an issue before starting significant work.
|
|
138
|
+
|
|
139
|
+
Contact: eilidh [dot] macnicol [at] kcl [dot] ac [dot] uk
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Licence
|
|
144
|
+
|
|
145
|
+
BSD 3-Clause. See [LICENSE](LICENSE).
|
plsdo-0.0.1/README.md
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# plsdo
|
|
2
|
+
|
|
3
|
+
Partial Least Squares (PLS) covariance analysis with permutation testing, bootstrap reliability, and publication-ready visualisation — from the command line.
|
|
4
|
+
|
|
5
|
+
(Pronounced: "please do")
|
|
6
|
+
|
|
7
|
+
`plsdo` was built out of necessity for project-specific neuroscience and neuroimaging pipelines, then generalised to handle flexible, diverse datasets beyond its origins. It implements two PLS variants used in neuroimaging and cognitive neuroscience research:
|
|
8
|
+
|
|
9
|
+
- **Correlational PLS** — finds latent variables that maximise covariance between two continuous data matrices (e.g. brain measures and behaviour scores).
|
|
10
|
+
- **Discriminatory PLS** — finds latent variables that maximise covariance between a continuous data matrix and a dummy-coded group matrix (i.e. group differences).
|
|
11
|
+
|
|
12
|
+
Statistical validity is built in: every analysis runs a permutation test on singular values and bootstraps loading stability. Only latent variables that pass both tests appear in the output.
|
|
13
|
+
|
|
14
|
+
> **Early alpha.** The API and output format may change before the first stable release. Feedback and bug reports are very welcome — please open an issue.
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
Requires Python ≥ 3.10.
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
git clone https://github.com/braincentrekcl/plsdo.git
|
|
24
|
+
cd plsdo
|
|
25
|
+
uv venv .venv && source .venv/bin/activate
|
|
26
|
+
uv pip install -e .
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
For discriminatory PLS with cross-validation (requires scikit-learn):
|
|
30
|
+
```bash
|
|
31
|
+
uv pip install -e ".[cv]"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Quick start
|
|
37
|
+
|
|
38
|
+
### Correlational PLS
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
plsdo run --method c \
|
|
42
|
+
--x brain_measures.csv \
|
|
43
|
+
--y behaviour_scores.csv \
|
|
44
|
+
--demographics participants.csv \
|
|
45
|
+
--group-col treatment \
|
|
46
|
+
--subject-id participant_id \
|
|
47
|
+
--output results/
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Discriminatory PLS
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
plsdo run --method d \
|
|
54
|
+
--y mri_features.csv \
|
|
55
|
+
--demographics participants.csv \
|
|
56
|
+
--group-col drug_group \
|
|
57
|
+
--subject-id participant_id \
|
|
58
|
+
--output results/
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Cross-validation (discriminatory only)
|
|
62
|
+
|
|
63
|
+
Requires `plsdo[cv]` — see Installation above.
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
plsdo cross-validate \
|
|
67
|
+
--y mri_features.csv \
|
|
68
|
+
--demographics participants.csv \
|
|
69
|
+
--group-col drug_group \
|
|
70
|
+
--subject-id participant_id \
|
|
71
|
+
--output cv_results/
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Output
|
|
77
|
+
|
|
78
|
+
Each run writes to the output directory:
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
results/
|
|
82
|
+
figures/ cross-correlation heatmap, permutation test, loading bar plots, score plots
|
|
83
|
+
data/ singular values, p-values, loadings, bootstrap ratios, subject scores (CSV)
|
|
84
|
+
log.txt parameters and version stamp
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Documentation
|
|
90
|
+
|
|
91
|
+
| Page | Contents |
|
|
92
|
+
|------|----------|
|
|
93
|
+
| [Usage guide](docs/usage.md) | Full CLI options, multiple grouping variables, all flags |
|
|
94
|
+
| [Input format](docs/input-format.md) | How to structure X, Y, demographics, and metadata files |
|
|
95
|
+
| [Interpreting output](docs/interpreting-output.md) | What each plot and CSV means |
|
|
96
|
+
| [Missing data](docs/missing-data.md) | Why plsdo does not impute, and what to do instead |
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## Contributing
|
|
101
|
+
|
|
102
|
+
Issues and pull requests are welcome. Please open an issue before starting significant work.
|
|
103
|
+
|
|
104
|
+
Contact: eilidh [dot] macnicol [at] kcl [dot] ac [dot] uk
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Licence
|
|
109
|
+
|
|
110
|
+
BSD 3-Clause. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Input Format
|
|
2
|
+
|
|
3
|
+
## Required Files
|
|
4
|
+
|
|
5
|
+
### X Matrix (correlational PLS only)
|
|
6
|
+
|
|
7
|
+
CSV with subjects as rows and features as columns. The first column must be
|
|
8
|
+
the subject identifier.
|
|
9
|
+
|
|
10
|
+
```csv
|
|
11
|
+
subject_id,region_A,region_B,region_C
|
|
12
|
+
sub01,1.23,4.56,7.89
|
|
13
|
+
sub02,2.34,5.67,8.90
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
### Y Matrix
|
|
17
|
+
|
|
18
|
+
Same format as X. Subject IDs must match across files (order does not matter
|
|
19
|
+
— the pipeline will align them).
|
|
20
|
+
|
|
21
|
+
### Demographics
|
|
22
|
+
|
|
23
|
+
CSV with a subject ID column and at least one grouping column.
|
|
24
|
+
|
|
25
|
+
```csv
|
|
26
|
+
subject_id,group,sex,age
|
|
27
|
+
sub01,control,F,25
|
|
28
|
+
sub02,treatment,M,30
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Optional Files
|
|
32
|
+
|
|
33
|
+
### Feature Metadata
|
|
34
|
+
|
|
35
|
+
CSV with a `feature` column matching data column headers, plus category
|
|
36
|
+
columns for plot colour-coding.
|
|
37
|
+
|
|
38
|
+
```csv
|
|
39
|
+
feature,category
|
|
40
|
+
region_A,frontal
|
|
41
|
+
region_B,frontal
|
|
42
|
+
region_C,temporal
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Groups Configuration
|
|
46
|
+
|
|
47
|
+
YAML file for multiple grouping variables. See `docs/usage.md` for examples.
|
|
48
|
+
|
|
49
|
+
## Subject Alignment
|
|
50
|
+
|
|
51
|
+
The pipeline finds the intersection of subject IDs across all input files.
|
|
52
|
+
Subjects present in some files but not others are excluded with a warning.
|
|
53
|
+
If no subjects are shared, the pipeline errors.
|
|
54
|
+
|
|
55
|
+
## Missing Data
|
|
56
|
+
|
|
57
|
+
The pipeline does **not** handle missing data. If any value in X or Y is
|
|
58
|
+
NaN, the pipeline errors and lists which subjects and features are affected.
|
|
59
|
+
|
|
60
|
+
See `docs/missing-data.md` for guidance on how to address this.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Interpreting PLS Output
|
|
2
|
+
|
|
3
|
+
## Cross-Correlation Heatmap
|
|
4
|
+
|
|
5
|
+
This matrix shows the Pearson correlation between every feature in X and
|
|
6
|
+
every feature in Y, computed across all subjects. It is the raw input to
|
|
7
|
+
the SVD. Strong positive or negative values indicate features that co-vary
|
|
8
|
+
across subjects.
|
|
9
|
+
|
|
10
|
+
## Singular Values and Permutation Test
|
|
11
|
+
|
|
12
|
+
The SVD breaks the cross-correlation matrix into latent variables (LVs),
|
|
13
|
+
ordered by how much covariance they explain. The singular value for each LV
|
|
14
|
+
quantifies its strength.
|
|
15
|
+
|
|
16
|
+
The permutation test asks: is this singular value larger than we would expect
|
|
17
|
+
if X and Y were unrelated? It shuffles the subject pairing between X and Y
|
|
18
|
+
10,000 times and compares the observed singular value to this null
|
|
19
|
+
distribution.
|
|
20
|
+
|
|
21
|
+
**How to read the plot:** A red line to the right of the grey histogram
|
|
22
|
+
indicates a singular value that exceeds the null distribution — that LV
|
|
23
|
+
captures real covariance, not noise.
|
|
24
|
+
|
|
25
|
+
## Loading Bar Plots
|
|
26
|
+
|
|
27
|
+
For each significant and reliable LV, the loading plots show which features
|
|
28
|
+
contribute most to the pattern. Bars are sorted by absolute loading. The
|
|
29
|
+
red error bars show the bootstrap standard error — they indicate how stable
|
|
30
|
+
each loading is across resampled versions of the data.
|
|
31
|
+
|
|
32
|
+
**Large bars with small error bars** are the features driving the pattern
|
|
33
|
+
reliably. **Large bars with large error bars** may be driven by a few
|
|
34
|
+
outlier subjects.
|
|
35
|
+
|
|
36
|
+
## Bootstrap Ratios
|
|
37
|
+
|
|
38
|
+
The bootstrap ratio is the loading divided by its standard error. It can be
|
|
39
|
+
interpreted like a z-score: values above 1.96 indicate that a feature's
|
|
40
|
+
contribution is reliable at the 95% confidence level.
|
|
41
|
+
|
|
42
|
+
## Subject Scores
|
|
43
|
+
|
|
44
|
+
Subject scores show how strongly each subject expresses a given LV pattern.
|
|
45
|
+
The X scores (XU) project each subject onto the X-side pattern; the Y
|
|
46
|
+
scores (YV') project onto the Y-side pattern.
|
|
47
|
+
|
|
48
|
+
**Box/strip plots** show how scores distribute across groups. If a LV
|
|
49
|
+
captures a group difference, the boxes will separate.
|
|
50
|
+
|
|
51
|
+
**Score scatter plots** (correlational PLS only) show the relationship
|
|
52
|
+
between X and Y scores. If the PLS pattern is strong, subjects should fall
|
|
53
|
+
along a diagonal. Group-specific linear fits reveal whether the X–Y
|
|
54
|
+
relationship differs by group.
|
|
55
|
+
|
|
56
|
+
## Cross-Validation (Discriminatory PLS)
|
|
57
|
+
|
|
58
|
+
Cross-validation tests whether the group discrimination holds on unseen
|
|
59
|
+
subjects. The fold accuracy histogram shows per-fold classification
|
|
60
|
+
accuracy, while the confusion matrix shows which groups are well-separated
|
|
61
|
+
and which are confused.
|
|
62
|
+
|
|
63
|
+
The permutation test of CV accuracy answers: is the observed accuracy
|
|
64
|
+
significantly better than chance? A p-value below 0.05 indicates that
|
|
65
|
+
the model generalises beyond the training data.
|
|
66
|
+
|
|
67
|
+
**Important:** do not select the number of components based on `plsdo run`
|
|
68
|
+
results and then feed that into cross-validation. This introduces
|
|
69
|
+
circularity. Use all components (the default) or use nested
|
|
70
|
+
cross-validation.
|