datamanifestpy 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ # These are supported funding model platforms
2
+
3
+ github: perrette # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4
+ patreon: # Replace with a single Patreon username
5
+ open_collective: # Replace with a single Open Collective username
6
+ ko_fi: # Replace with a single Ko-fi username
7
+ tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8
+ community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9
+ liberapay: # Replace with a single Liberapay username
10
+ issuehunt: # Replace with a single IssueHunt username
11
+ lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
12
+ polar: # Replace with a single Polar username
13
+ buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
14
+ thanks_dev: # Replace with a single thanks.dev username
15
+ custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
@@ -0,0 +1,91 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ tags: ["v*"]
7
+ pull_request:
8
+ branches: [main]
9
+
10
+ jobs:
11
+ test:
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ matrix:
15
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: pip install -e ".[dev]"
26
+
27
+ - name: Run tests
28
+ run: pytest -x -q
29
+
30
+ - name: Smoke-test CLI
31
+ run: datamanifest --version
32
+
33
+ build:
34
+ # Build the distribution once on a version tag, after the full test matrix passes.
35
+ needs: test
36
+ if: startsWith(github.ref, 'refs/tags/v')
37
+ runs-on: ubuntu-latest
38
+ steps:
39
+ - uses: actions/checkout@v4
40
+ with:
41
+ fetch-depth: 0 # setuptools_scm needs full history/tags for the version
42
+
43
+ - name: Set up Python
44
+ uses: actions/setup-python@v5
45
+ with:
46
+ python-version: "3.12"
47
+
48
+ - name: Build sdist and wheel
49
+ run: |
50
+ pip install build
51
+ python -m build
52
+
53
+ - name: Upload distribution artifacts
54
+ uses: actions/upload-artifact@v4
55
+ with:
56
+ name: dist
57
+ path: dist/
58
+
59
+ publish:
60
+ needs: build
61
+ runs-on: ubuntu-latest
62
+ environment: pypi
63
+ permissions:
64
+ id-token: write # required for PyPI Trusted Publishing (OIDC)
65
+ steps:
66
+ - name: Download distribution artifacts
67
+ uses: actions/download-artifact@v4
68
+ with:
69
+ name: dist
70
+ path: dist/
71
+
72
+ - name: Publish to PyPI
73
+ uses: pypa/gh-action-pypi-publish@release/v1
74
+
75
+ github-release:
76
+ needs: publish
77
+ runs-on: ubuntu-latest
78
+ permissions:
79
+ contents: write # required to create a GitHub Release
80
+ steps:
81
+ - name: Download distribution artifacts
82
+ uses: actions/download-artifact@v4
83
+ with:
84
+ name: dist
85
+ path: dist/
86
+
87
+ - name: Create GitHub Release
88
+ uses: softprops/action-gh-release@v2
89
+ with:
90
+ files: dist/*
91
+ generate_release_notes: true
@@ -0,0 +1,213 @@
1
+ datamanifest/_version.py
2
+
3
+ # Autonomous roadmap workflows (local coordination artifacts; never committed)
4
+ /workflows/
5
+ /.worktrees/
6
+
7
+ # Byte-compiled / optimized / DLL files
8
+ __pycache__/
9
+ *.py[codz]
10
+ *$py.class
11
+
12
+ # C extensions
13
+ *.so
14
+
15
+ # Distribution / packaging
16
+ .Python
17
+ build/
18
+ develop-eggs/
19
+ dist/
20
+ downloads/
21
+ eggs/
22
+ .eggs/
23
+ lib/
24
+ lib64/
25
+ parts/
26
+ sdist/
27
+ var/
28
+ wheels/
29
+ share/python-wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+ MANIFEST
34
+
35
+ # PyInstaller
36
+ # Usually these files are written by a python script from a template
37
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
+ *.manifest
39
+ *.spec
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # Unit test / coverage reports
46
+ htmlcov/
47
+ .tox/
48
+ .nox/
49
+ .coverage
50
+ .coverage.*
51
+ .cache
52
+ nosetests.xml
53
+ coverage.xml
54
+ *.cover
55
+ *.py.cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+ cover/
59
+
60
+ # Translations
61
+ *.mo
62
+ *.pot
63
+
64
+ # Django stuff:
65
+ *.log
66
+ local_settings.py
67
+ db.sqlite3
68
+ db.sqlite3-journal
69
+
70
+ # Flask stuff:
71
+ instance/
72
+ .webassets-cache
73
+
74
+ # Scrapy stuff:
75
+ .scrapy
76
+
77
+ # Sphinx documentation
78
+ docs/_build/
79
+
80
+ # PyBuilder
81
+ .pybuilder/
82
+ target/
83
+
84
+ # Jupyter Notebook
85
+ .ipynb_checkpoints
86
+
87
+ # IPython
88
+ profile_default/
89
+ ipython_config.py
90
+
91
+ # pyenv
92
+ # For a library or package, you might want to ignore these files since the code is
93
+ # intended to run in multiple environments; otherwise, check them in:
94
+ # .python-version
95
+
96
+ # pipenv
97
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
99
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
100
+ # install all needed dependencies.
101
+ #Pipfile.lock
102
+
103
+ # UV
104
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ #uv.lock
108
+
109
+ # poetry
110
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
111
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
112
+ # commonly ignored for libraries.
113
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
114
+ #poetry.lock
115
+ #poetry.toml
116
+
117
+ # pdm
118
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
119
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
120
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
121
+ #pdm.lock
122
+ #pdm.toml
123
+ .pdm-python
124
+ .pdm-build/
125
+
126
+ # pixi
127
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
128
+ #pixi.lock
129
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
130
+ # in the .venv directory. It is recommended not to include this directory in version control.
131
+ .pixi
132
+
133
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
134
+ __pypackages__/
135
+
136
+ # Celery stuff
137
+ celerybeat-schedule
138
+ celerybeat.pid
139
+
140
+ # SageMath parsed files
141
+ *.sage.py
142
+
143
+ # Environments
144
+ .env
145
+ .envrc
146
+ .venv
147
+ env/
148
+ venv/
149
+ ENV/
150
+ env.bak/
151
+ venv.bak/
152
+
153
+ # Spyder project settings
154
+ .spyderproject
155
+ .spyproject
156
+
157
+ # Rope project settings
158
+ .ropeproject
159
+
160
+ # mkdocs documentation
161
+ /site
162
+
163
+ # mypy
164
+ .mypy_cache/
165
+ .dmypy.json
166
+ dmypy.json
167
+
168
+ # Pyre type checker
169
+ .pyre/
170
+
171
+ # pytype static type analyzer
172
+ .pytype/
173
+
174
+ # Cython debug symbols
175
+ cython_debug/
176
+
177
+ # PyCharm
178
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
179
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
180
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
181
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
182
+ #.idea/
183
+
184
+ # Abstra
185
+ # Abstra is an AI-powered process automation framework.
186
+ # Ignore directories containing user credentials, local state, and settings.
187
+ # Learn more at https://abstra.io/docs
188
+ .abstra/
189
+
190
+ # Visual Studio Code
191
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
192
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
193
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
194
+ # you could uncomment the following to ignore the entire vscode folder
195
+ # .vscode/
196
+
197
+ # Ruff stuff:
198
+ .ruff_cache/
199
+
200
+ # PyPI configuration file
201
+ .pypirc
202
+
203
+ # Cursor
204
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
205
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
206
+ # refer to https://docs.cursor.com/context/ignore-files
207
+ .cursorignore
208
+ .cursorindexingignore
209
+
210
+ # Marimo
211
+ marimo/_static/
212
+ marimo/_lsp/
213
+ __marimo__/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Mahé Perrette
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.4
2
+ Name: datamanifestpy
3
+ Version: 0.1.0
4
+ Summary: Python port of DataManifest.jl — declare and manage data dependencies for scientific projects
5
+ Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Mahé Perrette
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Requires-Python: >=3.10
29
+ Description-Content-Type: text/markdown
30
+ License-File: LICENSE
31
+ Requires-Dist: httpx
32
+ Requires-Dist: tqdm
33
+ Requires-Dist: tomli_w
34
+ Requires-Dist: tomli; python_version < "3.11"
35
+ Provides-Extra: csv
36
+ Requires-Dist: pandas; extra == "csv"
37
+ Provides-Extra: parquet
38
+ Requires-Dist: pandas; extra == "parquet"
39
+ Requires-Dist: pyarrow; extra == "parquet"
40
+ Provides-Extra: nc
41
+ Requires-Dist: xarray; extra == "nc"
42
+ Requires-Dist: netcdf4; extra == "nc"
43
+ Provides-Extra: yaml
44
+ Requires-Dist: pyyaml; extra == "yaml"
45
+ Provides-Extra: all
46
+ Requires-Dist: pandas; extra == "all"
47
+ Requires-Dist: pyarrow; extra == "all"
48
+ Requires-Dist: xarray; extra == "all"
49
+ Requires-Dist: netcdf4; extra == "all"
50
+ Requires-Dist: pyyaml; extra == "all"
51
+ Provides-Extra: dev
52
+ Requires-Dist: pytest; extra == "dev"
53
+ Dynamic: license-file
54
+
55
+ # datamanifest
56
+
57
+ [![CI](https://github.com/perrette/datamanifest/actions/workflows/ci.yaml/badge.svg)](https://github.com/perrette/datamanifest/actions/workflows/ci.yaml)
58
+
59
+ Keep track of datasets used in a scientific project.
60
+
61
+ `datamanifest` provides a simple way to declare data dependencies — URLs, git repositories, checksums, formats — in a `datasets.toml` file, and handles download, verification, extraction, and loading. It is a Python port of [`DataManifest.jl`](https://github.com/awi-esc/DataManifest.jl) (same author), with the same manifest format and feature surface.
62
+
63
+ ## Installation
64
+
65
+ ```bash
66
+ pip install datamanifest
67
+ ```
68
+
69
+ With optional loader backends:
70
+
71
+ ```bash
72
+ pip install "datamanifest[csv]" # pandas CSV
73
+ pip install "datamanifest[parquet]" # pandas + pyarrow
74
+ pip install "datamanifest[nc]" # xarray + netcdf4
75
+ pip install "datamanifest[yaml]" # pyyaml
76
+ pip install "datamanifest[all]" # all of the above
77
+ ```
78
+
79
+ ## API quickstart
80
+
81
+ ```python
82
+ import datamanifest
83
+
84
+ # Add a dataset (registers + downloads + auto-fills sha256)
85
+ datamanifest.add(
86
+ "https://github.com/jesstierney/lgmDA/archive/refs/tags/v2.1.zip",
87
+ name="jesstierney/lgmDA",
88
+ extract=True,
89
+ )
90
+
91
+ # Resolve the on-disk path
92
+ path = datamanifest.get_dataset_path("jesstierney/lgmDA")
93
+
94
+ # Download and load in one step
95
+ ds = datamanifest.load_dataset("my_nc_entry") # returns xarray.Dataset for nc format
96
+
97
+ # Explicit database (no pyproject.toml / env-var lookup)
98
+ db = datamanifest.Database("datasets.toml", "my-data-folder")
99
+ datamanifest.add(db, "https://zenodo.org/record/.../file.csv")
100
+ path = datamanifest.get_dataset_path(db, "file")
101
+ ```
102
+
103
+ The module-level functions (`add`, `download_dataset`, `load_dataset`, `get_dataset_path`, …) look up a process-wide default `Database` via `pyproject.toml` discovery, the `DATAMANIFEST_TOML` / `DATASETS_TOML` environment variables, or a `datasets.toml` / `datamanifest.toml` file in the working tree. Pass an explicit `db` as the first argument to bypass auto-discovery.
104
+
105
+ ## CLI usage
106
+
107
+ ```
108
+ datamanifest COMMAND [OPTIONS]
109
+ ```
110
+
111
+ | Command | Description |
112
+ |---|---|
113
+ | `list [--present\|--missing\|--all]` | List datasets; default shows present first, then missing |
114
+ | `download [NAME ...] [--all] [--overwrite]` | Download specific datasets or all of them |
115
+ | `path NAME` | Print the resolved on-disk path (composable in shell) |
116
+ | `add URI [--name N] [--no-download] [--extract]` | Register and (by default) download a dataset |
117
+ | `remove NAME [--keep-cache]` | Delete an entry, optionally preserving cached files |
118
+ | `show NAME` | Print full entry detail in TOML style |
119
+ | `verify [NAME ...]` | Re-check sha256 checksums; exits nonzero on any mismatch |
120
+ | `init [--folder PATH] [--force]` | Create a fresh `datasets.toml` in the current directory |
121
+ | `where` | Print active `datasets_toml` and `datasets_folder` paths |
122
+
123
+ Examples:
124
+
125
+ ```bash
126
+ # Set up a new project
127
+ datamanifest init
128
+
129
+ # Add and download a dataset
130
+ datamanifest add "https://zenodo.org/record/.../file.zip" --extract
131
+
132
+ # Use the path in a shell pipeline
133
+ python analysis.py --data "$(datamanifest path file)"
134
+
135
+ # Verify all checksums before a paper submission
136
+ datamanifest verify
137
+
138
+ # Where is the active manifest?
139
+ datamanifest where
140
+ ```
141
+
142
+ ## Features
143
+
144
+ | Feature | Supported |
145
+ |---|---|
146
+ | HTTP / HTTPS download with progress | yes |
147
+ | Partial-download resume (Range header) | yes |
148
+ | git clone (`git://`, `ssh+git://`, `*.git`) | yes |
149
+ | SSH / rsync (`ssh://`, `sshfs://`, `rsync://`) | yes |
150
+ | Local file copy (`file://`) | yes |
151
+ | Multi-URI batch entries (`uris=`) | yes |
152
+ | SHA-256 checksum verification + auto-fill | yes |
153
+ | ZIP / tar / tar.gz extraction | yes |
154
+ | `requires=` dependency graph (topological order) | yes |
155
+ | Shell template hook (`shell=`) | yes |
156
+ | Python entry-point hook (`python=`) | yes |
157
+ | Named + default loaders (csv, parquet, nc, json, yaml, toml, zip, tar) | yes |
158
+ | TOML manifest round-trip (read `tomllib`, write `tomli_w`) | yes |
159
+ | Project-root auto-discovery (`pyproject.toml` walk, env vars) | yes |
160
+ | CLI (`datamanifest list/download/path/add/remove/show/verify/init/where`) | yes |
161
+
162
+ ## Python adaptations
163
+
164
+ The Python port uses the same `datasets.toml` format as `DataManifest.jl`. Two fields differ:
165
+
166
+ - **`python=`** replaces `julia=`: an entry-point reference (`"pkg.mod:func"`) resolved via `importlib`. The callable receives keyword arguments `(download_path, project_root, entry, uri, key, version, doi, format, branch, requires_paths)`. No inline code execution (`exec`/`eval`) anywhere.
167
+ - **`callable=`** is an alias for `python=` accepted on read and normalized to `python=` on write. Intended for single-language projects that want a language-agnostic key.
168
+ - **`python_includes=`** is a list of directory paths prepended to `sys.path` during loader resolution (replaces `julia_modules`).
169
+
170
+ A single `datasets.toml` can be consumed by both tools: each reads the common fields and ignores the other's extension keys. The shared schema is documented at [perrette/datamanifest.toml](https://github.com/perrette/datamanifest.toml).
171
+
172
+ ## Related projects
173
+
174
+ - [`awi-esc/DataManifest.jl`](https://github.com/awi-esc/DataManifest.jl) — the Julia implementation this port is based on.
175
+ - [`perrette/datamanifest.toml`](https://github.com/perrette/datamanifest.toml) — the shared TOML schema spec consumed by both implementations.
176
+
177
+ ## Acknowledgments
178
+
179
+ `datamanifest` is a Python port of [`awi-esc/DataManifest.jl`](https://github.com/awi-esc/DataManifest.jl), written by the same author (Mahé Perrette). The Python port was implemented with assistance from [Anthropic's Claude](https://www.anthropic.com/claude).
@@ -0,0 +1,125 @@
1
+ # datamanifest
2
+
3
+ [![CI](https://github.com/perrette/datamanifest/actions/workflows/ci.yaml/badge.svg)](https://github.com/perrette/datamanifest/actions/workflows/ci.yaml)
4
+
5
+ Keep track of datasets used in a scientific project.
6
+
7
+ `datamanifest` provides a simple way to declare data dependencies — URLs, git repositories, checksums, formats — in a `datasets.toml` file, and handles download, verification, extraction, and loading. It is a Python port of [`DataManifest.jl`](https://github.com/awi-esc/DataManifest.jl) (same author), with the same manifest format and feature surface.
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ pip install datamanifest
13
+ ```
14
+
15
+ With optional loader backends:
16
+
17
+ ```bash
18
+ pip install "datamanifest[csv]" # pandas CSV
19
+ pip install "datamanifest[parquet]" # pandas + pyarrow
20
+ pip install "datamanifest[nc]" # xarray + netcdf4
21
+ pip install "datamanifest[yaml]" # pyyaml
22
+ pip install "datamanifest[all]" # all of the above
23
+ ```
24
+
25
+ ## API quickstart
26
+
27
+ ```python
28
+ import datamanifest
29
+
30
+ # Add a dataset (registers + downloads + auto-fills sha256)
31
+ datamanifest.add(
32
+ "https://github.com/jesstierney/lgmDA/archive/refs/tags/v2.1.zip",
33
+ name="jesstierney/lgmDA",
34
+ extract=True,
35
+ )
36
+
37
+ # Resolve the on-disk path
38
+ path = datamanifest.get_dataset_path("jesstierney/lgmDA")
39
+
40
+ # Download and load in one step
41
+ ds = datamanifest.load_dataset("my_nc_entry") # returns xarray.Dataset for nc format
42
+
43
+ # Explicit database (no pyproject.toml / env-var lookup)
44
+ db = datamanifest.Database("datasets.toml", "my-data-folder")
45
+ datamanifest.add(db, "https://zenodo.org/record/.../file.csv")
46
+ path = datamanifest.get_dataset_path(db, "file")
47
+ ```
48
+
49
+ The module-level functions (`add`, `download_dataset`, `load_dataset`, `get_dataset_path`, …) look up a process-wide default `Database` via `pyproject.toml` discovery, the `DATAMANIFEST_TOML` / `DATASETS_TOML` environment variables, or a `datasets.toml` / `datamanifest.toml` file in the working tree. Pass an explicit `db` as the first argument to bypass auto-discovery.
50
+
51
+ ## CLI usage
52
+
53
+ ```
54
+ datamanifest COMMAND [OPTIONS]
55
+ ```
56
+
57
+ | Command | Description |
58
+ |---|---|
59
+ | `list [--present\|--missing\|--all]` | List datasets; default shows present first, then missing |
60
+ | `download [NAME ...] [--all] [--overwrite]` | Download specific datasets or all of them |
61
+ | `path NAME` | Print the resolved on-disk path (composable in shell) |
62
+ | `add URI [--name N] [--no-download] [--extract]` | Register and (by default) download a dataset |
63
+ | `remove NAME [--keep-cache]` | Delete an entry, optionally preserving cached files |
64
+ | `show NAME` | Print full entry detail in TOML style |
65
+ | `verify [NAME ...]` | Re-check sha256 checksums; exits nonzero on any mismatch |
66
+ | `init [--folder PATH] [--force]` | Create a fresh `datasets.toml` in the current directory |
67
+ | `where` | Print active `datasets_toml` and `datasets_folder` paths |
68
+
69
+ Examples:
70
+
71
+ ```bash
72
+ # Set up a new project
73
+ datamanifest init
74
+
75
+ # Add and download a dataset
76
+ datamanifest add "https://zenodo.org/record/.../file.zip" --extract
77
+
78
+ # Use the path in a shell pipeline
79
+ python analysis.py --data "$(datamanifest path file)"
80
+
81
+ # Verify all checksums before a paper submission
82
+ datamanifest verify
83
+
84
+ # Where is the active manifest?
85
+ datamanifest where
86
+ ```
87
+
88
+ ## Features
89
+
90
+ | Feature | Supported |
91
+ |---|---|
92
+ | HTTP / HTTPS download with progress | yes |
93
+ | Partial-download resume (Range header) | yes |
94
+ | git clone (`git://`, `ssh+git://`, `*.git`) | yes |
95
+ | SSH / rsync (`ssh://`, `sshfs://`, `rsync://`) | yes |
96
+ | Local file copy (`file://`) | yes |
97
+ | Multi-URI batch entries (`uris=`) | yes |
98
+ | SHA-256 checksum verification + auto-fill | yes |
99
+ | ZIP / tar / tar.gz extraction | yes |
100
+ | `requires=` dependency graph (topological order) | yes |
101
+ | Shell template hook (`shell=`) | yes |
102
+ | Python entry-point hook (`python=`) | yes |
103
+ | Named + default loaders (csv, parquet, nc, json, yaml, toml, zip, tar) | yes |
104
+ | TOML manifest round-trip (read `tomllib`, write `tomli_w`) | yes |
105
+ | Project-root auto-discovery (`pyproject.toml` walk, env vars) | yes |
106
+ | CLI (`datamanifest list/download/path/add/remove/show/verify/init/where`) | yes |
107
+
108
+ ## Python adaptations
109
+
110
+ The Python port uses the same `datasets.toml` format as `DataManifest.jl`. Two fields differ:
111
+
112
+ - **`python=`** replaces `julia=`: an entry-point reference (`"pkg.mod:func"`) resolved via `importlib`. The callable receives keyword arguments `(download_path, project_root, entry, uri, key, version, doi, format, branch, requires_paths)`. No inline code execution (`exec`/`eval`) anywhere.
113
+ - **`callable=`** is an alias for `python=` accepted on read and normalized to `python=` on write. Intended for single-language projects that want a language-agnostic key.
114
+ - **`python_includes=`** is a list of directory paths prepended to `sys.path` during loader resolution (replaces `julia_modules`).
115
+
116
+ A single `datasets.toml` can be consumed by both tools: each reads the common fields and ignores the other's extension keys. The shared schema is documented at [perrette/datamanifest.toml](https://github.com/perrette/datamanifest.toml).
117
+
118
+ ## Related projects
119
+
120
+ - [`awi-esc/DataManifest.jl`](https://github.com/awi-esc/DataManifest.jl) — the Julia implementation this port is based on.
121
+ - [`perrette/datamanifest.toml`](https://github.com/perrette/datamanifest.toml) — the shared TOML schema spec consumed by both implementations.
122
+
123
+ ## Acknowledgments
124
+
125
+ `datamanifest` is a Python port of [`awi-esc/DataManifest.jl`](https://github.com/awi-esc/DataManifest.jl), written by the same author (Mahé Perrette). The Python port was implemented with assistance from [Anthropic's Claude](https://www.anthropic.com/claude).
@@ -0,0 +1,38 @@
1
+ try:
2
+ from ._version import __version__
3
+ except ImportError:
4
+ __version__ = "unknown"
5
+
6
+ from .database import (
7
+ Database,
8
+ DatasetEntry,
9
+ delete_dataset as _delete_dataset_db,
10
+ get_default_database,
11
+ validate_loader,
12
+ validate_loaders,
13
+ )
14
+ from .pipelines import (
15
+ _module_add as add,
16
+ _module_delete_dataset as delete_dataset,
17
+ _module_download_dataset as download_dataset,
18
+ _module_download_datasets as download_datasets,
19
+ _module_get_dataset_path as get_dataset_path,
20
+ _module_load_dataset as load_dataset,
21
+ _module_register_dataset as register_dataset,
22
+ )
23
+
24
+ __all__ = [
25
+ "__version__",
26
+ "Database",
27
+ "DatasetEntry",
28
+ "add",
29
+ "delete_dataset",
30
+ "download_dataset",
31
+ "download_datasets",
32
+ "get_dataset_path",
33
+ "get_default_database",
34
+ "load_dataset",
35
+ "register_dataset",
36
+ "validate_loader",
37
+ "validate_loaders",
38
+ ]