datamanifestpy 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamanifestpy-0.1.0/.github/FUNDING.yml +15 -0
- datamanifestpy-0.1.0/.github/workflows/ci.yaml +91 -0
- datamanifestpy-0.1.0/.gitignore +213 -0
- datamanifestpy-0.1.0/LICENSE +21 -0
- datamanifestpy-0.1.0/PKG-INFO +179 -0
- datamanifestpy-0.1.0/README.md +125 -0
- datamanifestpy-0.1.0/datamanifest/__init__.py +38 -0
- datamanifestpy-0.1.0/datamanifest/_version.py +24 -0
- datamanifestpy-0.1.0/datamanifest/cli.py +296 -0
- datamanifestpy-0.1.0/datamanifest/config.py +105 -0
- datamanifestpy-0.1.0/datamanifest/database.py +806 -0
- datamanifestpy-0.1.0/datamanifest/default_loaders.py +138 -0
- datamanifestpy-0.1.0/datamanifest/pipelines.py +714 -0
- datamanifestpy-0.1.0/datamanifestpy.egg-info/PKG-INFO +179 -0
- datamanifestpy-0.1.0/datamanifestpy.egg-info/SOURCES.txt +25 -0
- datamanifestpy-0.1.0/datamanifestpy.egg-info/dependency_links.txt +1 -0
- datamanifestpy-0.1.0/datamanifestpy.egg-info/entry_points.txt +2 -0
- datamanifestpy-0.1.0/datamanifestpy.egg-info/requires.txt +30 -0
- datamanifestpy-0.1.0/datamanifestpy.egg-info/top_level.txt +1 -0
- datamanifestpy-0.1.0/datasets.toml +20 -0
- datamanifestpy-0.1.0/docs/datamanifest-toml.md +19 -0
- datamanifestpy-0.1.0/pyproject.toml +42 -0
- datamanifestpy-0.1.0/setup.cfg +4 -0
- datamanifestpy-0.1.0/tests/helpers/__init__.py +0 -0
- datamanifestpy-0.1.0/tests/helpers/loaders.py +17 -0
- datamanifestpy-0.1.0/tests/test_basic.py +1108 -0
- datamanifestpy-0.1.0/tests/test_cli.py +97 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# These are supported funding model platforms
|
|
2
|
+
|
|
3
|
+
github: perrette # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
|
4
|
+
patreon: # Replace with a single Patreon username
|
|
5
|
+
open_collective: # Replace with a single Open Collective username
|
|
6
|
+
ko_fi: # Replace with a single Ko-fi username
|
|
7
|
+
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
|
|
8
|
+
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
|
|
9
|
+
liberapay: # Replace with a single Liberapay username
|
|
10
|
+
issuehunt: # Replace with a single IssueHunt username
|
|
11
|
+
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
|
|
12
|
+
polar: # Replace with a single Polar username
|
|
13
|
+
buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
|
|
14
|
+
thanks_dev: # Replace with a single thanks.dev username
|
|
15
|
+
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
tags: ["v*"]
|
|
7
|
+
pull_request:
|
|
8
|
+
branches: [main]
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
test:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: pip install -e ".[dev]"
|
|
26
|
+
|
|
27
|
+
- name: Run tests
|
|
28
|
+
run: pytest -x -q
|
|
29
|
+
|
|
30
|
+
- name: Smoke-test CLI
|
|
31
|
+
run: datamanifest --version
|
|
32
|
+
|
|
33
|
+
build:
|
|
34
|
+
# Build the distribution once on a version tag, after the full test matrix passes.
|
|
35
|
+
needs: test
|
|
36
|
+
if: startsWith(github.ref, 'refs/tags/v')
|
|
37
|
+
runs-on: ubuntu-latest
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/checkout@v4
|
|
40
|
+
with:
|
|
41
|
+
fetch-depth: 0 # setuptools_scm needs full history/tags for the version
|
|
42
|
+
|
|
43
|
+
- name: Set up Python
|
|
44
|
+
uses: actions/setup-python@v5
|
|
45
|
+
with:
|
|
46
|
+
python-version: "3.12"
|
|
47
|
+
|
|
48
|
+
- name: Build sdist and wheel
|
|
49
|
+
run: |
|
|
50
|
+
pip install build
|
|
51
|
+
python -m build
|
|
52
|
+
|
|
53
|
+
- name: Upload distribution artifacts
|
|
54
|
+
uses: actions/upload-artifact@v4
|
|
55
|
+
with:
|
|
56
|
+
name: dist
|
|
57
|
+
path: dist/
|
|
58
|
+
|
|
59
|
+
publish:
|
|
60
|
+
needs: build
|
|
61
|
+
runs-on: ubuntu-latest
|
|
62
|
+
environment: pypi
|
|
63
|
+
permissions:
|
|
64
|
+
id-token: write # required for PyPI Trusted Publishing (OIDC)
|
|
65
|
+
steps:
|
|
66
|
+
- name: Download distribution artifacts
|
|
67
|
+
uses: actions/download-artifact@v4
|
|
68
|
+
with:
|
|
69
|
+
name: dist
|
|
70
|
+
path: dist/
|
|
71
|
+
|
|
72
|
+
- name: Publish to PyPI
|
|
73
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
74
|
+
|
|
75
|
+
github-release:
|
|
76
|
+
needs: publish
|
|
77
|
+
runs-on: ubuntu-latest
|
|
78
|
+
permissions:
|
|
79
|
+
contents: write # required to create a GitHub Release
|
|
80
|
+
steps:
|
|
81
|
+
- name: Download distribution artifacts
|
|
82
|
+
uses: actions/download-artifact@v4
|
|
83
|
+
with:
|
|
84
|
+
name: dist
|
|
85
|
+
path: dist/
|
|
86
|
+
|
|
87
|
+
- name: Create GitHub Release
|
|
88
|
+
uses: softprops/action-gh-release@v2
|
|
89
|
+
with:
|
|
90
|
+
files: dist/*
|
|
91
|
+
generate_release_notes: true
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
datamanifest/_version.py
|
|
2
|
+
|
|
3
|
+
# Autonomous roadmap workflows (local coordination artifacts; never committed)
|
|
4
|
+
/workflows/
|
|
5
|
+
/.worktrees/
|
|
6
|
+
|
|
7
|
+
# Byte-compiled / optimized / DLL files
|
|
8
|
+
__pycache__/
|
|
9
|
+
*.py[codz]
|
|
10
|
+
*$py.class
|
|
11
|
+
|
|
12
|
+
# C extensions
|
|
13
|
+
*.so
|
|
14
|
+
|
|
15
|
+
# Distribution / packaging
|
|
16
|
+
.Python
|
|
17
|
+
build/
|
|
18
|
+
develop-eggs/
|
|
19
|
+
dist/
|
|
20
|
+
downloads/
|
|
21
|
+
eggs/
|
|
22
|
+
.eggs/
|
|
23
|
+
lib/
|
|
24
|
+
lib64/
|
|
25
|
+
parts/
|
|
26
|
+
sdist/
|
|
27
|
+
var/
|
|
28
|
+
wheels/
|
|
29
|
+
share/python-wheels/
|
|
30
|
+
*.egg-info/
|
|
31
|
+
.installed.cfg
|
|
32
|
+
*.egg
|
|
33
|
+
MANIFEST
|
|
34
|
+
|
|
35
|
+
# PyInstaller
|
|
36
|
+
# Usually these files are written by a python script from a template
|
|
37
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
38
|
+
*.manifest
|
|
39
|
+
*.spec
|
|
40
|
+
|
|
41
|
+
# Installer logs
|
|
42
|
+
pip-log.txt
|
|
43
|
+
pip-delete-this-directory.txt
|
|
44
|
+
|
|
45
|
+
# Unit test / coverage reports
|
|
46
|
+
htmlcov/
|
|
47
|
+
.tox/
|
|
48
|
+
.nox/
|
|
49
|
+
.coverage
|
|
50
|
+
.coverage.*
|
|
51
|
+
.cache
|
|
52
|
+
nosetests.xml
|
|
53
|
+
coverage.xml
|
|
54
|
+
*.cover
|
|
55
|
+
*.py.cover
|
|
56
|
+
.hypothesis/
|
|
57
|
+
.pytest_cache/
|
|
58
|
+
cover/
|
|
59
|
+
|
|
60
|
+
# Translations
|
|
61
|
+
*.mo
|
|
62
|
+
*.pot
|
|
63
|
+
|
|
64
|
+
# Django stuff:
|
|
65
|
+
*.log
|
|
66
|
+
local_settings.py
|
|
67
|
+
db.sqlite3
|
|
68
|
+
db.sqlite3-journal
|
|
69
|
+
|
|
70
|
+
# Flask stuff:
|
|
71
|
+
instance/
|
|
72
|
+
.webassets-cache
|
|
73
|
+
|
|
74
|
+
# Scrapy stuff:
|
|
75
|
+
.scrapy
|
|
76
|
+
|
|
77
|
+
# Sphinx documentation
|
|
78
|
+
docs/_build/
|
|
79
|
+
|
|
80
|
+
# PyBuilder
|
|
81
|
+
.pybuilder/
|
|
82
|
+
target/
|
|
83
|
+
|
|
84
|
+
# Jupyter Notebook
|
|
85
|
+
.ipynb_checkpoints
|
|
86
|
+
|
|
87
|
+
# IPython
|
|
88
|
+
profile_default/
|
|
89
|
+
ipython_config.py
|
|
90
|
+
|
|
91
|
+
# pyenv
|
|
92
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
93
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
94
|
+
# .python-version
|
|
95
|
+
|
|
96
|
+
# pipenv
|
|
97
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
98
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
99
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
100
|
+
# install all needed dependencies.
|
|
101
|
+
#Pipfile.lock
|
|
102
|
+
|
|
103
|
+
# UV
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
#uv.lock
|
|
108
|
+
|
|
109
|
+
# poetry
|
|
110
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
111
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
112
|
+
# commonly ignored for libraries.
|
|
113
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
114
|
+
#poetry.lock
|
|
115
|
+
#poetry.toml
|
|
116
|
+
|
|
117
|
+
# pdm
|
|
118
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
119
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
120
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
121
|
+
#pdm.lock
|
|
122
|
+
#pdm.toml
|
|
123
|
+
.pdm-python
|
|
124
|
+
.pdm-build/
|
|
125
|
+
|
|
126
|
+
# pixi
|
|
127
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
128
|
+
#pixi.lock
|
|
129
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
130
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
131
|
+
.pixi
|
|
132
|
+
|
|
133
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
134
|
+
__pypackages__/
|
|
135
|
+
|
|
136
|
+
# Celery stuff
|
|
137
|
+
celerybeat-schedule
|
|
138
|
+
celerybeat.pid
|
|
139
|
+
|
|
140
|
+
# SageMath parsed files
|
|
141
|
+
*.sage.py
|
|
142
|
+
|
|
143
|
+
# Environments
|
|
144
|
+
.env
|
|
145
|
+
.envrc
|
|
146
|
+
.venv
|
|
147
|
+
env/
|
|
148
|
+
venv/
|
|
149
|
+
ENV/
|
|
150
|
+
env.bak/
|
|
151
|
+
venv.bak/
|
|
152
|
+
|
|
153
|
+
# Spyder project settings
|
|
154
|
+
.spyderproject
|
|
155
|
+
.spyproject
|
|
156
|
+
|
|
157
|
+
# Rope project settings
|
|
158
|
+
.ropeproject
|
|
159
|
+
|
|
160
|
+
# mkdocs documentation
|
|
161
|
+
/site
|
|
162
|
+
|
|
163
|
+
# mypy
|
|
164
|
+
.mypy_cache/
|
|
165
|
+
.dmypy.json
|
|
166
|
+
dmypy.json
|
|
167
|
+
|
|
168
|
+
# Pyre type checker
|
|
169
|
+
.pyre/
|
|
170
|
+
|
|
171
|
+
# pytype static type analyzer
|
|
172
|
+
.pytype/
|
|
173
|
+
|
|
174
|
+
# Cython debug symbols
|
|
175
|
+
cython_debug/
|
|
176
|
+
|
|
177
|
+
# PyCharm
|
|
178
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
179
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
180
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
181
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
182
|
+
#.idea/
|
|
183
|
+
|
|
184
|
+
# Abstra
|
|
185
|
+
# Abstra is an AI-powered process automation framework.
|
|
186
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
187
|
+
# Learn more at https://abstra.io/docs
|
|
188
|
+
.abstra/
|
|
189
|
+
|
|
190
|
+
# Visual Studio Code
|
|
191
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
192
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
193
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
194
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
195
|
+
# .vscode/
|
|
196
|
+
|
|
197
|
+
# Ruff stuff:
|
|
198
|
+
.ruff_cache/
|
|
199
|
+
|
|
200
|
+
# PyPI configuration file
|
|
201
|
+
.pypirc
|
|
202
|
+
|
|
203
|
+
# Cursor
|
|
204
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
205
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
206
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
207
|
+
.cursorignore
|
|
208
|
+
.cursorindexingignore
|
|
209
|
+
|
|
210
|
+
# Marimo
|
|
211
|
+
marimo/_static/
|
|
212
|
+
marimo/_lsp/
|
|
213
|
+
__marimo__/
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Mahé Perrette
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datamanifestpy
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python port of DataManifest.jl — declare and manage data dependencies for scientific projects
|
|
5
|
+
Author-email: Mahé Perrette <mahe.perrette@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 Mahé Perrette
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Requires-Python: >=3.10
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Requires-Dist: httpx
|
|
32
|
+
Requires-Dist: tqdm
|
|
33
|
+
Requires-Dist: tomli_w
|
|
34
|
+
Requires-Dist: tomli; python_version < "3.11"
|
|
35
|
+
Provides-Extra: csv
|
|
36
|
+
Requires-Dist: pandas; extra == "csv"
|
|
37
|
+
Provides-Extra: parquet
|
|
38
|
+
Requires-Dist: pandas; extra == "parquet"
|
|
39
|
+
Requires-Dist: pyarrow; extra == "parquet"
|
|
40
|
+
Provides-Extra: nc
|
|
41
|
+
Requires-Dist: xarray; extra == "nc"
|
|
42
|
+
Requires-Dist: netcdf4; extra == "nc"
|
|
43
|
+
Provides-Extra: yaml
|
|
44
|
+
Requires-Dist: pyyaml; extra == "yaml"
|
|
45
|
+
Provides-Extra: all
|
|
46
|
+
Requires-Dist: pandas; extra == "all"
|
|
47
|
+
Requires-Dist: pyarrow; extra == "all"
|
|
48
|
+
Requires-Dist: xarray; extra == "all"
|
|
49
|
+
Requires-Dist: netcdf4; extra == "all"
|
|
50
|
+
Requires-Dist: pyyaml; extra == "all"
|
|
51
|
+
Provides-Extra: dev
|
|
52
|
+
Requires-Dist: pytest; extra == "dev"
|
|
53
|
+
Dynamic: license-file
|
|
54
|
+
|
|
55
|
+
# datamanifest
|
|
56
|
+
|
|
57
|
+
[](https://github.com/perrette/datamanifest/actions/workflows/ci.yaml)
|
|
58
|
+
|
|
59
|
+
Keep track of datasets used in a scientific project.
|
|
60
|
+
|
|
61
|
+
`datamanifest` provides a simple way to declare data dependencies — URLs, git repositories, checksums, formats — in a `datasets.toml` file, and handles download, verification, extraction, and loading. It is a Python port of [`DataManifest.jl`](https://github.com/awi-esc/DataManifest.jl) (same author), with the same manifest format and feature surface.
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install datamanifest
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
With optional loader backends:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install "datamanifest[csv]" # pandas CSV
|
|
73
|
+
pip install "datamanifest[parquet]" # pandas + pyarrow
|
|
74
|
+
pip install "datamanifest[nc]" # xarray + netcdf4
|
|
75
|
+
pip install "datamanifest[yaml]" # pyyaml
|
|
76
|
+
pip install "datamanifest[all]" # all of the above
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## API quickstart
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
import datamanifest
|
|
83
|
+
|
|
84
|
+
# Add a dataset (registers + downloads + auto-fills sha256)
|
|
85
|
+
datamanifest.add(
|
|
86
|
+
"https://github.com/jesstierney/lgmDA/archive/refs/tags/v2.1.zip",
|
|
87
|
+
name="jesstierney/lgmDA",
|
|
88
|
+
extract=True,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Resolve the on-disk path
|
|
92
|
+
path = datamanifest.get_dataset_path("jesstierney/lgmDA")
|
|
93
|
+
|
|
94
|
+
# Download and load in one step
|
|
95
|
+
ds = datamanifest.load_dataset("my_nc_entry") # returns xarray.Dataset for nc format
|
|
96
|
+
|
|
97
|
+
# Explicit database (no pyproject.toml / env-var lookup)
|
|
98
|
+
db = datamanifest.Database("datasets.toml", "my-data-folder")
|
|
99
|
+
datamanifest.add(db, "https://zenodo.org/record/.../file.csv")
|
|
100
|
+
path = datamanifest.get_dataset_path(db, "file")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
The module-level functions (`add`, `download_dataset`, `load_dataset`, `get_dataset_path`, …) look up a process-wide default `Database` via `pyproject.toml` discovery, the `DATAMANIFEST_TOML` / `DATASETS_TOML` environment variables, or a `datasets.toml` / `datamanifest.toml` file in the working tree. Pass an explicit `db` as the first argument to bypass auto-discovery.
|
|
104
|
+
|
|
105
|
+
## CLI usage
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
datamanifest COMMAND [OPTIONS]
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
| Command | Description |
|
|
112
|
+
|---|---|
|
|
113
|
+
| `list [--present\|--missing\|--all]` | List datasets; default shows present first, then missing |
|
|
114
|
+
| `download [NAME ...] [--all] [--overwrite]` | Download specific datasets or all of them |
|
|
115
|
+
| `path NAME` | Print the resolved on-disk path (composable in shell) |
|
|
116
|
+
| `add URI [--name N] [--no-download] [--extract]` | Register and (by default) download a dataset |
|
|
117
|
+
| `remove NAME [--keep-cache]` | Delete an entry, optionally preserving cached files |
|
|
118
|
+
| `show NAME` | Print full entry detail in TOML style |
|
|
119
|
+
| `verify [NAME ...]` | Re-check sha256 checksums; exits nonzero on any mismatch |
|
|
120
|
+
| `init [--folder PATH] [--force]` | Create a fresh `datasets.toml` in the current directory |
|
|
121
|
+
| `where` | Print active `datasets_toml` and `datasets_folder` paths |
|
|
122
|
+
|
|
123
|
+
Examples:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
# Set up a new project
|
|
127
|
+
datamanifest init
|
|
128
|
+
|
|
129
|
+
# Add and download a dataset
|
|
130
|
+
datamanifest add "https://zenodo.org/record/.../file.zip" --extract
|
|
131
|
+
|
|
132
|
+
# Use the path in a shell pipeline
|
|
133
|
+
python analysis.py --data "$(datamanifest path file)"
|
|
134
|
+
|
|
135
|
+
# Verify all checksums before a paper submission
|
|
136
|
+
datamanifest verify
|
|
137
|
+
|
|
138
|
+
# Where is the active manifest?
|
|
139
|
+
datamanifest where
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Features
|
|
143
|
+
|
|
144
|
+
| Feature | Supported |
|
|
145
|
+
|---|---|
|
|
146
|
+
| HTTP / HTTPS download with progress | yes |
|
|
147
|
+
| Partial-download resume (Range header) | yes |
|
|
148
|
+
| git clone (`git://`, `ssh+git://`, `*.git`) | yes |
|
|
149
|
+
| SSH / rsync (`ssh://`, `sshfs://`, `rsync://`) | yes |
|
|
150
|
+
| Local file copy (`file://`) | yes |
|
|
151
|
+
| Multi-URI batch entries (`uris=`) | yes |
|
|
152
|
+
| SHA-256 checksum verification + auto-fill | yes |
|
|
153
|
+
| ZIP / tar / tar.gz extraction | yes |
|
|
154
|
+
| `requires=` dependency graph (topological order) | yes |
|
|
155
|
+
| Shell template hook (`shell=`) | yes |
|
|
156
|
+
| Python entry-point hook (`python=`) | yes |
|
|
157
|
+
| Named + default loaders (csv, parquet, nc, json, yaml, toml, zip, tar) | yes |
|
|
158
|
+
| TOML manifest round-trip (read `tomllib`, write `tomli_w`) | yes |
|
|
159
|
+
| Project-root auto-discovery (`pyproject.toml` walk, env vars) | yes |
|
|
160
|
+
| CLI (`datamanifest list/download/path/add/remove/show/verify/init/where`) | yes |
|
|
161
|
+
|
|
162
|
+
## Python adaptations
|
|
163
|
+
|
|
164
|
+
The Python port uses the same `datasets.toml` format as `DataManifest.jl`. Two fields differ:
|
|
165
|
+
|
|
166
|
+
- **`python=`** replaces `julia=`: an entry-point reference (`"pkg.mod:func"`) resolved via `importlib`. The callable receives keyword arguments `(download_path, project_root, entry, uri, key, version, doi, format, branch, requires_paths)`. No inline code execution (`exec`/`eval`) anywhere.
|
|
167
|
+
- **`callable=`** is an alias for `python=` accepted on read and normalized to `python=` on write. Intended for single-language projects that want a language-agnostic key.
|
|
168
|
+
- **`python_includes=`** is a list of directory paths prepended to `sys.path` during loader resolution (replaces `julia_modules`).
|
|
169
|
+
|
|
170
|
+
A single `datasets.toml` can be consumed by both tools: each reads the common fields and ignores the other's extension keys. The shared schema is documented at [perrette/datamanifest.toml](https://github.com/perrette/datamanifest.toml).
|
|
171
|
+
|
|
172
|
+
## Related projects
|
|
173
|
+
|
|
174
|
+
- [`awi-esc/DataManifest.jl`](https://github.com/awi-esc/DataManifest.jl) — the Julia implementation this port is based on.
|
|
175
|
+
- [`perrette/datamanifest.toml`](https://github.com/perrette/datamanifest.toml) — the shared TOML schema spec consumed by both implementations.
|
|
176
|
+
|
|
177
|
+
## Acknowledgments
|
|
178
|
+
|
|
179
|
+
`datamanifest` is a Python port of [`awi-esc/DataManifest.jl`](https://github.com/awi-esc/DataManifest.jl), written by the same author (Mahé Perrette). The Python port was implemented with assistance from [Anthropic's Claude](https://www.anthropic.com/claude).
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# datamanifest
|
|
2
|
+
|
|
3
|
+
[](https://github.com/perrette/datamanifest/actions/workflows/ci.yaml)
|
|
4
|
+
|
|
5
|
+
Keep track of datasets used in a scientific project.
|
|
6
|
+
|
|
7
|
+
`datamanifest` provides a simple way to declare data dependencies — URLs, git repositories, checksums, formats — in a `datasets.toml` file, and handles download, verification, extraction, and loading. It is a Python port of [`DataManifest.jl`](https://github.com/awi-esc/DataManifest.jl) (same author), with the same manifest format and feature surface.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install datamanifest
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
With optional loader backends:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install "datamanifest[csv]" # pandas CSV
|
|
19
|
+
pip install "datamanifest[parquet]" # pandas + pyarrow
|
|
20
|
+
pip install "datamanifest[nc]" # xarray + netcdf4
|
|
21
|
+
pip install "datamanifest[yaml]" # pyyaml
|
|
22
|
+
pip install "datamanifest[all]" # all of the above
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## API quickstart
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
import datamanifest
|
|
29
|
+
|
|
30
|
+
# Add a dataset (registers + downloads + auto-fills sha256)
|
|
31
|
+
datamanifest.add(
|
|
32
|
+
"https://github.com/jesstierney/lgmDA/archive/refs/tags/v2.1.zip",
|
|
33
|
+
name="jesstierney/lgmDA",
|
|
34
|
+
extract=True,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Resolve the on-disk path
|
|
38
|
+
path = datamanifest.get_dataset_path("jesstierney/lgmDA")
|
|
39
|
+
|
|
40
|
+
# Download and load in one step
|
|
41
|
+
ds = datamanifest.load_dataset("my_nc_entry") # returns xarray.Dataset for nc format
|
|
42
|
+
|
|
43
|
+
# Explicit database (no pyproject.toml / env-var lookup)
|
|
44
|
+
db = datamanifest.Database("datasets.toml", "my-data-folder")
|
|
45
|
+
datamanifest.add(db, "https://zenodo.org/record/.../file.csv")
|
|
46
|
+
path = datamanifest.get_dataset_path(db, "file")
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
The module-level functions (`add`, `download_dataset`, `load_dataset`, `get_dataset_path`, …) look up a process-wide default `Database` via `pyproject.toml` discovery, the `DATAMANIFEST_TOML` / `DATASETS_TOML` environment variables, or a `datasets.toml` / `datamanifest.toml` file in the working tree. Pass an explicit `db` as the first argument to bypass auto-discovery.
|
|
50
|
+
|
|
51
|
+
## CLI usage
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
datamanifest COMMAND [OPTIONS]
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
| Command | Description |
|
|
58
|
+
|---|---|
|
|
59
|
+
| `list [--present\|--missing\|--all]` | List datasets; default shows present first, then missing |
|
|
60
|
+
| `download [NAME ...] [--all] [--overwrite]` | Download specific datasets or all of them |
|
|
61
|
+
| `path NAME` | Print the resolved on-disk path (composable in shell) |
|
|
62
|
+
| `add URI [--name N] [--no-download] [--extract]` | Register and (by default) download a dataset |
|
|
63
|
+
| `remove NAME [--keep-cache]` | Delete an entry, optionally preserving cached files |
|
|
64
|
+
| `show NAME` | Print full entry detail in TOML style |
|
|
65
|
+
| `verify [NAME ...]` | Re-check sha256 checksums; exits nonzero on any mismatch |
|
|
66
|
+
| `init [--folder PATH] [--force]` | Create a fresh `datasets.toml` in the current directory |
|
|
67
|
+
| `where` | Print active `datasets_toml` and `datasets_folder` paths |
|
|
68
|
+
|
|
69
|
+
Examples:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
# Set up a new project
|
|
73
|
+
datamanifest init
|
|
74
|
+
|
|
75
|
+
# Add and download a dataset
|
|
76
|
+
datamanifest add "https://zenodo.org/record/.../file.zip" --extract
|
|
77
|
+
|
|
78
|
+
# Use the path in a shell pipeline
|
|
79
|
+
python analysis.py --data "$(datamanifest path file)"
|
|
80
|
+
|
|
81
|
+
# Verify all checksums before a paper submission
|
|
82
|
+
datamanifest verify
|
|
83
|
+
|
|
84
|
+
# Where is the active manifest?
|
|
85
|
+
datamanifest where
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Features
|
|
89
|
+
|
|
90
|
+
| Feature | Supported |
|
|
91
|
+
|---|---|
|
|
92
|
+
| HTTP / HTTPS download with progress | yes |
|
|
93
|
+
| Partial-download resume (Range header) | yes |
|
|
94
|
+
| git clone (`git://`, `ssh+git://`, `*.git`) | yes |
|
|
95
|
+
| SSH / rsync (`ssh://`, `sshfs://`, `rsync://`) | yes |
|
|
96
|
+
| Local file copy (`file://`) | yes |
|
|
97
|
+
| Multi-URI batch entries (`uris=`) | yes |
|
|
98
|
+
| SHA-256 checksum verification + auto-fill | yes |
|
|
99
|
+
| ZIP / tar / tar.gz extraction | yes |
|
|
100
|
+
| `requires=` dependency graph (topological order) | yes |
|
|
101
|
+
| Shell template hook (`shell=`) | yes |
|
|
102
|
+
| Python entry-point hook (`python=`) | yes |
|
|
103
|
+
| Named + default loaders (csv, parquet, nc, json, yaml, toml, zip, tar) | yes |
|
|
104
|
+
| TOML manifest round-trip (read `tomllib`, write `tomli_w`) | yes |
|
|
105
|
+
| Project-root auto-discovery (`pyproject.toml` walk, env vars) | yes |
|
|
106
|
+
| CLI (`datamanifest list/download/path/add/remove/show/verify/init/where`) | yes |
|
|
107
|
+
|
|
108
|
+
## Python adaptations
|
|
109
|
+
|
|
110
|
+
The Python port uses the same `datasets.toml` format as `DataManifest.jl`. Two fields differ:
|
|
111
|
+
|
|
112
|
+
- **`python=`** replaces `julia=`: an entry-point reference (`"pkg.mod:func"`) resolved via `importlib`. The callable receives keyword arguments `(download_path, project_root, entry, uri, key, version, doi, format, branch, requires_paths)`. No inline code execution (`exec`/`eval`) anywhere.
|
|
113
|
+
- **`callable=`** is an alias for `python=` accepted on read and normalized to `python=` on write. Intended for single-language projects that want a language-agnostic key.
|
|
114
|
+
- **`python_includes=`** is a list of directory paths prepended to `sys.path` during loader resolution (replaces `julia_modules`).
|
|
115
|
+
|
|
116
|
+
A single `datasets.toml` can be consumed by both tools: each reads the common fields and ignores the other's extension keys. The shared schema is documented at [perrette/datamanifest.toml](https://github.com/perrette/datamanifest.toml).
|
|
117
|
+
|
|
118
|
+
## Related projects
|
|
119
|
+
|
|
120
|
+
- [`awi-esc/DataManifest.jl`](https://github.com/awi-esc/DataManifest.jl) — the Julia implementation this port is based on.
|
|
121
|
+
- [`perrette/datamanifest.toml`](https://github.com/perrette/datamanifest.toml) — the shared TOML schema spec consumed by both implementations.
|
|
122
|
+
|
|
123
|
+
## Acknowledgments
|
|
124
|
+
|
|
125
|
+
`datamanifest` is a Python port of [`awi-esc/DataManifest.jl`](https://github.com/awi-esc/DataManifest.jl), written by the same author (Mahé Perrette). The Python port was implemented with assistance from [Anthropic's Claude](https://www.anthropic.com/claude).
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from ._version import __version__
|
|
3
|
+
except ImportError:
|
|
4
|
+
__version__ = "unknown"
|
|
5
|
+
|
|
6
|
+
from .database import (
|
|
7
|
+
Database,
|
|
8
|
+
DatasetEntry,
|
|
9
|
+
delete_dataset as _delete_dataset_db,
|
|
10
|
+
get_default_database,
|
|
11
|
+
validate_loader,
|
|
12
|
+
validate_loaders,
|
|
13
|
+
)
|
|
14
|
+
from .pipelines import (
|
|
15
|
+
_module_add as add,
|
|
16
|
+
_module_delete_dataset as delete_dataset,
|
|
17
|
+
_module_download_dataset as download_dataset,
|
|
18
|
+
_module_download_datasets as download_datasets,
|
|
19
|
+
_module_get_dataset_path as get_dataset_path,
|
|
20
|
+
_module_load_dataset as load_dataset,
|
|
21
|
+
_module_register_dataset as register_dataset,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"__version__",
|
|
26
|
+
"Database",
|
|
27
|
+
"DatasetEntry",
|
|
28
|
+
"add",
|
|
29
|
+
"delete_dataset",
|
|
30
|
+
"download_dataset",
|
|
31
|
+
"download_datasets",
|
|
32
|
+
"get_dataset_path",
|
|
33
|
+
"get_default_database",
|
|
34
|
+
"load_dataset",
|
|
35
|
+
"register_dataset",
|
|
36
|
+
"validate_loader",
|
|
37
|
+
"validate_loaders",
|
|
38
|
+
]
|