datamaestro 1.5.1__tar.gz → 1.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamaestro-1.5.1 → datamaestro-1.5.2}/.github/workflows/python-publish.yml +2 -3
- datamaestro-1.5.2/.python-version +1 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/.readthedocs.yml +1 -1
- {datamaestro-1.5.1 → datamaestro-1.5.2}/PKG-INFO +18 -48
- datamaestro-1.5.2/pyproject.toml +62 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/__init__.py +1 -1
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/context.py +10 -3
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/definitions.py +3 -1
- datamaestro-1.5.2/src/datamaestro/version.py +4 -0
- datamaestro-1.5.2/uv.lock +1793 -0
- datamaestro-1.5.1/pyproject.toml +0 -13
- datamaestro-1.5.1/setup.cfg +0 -63
- datamaestro-1.5.1/setup.py +0 -6
- datamaestro-1.5.1/src/datamaestro/version.py +0 -21
- datamaestro-1.5.1/src/datamaestro.egg-info/PKG-INFO +0 -213
- datamaestro-1.5.1/src/datamaestro.egg-info/SOURCES.txt +0 -80
- datamaestro-1.5.1/src/datamaestro.egg-info/dependency_links.txt +0 -1
- datamaestro-1.5.1/src/datamaestro.egg-info/entry_points.txt +0 -5
- datamaestro-1.5.1/src/datamaestro.egg-info/not-zip-safe +0 -1
- datamaestro-1.5.1/src/datamaestro.egg-info/requires.txt +0 -16
- datamaestro-1.5.1/src/datamaestro.egg-info/top_level.txt +0 -1
- {datamaestro-1.5.1 → datamaestro-1.5.2}/.coverage +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/.github/workflows/pytest.yml +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/.gitignore +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/.pre-commit-config.yaml +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/CHANGELOG.md +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/LICENSE +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/MANIFEST.in +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/README.md +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/TODO.md +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/docs/Makefile +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/docs/make.bat +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/docs/requirements.txt +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/docs/source/api/data.md +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/docs/source/api/download.rst +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/docs/source/api/index.md +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/docs/source/api/records.rst +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/docs/source/conf.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/docs/source/datasets.rst +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/docs/source/developping.md +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/docs/source/index.md +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/docs/source/style.css +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/pytest.ini +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/requirements-dev.txt +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/requirements.txt +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/schema.yaml +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/__main__.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/annotations/__init__.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/annotations/agreement.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/commands/__init__.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/commands/mainstyle.css +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/commands/site.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/data/__init__.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/data/csv.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/data/huggingface.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/data/ml.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/data/tensor.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/download/__init__.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/download/archive.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/download/custom.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/download/huggingface.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/download/links.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/download/manual.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/download/multiple.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/download/single.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/download/sync.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/download/todo.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/download/wayback.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/record.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/registry.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/search.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/settings.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/sphinx.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/stream/__init__.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/stream/compress.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/stream/lines.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/templates/dataset.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/test/__init__.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/test/checks.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/test/conftest.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/test/test_annotations.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/test/test_download_handlers.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/test/test_record.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/src/datamaestro/utils.py +0 -0
- {datamaestro-1.5.1 → datamaestro-1.5.2}/tox.ini +0 -0
|
@@ -19,12 +19,11 @@ jobs:
|
|
|
19
19
|
python-version: "3.x"
|
|
20
20
|
- name: Install dependencies
|
|
21
21
|
run: |
|
|
22
|
-
python -m pip install --upgrade
|
|
23
|
-
pip install setuptools wheel twine
|
|
22
|
+
python -m pip install --upgrade setuptools twine uv
|
|
24
23
|
- name: Build and publish
|
|
25
24
|
env:
|
|
26
25
|
TWINE_USERNAME: __token__
|
|
27
26
|
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
|
28
27
|
run: |
|
|
29
|
-
|
|
28
|
+
uv build
|
|
30
29
|
twine upload dist/*
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.10
|
|
@@ -1,42 +1,31 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro
|
|
3
|
-
Version: 1.5.
|
|
4
|
-
Summary:
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
Author-email: benjamin@piwowarski.fr
|
|
8
|
-
License: GPL-3
|
|
9
|
-
Keywords: dataset manager
|
|
10
|
-
Platform: any
|
|
3
|
+
Version: 1.5.2
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
|
+
License-File: LICENSE
|
|
11
7
|
Classifier: Development Status :: 4 - Beta
|
|
12
8
|
Classifier: Intended Audience :: Science/Research
|
|
13
9
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
14
10
|
Classifier: Operating System :: OS Independent
|
|
15
11
|
Classifier: Programming Language :: Python
|
|
16
|
-
Classifier: Programming Language :: Python :: 3
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
13
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
-
Requires-Python: >=3.
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
Requires-Dist: click
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
Requires-Dist: urllib3
|
|
26
|
-
Requires-Dist: marshmallow
|
|
27
|
-
Requires-Dist: cached_property
|
|
28
|
-
Requires-Dist: requests
|
|
29
|
-
Requires-Dist: bitmath
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Requires-Dist: bitmath>=1.3.3.1
|
|
16
|
+
Requires-Dist: cached-property>=2.0.1
|
|
17
|
+
Requires-Dist: click>=8.2.1
|
|
18
|
+
Requires-Dist: docstring-parser>=0.16
|
|
30
19
|
Requires-Dist: experimaestro>=1.8.9
|
|
31
|
-
Requires-Dist:
|
|
32
|
-
Requires-Dist:
|
|
33
|
-
Requires-Dist: mkdocs
|
|
34
|
-
Requires-Dist: docstring_parser
|
|
20
|
+
Requires-Dist: marshmallow>=3.26.1
|
|
21
|
+
Requires-Dist: mkdocs-material>=9.6.15
|
|
22
|
+
Requires-Dist: mkdocs>=1.6.1
|
|
35
23
|
Requires-Dist: numpy
|
|
36
|
-
|
|
37
|
-
Requires-Dist:
|
|
38
|
-
|
|
39
|
-
|
|
24
|
+
Requires-Dist: pymdown-extensions>=10.16
|
|
25
|
+
Requires-Dist: requests>=2.32.4
|
|
26
|
+
Requires-Dist: tqdm>=4.67.1
|
|
27
|
+
Requires-Dist: urllib3>=2.5.0
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
40
29
|
|
|
41
30
|
[](https://badge.fury.io/py/datamaestro) [](https://github.com/pre-commit/pre-commit) [](https://zenodo.org/badge/latestdoi/4573876)
|
|
42
31
|
|
|
@@ -192,22 +181,3 @@ This will allow to
|
|
|
192
181
|
|
|
193
182
|
1. Document the dataset
|
|
194
183
|
2. Allow to use the command line interface to manipulate it (download resources, etc.)
|
|
195
|
-
|
|
196
|
-
# 0.8.0
|
|
197
|
-
|
|
198
|
-
- Integration with other repositories: abstracting away the notion of dataset
|
|
199
|
-
- Repository prefix
|
|
200
|
-
- Set sub-datasets IDs automatically
|
|
201
|
-
|
|
202
|
-
# 0.7.3
|
|
203
|
-
|
|
204
|
-
- Updates for new experimaestro (0.8.5)
|
|
205
|
-
- Search types with "type:..."
|
|
206
|
-
|
|
207
|
-
# 0.6.17
|
|
208
|
-
|
|
209
|
-
- Allow remote access through rpyc
|
|
210
|
-
|
|
211
|
-
# 0.6.9
|
|
212
|
-
|
|
213
|
-
`version` command
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "datamaestro"
|
|
3
|
+
description = "Add your description here"
|
|
4
|
+
readme = "README.md"
|
|
5
|
+
requires-python = ">=3.10"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Benjamin Piwowarski", email = "benjamin@piwowarski.fr"}
|
|
8
|
+
]
|
|
9
|
+
dynamic = ["version"]
|
|
10
|
+
dependencies = [
|
|
11
|
+
"bitmath>=1.3.3.1",
|
|
12
|
+
"cached-property>=2.0.1",
|
|
13
|
+
"click>=8.2.1",
|
|
14
|
+
"docstring-parser>=0.16",
|
|
15
|
+
"experimaestro>=1.8.9",
|
|
16
|
+
"marshmallow>=3.26.1",
|
|
17
|
+
"mkdocs>=1.6.1",
|
|
18
|
+
"mkdocs-material>=9.6.15",
|
|
19
|
+
"numpy",
|
|
20
|
+
"pymdown-extensions>=10.16",
|
|
21
|
+
"requests>=2.32.4",
|
|
22
|
+
"tqdm>=4.67.1",
|
|
23
|
+
"urllib3>=2.5.0",
|
|
24
|
+
]
|
|
25
|
+
classifiers = [
|
|
26
|
+
"Development Status :: 4 - Beta",
|
|
27
|
+
"Intended Audience :: Science/Research",
|
|
28
|
+
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
|
29
|
+
"Operating System :: OS Independent",
|
|
30
|
+
"Programming Language :: Python",
|
|
31
|
+
"Programming Language :: Python :: 3",
|
|
32
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
33
|
+
]
|
|
34
|
+
[dependency-groups]
|
|
35
|
+
dev = [
|
|
36
|
+
"pre-commit>=4.2.0",
|
|
37
|
+
"pytest>=8.4.1",
|
|
38
|
+
"twine>=6.1.0",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[project.scripts]
|
|
42
|
+
datamaestro = "datamaestro.__main__:main"
|
|
43
|
+
|
|
44
|
+
[project.entry-points."mkdocs.plugins"]
|
|
45
|
+
datamaestro = "datamaestro.commands.site:DatasetGenerator"
|
|
46
|
+
|
|
47
|
+
[build-system]
|
|
48
|
+
requires = ["hatchling", "uv-dynamic-versioning"]
|
|
49
|
+
build-backend = "hatchling.build"
|
|
50
|
+
|
|
51
|
+
[tool.hatch.version]
|
|
52
|
+
source = "uv-dynamic-versioning"
|
|
53
|
+
|
|
54
|
+
[tool.uv-dynamic-versioning]
|
|
55
|
+
fallback-version = "0.0.0"
|
|
56
|
+
|
|
57
|
+
[tool.hatch.build.hooks.version]
|
|
58
|
+
path = "src/datamaestro/version.py"
|
|
59
|
+
|
|
60
|
+
[tool.mypy]
|
|
61
|
+
python_version = 3.10
|
|
62
|
+
warn_unused_ignores = "True"
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from typing import Iterable, Iterator, Dict, Union
|
|
2
|
+
from typing import Iterable, Iterator, Dict, Optional, Union
|
|
3
3
|
import importlib
|
|
4
4
|
import os
|
|
5
5
|
import hashlib
|
|
@@ -423,16 +423,23 @@ def find_dataset(dataset_id: str):
|
|
|
423
423
|
return AbstractDataset.find(dataset_id)
|
|
424
424
|
|
|
425
425
|
|
|
426
|
-
def prepare_dataset(
|
|
426
|
+
def prepare_dataset(
|
|
427
|
+
dataset_id: Union[str, "DatasetWrapper", Config],
|
|
428
|
+
context: Optional[Union[Context, Path]] = None,
|
|
429
|
+
):
|
|
427
430
|
"""Find a dataset given its id and download the resources"""
|
|
428
431
|
from .definitions import AbstractDataset, DatasetWrapper
|
|
429
432
|
|
|
433
|
+
match context:
|
|
434
|
+
case Path() | str():
|
|
435
|
+
context = Context(Path(context))
|
|
436
|
+
|
|
430
437
|
if isinstance(dataset_id, DatasetWrapper):
|
|
431
438
|
ds = dataset_id
|
|
432
439
|
elif isinstance(dataset_id, Config):
|
|
433
440
|
ds = dataset_id.__datamaestro_dataset__
|
|
434
441
|
else:
|
|
435
|
-
ds = AbstractDataset.find(dataset_id)
|
|
442
|
+
ds = AbstractDataset.find(dataset_id, context=context)
|
|
436
443
|
|
|
437
444
|
return ds.prepare(download=True)
|
|
438
445
|
|
|
@@ -236,10 +236,12 @@ class AbstractDataset(AbstractData):
|
|
|
236
236
|
return success
|
|
237
237
|
|
|
238
238
|
@staticmethod
|
|
239
|
-
def find(name: str) -> "DataDefinition":
|
|
239
|
+
def find(name: str, context: Optional["Context"] = None) -> "DataDefinition":
|
|
240
240
|
"""Find a dataset given its name"""
|
|
241
241
|
from datamaestro.context import Context # noqa: F811
|
|
242
242
|
|
|
243
|
+
context = Context.instance() if context is None else context
|
|
244
|
+
|
|
243
245
|
logging.debug("Searching dataset %s", name)
|
|
244
246
|
for repository in Context.instance().repositories():
|
|
245
247
|
logging.debug("Searching dataset %s in %s", name, repository)
|