datamaestro 1.5.0__tar.gz → 1.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {datamaestro-1.5.0 → datamaestro-1.5.2}/.github/workflows/python-publish.yml +2 -3
  2. datamaestro-1.5.2/.python-version +1 -0
  3. {datamaestro-1.5.0 → datamaestro-1.5.2}/.readthedocs.yml +1 -1
  4. {datamaestro-1.5.0 → datamaestro-1.5.2}/PKG-INFO +18 -48
  5. datamaestro-1.5.2/pyproject.toml +62 -0
  6. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/__init__.py +1 -1
  7. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/context.py +10 -3
  8. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/definitions.py +11 -1
  9. datamaestro-1.5.2/src/datamaestro/version.py +4 -0
  10. datamaestro-1.5.2/uv.lock +1793 -0
  11. datamaestro-1.5.0/pyproject.toml +0 -13
  12. datamaestro-1.5.0/setup.cfg +0 -63
  13. datamaestro-1.5.0/setup.py +0 -6
  14. datamaestro-1.5.0/src/datamaestro/version.py +0 -21
  15. datamaestro-1.5.0/src/datamaestro.egg-info/PKG-INFO +0 -213
  16. datamaestro-1.5.0/src/datamaestro.egg-info/SOURCES.txt +0 -80
  17. datamaestro-1.5.0/src/datamaestro.egg-info/dependency_links.txt +0 -1
  18. datamaestro-1.5.0/src/datamaestro.egg-info/entry_points.txt +0 -5
  19. datamaestro-1.5.0/src/datamaestro.egg-info/not-zip-safe +0 -1
  20. datamaestro-1.5.0/src/datamaestro.egg-info/requires.txt +0 -16
  21. datamaestro-1.5.0/src/datamaestro.egg-info/top_level.txt +0 -1
  22. {datamaestro-1.5.0 → datamaestro-1.5.2}/.coverage +0 -0
  23. {datamaestro-1.5.0 → datamaestro-1.5.2}/.github/workflows/pytest.yml +0 -0
  24. {datamaestro-1.5.0 → datamaestro-1.5.2}/.gitignore +0 -0
  25. {datamaestro-1.5.0 → datamaestro-1.5.2}/.pre-commit-config.yaml +0 -0
  26. {datamaestro-1.5.0 → datamaestro-1.5.2}/CHANGELOG.md +0 -0
  27. {datamaestro-1.5.0 → datamaestro-1.5.2}/LICENSE +0 -0
  28. {datamaestro-1.5.0 → datamaestro-1.5.2}/MANIFEST.in +0 -0
  29. {datamaestro-1.5.0 → datamaestro-1.5.2}/README.md +0 -0
  30. {datamaestro-1.5.0 → datamaestro-1.5.2}/TODO.md +0 -0
  31. {datamaestro-1.5.0 → datamaestro-1.5.2}/docs/Makefile +0 -0
  32. {datamaestro-1.5.0 → datamaestro-1.5.2}/docs/make.bat +0 -0
  33. {datamaestro-1.5.0 → datamaestro-1.5.2}/docs/requirements.txt +0 -0
  34. {datamaestro-1.5.0 → datamaestro-1.5.2}/docs/source/api/data.md +0 -0
  35. {datamaestro-1.5.0 → datamaestro-1.5.2}/docs/source/api/download.rst +0 -0
  36. {datamaestro-1.5.0 → datamaestro-1.5.2}/docs/source/api/index.md +0 -0
  37. {datamaestro-1.5.0 → datamaestro-1.5.2}/docs/source/api/records.rst +0 -0
  38. {datamaestro-1.5.0 → datamaestro-1.5.2}/docs/source/conf.py +0 -0
  39. {datamaestro-1.5.0 → datamaestro-1.5.2}/docs/source/datasets.rst +0 -0
  40. {datamaestro-1.5.0 → datamaestro-1.5.2}/docs/source/developping.md +0 -0
  41. {datamaestro-1.5.0 → datamaestro-1.5.2}/docs/source/index.md +0 -0
  42. {datamaestro-1.5.0 → datamaestro-1.5.2}/docs/source/style.css +0 -0
  43. {datamaestro-1.5.0 → datamaestro-1.5.2}/pytest.ini +0 -0
  44. {datamaestro-1.5.0 → datamaestro-1.5.2}/requirements-dev.txt +0 -0
  45. {datamaestro-1.5.0 → datamaestro-1.5.2}/requirements.txt +0 -0
  46. {datamaestro-1.5.0 → datamaestro-1.5.2}/schema.yaml +0 -0
  47. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/__main__.py +0 -0
  48. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/annotations/__init__.py +0 -0
  49. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/annotations/agreement.py +0 -0
  50. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/commands/__init__.py +0 -0
  51. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/commands/mainstyle.css +0 -0
  52. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/commands/site.py +0 -0
  53. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/data/__init__.py +0 -0
  54. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/data/csv.py +0 -0
  55. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/data/huggingface.py +0 -0
  56. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/data/ml.py +0 -0
  57. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/data/tensor.py +0 -0
  58. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/download/__init__.py +0 -0
  59. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/download/archive.py +0 -0
  60. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/download/custom.py +0 -0
  61. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/download/huggingface.py +0 -0
  62. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/download/links.py +0 -0
  63. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/download/manual.py +0 -0
  64. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/download/multiple.py +0 -0
  65. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/download/single.py +0 -0
  66. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/download/sync.py +0 -0
  67. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/download/todo.py +0 -0
  68. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/download/wayback.py +0 -0
  69. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/record.py +0 -0
  70. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/registry.py +0 -0
  71. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/search.py +0 -0
  72. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/settings.py +0 -0
  73. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/sphinx.py +0 -0
  74. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/stream/__init__.py +0 -0
  75. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/stream/compress.py +0 -0
  76. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/stream/lines.py +0 -0
  77. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/templates/dataset.py +0 -0
  78. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/test/__init__.py +0 -0
  79. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/test/checks.py +0 -0
  80. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/test/conftest.py +0 -0
  81. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/test/test_annotations.py +0 -0
  82. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/test/test_download_handlers.py +0 -0
  83. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/test/test_record.py +0 -0
  84. {datamaestro-1.5.0 → datamaestro-1.5.2}/src/datamaestro/utils.py +0 -0
  85. {datamaestro-1.5.0 → datamaestro-1.5.2}/tox.ini +0 -0
@@ -19,12 +19,11 @@ jobs:
19
19
  python-version: "3.x"
20
20
  - name: Install dependencies
21
21
  run: |
22
- python -m pip install --upgrade pip
23
- pip install setuptools wheel twine
22
+ python -m pip install --upgrade setuptools twine uv
24
23
  - name: Build and publish
25
24
  env:
26
25
  TWINE_USERNAME: __token__
27
26
  TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
28
27
  run: |
29
- python setup.py sdist bdist_wheel
28
+ uv build
30
29
  twine upload dist/*
@@ -0,0 +1 @@
1
+ 3.10
@@ -11,7 +11,7 @@ sphinx:
11
11
  build:
12
12
  os: "ubuntu-20.04"
13
13
  tools:
14
- python: "3.9"
14
+ python: "3.10"
15
15
 
16
16
  # Install the package
17
17
  python:
@@ -1,42 +1,31 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro
3
- Version: 1.5.0
4
- Summary: "Dataset management command line and API"
5
- Home-page: https://github.com/experimaestro/datamaestro
6
- Author: Benjamin Piwowarski
7
- Author-email: benjamin@piwowarski.fr
8
- License: GPL-3
9
- Keywords: dataset manager
10
- Platform: any
3
+ Version: 1.5.2
4
+ Summary: Add your description here
5
+ Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
+ License-File: LICENSE
11
7
  Classifier: Development Status :: 4 - Beta
12
8
  Classifier: Intended Audience :: Science/Research
13
9
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
14
10
  Classifier: Operating System :: OS Independent
15
11
  Classifier: Programming Language :: Python
16
- Classifier: Programming Language :: Python :: 3.9
17
- Classifier: Programming Language :: Python :: 3.10
18
- Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3
19
13
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
- Requires-Python: >=3.8
21
- Description-Content-Type: text/markdown
22
- License-File: LICENSE
23
- Requires-Dist: click
24
- Requires-Dist: tqdm
25
- Requires-Dist: urllib3
26
- Requires-Dist: marshmallow
27
- Requires-Dist: cached_property
28
- Requires-Dist: requests
29
- Requires-Dist: bitmath
14
+ Requires-Python: >=3.10
15
+ Requires-Dist: bitmath>=1.3.3.1
16
+ Requires-Dist: cached-property>=2.0.1
17
+ Requires-Dist: click>=8.2.1
18
+ Requires-Dist: docstring-parser>=0.16
30
19
  Requires-Dist: experimaestro>=1.8.9
31
- Requires-Dist: mkdocs
32
- Requires-Dist: pymdown-extensions
33
- Requires-Dist: mkdocs-material
34
- Requires-Dist: docstring_parser
20
+ Requires-Dist: marshmallow>=3.26.1
21
+ Requires-Dist: mkdocs-material>=9.6.15
22
+ Requires-Dist: mkdocs>=1.6.1
35
23
  Requires-Dist: numpy
36
- Provides-Extra: test
37
- Requires-Dist: tox; extra == "test"
38
- Dynamic: license-file
39
- Dynamic: requires-dist
24
+ Requires-Dist: pymdown-extensions>=10.16
25
+ Requires-Dist: requests>=2.32.4
26
+ Requires-Dist: tqdm>=4.67.1
27
+ Requires-Dist: urllib3>=2.5.0
28
+ Description-Content-Type: text/markdown
40
29
 
41
30
  [![PyPI version](https://badge.fury.io/py/datamaestro.svg)](https://badge.fury.io/py/datamaestro) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![DOI](https://zenodo.org/badge/4573876.svg)](https://zenodo.org/badge/latestdoi/4573876)
42
31
 
@@ -192,22 +181,3 @@ This will allow to
192
181
 
193
182
  1. Document the dataset
194
183
  2. Allow to use the command line interface to manipulate it (download resources, etc.)
195
-
196
- # 0.8.0
197
-
198
- - Integration with other repositories: abstracting away the notion of dataset
199
- - Repository prefix
200
- - Set sub-datasets IDs automatically
201
-
202
- # 0.7.3
203
-
204
- - Updates for new experimaestro (0.8.5)
205
- - Search types with "type:..."
206
-
207
- # 0.6.17
208
-
209
- - Allow remote access through rpyc
210
-
211
- # 0.6.9
212
-
213
- `version` command
@@ -0,0 +1,62 @@
1
+ [project]
2
+ name = "datamaestro"
3
+ description = "Add your description here"
4
+ readme = "README.md"
5
+ requires-python = ">=3.10"
6
+ authors = [
7
+ { name = "Benjamin Piwowarski", email = "benjamin@piwowarski.fr"}
8
+ ]
9
+ dynamic = ["version"]
10
+ dependencies = [
11
+ "bitmath>=1.3.3.1",
12
+ "cached-property>=2.0.1",
13
+ "click>=8.2.1",
14
+ "docstring-parser>=0.16",
15
+ "experimaestro>=1.8.9",
16
+ "marshmallow>=3.26.1",
17
+ "mkdocs>=1.6.1",
18
+ "mkdocs-material>=9.6.15",
19
+ "numpy",
20
+ "pymdown-extensions>=10.16",
21
+ "requests>=2.32.4",
22
+ "tqdm>=4.67.1",
23
+ "urllib3>=2.5.0",
24
+ ]
25
+ classifiers = [
26
+ "Development Status :: 4 - Beta",
27
+ "Intended Audience :: Science/Research",
28
+ "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
29
+ "Operating System :: OS Independent",
30
+ "Programming Language :: Python",
31
+ "Programming Language :: Python :: 3",
32
+ "Topic :: Software Development :: Libraries :: Python Modules",
33
+ ]
34
+ [dependency-groups]
35
+ dev = [
36
+ "pre-commit>=4.2.0",
37
+ "pytest>=8.4.1",
38
+ "twine>=6.1.0",
39
+ ]
40
+
41
+ [project.scripts]
42
+ datamaestro = "datamaestro.__main__:main"
43
+
44
+ [project.entry-points."mkdocs.plugins"]
45
+ datamaestro = "datamaestro.commands.site:DatasetGenerator"
46
+
47
+ [build-system]
48
+ requires = ["hatchling", "uv-dynamic-versioning"]
49
+ build-backend = "hatchling.build"
50
+
51
+ [tool.hatch.version]
52
+ source = "uv-dynamic-versioning"
53
+
54
+ [tool.uv-dynamic-versioning]
55
+ fallback-version = "0.0.0"
56
+
57
+ [tool.hatch.build.hooks.version]
58
+ path = "src/datamaestro/version.py"
59
+
60
+ [tool.mypy]
61
+ python_version = 3.10
62
+ warn_unused_ignores = "True"
@@ -10,4 +10,4 @@ from .context import (
10
10
  from pkg_resources import get_distribution, DistributionNotFound
11
11
  from .definitions import dataset, metadata
12
12
  from .data import Base
13
- from .version import version, version_tuple
13
+ from .version import __version__
@@ -1,5 +1,5 @@
1
1
  from pathlib import Path
2
- from typing import Iterable, Iterator, Dict, Union
2
+ from typing import Iterable, Iterator, Dict, Optional, Union
3
3
  import importlib
4
4
  import os
5
5
  import hashlib
@@ -423,16 +423,23 @@ def find_dataset(dataset_id: str):
423
423
  return AbstractDataset.find(dataset_id)
424
424
 
425
425
 
426
- def prepare_dataset(dataset_id: Union[str, "DatasetWrapper", Config]):
426
+ def prepare_dataset(
427
+ dataset_id: Union[str, "DatasetWrapper", Config],
428
+ context: Optional[Union[Context, Path]] = None,
429
+ ):
427
430
  """Find a dataset given its id and download the resources"""
428
431
  from .definitions import AbstractDataset, DatasetWrapper
429
432
 
433
+ match context:
434
+ case Path() | str():
435
+ context = Context(Path(context))
436
+
430
437
  if isinstance(dataset_id, DatasetWrapper):
431
438
  ds = dataset_id
432
439
  elif isinstance(dataset_id, Config):
433
440
  ds = dataset_id.__datamaestro_dataset__
434
441
  else:
435
- ds = AbstractDataset.find(dataset_id)
442
+ ds = AbstractDataset.find(dataset_id, context=context)
436
443
 
437
444
  return ds.prepare(download=True)
438
445
 
@@ -204,6 +204,14 @@ class AbstractDataset(AbstractData):
204
204
  from datamaestro.data import Base
205
205
 
206
206
  if isinstance(data, Base):
207
+ try:
208
+ if data.id:
209
+ # There is already an ID, skip this
210
+ # and the descendants
211
+ return
212
+ except KeyError:
213
+ pass
214
+
207
215
  if self.repository is None:
208
216
  data.id = id
209
217
  else:
@@ -228,10 +236,12 @@ class AbstractDataset(AbstractData):
228
236
  return success
229
237
 
230
238
  @staticmethod
231
- def find(name: str) -> "DataDefinition":
239
+ def find(name: str, context: Optional["Context"] = None) -> "DataDefinition":
232
240
  """Find a dataset given its name"""
233
241
  from datamaestro.context import Context # noqa: F811
234
242
 
243
+ context = Context.instance() if context is None else context
244
+
235
245
  logging.debug("Searching dataset %s", name)
236
246
  for repository in Context.instance().repositories():
237
247
  logging.debug("Searching dataset %s in %s", name, repository)
@@ -0,0 +1,4 @@
1
+ # This file is auto-generated by Hatchling. As such, do not:
2
+ # - modify
3
+ # - track in version control e.g. be sure to add to .gitignore
4
+ __version__ = VERSION = '1.5.2'