datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. datamaestro/__init__.py +11 -7
  2. datamaestro/__main__.py +29 -8
  3. datamaestro/annotations/__init__.py +1 -1
  4. datamaestro/annotations/agreement.py +9 -3
  5. datamaestro/commands/site.py +27 -15
  6. datamaestro/context.py +143 -87
  7. datamaestro/data/__init__.py +23 -11
  8. datamaestro/data/csv.py +12 -12
  9. datamaestro/data/huggingface.py +25 -0
  10. datamaestro/data/ml.py +19 -10
  11. datamaestro/data/tensor.py +32 -24
  12. datamaestro/definitions.py +492 -131
  13. datamaestro/download/__init__.py +610 -24
  14. datamaestro/download/archive.py +129 -77
  15. datamaestro/download/custom.py +53 -0
  16. datamaestro/download/huggingface.py +77 -0
  17. datamaestro/download/links.py +106 -50
  18. datamaestro/download/multiple.py +27 -5
  19. datamaestro/download/single.py +114 -51
  20. datamaestro/download/sync.py +0 -1
  21. datamaestro/download/todo.py +9 -4
  22. datamaestro/download/wayback.py +164 -0
  23. datamaestro/record.py +232 -0
  24. datamaestro/registry.py +1 -0
  25. datamaestro/search.py +1 -1
  26. datamaestro/settings.py +3 -1
  27. datamaestro/sphinx.py +224 -0
  28. datamaestro/stream/__init__.py +0 -2
  29. datamaestro/stream/lines.py +10 -7
  30. datamaestro/templates/dataset.py +5 -4
  31. datamaestro/test/__init__.py +3 -1
  32. datamaestro/test/checks.py +1 -5
  33. datamaestro/test/conftest.py +1 -6
  34. datamaestro/test/test_annotations.py +2 -2
  35. datamaestro/test/test_download_handlers.py +3 -4
  36. datamaestro/test/test_record.py +72 -0
  37. datamaestro/test/test_resource.py +1388 -0
  38. datamaestro/utils.py +15 -9
  39. datamaestro/v2.md +301 -0
  40. datamaestro/version.py +4 -0
  41. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
  42. datamaestro-1.7.0.dist-info/RECORD +49 -0
  43. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
  44. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
  45. datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
  46. datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
  47. datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
  48. datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
  49. datamaestro/__pycache__/context.cpython-38.pyc +0 -0
  50. datamaestro/__pycache__/context.cpython-39.pyc +0 -0
  51. datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
  52. datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
  53. datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
  54. datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
  55. datamaestro/__pycache__/search.cpython-38.pyc +0 -0
  56. datamaestro/__pycache__/search.cpython-39.pyc +0 -0
  57. datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
  58. datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
  59. datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
  60. datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
  61. datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
  62. datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
  63. datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
  64. datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
  65. datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
  66. datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
  67. datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
  68. datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
  69. datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
  70. datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
  71. datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
  72. datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
  73. datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
  74. datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
  75. datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
  76. datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
  77. datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
  78. datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
  79. datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
  80. datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
  81. datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
  82. datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
  83. datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
  84. datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
  85. datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
  86. datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
  87. datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
  88. datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
  89. datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
  90. datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
  91. datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
  92. datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
  93. datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
  94. datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
  95. datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
  96. datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
  97. datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
  98. datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
  99. datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
  100. datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
  101. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
  102. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
  103. datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
  104. datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
  105. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
  106. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
  107. datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
  108. datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
  109. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
  110. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
  111. datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
  112. datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
  113. datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
  114. datamaestro-0.8.1.dist-info/RECORD +0 -109
  115. datamaestro-0.8.1.dist-info/top_level.txt +0 -1
  116. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
datamaestro/utils.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import os.path as op
3
2
  from experimaestro import Config
4
3
  import json
5
4
  from pathlib import PosixPath, Path
@@ -43,9 +42,7 @@ def copyfileobjs(fsrc, fdsts, length=0):
43
42
 
44
43
 
45
44
  class FileChecker:
46
- def check(self, path: Path):
47
- """Check if the file is correct and throws an exception if not"""
48
- raise NotImplementedError()
45
+ """Checks a file"""
49
46
 
50
47
  def check(self, path: Path):
51
48
  """Check the given file
@@ -69,9 +66,12 @@ class FileChecker:
69
66
 
70
67
 
71
68
  class HashCheck(FileChecker):
72
- """Check a file against a hash"""
73
-
74
69
  def __init__(self, hashstr: str, hasherfn=hashlib.md5):
70
+ """Check a file against a hash
71
+
72
+ :param hashstr: The HASH value
73
+ :param hasherfn: The hash computer, defaults to hashlib.md5
74
+ """
75
75
  self.hashstr = hashstr
76
76
  self.hasherfn = hasherfn
77
77
  self.hasher = None
@@ -158,6 +158,11 @@ def downloadURL(url: str, path: Path, resume: bool = False, size: int = None):
158
158
  if response is None:
159
159
  response = requests.get(url, stream=True)
160
160
 
161
+ # Valid response
162
+ assert response.status_code >= 200 and response.status_code < 300, (
163
+ f"Status code is not 2XX ({response.status_code})"
164
+ )
165
+
161
166
  # Get the total size (or use the provided one)
162
167
  total_size = int(response.headers.get("content-length", size or 0))
163
168
 
@@ -166,9 +171,10 @@ def downloadURL(url: str, path: Path, resume: bool = False, size: int = None):
166
171
  total_size += pos
167
172
 
168
173
  CHUNK_SIZE = 1024
169
- with path.open("ab") as f, tqdm(
170
- initial=pos, total=total_size, unit_scale=True, unit="B"
171
- ) as t:
174
+ with (
175
+ path.open("ab") as f,
176
+ tqdm(initial=pos, total=total_size, unit_scale=True, unit="B") as t,
177
+ ):
172
178
  for data in response.iter_content(chunk_size=CHUNK_SIZE):
173
179
  f.write(data)
174
180
  t.update(len(data))
datamaestro/v2.md ADDED
@@ -0,0 +1,301 @@
1
+ # Resource Interface (v2)
2
+
3
+ ## Overview
4
+
5
+ Resources represent steps in a dataset preparation pipeline. They form a
6
+ directed acyclic graph (DAG) where each resource can depend on other resources.
7
+
8
+ Key concepts:
9
+
10
+ - **Two-path system**: resources write to `transient_path` during download,
11
+ then the framework moves data to `path` and marks the resource as COMPLETE.
12
+ - **Three states**: NONE, PARTIAL, COMPLETE (persisted in `.state.json`)
13
+ - **Transient resources**: intermediate resources that can be deleted after all
14
+ dependents are COMPLETE (eager cleanup)
15
+ - **`can_recover` property**: subclasses override to preserve PARTIAL data on error
16
+
17
+ ## Modern API: Class-based datasets (preferred)
18
+
19
+ ```python
20
+ from datamaestro.definitions import dataset
21
+ from datamaestro.download.single import FileDownloader
22
+
23
+ @dataset(url="http://yann.lecun.com/exdb/mnist/")
24
+ class ProcessedMNIST(ImageClassification):
25
+ """The MNIST database of handwritten digits."""
26
+
27
+ # Resources are class attributes — no decorators needed
28
+ TRAIN_IMAGES = FileDownloader(
29
+ "train_images.idx",
30
+ "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
31
+ transient=True,
32
+ )
33
+ TRAIN_IMAGES_NP = NumpyTensorFile.from_idx(TRAIN_IMAGES)
34
+
35
+ TRAIN_LABELS = FileDownloader(
36
+ "train_labels.idx",
37
+ "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz",
38
+ transient=True,
39
+ )
40
+ TRAIN_LABELS_NP = NumpyTensorFile.from_idx(TRAIN_LABELS)
41
+
42
+ TEST_IMAGES = FileDownloader(
43
+ "test_images.idx",
44
+ "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
45
+ transient=True,
46
+ )
47
+ TEST_IMAGES_NP = NumpyTensorFile.from_idx(TEST_IMAGES)
48
+
49
+ TEST_LABELS = FileDownloader(
50
+ "test_labels.idx",
51
+ "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz",
52
+ transient=True,
53
+ )
54
+ TEST_LABELS_NP = NumpyTensorFile.from_idx(TEST_LABELS)
55
+
56
+ @classmethod
57
+ def __create_dataset__(cls, dataset: AbstractDataset):
58
+ return cls.C(
59
+ train=LabelledImages(
60
+ images=NumpyTensorFile(path=cls.TRAIN_IMAGES_NP.path),
61
+ labels=NumpyTensorFile(path=cls.TRAIN_LABELS_NP.path),
62
+ ),
63
+ test=LabelledImages(
64
+ images=NumpyTensorFile(path=cls.TEST_IMAGES_NP.path),
65
+ labels=NumpyTensorFile(path=cls.TEST_LABELS_NP.path),
66
+ ),
67
+ )
68
+ ```
69
+
70
+ Advantages:
71
+
72
+ 1. **Explicit pipeline** — dependencies between resources are visible
73
+ 2. **Transient intermediaries** — intermediate files can be deleted after processing
74
+ 3. **No varname** — resource names are auto-detected from class attribute names
75
+ 4. **Two-path safety** — incomplete downloads never appear at the final path
76
+
77
+ ## Resource hierarchy
78
+
79
+ ```
80
+ Resource (ABC)
81
+ ├── FileResource — produces a single file
82
+ ├── FolderResource — produces a directory
83
+ ├── ValueResource — produces an in-memory value (no files)
84
+ ├── reference — references another dataset
85
+ └── Download — (deprecated alias for Resource)
86
+ ```
87
+
88
+ ### `ResourceState`
89
+
90
+ ```python
91
+ class ResourceState(str, Enum):
92
+ NONE = "none" # Not started
93
+ PARTIAL = "partial" # Started but incomplete
94
+ COMPLETE = "complete" # Fully available
95
+ ```
96
+
97
+ ### `Resource` base class
98
+
99
+ | Property / Method | Description |
100
+ |---|---|
101
+ | `name: str` | Resource name (auto-set from class attribute name) |
102
+ | `dataset` | Back-reference to the owning `AbstractDataset` |
103
+ | `transient: bool` | Whether data can be deleted after dependents complete |
104
+ | `can_recover: bool` | Property. If True, PARTIAL data is preserved on error |
105
+ | `dependencies` | List of resources that must be COMPLETE first |
106
+ | `dependents` | Computed inverse of dependencies |
107
+ | `path: Path` | Final storage path (after COMPLETE) |
108
+ | `transient_path: Path` | Temp path where `download()` writes |
109
+ | `state: ResourceState` | Current state (from `.state.json` metadata file) |
110
+ | `download(force)` | Abstract. Execute download/processing step |
111
+ | `prepare()` | Abstract. Return value for dataset construction |
112
+ | `cleanup()` | Remove data from disk, set state to NONE |
113
+ | `has_files() -> bool` | Whether this resource produces files on disk |
114
+ | `bind(name, dataset)` | Bind to a dataset (called by framework) |
115
+ | `stream() -> IO | None` | (FileResource only) Return byte stream or None |
116
+
117
+ ### `FileResource`
118
+
119
+ Base for resources that produce a single file. Subclasses implement
120
+ `_download(destination: Path)`.
121
+
122
+ ```python
123
+ class MyFileResource(FileResource):
124
+ def __init__(self, filename, url, **kw):
125
+ super().__init__(filename, **kw)
126
+ self.url = url
127
+
128
+ def _download(self, destination: Path):
129
+ # Write to destination (which is self.transient_path)
130
+ ...
131
+ ```
132
+
133
+ ### `FolderResource`
134
+
135
+ Base for resources that produce a directory. Subclasses implement
136
+ `_download(destination: Path)`.
137
+
138
+ ### `ValueResource`
139
+
140
+ Base for resources that produce in-memory values (no files on disk).
141
+ `has_files()` returns False.
142
+
143
+ ## Custom resource handlers (modern)
144
+
145
+ ```python
146
+ from datamaestro.download import FileResource
147
+
148
+ class MyProcessor(FileResource):
149
+ """Process a source file into a numpy array."""
150
+
151
+ @property
152
+ def can_recover(self) -> bool:
153
+ return False # or True for resumable downloads
154
+
155
+ def __init__(self, filename, source, **kw):
156
+ super().__init__(filename, **kw)
157
+ self._dependencies = [source]
158
+
159
+ def _download(self, destination):
160
+ # Read from dependency, write to destination
161
+ source_path = self.dependencies[0].path
162
+ data = load(source_path)
163
+ save(process(data), destination)
164
+
165
+ @classmethod
166
+ def from_source(cls, source):
167
+ return cls("processed.npy", source)
168
+
169
+ # Factory alias
170
+ my_processor = MyProcessor.from_source
171
+ ```
172
+
173
+ ## Built-in resource types
174
+
175
+ | Class | Module | Factory alias | Base |
176
+ |---|---|---|---|
177
+ | `FileDownloader` | `download.single` | `filedownloader` | `FileResource` |
178
+ | `ConcatDownloader` | `download.single` | `concatdownload` | `FileResource` |
179
+ | `ZipDownloader` | `download.archive` | `zipdownloader` | `FolderResource` |
180
+ | `TarDownloader` | `download.archive` | `tardownloader` | `FolderResource` |
181
+ | `HFDownloader` | `download.huggingface` | `hf_download` | `ValueResource` |
182
+ | `custom_download` | `download.custom` | — | `Resource` |
183
+ | `links` | `download.links` | — | `Resource` |
184
+ | `linkfolder` | `download.links` | — | `Resource` |
185
+ | `linkfile` | `download.links` | — | `Resource` |
186
+ | `reference` | `download` | — | `Resource` |
187
+
188
+ ## Two-path download flow
189
+
190
+ The framework (in `AbstractDataset.download()`) orchestrates:
191
+
192
+ ```
193
+ 1. Topological sort resources by dependencies
194
+ 2. For each resource:
195
+ a. COMPLETE and not force → skip
196
+ b. PARTIAL and not can_recover → delete transient_path, set NONE
197
+ c. Call resource.download(force)
198
+ → Resource writes to transient_path
199
+ d. On success: move transient_path → path, set COMPLETE
200
+ e. On failure: if can_recover → set PARTIAL, else delete → NONE
201
+ f. Eager cleanup: for each transient dependency with all
202
+ dependents COMPLETE → cleanup
203
+ ```
204
+
205
+ ## State metadata file
206
+
207
+ Location: `<dataset.datapath>/.downloads/.state.json`
208
+
209
+ ```json
210
+ {
211
+ "version": 1,
212
+ "resources": {
213
+ "TRAIN_IMAGES": {"state": "complete"},
214
+ "TRAIN_LABELS": {"state": "partial"}
215
+ }
216
+ }
217
+ ```
218
+
219
+ ---
220
+
221
+ ## Deprecated: decorator-based datasets
222
+
223
+ > **Deprecated.** The decorator-based API still works but emits deprecation
224
+ > warnings. Migrate to the class-based approach above.
225
+
226
+ ```python
227
+ # DEPRECATED — use class-based approach instead
228
+ @filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
229
+ @filedownloader("train_labels.idx", "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")
230
+ @filedownloader("test_images.idx", "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")
231
+ @filedownloader("test_labels.idx", "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")
232
+ @dataset(
233
+ ImageClassification,
234
+ url="http://yann.lecun.com/exdb/mnist/",
235
+ )
236
+ def MNIST(train_images, train_labels, test_images, test_labels):
237
+ """The MNIST database"""
238
+ return {
239
+ "train": LabelledImages(
240
+ images=IDX(path=train_images),
241
+ labels=IDX(path=train_labels)
242
+ ),
243
+ "test": LabelledImages(
244
+ images=IDX(path=test_images),
245
+ labels=IDX(path=test_labels)
246
+ ),
247
+ }
248
+ ```
249
+
250
+ ### Deprecated names
251
+
252
+ | Deprecated | Replacement |
253
+ |---|---|
254
+ | `Download` (base class) | `Resource` |
255
+ | `hasfiles()` | `has_files()` |
256
+ | `Resource.definition` | `Resource.dataset` |
257
+ | `Resource.varname` | `Resource.name` |
258
+ | `@filedownloader(...)` (decorator) | `FileDownloader(...)` (class attr) |
259
+ | `SingleDownload` | `FileDownloader` |
260
+
261
+ ### Deprecated custom handler pattern
262
+
263
+ ```python
264
+ # DEPRECATED
265
+ class MyDownload(Download):
266
+ def __init__(self, varname, custom_param):
267
+ super().__init__(varname)
268
+ self.custom_param = custom_param
269
+
270
+ def prepare(self):
271
+ return self._download_and_process()
272
+
273
+ def download(self, force=False):
274
+ if force or not self._is_cached():
275
+ self._do_download()
276
+
277
+ def hasfiles(self) -> bool:
278
+ return True
279
+
280
+ def mydownloader(varname, custom_param):
281
+ def decorator(dataset):
282
+ download = MyDownload(varname, custom_param)
283
+ download.register(dataset)
284
+ return dataset
285
+ return decorator
286
+ ```
287
+
288
+ Modern equivalent:
289
+
290
+ ```python
291
+ class MyDownload(FileResource):
292
+ def __init__(self, filename, custom_param, **kw):
293
+ super().__init__(filename, **kw)
294
+ self.custom_param = custom_param
295
+
296
+ def _download(self, destination):
297
+ # Write output to destination (self.transient_path)
298
+ self._do_download(destination)
299
+
300
+ mydownloader = MyDownload.apply
301
+ ```
datamaestro/version.py ADDED
@@ -0,0 +1,4 @@
1
+ # This file is auto-generated by Hatchling. As such, do not:
2
+ # - modify
3
+ # - track in version control e.g. be sure to add to .gitignore
4
+ __version__ = VERSION = '1.7.0'
@@ -1,39 +1,37 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: datamaestro
3
- Version: 0.8.1
4
- Summary: "Dataset management command line and API"
5
- Home-page: https://github.com/experimaestro/datamaestro
6
- Author: Benjamin Piwowarski
7
- Author-email: benjamin@piwowarski.fr
8
- License: GPL-3
9
- Keywords: dataset manager
10
- Platform: any
3
+ Version: 1.7.0
4
+ Summary: Add your description here
5
+ Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
+ License-File: LICENSE
11
7
  Classifier: Development Status :: 4 - Beta
12
8
  Classifier: Intended Audience :: Science/Research
13
9
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
14
10
  Classifier: Operating System :: OS Independent
15
11
  Classifier: Programming Language :: Python
16
- Classifier: Programming Language :: Python :: 3.7
17
- Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3
18
13
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
- Requires-Python: >=3.5
20
- Description-Content-Type: text/markdown
21
- License-File: LICENSE
22
- Requires-Dist: click
23
- Requires-Dist: tqdm
24
- Requires-Dist: urllib3
25
- Requires-Dist: marshmallow
26
- Requires-Dist: cached-property
27
- Requires-Dist: requests
28
- Requires-Dist: bitmath
29
- Requires-Dist: experimaestro (>=0.9.5)
30
- Requires-Dist: mkdocs
31
- Requires-Dist: pymdown-extensions
32
- Requires-Dist: mkdocs-material
33
- Requires-Dist: docstring-parser
14
+ Requires-Python: >=3.10
15
+ Requires-Dist: bitmath>=1.3.3.1
16
+ Requires-Dist: cached-property>=2.0.1
17
+ Requires-Dist: click>=8.2.1
18
+ Requires-Dist: docstring-parser>=0.16
19
+ Requires-Dist: experimaestro>=1.8.9
20
+ Requires-Dist: marshmallow>=3.26.1
21
+ Requires-Dist: mkdocs-material>=9.6.15
22
+ Requires-Dist: mkdocs>=1.6.1
34
23
  Requires-Dist: numpy
35
- Provides-Extra: test
36
- Requires-Dist: tox ; extra == 'test'
24
+ Requires-Dist: pymdown-extensions>=10.16
25
+ Requires-Dist: requests>=2.32.4
26
+ Requires-Dist: tqdm>=4.67.1
27
+ Requires-Dist: urllib3>=2.5.0
28
+ Provides-Extra: docs
29
+ Requires-Dist: myst-parser>0.18; extra == 'docs'
30
+ Requires-Dist: sphinx-codeautolink>=0.15; extra == 'docs'
31
+ Requires-Dist: sphinx-rtd-theme==1.2.2; extra == 'docs'
32
+ Requires-Dist: sphinx-toolbox>=4.1.2; extra == 'docs'
33
+ Requires-Dist: sphinx>=4.2; extra == 'docs'
34
+ Description-Content-Type: text/markdown
37
35
 
38
36
  [![PyPI version](https://badge.fury.io/py/datamaestro.svg)](https://badge.fury.io/py/datamaestro) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![DOI](https://zenodo.org/badge/4573876.svg)](https://zenodo.org/badge/latestdoi/4573876)
39
37
 
@@ -94,22 +92,10 @@ $ datamaestro search tag:image
94
92
  [image] com.lecun.mnist
95
93
 
96
94
  $ datamaestro prepare com.lecun.mnist
97
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
98
- INFO:root:Transforming file
99
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
100
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
101
- INFO:root:Transforming file
102
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
103
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
104
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
105
- Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz: 32.8kB [00:00, 92.1kB/s] INFO:root:Transforming file
106
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
107
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
108
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
109
- Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz: 9.92MB [00:00, 10.6MB/s]
110
- INFO:root:Transforming file
111
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
112
- ...JSON...
95
+ INFO:root:Materializing 4 resources
96
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/train_images.idx
97
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/test_images.idx
98
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz into .../datamaestro/store/com/lecun/test_labels.idx
113
99
  ```
114
100
 
115
101
  The previous command also returns a JSON on standard output
@@ -147,68 +133,50 @@ Out[3]: (dtype('uint8'), (60000, 28, 28))
147
133
 
148
134
  ## Python definition of datasets
149
135
 
150
- Each dataset (or a set of related datasets) is described in Python using a mix of declarative
151
- and imperative statements. This allows to quickly define how to download dataset using the
152
- datamaestro declarative API; the imperative part is used when creating the JSON output,
153
- and is integrated with [experimaestro](http://experimaestro.github.io/experimaestro-python).
154
-
155
- Its syntax is described in the [documentation](http://experimaestro.github.io/datamaestro/).
156
-
157
-
158
- For MNIST, this corresponds to.
136
+ Datasets are defined as Python classes with resource attributes that describe how
137
+ to download and process data. The framework automatically builds a dependency graph
138
+ and handles downloads with two-path safety and state tracking.
159
139
 
160
140
  ```python
161
- from datamaestro_image.data import ImageClassification, LabelledImages, Base, IDXImage
162
- from datamaestro.download.single import filedownloader
163
- from datamaestro.definitions import argument, datatasks, datatags, dataset
141
+ from datamaestro_image.data import ImageClassification, LabelledImages
164
142
  from datamaestro.data.tensor import IDX
165
-
166
-
167
- @filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
168
- @filedownloader("train_labels.idx", "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")
169
- @filedownloader("test_images.idx", "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")
170
- @filedownloader("test_labels.idx", "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")
171
- @dataset(
172
- ImageClassification,
173
- url="http://yann.lecun.com/exdb/mnist/",
174
- )
175
- def MNIST(train_images, train_labels, test_images, test_labels):
176
- """The MNIST database
177
-
178
- The MNIST database of handwritten digits, available from this page, has a
179
- training set of 60,000 examples, and a test set of 10,000 examples. It is a
180
- subset of a larger set available from NIST. The digits have been
181
- size-normalized and centered in a fixed-size image.
182
- """
183
- return {
184
- "train": LabelledImages(
185
- images=IDXImage(path=train_images),
186
- labels=IDX(path=train_labels)
187
- ),
188
- "test": LabelledImages(
189
- images=IDXImage(path=test_images),
190
- labels=IDX(path=test_labels)
191
- ),
192
- }
143
+ from datamaestro.download.single import FileDownloader
144
+ from datamaestro.definitions import AbstractDataset, dataset
145
+
146
+
147
+ @dataset(url="http://yann.lecun.com/exdb/mnist/")
148
+ class MNIST(ImageClassification):
149
+ """The MNIST database of handwritten digits."""
150
+
151
+ TRAIN_IMAGES = FileDownloader(
152
+ "train_images.idx",
153
+ "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
154
+ )
155
+ TRAIN_LABELS = FileDownloader(
156
+ "train_labels.idx",
157
+ "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz",
158
+ )
159
+ TEST_IMAGES = FileDownloader(
160
+ "test_images.idx",
161
+ "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
162
+ )
163
+ TEST_LABELS = FileDownloader(
164
+ "test_labels.idx",
165
+ "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz",
166
+ )
167
+
168
+ @classmethod
169
+ def __create_dataset__(cls, dataset: AbstractDataset):
170
+ return cls.C(
171
+ train=LabelledImages(
172
+ images=IDX(path=cls.TRAIN_IMAGES.path),
173
+ labels=IDX(path=cls.TRAIN_LABELS.path),
174
+ ),
175
+ test=LabelledImages(
176
+ images=IDX(path=cls.TEST_IMAGES.path),
177
+ labels=IDX(path=cls.TEST_LABELS.path),
178
+ ),
179
+ )
193
180
  ```
194
181
 
195
- # 0.8.0
196
-
197
- - Integration with other repositories: abstracting away the notion of dataset
198
- - Repository prefix
199
- - Set sub-datasets IDs automatically
200
-
201
- # 0.7.3
202
-
203
- - Updates for new experimaestro (0.8.5)
204
- - Search types with "type:..."
205
-
206
- # 0.6.17
207
-
208
- - Allow remote access through rpyc
209
-
210
- # 0.6.9
211
-
212
- `version` command
213
-
214
-
182
+ Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
@@ -0,0 +1,49 @@
1
+ datamaestro/__init__.py,sha256=oh9M4VODuvTc9EFHKirtDxpCJkLUANzpzBOIwzHc_mw,246
2
+ datamaestro/__main__.py,sha256=22v54rQoO2umL1frFO2FOQuuRljr-Jw-ER-OATTpVxw,9218
3
+ datamaestro/context.py,sha256=AL2BTi6dLA8rDGBE0PFyfV9ua29JHvBgx6_w6hDj9Dg,13977
4
+ datamaestro/definitions.py,sha256=kIwyrXZWg1tZw3G1PuUyGJ13ZPunocmu0wuxydVesbQ,27167
5
+ datamaestro/record.py,sha256=e5fjRV3ni7ZxXwYH45bVDB_jpD-n9quvh4ie4uI-MM4,7140
6
+ datamaestro/registry.py,sha256=M7QJkcWJP_cxAoqIioLQ01ou2Zg9RqGQvW0XGVspYFE,1421
7
+ datamaestro/search.py,sha256=bRT-91-2VJJ2JSfNaS1mzaVfqq_HMVBVs-RBj0w-ypM,2906
8
+ datamaestro/settings.py,sha256=NuUbe_C31GDlzdio2ryz7tPzuo4hsmmdCM5Cyuhqbzs,1294
9
+ datamaestro/sphinx.py,sha256=WWXB63gd0ZgEwFr_YwO2Hmuly5OoiFlu9mDvJSHFYuY,6966
10
+ datamaestro/utils.py,sha256=JUrvtVYnjNKRo0_ZypmXSQ9R4uOyImDjW1GZ14MYzKM,6547
11
+ datamaestro/v2.md,sha256=pLCxQUdfVkd4CM9Ie0ZxCnxUntqoA7k_0m7x1etcr7Y,9801
12
+ datamaestro/version.py,sha256=aCGW8aYYQ-ZQNfHZo9TrCX1MKqWbHUjj3X57h-DmRAs,171
13
+ datamaestro/annotations/__init__.py,sha256=jLprrxSBa5QIqc--vqycEcxU4CR9WjVNRaqR5lH0EuE,39
14
+ datamaestro/annotations/agreement.py,sha256=xEH0ddZxdJ_oG_150PoOa-WjY_OaeQja3FzMzY5IB6k,955
15
+ datamaestro/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ datamaestro/commands/mainstyle.css,sha256=EAWq6hKWjLYZ-gUrGV-z3L8LtkubD7mLoYdSIC7kLOo,465
17
+ datamaestro/commands/site.py,sha256=mVU5JKhwx9LTMf5FddcWgGh41qXtANJqB8qWKSKa-70,14432
18
+ datamaestro/data/__init__.py,sha256=s81ZxT8MQrBGkcu45xr4NaInIsMeunHOLnkLrJE47So,1496
19
+ datamaestro/data/csv.py,sha256=jcXFVBOEQoSi3YL60bqtwjCf2YXHboaMpUmiXZpzuPM,2506
20
+ datamaestro/data/huggingface.py,sha256=rCMiMqVgNI9zRAgm9PYnbwb7musYryBoIP3HuJmH4sg,691
21
+ datamaestro/data/ml.py,sha256=4PlH6FJFZwtfTEStkOjOucV8t8yY8LFaPsnDBvEqAPs,710
22
+ datamaestro/data/tensor.py,sha256=in36UQz4cdUEVmCS62pInu9RNekohRON667Z_JqNdhk,2254
23
+ datamaestro/download/__init__.py,sha256=qbmSLtzo4zTLuc1cAVSAKDdbIJROsJa6BMP6ksVJWvU,19375
24
+ datamaestro/download/archive.py,sha256=fz1ElRggB9gYb6F7fek0Tkw9eAj6Glotc_Mit9OcCZU,6986
25
+ datamaestro/download/custom.py,sha256=dxyvwbweVuz0xveExtvta8xycoqTjpDZz_P98ucintA,1287
26
+ datamaestro/download/huggingface.py,sha256=inZbB5EdVvczW9CfM59SqL1Nl-H4y3bWxv1SWjrYeOs,1996
27
+ datamaestro/download/links.py,sha256=NCCpBFAIYznaskJV5NSFX3NoorqHDKyAiRCWSnEnb9E,4364
28
+ datamaestro/download/manual.py,sha256=-T2QWxKAiN3ZbSujjQUVeWDEDFonw9VnlzCfBIHcLao,190
29
+ datamaestro/download/multiple.py,sha256=iX3gtgQT1eskHok0pAecU_mgd56of1Sadz7_o95ItaA,2736
30
+ datamaestro/download/single.py,sha256=nFWBH1LeGO-WmMUBbdV6bzkd6Lfe74uhfcWeLZkCC3M,5737
31
+ datamaestro/download/sync.py,sha256=QlpoOkamiX9yE-4P8-ppCZ_wgA2P4oBSOQCX98gWnCc,784
32
+ datamaestro/download/todo.py,sha256=d-mfi_gJlrOvAoa7dXN2ecXYY-cgB-NHzU1J-dzkEkI,444
33
+ datamaestro/download/wayback.py,sha256=wpbrTtE321AwsO8Poj1a4qwEKy1kE0wEbxWgMEf5nLo,5489
34
+ datamaestro/stream/__init__.py,sha256=Angu_Yg9rNKXb8s4at-DXYcnE-OTgSMLfUEfrL6APD8,896
35
+ datamaestro/stream/compress.py,sha256=0ViFGpJc6pdvZGUNERE-3XV8jAOTSvhJurb2t0NW2eU,260
36
+ datamaestro/stream/lines.py,sha256=DhptjIqhhAJ1tu3e-uoOepHHNALSXS8qz8ASUAyaSkM,2074
37
+ datamaestro/templates/dataset.py,sha256=5065rTMAIl4gtzQ96GFiV1_46tY08miIx3WspTP8yGA,346
38
+ datamaestro/test/__init__.py,sha256=9xXqLvUgiIn74AY6k8qyYX7rq6hWz7dOJFBrUgwuX88,61
39
+ datamaestro/test/checks.py,sha256=1eTkz4YJhAPOcnQSsz4vPnvzwwfrEnpn6H_s1ADISpo,1704
40
+ datamaestro/test/conftest.py,sha256=z8rF0OIKVuCgIYJ-4fQxQL8KhgIfg_4kfkIZNETfNJ0,793
41
+ datamaestro/test/test_annotations.py,sha256=XUjDWb3FJimSD91wcItJ0lLwTBmvN4wVu_EgTKSvV2c,278
42
+ datamaestro/test/test_download_handlers.py,sha256=-Gofr89zqIyeI8C4rZqfYR3JfiZVImdcSz9s6q361zQ,641
43
+ datamaestro/test/test_record.py,sha256=hNZ3uo2i5FZ0VsOHRwvLO1Z6Zce92PdipAF65UptPB8,1156
44
+ datamaestro/test/test_resource.py,sha256=meUCDaoPg5XT3gWIToqXvaofE1vrZq_qG7gZtOHIOfQ,41044
45
+ datamaestro-1.7.0.dist-info/METADATA,sha256=tutvO9o9gHY7DLbF7zliiwcz2ajn7jnufkmotlA-cDQ,7433
46
+ datamaestro-1.7.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
47
+ datamaestro-1.7.0.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
48
+ datamaestro-1.7.0.dist-info/licenses/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
49
+ datamaestro-1.7.0.dist-info/RECORD,,
@@ -1,5 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.36.2)
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
-
@@ -3,4 +3,3 @@ datamaestro = datamaestro.__main__:main
3
3
 
4
4
  [mkdocs.plugins]
5
5
  datamaestro = datamaestro.commands.site:DatasetGenerator
6
-