datamaestro 1.3.2__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamaestro/__init__.py CHANGED
@@ -7,8 +7,6 @@ from .context import (
7
7
  prepare_dataset,
8
8
  )
9
9
 
10
- from .datasets.yaml_repository import YAMLRepository
11
-
12
10
  from pkg_resources import get_distribution, DistributionNotFound
13
11
  from .definitions import dataset, metadata
14
12
  from .data import Base
datamaestro/__main__.py CHANGED
@@ -319,13 +319,17 @@ def search(config: Config, searchterms):
319
319
 
320
320
  logging.debug("Search: %s", condition)
321
321
  for dataset in config.context.datasets():
322
- if condition.match(dataset):
323
- cfg = dataset.configtype
324
- print(
325
- "[%s] %s (%s)"
326
- % (
327
- dataset.repository.id,
328
- dataset.id,
329
- cfg.__name__ if cfg is not None else "?",
322
+ try:
323
+ if condition.match(dataset):
324
+ cfg = dataset.configtype
325
+ print(
326
+ "[%s] %s (%s)"
327
+ % (
328
+ dataset.repository.id,
329
+ dataset.id,
330
+ cfg.__name__ if cfg is not None else "?",
331
+ )
330
332
  )
331
- )
333
+ except Exception:
334
+ logging.error("Error while matching with dataset %s", dataset)
335
+ raise
datamaestro/context.py CHANGED
@@ -88,11 +88,6 @@ class Context:
88
88
 
89
89
  return ContextManager()
90
90
 
91
- @property
92
- def storepath(self):
93
- """Replaces the data path"""
94
- return self._path.joinpath("store")
95
-
96
91
  @property
97
92
  def datapath(self):
98
93
  return self._path.joinpath("data")
@@ -7,7 +7,6 @@ import inspect
7
7
  from pathlib import Path
8
8
  from itertools import chain
9
9
  from abc import ABC, abstractmethod
10
- from contextlib import contextmanager
11
10
  import traceback
12
11
  from typing import (
13
12
  Dict,
@@ -19,7 +18,6 @@ from typing import (
19
18
  Callable,
20
19
  TYPE_CHECKING,
21
20
  Union,
22
- ClassVar,
23
21
  _GenericAlias,
24
22
  )
25
23
  from experimaestro import ( # noqa: F401 (re-exports)
@@ -217,8 +215,8 @@ class AbstractDataset(AbstractData):
217
215
  def download(self, force=False):
218
216
  """Download all the necessary resources"""
219
217
  success = True
220
- logging.info("Materializing %d resources", len(self.ordered_resources))
221
218
  self.prepare()
219
+ logging.info("Materializing %d resources", len(self.ordered_resources))
222
220
  for resource in self.ordered_resources:
223
221
  try:
224
222
  resource.download(force)
@@ -274,9 +272,6 @@ class DatasetWrapper(AbstractDataset):
274
272
  annotations (otherwise, derive from `AbstractDataset`).
275
273
  """
276
274
 
277
- BUILDING: ClassVar[list["DatasetWrapper"]] = []
278
- """Currently built dataset"""
279
-
280
275
  def __init__(self, annotation, t: type):
281
276
  self.config = None
282
277
  self.repository: Optional[Repository] = None
@@ -287,6 +282,11 @@ class DatasetWrapper(AbstractDataset):
287
282
  repository, components = DataDefinition.repository_relpath(t)
288
283
  super().__init__(repository)
289
284
 
285
+ self.module_name = None
286
+ if repository is None:
287
+ # Try to find the module name
288
+ self.module_name, _ = t.__module__.split(".", 1)
289
+
290
290
  # Set some variables
291
291
  self.url = annotation.url
292
292
  self.doi = annotation.doi
@@ -361,12 +361,6 @@ class DatasetWrapper(AbstractDataset):
361
361
  self._prepare()
362
362
  return super().download(force=force)
363
363
 
364
- @contextmanager
365
- def building(self):
366
- DatasetWrapper.BUILDING.append(self)
367
- yield self
368
- DatasetWrapper.BUILDING.pop()
369
-
370
364
  def _prepare(self) -> "Base":
371
365
  if self.config is not None:
372
366
  return self.config
@@ -378,8 +372,7 @@ class DatasetWrapper(AbstractDataset):
378
372
  # Construct the object
379
373
  resources = {key: value.prepare() for key, value in self.resources.items()}
380
374
 
381
- with self.building():
382
- result = self.t(**resources)
375
+ result = self.t(**resources)
383
376
 
384
377
  # Download resources
385
378
  logging.debug("Building with data type %s and dataset %s", self.base, self.t)
@@ -425,18 +418,11 @@ class DatasetWrapper(AbstractDataset):
425
418
  @property
426
419
  def datapath(self):
427
420
  """Returns the destination path for downloads"""
428
- from datamaestro import Context # noqa: F811
429
-
430
- path = Context.instance().storepath / self._path
431
-
432
- if (self.repository is not None) and (not path.exists()):
433
- old_path: Path = self.repository.datapath / self._path
434
- if old_path.exists():
435
- logging.info(
436
- "Moving from old path [%s] to new path [%s]", old_path, path
437
- )
438
- path.parent.mkdir(exist_ok=True, parents=True)
439
- old_path.rename(path)
421
+ if self.repository is not None:
422
+ return self.repository.datapath / self._path
423
+
424
+ # No repository, use __custom__/[MODULE NAME]
425
+ path = self.context.datapath / "__custom__" / self.module_name / self._path
440
426
 
441
427
  return path
442
428
 
@@ -31,7 +31,7 @@ class Resource(DatasetAnnotation, ABC):
31
31
  self.varname = varname
32
32
  # Ensures that the object is initialized
33
33
  self._post = False
34
- self.definition = None
34
+ self.definition: AbstractDataset = None
35
35
 
36
36
  def annotate(self, dataset: AbstractDataset):
37
37
  assert self.definition is None
@@ -45,9 +45,9 @@ class Resource(DatasetAnnotation, ABC):
45
45
 
46
46
  def contextualize(self):
47
47
  """When using an annotation inline, uses the current dataset wrapper object"""
48
- from datamaestro.definitions import DatasetWrapper
48
+ from datamaestro.definitions import AbstractDataset
49
49
 
50
- wrapper = DatasetWrapper.BUILDING[-1]
50
+ wrapper = AbstractDataset.processing()
51
51
  self.annotate(wrapper)
52
52
 
53
53
  @property
@@ -1,7 +1,6 @@
1
1
  from typing import Protocol
2
2
  from pathlib import Path
3
3
  from datamaestro import Context
4
- from datamaestro.definitions import DatasetWrapper
5
4
  from datamaestro.download import Resource
6
5
 
7
6
 
@@ -10,20 +9,13 @@ class Downloader(Protocol):
10
9
  pass
11
10
 
12
11
 
13
- class CustomResource(Resource):
14
- def __init__(self, ds_wrapper: DatasetWrapper, downloader: Downloader):
15
- self.ds_wrapper = ds_wrapper
12
+ class custom_download(Resource):
13
+ def __init__(self, varname: str, downloader: Downloader):
14
+ super().__init__(varname)
16
15
  self.downloader = downloader
17
16
 
18
17
  def prepare(self):
19
- pass
18
+ return self.definition.datapath
20
19
 
21
20
  def download(self, force=False):
22
- self.downloader(self.context, self.ds_wrapper.datapath, force=force)
23
-
24
-
25
- def custom_download(downloader: Downloader) -> Path:
26
- ds_wrapper = DatasetWrapper.BUILDING[-1]
27
- ds_wrapper.ordered_resources.append(CustomResource(ds_wrapper, downloader))
28
-
29
- return ds_wrapper.datapath
21
+ self.downloader(self.context, self.definition.datapath, force=force)
@@ -5,7 +5,7 @@ from datamaestro.download import Download
5
5
 
6
6
 
7
7
  class hf_download(Download):
8
- """Use Hugging Face to donwload a file"""
8
+ """Use Hugging Face to download a file"""
9
9
 
10
10
  def __init__(
11
11
  self,
@@ -9,7 +9,7 @@ import os
9
9
  import urllib3
10
10
  from pathlib import Path
11
11
  import re
12
- from datamaestro.utils import copyfileobjs, FileChecker
12
+ from datamaestro.utils import copyfileobjs
13
13
  from datamaestro.stream import Transform
14
14
  from datamaestro.download import Download
15
15
 
@@ -35,7 +35,7 @@ class SingleDownload(Download):
35
35
  return self.path
36
36
 
37
37
  def download(self, force=False):
38
- if not self.path.is_file():
38
+ if not self.path.is_file() and not force:
39
39
  self._download(self.path)
40
40
 
41
41
 
@@ -96,20 +96,6 @@ class filedownloader(SingleDownload):
96
96
  logging.info("Created file %s" % destination)
97
97
 
98
98
 
99
- def file_from_url(
100
- filename: str,
101
- url: str,
102
- *,
103
- size: Optional[int] = None,
104
- transforms: Optional[Transform] = None,
105
- checker: Optional[FileChecker] = None,
106
- ) -> Path:
107
- """Defines a file that should be downloaded from"""
108
- downloader = filedownloader(filename, url, size, transforms, checker)
109
- downloader.contextualize()
110
- return downloader.path
111
-
112
-
113
99
  class concatdownload(SingleDownload):
114
100
  """Concatenate all files in an archive"""
115
101
 
datamaestro/version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '1.3.2'
21
- __version_tuple__ = version_tuple = (1, 3, 2)
20
+ __version__ = version = '1.4.0'
21
+ __version_tuple__ = version_tuple = (1, 4, 0)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro
3
- Version: 1.3.2
3
+ Version: 1.4.0
4
4
  Summary: "Dataset management command line and API"
5
5
  Home-page: https://github.com/experimaestro/datamaestro
6
6
  Author: Benjamin Piwowarski
@@ -97,22 +97,10 @@ $ datamaestro search tag:image
97
97
  [image] com.lecun.mnist
98
98
 
99
99
  $ datamaestro prepare com.lecun.mnist
100
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
101
- INFO:root:Transforming file
102
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
103
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
104
- INFO:root:Transforming file
105
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
106
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
107
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
108
- Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz: 32.8kB [00:00, 92.1kB/s] INFO:root:Transforming file
109
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
110
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
111
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
112
- Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz: 9.92MB [00:00, 10.6MB/s]
113
- INFO:root:Transforming file
114
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
115
- ...JSON...
100
+ INFO:root:Materializing 4 resources
101
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/train_images.idx
102
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/test_images.idx
103
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz into .../datamaestro/store/com/lecun/test_labels.idx
116
104
  ```
117
105
 
118
106
  The previous command also returns a JSON on standard output
@@ -158,13 +146,12 @@ and is integrated with [experimaestro](http://experimaestro.github.io/experimaes
158
146
  Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
159
147
 
160
148
 
161
- For MNIST, this corresponds to.
149
+ For instance, the MNIST dataset can be described by the following
162
150
 
163
151
  ```python
164
- from datamaestro_image.data import ImageClassification, LabelledImages, Base, IDXImage
165
- from datamaestro.download.single import filedownloader
166
- from datamaestro.definitions import argument, datatasks, datatags, dataset
167
- from datamaestro.data.tensor import IDX
152
+ from datamaestro import dataset
153
+ from datamaestro.download.single import download_file
154
+ from datamaestro_image.data import ImageClassification, LabelledImages, IDXImage
168
155
 
169
156
 
170
157
  @filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
@@ -175,26 +162,37 @@ from datamaestro.data.tensor import IDX
175
162
  ImageClassification,
176
163
  url="http://yann.lecun.com/exdb/mnist/",
177
164
  )
178
- def MNIST(train_images, train_labels, test_images, test_labels):
179
- """The MNIST database
180
-
181
- The MNIST database of handwritten digits, available from this page, has a
182
- training set of 60,000 examples, and a test set of 10,000 examples. It is a
183
- subset of a larger set available from NIST. The digits have been
184
- size-normalized and centered in a fixed-size image.
185
- """
186
- return {
187
- "train": LabelledImages(
188
- images=IDXImage(path=train_images),
189
- labels=IDX(path=train_labels)
190
- ),
191
- "test": LabelledImages(
192
- images=IDXImage(path=test_images),
193
- labels=IDX(path=test_labels)
194
- ),
195
- }
165
+
166
+ return ImageClassification(
167
+ train=LabelledImages(
168
+ images=IDXImage(path=train_images), labels=IDXImage(path=train_labels)
169
+ ),
170
+ test=LabelledImages(
171
+ images=IDXImage(path=test_images), labels=IDXImage(path=test_labels)
172
+ ),
173
+ )
174
+ ```
175
+
176
+ When building dataset modules, some extra documentation can be provided:
177
+
178
+ ```yaml
179
+ ids: [com.lecun.mnist]
180
+ entry_point: "datamaestro_image.config.com.lecun:mnist"
181
+ title: The MNIST database
182
+ url: http://yann.lecun.com/exdb/mnist/
183
+ groups: [image-classification]
184
+ description: |
185
+ The MNIST database of handwritten digits, available from this page,
186
+ has a training set of 60,000 examples, and a test set of 10,000
187
+ examples. It is a subset of a larger set available from NIST. The
188
+ digits have been size-normalized and centered in a fixed-size image.
196
189
  ```
197
190
 
191
+ This will allow to
192
+
193
+ 1. Document the dataset
194
+ 2. Allow to use the command line interface to manipulate it (download resources, etc.)
195
+
198
196
  # 0.8.0
199
197
 
200
198
  - Integration with other repositories: abstracting away the notion of dataset
@@ -1,14 +1,14 @@
1
- datamaestro/__init__.py,sha256=gnbxrPFzIuG4oR2Qrw9UYS0SNVsf4yCtqNvzSjstdak,376
2
- datamaestro/__main__.py,sha256=tJTf1sTWKRIatvBcHlWDIZRZodAZ2B2zkD01pD89MYk,9024
3
- datamaestro/context.py,sha256=S7sQ6RQVLjtoY5iyAikfyvfbqoaoDzcHt4-js8t6mMg,13653
4
- datamaestro/definitions.py,sha256=HEnwB32Reb4ouLOjboEOe_j88keBZPQ0SU6OrO_ohLU,18764
1
+ datamaestro/__init__.py,sha256=LR8nx7H3Fo97O0gJXV2PxQezsmSTDLAg_nQEXB5QAjc,322
2
+ datamaestro/__main__.py,sha256=2p36ZcJcZAL9NZBUkMaYRUhKyqhheVPXMGw6K1KNwhk,9196
3
+ datamaestro/context.py,sha256=KsXYNTt4xX4zEVrnd2hciP7PVCh1StRzjU1Ih6VeCtU,13532
4
+ datamaestro/definitions.py,sha256=HsldRC6QJMyxKp4yKa5_c0zUn5MlYW7S-CrWHmij7t0,18337
5
5
  datamaestro/record.py,sha256=m3WGsPcZ1LouQXNJOBUK3QusAIRiuy6T_oqhq09-Ckg,5504
6
6
  datamaestro/registry.py,sha256=M7QJkcWJP_cxAoqIioLQ01ou2Zg9RqGQvW0XGVspYFE,1421
7
7
  datamaestro/search.py,sha256=bRT-91-2VJJ2JSfNaS1mzaVfqq_HMVBVs-RBj0w-ypM,2906
8
8
  datamaestro/settings.py,sha256=HYSElTUYZ6DZocBb9o3ifm6WW9knRO64XJUwxGIpvwQ,1304
9
9
  datamaestro/sphinx.py,sha256=bp7x_2BFoTSwTqcVZDM8R8cWa7G2pz0Zb8GS054lLYM,6996
10
10
  datamaestro/utils.py,sha256=9m-AVVww6InAZfGFiGy6XJzfExpYNqH1fhWQEezjafA,6536
11
- datamaestro/version.py,sha256=2MIKMHG_bp3KmQVZwa0rSvoTHIRfxwkSxFOUhMK4eQc,511
11
+ datamaestro/version.py,sha256=rcWNYDlh913lujUvTfOu9iOIPdrTXg64R9wl7ENLjFU,511
12
12
  datamaestro/annotations/__init__.py,sha256=jLprrxSBa5QIqc--vqycEcxU4CR9WjVNRaqR5lH0EuE,39
13
13
  datamaestro/annotations/agreement.py,sha256=xEH0ddZxdJ_oG_150PoOa-WjY_OaeQja3FzMzY5IB6k,955
14
14
  datamaestro/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -19,16 +19,14 @@ datamaestro/data/csv.py,sha256=jcXFVBOEQoSi3YL60bqtwjCf2YXHboaMpUmiXZpzuPM,2506
19
19
  datamaestro/data/huggingface.py,sha256=rCMiMqVgNI9zRAgm9PYnbwb7musYryBoIP3HuJmH4sg,691
20
20
  datamaestro/data/ml.py,sha256=7Rv4Tb9g17HDj8mOBJpIDjgolGQAd5Wrb0mHlnm-bPE,709
21
21
  datamaestro/data/tensor.py,sha256=in36UQz4cdUEVmCS62pInu9RNekohRON667Z_JqNdhk,2254
22
- datamaestro/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- datamaestro/datasets/yaml_repository.py,sha256=X5JjA2dQ5xfdYSUgL2EbZhrOYn-FPiBOAK97kw4kwqo,2533
24
- datamaestro/download/__init__.py,sha256=XcRw9acAq1IwhLQZpj2HpMNEaMesA5BbllJpbRCkOwA,2846
22
+ datamaestro/download/__init__.py,sha256=EBoAcw2wErS8ymEYs7LJKez4UO-Gwhe4YgqRAysOxRY,2865
25
23
  datamaestro/download/archive.py,sha256=G-2gzepknqT7Us3naMGAApGVGJMeHQIxM-tSpaa9ark,5608
26
- datamaestro/download/custom.py,sha256=2-gFoOgQ8J93HjH9sc7u6wjVYm7DmSytP1ty2O6-d8k,839
27
- datamaestro/download/huggingface.py,sha256=LkzmZo2Z0yccqAfj7di7jDNGFrMKN9m8IM8SfexOomY,1125
24
+ datamaestro/download/custom.py,sha256=DUjDVAWuHC6sV_apMQb44Yjd6HUXkHY6Ob52FQY3t-M,587
25
+ datamaestro/download/huggingface.py,sha256=b4Y437ATYrugdkvqZrPQmqiXXSrmYyqEKDVI0wnIGDE,1125
28
26
  datamaestro/download/links.py,sha256=GFnq_AzI_uen7JBuGWD9qveeC9QFBWDrSnj7pOcwWwM,3352
29
27
  datamaestro/download/manual.py,sha256=-T2QWxKAiN3ZbSujjQUVeWDEDFonw9VnlzCfBIHcLao,190
30
28
  datamaestro/download/multiple.py,sha256=Mrr0ObHM5cE1CPSHE9PKIrox3qZVgxwRyxLzNXp0LqM,2159
31
- datamaestro/download/single.py,sha256=bMDLldvODp2ZXyxXeKLT4qbL-v4igA6A7HVjIt2Cf8c,4526
29
+ datamaestro/download/single.py,sha256=fCIfZdR14YN09MQTgcxL21PWu5CjELfIClgWjFpR5mg,4148
32
30
  datamaestro/download/sync.py,sha256=Z_LsXj4kbZWIYKTVJZEhfdpYiv6wXOOIyw8LahmEcqs,836
33
31
  datamaestro/download/todo.py,sha256=y3YnmWC_i-u23ce-vreIwIXZcoO-uA0HXErgJPThnco,256
34
32
  datamaestro/download/wayback.py,sha256=B9X1P9jElvd_qnUs9aX0TAO-NrNyvuHLYDAcpNq354w,5430
@@ -42,9 +40,9 @@ datamaestro/test/conftest.py,sha256=it4S5Qq1CA_U8qM0pr4m7v-1dhLj5Y49WjVg5Ee3mpM,
42
40
  datamaestro/test/test_annotations.py,sha256=XUjDWb3FJimSD91wcItJ0lLwTBmvN4wVu_EgTKSvV2c,278
43
41
  datamaestro/test/test_download_handlers.py,sha256=-Gofr89zqIyeI8C4rZqfYR3JfiZVImdcSz9s6q361zQ,641
44
42
  datamaestro/test/test_record.py,sha256=hNZ3uo2i5FZ0VsOHRwvLO1Z6Zce92PdipAF65UptPB8,1156
45
- datamaestro-1.3.2.dist-info/licenses/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
46
- datamaestro-1.3.2.dist-info/METADATA,sha256=1RJCcSxd3VdZ1VOMrVlQEA_cQuCBbFGC-fB1NjOWVPY,8990
47
- datamaestro-1.3.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
48
- datamaestro-1.3.2.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
49
- datamaestro-1.3.2.dist-info/top_level.txt,sha256=XSznaMNAA8jELV7-TOqaAgDsjLzUf9G9MxL7C4helT0,12
50
- datamaestro-1.3.2.dist-info/RECORD,,
43
+ datamaestro-1.4.0.dist-info/licenses/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
44
+ datamaestro-1.4.0.dist-info/METADATA,sha256=0q8MT6YvMD4WD3j79fpkGGFvHbLey-WgZnAV5VoRzdg,8189
45
+ datamaestro-1.4.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
46
+ datamaestro-1.4.0.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
47
+ datamaestro-1.4.0.dist-info/top_level.txt,sha256=XSznaMNAA8jELV7-TOqaAgDsjLzUf9G9MxL7C4helT0,12
48
+ datamaestro-1.4.0.dist-info/RECORD,,
File without changes
@@ -1,103 +0,0 @@
1
- import re
2
- from typing import Iterator, Optional
3
- from functools import cached_property
4
- from attrs import field
5
- import importlib
6
- from omegaconf import OmegaConf
7
- from functools import partial
8
- from attrs import define
9
- from datamaestro import BaseRepository
10
- from datamaestro.definitions import AbstractDataset, DatasetWrapper
11
- from datamaestro.data import Base
12
-
13
-
14
- re_spec = re.compile(r"""^(\w\.)+:(\w+)""")
15
-
16
-
17
- @define
18
- class RepositoryDataset:
19
- ids: list[str]
20
- """ID(s) of this dataset"""
21
-
22
- entry_point: str = field(validator=re_spec.match)
23
- """The entry point"""
24
-
25
- title: str
26
- """The full name of the dataset"""
27
-
28
- description: str
29
- """Description of the dataset"""
30
-
31
- url: Optional[str]
32
- """The URL"""
33
-
34
- groups: Optional[list[str]]
35
- """Groups to which this repository belongs"""
36
-
37
-
38
- @define
39
- class RepositoryAuthors:
40
- name: str
41
- email: str
42
-
43
-
44
- @define
45
- class RepositoryGroup:
46
- name: str
47
- tasks: list[str]
48
- tags: list[str]
49
-
50
-
51
- @define
52
- class RepositoryConfiguration:
53
- namespace: str
54
- authors: list[RepositoryAuthors]
55
- description: str
56
- groups: dict[str, RepositoryGroup]
57
- datasets: list[RepositoryDataset]
58
-
59
-
60
- class YAMLDataset(AbstractDataset):
61
- def __init__(self, repository: "YAMLRepository", information: RepositoryDataset):
62
- super().__init__(repository)
63
- self.information = information
64
- self.id = self.information.ids[0]
65
- self.aliases = set(self.information.ids)
66
-
67
- @cached_property
68
- def wrapper(self) -> DatasetWrapper:
69
- module, func_name = self.information.entry_point.split(":")
70
- wrapper = getattr(importlib.import_module(module), func_name)
71
- return wrapper
72
-
73
- def _prepare(self) -> "Base":
74
- return self.wrapper()
75
-
76
- def download(self, **kwargs):
77
- return self.wrapper.download(**kwargs)
78
-
79
-
80
- class YAMLRepository(BaseRepository):
81
- """YAML-based repository"""
82
-
83
- @property
84
- def id(self):
85
- return self.configuration.namespace
86
-
87
- @property
88
- def name(self):
89
- return self.configuration.namespace
90
-
91
- @cached_property
92
- def configuration(self):
93
- schema = OmegaConf.structured(RepositoryConfiguration)
94
- with importlib.resources.path(
95
- self.__class__.__module__, "datamaestro.yaml"
96
- ) as fp:
97
- conf = OmegaConf.load(fp)
98
-
99
- conf: RepositoryConfiguration = OmegaConf.merge(schema, conf)
100
- return conf
101
-
102
- def __iter__(self) -> Iterator["AbstractDataset"]:
103
- return map(partial(YAMLDataset, self), self.configuration.datasets)