datamaestro 1.2.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {datamaestro-1.2.0 → datamaestro-1.3.0}/PKG-INFO +4 -2
  2. {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/api/data.md +0 -5
  3. {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/api/download.rst +11 -0
  4. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/__init__.py +5 -1
  5. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/context.py +71 -67
  6. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/__init__.py +18 -11
  7. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/csv.py +8 -6
  8. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/ml.py +3 -3
  9. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/tensor.py +30 -23
  10. datamaestro-1.3.0/src/datamaestro/datasets/__init__.py +0 -0
  11. datamaestro-1.3.0/src/datamaestro/datasets/yaml_repository.py +103 -0
  12. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/definitions.py +135 -53
  13. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/__init__.py +8 -1
  14. datamaestro-1.3.0/src/datamaestro/download/custom.py +29 -0
  15. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/single.py +15 -1
  16. datamaestro-1.3.0/src/datamaestro/download/wayback.py +163 -0
  17. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/search.py +1 -1
  18. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/test/test_annotations.py +2 -1
  19. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/test/test_download_handlers.py +3 -0
  20. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/utils.py +2 -0
  21. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/version.py +9 -4
  22. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/PKG-INFO +4 -2
  23. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/SOURCES.txt +4 -0
  24. {datamaestro-1.2.0 → datamaestro-1.3.0}/.coverage +0 -0
  25. {datamaestro-1.2.0 → datamaestro-1.3.0}/.github/workflows/pytest.yml +0 -0
  26. {datamaestro-1.2.0 → datamaestro-1.3.0}/.github/workflows/python-publish.yml +0 -0
  27. {datamaestro-1.2.0 → datamaestro-1.3.0}/.gitignore +0 -0
  28. {datamaestro-1.2.0 → datamaestro-1.3.0}/.pre-commit-config.yaml +0 -0
  29. {datamaestro-1.2.0 → datamaestro-1.3.0}/.readthedocs.yml +0 -0
  30. {datamaestro-1.2.0 → datamaestro-1.3.0}/CHANGELOG.md +0 -0
  31. {datamaestro-1.2.0 → datamaestro-1.3.0}/LICENSE +0 -0
  32. {datamaestro-1.2.0 → datamaestro-1.3.0}/MANIFEST.in +0 -0
  33. {datamaestro-1.2.0 → datamaestro-1.3.0}/README.md +0 -0
  34. {datamaestro-1.2.0 → datamaestro-1.3.0}/TODO.md +0 -0
  35. {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/Makefile +0 -0
  36. {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/make.bat +0 -0
  37. {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/requirements.txt +0 -0
  38. {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/api/index.md +0 -0
  39. {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/api/records.rst +0 -0
  40. {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/conf.py +0 -0
  41. {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/datasets.rst +0 -0
  42. {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/developping.md +0 -0
  43. {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/index.md +0 -0
  44. {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/style.css +0 -0
  45. {datamaestro-1.2.0 → datamaestro-1.3.0}/mkdocs.yml +0 -0
  46. {datamaestro-1.2.0 → datamaestro-1.3.0}/pyproject.toml +0 -0
  47. {datamaestro-1.2.0 → datamaestro-1.3.0}/pytest.ini +0 -0
  48. {datamaestro-1.2.0 → datamaestro-1.3.0}/requirements-dev.txt +0 -0
  49. {datamaestro-1.2.0 → datamaestro-1.3.0}/requirements.txt +0 -0
  50. {datamaestro-1.2.0 → datamaestro-1.3.0}/schema.yaml +0 -0
  51. {datamaestro-1.2.0 → datamaestro-1.3.0}/setup.cfg +0 -0
  52. {datamaestro-1.2.0 → datamaestro-1.3.0}/setup.py +0 -0
  53. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/__main__.py +0 -0
  54. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/annotations/__init__.py +0 -0
  55. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/annotations/agreement.py +0 -0
  56. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/commands/__init__.py +0 -0
  57. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/commands/mainstyle.css +0 -0
  58. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/commands/site.py +0 -0
  59. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/huggingface.py +0 -0
  60. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/archive.py +0 -0
  61. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/huggingface.py +0 -0
  62. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/links.py +0 -0
  63. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/manual.py +0 -0
  64. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/multiple.py +0 -0
  65. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/sync.py +0 -0
  66. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/todo.py +0 -0
  67. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/record.py +0 -0
  68. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/registry.py +0 -0
  69. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/settings.py +0 -0
  70. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/sphinx.py +0 -0
  71. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/stream/__init__.py +0 -0
  72. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/stream/compress.py +0 -0
  73. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/stream/lines.py +0 -0
  74. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/templates/dataset.py +0 -0
  75. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/test/__init__.py +0 -0
  76. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/test/checks.py +0 -0
  77. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/test/conftest.py +0 -0
  78. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/test/test_record.py +0 -0
  79. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/dependency_links.txt +0 -0
  80. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/entry_points.txt +0 -0
  81. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/not-zip-safe +0 -0
  82. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/requires.txt +0 -0
  83. {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/top_level.txt +0 -0
  84. {datamaestro-1.2.0 → datamaestro-1.3.0}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: datamaestro
3
- Version: 1.2.0
3
+ Version: 1.3.0
4
4
  Summary: "Dataset management command line and API"
5
5
  Home-page: https://github.com/experimaestro/datamaestro
6
6
  Author: Benjamin Piwowarski
@@ -36,6 +36,8 @@ Requires-Dist: docstring_parser
36
36
  Requires-Dist: numpy
37
37
  Provides-Extra: test
38
38
  Requires-Dist: tox; extra == "test"
39
+ Dynamic: license-file
40
+ Dynamic: requires-dist
39
41
 
40
42
  [![PyPI version](https://badge.fury.io/py/datamaestro.svg)](https://badge.fury.io/py/datamaestro) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![DOI](https://zenodo.org/badge/4573876.svg)](https://zenodo.org/badge/latestdoi/4573876)
41
43
 
@@ -39,11 +39,6 @@ Package `datamaestro.data.ml`
39
39
  .. autoxpmconfig:: datamaestro.data.ml.Supervised
40
40
  ```
41
41
 
42
- ```{eval-rst}
43
- .. autoxpmconfig:: datamaestro.data.ml.FolderBased
44
- ```
45
-
46
-
47
42
  ## Tensor
48
43
 
49
44
  Package `datamaestro.data.tensor`
@@ -40,6 +40,10 @@ Package `datamaestro.download.links`
40
40
  .. autofunction:: datamaestro.download.links.linkfile
41
41
 
42
42
 
43
+ Other
44
+ =====
45
+
46
+ .. autofunction:: datamaestro.download.wayback.wayback_documents
43
47
 
44
48
 
45
49
 
@@ -58,3 +62,10 @@ File hashes can be checked with the following checker
58
62
  .. autoclass:: datamaestro.utils.FileChecker
59
63
  .. autoclass:: datamaestro.utils.HashCheck
60
64
  :members: __init__
65
+
66
+
67
+ Custom
68
+ ======
69
+
70
+ .. autofunction:: datamaestro.download.custom.Downloader
71
+ .. autoclass:: datamaestro.download.custom.custom_download
@@ -2,10 +2,14 @@
2
2
  from .context import (
3
3
  Context,
4
4
  Repository,
5
+ BaseRepository,
5
6
  get_dataset,
6
7
  prepare_dataset,
7
8
  )
8
9
 
9
- from pkg_resources import get_distribution, DistributionNotFound
10
+ from .datasets.yaml_repository import YAMLRepository
10
11
 
12
+ from pkg_resources import get_distribution, DistributionNotFound
13
+ from .definitions import dataset, metadata
14
+ from .data import Base
11
15
  from .version import version, version_tuple
@@ -1,21 +1,22 @@
1
1
  from pathlib import Path
2
- from experimaestro.compat import cached_property
2
+ from typing import Iterable, Iterator, Dict, Union
3
3
  import importlib
4
4
  import os
5
5
  import hashlib
6
6
  import logging
7
7
  import inspect
8
8
  import json
9
- from experimaestro.mkdocs.metaloader import Module
9
+ from abc import ABC, abstractmethod
10
+ from experimaestro import Config
10
11
  import pkg_resources
11
- from typing import Iterable, Iterator, List, Dict
12
+ from experimaestro.compat import cached_property
13
+ from experimaestro.mkdocs.metaloader import Module
12
14
  from .utils import CachedFile, downloadURL
13
15
  from .settings import UserSettings, Settings
14
-
15
16
  from typing import TYPE_CHECKING
16
17
 
17
18
  if TYPE_CHECKING:
18
- from datamaestro.definitions import AbstractDataset
19
+ from datamaestro.definitions import AbstractDataset, DatasetWrapper
19
20
 
20
21
 
21
22
  class Compression:
@@ -87,6 +88,11 @@ class Context:
87
88
 
88
89
  return ContextManager()
89
90
 
91
+ @property
92
+ def storepath(self):
93
+ """Replaces the data path"""
94
+ return self._path.joinpath("store")
95
+
90
96
  @property
91
97
  def datapath(self):
92
98
  return self._path.joinpath("data")
@@ -98,7 +104,9 @@ class Context:
98
104
  @cached_property
99
105
  def repositorymap(self) -> Dict[str, "Repository"]:
100
106
  return {
101
- repository.basemodule(): repository for repository in self.repositories()
107
+ repository.basemodule(): repository
108
+ for repository in self.repositories()
109
+ if repository.basemodule() is not None
102
110
  }
103
111
 
104
112
  def repositories(self) -> Iterable["Repository"]:
@@ -286,10 +294,53 @@ class Datasets(Iterable["AbstractDataset"]):
286
294
  yield value.__dataset__
287
295
 
288
296
 
289
- class Repository:
290
- """A repository regroup a set of datasets and their corresponding specific
297
+ class BaseRepository(ABC):
298
+ """A repository groups a set of datasets and their corresponding specific
291
299
  handlers (downloading, filtering, etc.)"""
292
300
 
301
+ def __init__(self, context: Context):
302
+ self.context = context
303
+ p = inspect.getabsfile(self.__class__)
304
+ self.basedir = Path(p).parent
305
+
306
+ @abstractmethod
307
+ def __iter__(self) -> Iterator["AbstractDataset"]:
308
+ ...
309
+
310
+ def search(self, name: str):
311
+ """Search for a dataset in the definitions"""
312
+ for dataset in self:
313
+ if name in dataset.aliases:
314
+ return dataset
315
+
316
+ @classmethod
317
+ def instance(cls, context=None):
318
+ try:
319
+ return cls.__getattribute__(cls, "INSTANCE")
320
+ except AttributeError:
321
+ return cls(context if context else Context.instance())
322
+
323
+ @classmethod
324
+ def basemodule(cls):
325
+ return cls.__module__
326
+
327
+ @property
328
+ def generatedpath(self):
329
+ return self.basedir / "generated"
330
+
331
+ @property
332
+ def datapath(self):
333
+ return self.context.datapath.joinpath(self.id)
334
+
335
+ @property
336
+ def extrapath(self):
337
+ """Path to the directory containing extra configuration files"""
338
+ return self.basedir / "data"
339
+
340
+
341
+ class Repository(BaseRepository):
342
+ """(deprecated) Repository where datasets are located in __module__.config"""
343
+
293
344
  def __init__(self, context: Context):
294
345
  """Initialize a new repository
295
346
 
@@ -297,26 +348,14 @@ class Repository:
297
348
  :param basedir: The base directory of the repository
298
349
  (by default, the same as the repository class)
299
350
  """
351
+ super().__init__(context)
300
352
  self.context = context
301
- p = inspect.getabsfile(self.__class__)
302
- self.basedir = Path(p).parent
303
353
  self.configdir = self.basedir.joinpath("config")
304
354
  self.id = self.__class__.NAMESPACE
305
355
  self.name = self.id
306
356
  self.module = self.__class__.__module__
307
357
  self.__class__.INSTANCE = self
308
358
 
309
- @classmethod
310
- def basemodule(cls):
311
- return cls.__module__
312
-
313
- @classmethod
314
- def instance(cls, context=None):
315
- try:
316
- return cls.__getattribute__(cls, "INSTANCE")
317
- except AttributeError:
318
- return cls(context if context else Context.instance())
319
-
320
359
  @classmethod
321
360
  def version(cls):
322
361
  from pkg_resources import get_distribution, DistributionNotFound
@@ -336,36 +375,8 @@ class Repository:
336
375
  assert isinstance(other, Repository)
337
376
  return self.basedir == other.basedir
338
377
 
339
- def search(self, name: str):
340
- """Search for a dataset in the definitions"""
341
- logging.debug("Searching for %s in %s", name, self.configdir)
342
-
343
- candidates: List[str] = []
344
- components = name.split(".")
345
- path = self.configdir
346
- for i, c in enumerate(components):
347
- path = path / c
348
-
349
- if (path / "__init__.py").is_file():
350
- candidates.append(".".join(components[: i + 1]))
351
-
352
- if path.with_suffix(".py").is_file():
353
- candidates.append(".".join(components[: i + 1]))
354
-
355
- if not path.is_dir():
356
- break
357
-
358
- # Get the dataset
359
- for candidate in candidates[::-1]:
360
- logging.debug("Searching in module %s.config.%s", self.module, candidate)
361
- module = importlib.import_module("%s.config.%s" % (self.module, candidate))
362
- for value in Datasets(module):
363
- if name in value.aliases:
364
- return value
365
-
366
- return None
367
-
368
- def datasets(self, candidate):
378
+ def datasets(self, candidate: str):
379
+ """Returns the dataset candidates from a module"""
369
380
  try:
370
381
  module = importlib.import_module("%s.config.%s" % (self.module, candidate))
371
382
  except ModuleNotFoundError:
@@ -409,19 +420,6 @@ class Repository:
409
420
  for dataset in datasets:
410
421
  yield dataset
411
422
 
412
- @property
413
- def generatedpath(self):
414
- return self.basedir.joinpath("generated")
415
-
416
- @property
417
- def datapath(self):
418
- return self.context.datapath.joinpath(self.id)
419
-
420
- @property
421
- def extrapath(self):
422
- """Path to the directory containing extra configuration files"""
423
- return self.basedir.joinpath("data")
424
-
425
423
 
426
424
  def find_dataset(dataset_id: str):
427
425
  """Find a dataset given its id"""
@@ -430,11 +428,17 @@ def find_dataset(dataset_id: str):
430
428
  return AbstractDataset.find(dataset_id)
431
429
 
432
430
 
433
- def prepare_dataset(dataset_id: str):
431
+ def prepare_dataset(dataset_id: Union[str, "DatasetWrapper", Config]):
434
432
  """Find a dataset given its id and download the resources"""
435
- from .definitions import AbstractDataset
433
+ from .definitions import AbstractDataset, DatasetWrapper
434
+
435
+ if isinstance(dataset_id, DatasetWrapper):
436
+ ds = dataset_id
437
+ elif isinstance(dataset_id, Config):
438
+ ds = dataset_id.__datamaestro_dataset__
439
+ else:
440
+ ds = AbstractDataset.find(dataset_id)
436
441
 
437
- ds = AbstractDataset.find(dataset_id)
438
442
  return ds.prepare(download=True)
439
443
 
440
444
 
@@ -1,22 +1,18 @@
1
+ from abc import abstractmethod
1
2
  import logging
2
3
  from pathlib import Path
3
4
  from typing import Any, Dict
4
- from datamaestro.definitions import AbstractDataset, argument, Param
5
- from experimaestro import Config
6
- from experimaestro import documentation # noqa: F401
5
+ from experimaestro import Config, Param, Meta
6
+ from datamaestro.definitions import AbstractDataset
7
7
 
8
8
 
9
9
  class Base(Config):
10
10
  """Base object for all data types"""
11
11
 
12
12
  id: Param[str]
13
- """The unique dataset ID"""
13
+ """The unique (sub-)dataset ID"""
14
14
 
15
- __datamaestro_dataset__: AbstractDataset
16
-
17
- def download(self):
18
- """Download the dataset"""
19
- self.__datamaestro_dataset__.download()
15
+ __datamaestro_dataset__: "AbstractDataset"
20
16
 
21
17
  def dataset_information(self) -> Dict[str, Any]:
22
18
  """Returns document meta-informations"""
@@ -26,6 +22,16 @@ class Base(Config):
26
22
  "description": self.__datamaestro_dataset__.description,
27
23
  }
28
24
 
25
+ def download(self):
26
+ """Download the dataset"""
27
+ self.__datamaestro_dataset__.download()
28
+
29
+ @abstractmethod
30
+ def prepare(self, *args, **kwargs):
31
+ """Prepare the dataset"""
32
+ self.__datamaestro_dataset__.prepare()
33
+ return self
34
+
29
35
 
30
36
  class Generic(Base):
31
37
  """Generic dataset
@@ -44,16 +50,17 @@ class Generic(Base):
44
50
  class File(Base):
45
51
  """A data file"""
46
52
 
47
- path: Param[Path]
53
+ path: Meta[Path]
48
54
  """The path of the file"""
49
55
 
50
56
  def open(self, mode):
51
57
  return self.path.open(mode)
52
58
 
53
59
 
54
- @argument("path", type=Path)
55
60
  class Folder(Base):
56
61
  """A data folder"""
57
62
 
63
+ path: Meta[Path]
64
+
58
65
  def open(self, mode):
59
66
  return self.path.open(mode)
@@ -1,7 +1,8 @@
1
+ from typing import Optional, Tuple, List, Any
1
2
  from csv import reader as csv_reader
2
- from . import File, argument, documentation
3
- from datamaestro.definitions import Meta
4
- from typing import Tuple, List, Any
3
+ from experimaestro import Param, Meta
4
+ from experimaestro import documentation
5
+ from . import File
5
6
 
6
7
 
7
8
  class Generic(File):
@@ -26,12 +27,13 @@ class Generic(File):
26
27
  return row
27
28
 
28
29
 
29
- @argument("names_row", type=int, default=-1)
30
- @argument("size_row", type=int, default=-1)
31
- @argument("target", type=str, default=None)
32
30
  class Matrix(Generic):
33
31
  """A numerical dataset"""
34
32
 
33
+ names_row: Param[int] = -1
34
+ size_row: Param[int] = -1
35
+ target: Param[Optional[str]] = None
36
+
35
37
  @documentation
36
38
  def data(self) -> Tuple[List[str], Any]:
37
39
  """Returns the list of fields and the numeric data
@@ -1,7 +1,7 @@
1
1
  """Machine learning generic data formats"""
2
- from typing import Generic, TypeVar, Optional
3
2
  from pathlib import Path
4
- from experimaestro import Param, Meta, argument
3
+ from typing import Generic, TypeVar, Optional
4
+ from experimaestro import Param, Meta
5
5
  from . import Base
6
6
 
7
7
  Train = TypeVar("Train", bound=Base)
@@ -20,8 +20,8 @@ class Supervised(Base, Generic[Train, Validation, Test]):
20
20
  """The training optional"""
21
21
 
22
22
 
23
- @argument("classes")
24
23
  class FolderBased(Base):
25
24
  """Classification dataset where folders give the basis"""
26
25
 
26
+ classes: Param[list[str]]
27
27
  path: Meta[Path]
@@ -1,44 +1,50 @@
1
- from pathlib import Path
1
+ from abc import ABC, abstractmethod
2
2
  from struct import Struct
3
- from . import File
3
+ from typing import TYPE_CHECKING
4
+ from . import File, Base
4
5
 
6
+ if TYPE_CHECKING:
7
+ import numpy as np
5
8
 
6
- class IDX(File):
9
+
10
+ class Tensor(Base, ABC):
11
+ @abstractmethod
12
+ def data(self) -> "np.ndarray":
13
+ """Returns the tensor in numpy format"""
14
+ pass
15
+
16
+
17
+ class IDX(Tensor, File):
7
18
  """IDX File format
8
19
 
9
- The IDX file format is a simple format for vectors and multidimensional matrices of various numerical types.
20
+ The IDX file format is a simple format for vectors and multidimensional
21
+ matrices of various numerical types.
10
22
 
11
23
  The basic format is:
12
24
 
13
- magic number
14
- size in dimension 0
15
- size in dimension 1
16
- size in dimension 2
17
- .....
18
- size in dimension N
19
- data
25
+ magic number size in dimension 0 size in dimension 1 size in dimension 2
26
+ ..... size in dimension N data
20
27
 
21
28
  The magic number is an integer (MSB first). The first 2 bytes are always 0.
22
29
 
23
- The third byte codes the type of the data:
24
- 0x08: unsigned byte
25
- 0x09: signed byte
26
- 0x0B: short (2 bytes)
27
- 0x0C: int (4 bytes)
28
- 0x0D: float (4 bytes)
29
- 0x0E: double (8 bytes)
30
+ The third byte codes the type of the data: 0x08: unsigned byte 0x09: signed
31
+ byte 0x0B: short (2 bytes) 0x0C: int (4 bytes) 0x0D: float (4 bytes) 0x0E:
32
+ double (8 bytes)
30
33
 
31
- The 4-th byte codes the number of dimensions of the vector/matrix: 1 for vectors, 2 for matrices....
34
+ The 4-th byte codes the number of dimensions of the vector/matrix: 1 for
35
+ vectors, 2 for matrices....
32
36
 
33
- The sizes in each dimension are 4-byte integers (MSB first, high endian, like in most non-Intel processors).
37
+ The sizes in each dimension are 4-byte integers (MSB first, high endian,
38
+ like in most non-Intel processors).
34
39
 
35
- The data is stored like in a C array, i.e. the index in the last dimension changes the fastest.
40
+ The data is stored like in a C array, i.e. the index in the last dimension
41
+ changes the fastest.
36
42
  """
37
43
 
38
44
  MAGIC_NUMBER = Struct(">HBB")
39
45
  DIM = Struct(">I")
40
46
 
41
- def data(self):
47
+ def data(self) -> "np.ndarray":
42
48
  """Returns the tensor"""
43
49
  import numpy as np
44
50
 
@@ -59,7 +65,8 @@ class IDX(File):
59
65
  shape = [IDX.DIM.unpack_from(fp.read(IDX.DIM.size))[0] for i in range(size)]
60
66
 
61
67
  size = np.prod(shape)
62
- # Could use np.fromfile... if it were not broken - see https://github.com/numpy/numpy/issues/7989
68
+ # Could use np.fromfile... if it were not broken
69
+ # see https://github.com/numpy/numpy/issues/7989
63
70
  data = np.frombuffer(fp.read(), dtype=dtype, count=size)
64
71
  data = data.reshape(shape, order="C")
65
72
  return data
File without changes
@@ -0,0 +1,103 @@
1
+ import regex
2
+ from typing import Iterator, Optional
3
+ from functools import cached_property
4
+ from attrs import field
5
+ import importlib
6
+ from omegaconf import OmegaConf
7
+ from functools import partial
8
+ from attrs import define
9
+ from datamaestro import BaseRepository
10
+ from datamaestro.definitions import AbstractDataset, DatasetWrapper
11
+ from datamaestro.data import Base
12
+
13
+
14
+ re_spec = regex.compile(r"""^(\w\.)+:(\w+)""")
15
+
16
+
17
+ @define
18
+ class RepositoryDataset:
19
+ ids: list[str]
20
+ """ID(s) of this dataset"""
21
+
22
+ entry_point: str = field(validator=re_spec.match)
23
+ """The entry point"""
24
+
25
+ title: str
26
+ """The full name of the dataset"""
27
+
28
+ description: str
29
+ """Description of the dataset"""
30
+
31
+ url: Optional[str]
32
+ """The URL"""
33
+
34
+ groups: Optional[list[str]]
35
+ """Groups to which this repository belongs"""
36
+
37
+
38
+ @define
39
+ class RepositoryAuthors:
40
+ name: str
41
+ email: str
42
+
43
+
44
+ @define
45
+ class RepositoryGroup:
46
+ name: str
47
+ tasks: list[str]
48
+ tags: list[str]
49
+
50
+
51
+ @define
52
+ class RepositoryConfiguration:
53
+ namespace: str
54
+ authors: list[RepositoryAuthors]
55
+ description: str
56
+ groups: dict[str, RepositoryGroup]
57
+ datasets: list[RepositoryDataset]
58
+
59
+
60
+ class YAMLDataset(AbstractDataset):
61
+ def __init__(self, repository: "YAMLRepository", information: RepositoryDataset):
62
+ super().__init__(repository)
63
+ self.information = information
64
+ self.id = self.information.ids[0]
65
+ self.aliases = set(self.information.ids)
66
+
67
+ @cached_property
68
+ def wrapper(self) -> DatasetWrapper:
69
+ module, func_name = self.information.entry_point.split(":")
70
+ wrapper = getattr(importlib.import_module(module), func_name)
71
+ return wrapper
72
+
73
+ def _prepare(self) -> "Base":
74
+ return self.wrapper()
75
+
76
+ def download(self, **kwargs):
77
+ return self.wrapper.download(**kwargs)
78
+
79
+
80
+ class YAMLRepository(BaseRepository):
81
+ """YAML-based repository"""
82
+
83
+ @property
84
+ def id(self):
85
+ return self.configuration.namespace
86
+
87
+ @property
88
+ def name(self):
89
+ return self.configuration.namespace
90
+
91
+ @cached_property
92
+ def configuration(self):
93
+ schema = OmegaConf.structured(RepositoryConfiguration)
94
+ with importlib.resources.path(
95
+ self.__class__.__module__, "datamaestro.yaml"
96
+ ) as fp:
97
+ conf = OmegaConf.load(fp)
98
+
99
+ conf: RepositoryConfiguration = OmegaConf.merge(schema, conf)
100
+ return conf
101
+
102
+ def __iter__(self) -> Iterator["AbstractDataset"]:
103
+ return map(partial(YAMLDataset, self), self.configuration.datasets)