datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. datamaestro/__init__.py +11 -7
  2. datamaestro/__main__.py +29 -8
  3. datamaestro/annotations/__init__.py +1 -1
  4. datamaestro/annotations/agreement.py +9 -3
  5. datamaestro/commands/site.py +27 -15
  6. datamaestro/context.py +143 -87
  7. datamaestro/data/__init__.py +23 -11
  8. datamaestro/data/csv.py +12 -12
  9. datamaestro/data/huggingface.py +25 -0
  10. datamaestro/data/ml.py +19 -10
  11. datamaestro/data/tensor.py +32 -24
  12. datamaestro/definitions.py +492 -131
  13. datamaestro/download/__init__.py +610 -24
  14. datamaestro/download/archive.py +129 -77
  15. datamaestro/download/custom.py +53 -0
  16. datamaestro/download/huggingface.py +77 -0
  17. datamaestro/download/links.py +106 -50
  18. datamaestro/download/multiple.py +27 -5
  19. datamaestro/download/single.py +114 -51
  20. datamaestro/download/sync.py +0 -1
  21. datamaestro/download/todo.py +9 -4
  22. datamaestro/download/wayback.py +164 -0
  23. datamaestro/record.py +232 -0
  24. datamaestro/registry.py +1 -0
  25. datamaestro/search.py +1 -1
  26. datamaestro/settings.py +3 -1
  27. datamaestro/sphinx.py +224 -0
  28. datamaestro/stream/__init__.py +0 -2
  29. datamaestro/stream/lines.py +10 -7
  30. datamaestro/templates/dataset.py +5 -4
  31. datamaestro/test/__init__.py +3 -1
  32. datamaestro/test/checks.py +1 -5
  33. datamaestro/test/conftest.py +1 -6
  34. datamaestro/test/test_annotations.py +2 -2
  35. datamaestro/test/test_download_handlers.py +3 -4
  36. datamaestro/test/test_record.py +72 -0
  37. datamaestro/test/test_resource.py +1388 -0
  38. datamaestro/utils.py +15 -9
  39. datamaestro/v2.md +301 -0
  40. datamaestro/version.py +4 -0
  41. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
  42. datamaestro-1.7.0.dist-info/RECORD +49 -0
  43. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
  44. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
  45. datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
  46. datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
  47. datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
  48. datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
  49. datamaestro/__pycache__/context.cpython-38.pyc +0 -0
  50. datamaestro/__pycache__/context.cpython-39.pyc +0 -0
  51. datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
  52. datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
  53. datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
  54. datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
  55. datamaestro/__pycache__/search.cpython-38.pyc +0 -0
  56. datamaestro/__pycache__/search.cpython-39.pyc +0 -0
  57. datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
  58. datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
  59. datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
  60. datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
  61. datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
  62. datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
  63. datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
  64. datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
  65. datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
  66. datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
  67. datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
  68. datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
  69. datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
  70. datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
  71. datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
  72. datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
  73. datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
  74. datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
  75. datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
  76. datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
  77. datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
  78. datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
  79. datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
  80. datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
  81. datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
  82. datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
  83. datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
  84. datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
  85. datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
  86. datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
  87. datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
  88. datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
  89. datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
  90. datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
  91. datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
  92. datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
  93. datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
  94. datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
  95. datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
  96. datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
  97. datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
  98. datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
  99. datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
  100. datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
  101. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
  102. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
  103. datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
  104. datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
  105. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
  106. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
  107. datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
  108. datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
  109. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
  110. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
  111. datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
  112. datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
  113. datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
  114. datamaestro-0.8.1.dist-info/RECORD +0 -109
  115. datamaestro-0.8.1.dist-info/top_level.txt +0 -1
  116. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
datamaestro/data/csv.py CHANGED
@@ -1,17 +1,16 @@
1
- from pathlib import Path
1
+ from typing import Optional, Tuple, List, Any
2
2
  from csv import reader as csv_reader
3
- from . import File, argument, documentation
4
- from datamaestro.definitions import Meta
5
- from typing import Tuple, List
3
+ from experimaestro import Param, Meta
4
+ from experimaestro import documentation
5
+ from . import File
6
6
 
7
7
 
8
- @argument("ignore", type=int, default=0)
9
- @argument("names_row", type=int, default=-1)
10
8
  class Generic(File):
11
9
  """A generic CSV file"""
12
10
 
11
+ delimiter: Meta[str] = ","
13
12
  ignore: Meta[int] = 0
14
- names_row: Meta[int] = 1
13
+ names_row: Meta[int] = -1
15
14
 
16
15
  @documentation
17
16
  def columns(self):
@@ -23,19 +22,20 @@ class Generic(File):
23
22
  for i in range(self.ignore):
24
23
  fp.readline()
25
24
 
26
- for ix, row in enumerate(csv_reader(fp)):
25
+ for ix, row in enumerate(csv_reader(fp, delimiter=self.delimiter)):
27
26
  if ix == self.names_row:
28
27
  return row
29
28
 
30
29
 
31
- @argument("names_row", type=int, default=-1)
32
- @argument("size_row", type=int, default=-1)
33
- @argument("target", type=str, default=None)
34
30
  class Matrix(Generic):
35
31
  """A numerical dataset"""
36
32
 
33
+ names_row: Param[int] = -1
34
+ size_row: Param[int] = -1
35
+ target: Param[Optional[str]] = None
36
+
37
37
  @documentation
38
- def data(self) -> Tuple[List[str], "numpy.array"]:
38
+ def data(self) -> Tuple[List[str], Any]:
39
39
  """Returns the list of fields and the numeric data
40
40
 
41
41
 
@@ -0,0 +1,25 @@
1
+ """Huggingface datamaestro adapters"""
2
+
3
+ from functools import cached_property
4
+ from typing import Optional
5
+ from . import Base
6
+ import logging
7
+ from experimaestro import Param
8
+
9
+
10
+ class HuggingFaceDataset(Base):
11
+ repo_id: Param[str]
12
+ data_files: Param[Optional[str]] = None
13
+ split: Param[Optional[str]] = None
14
+
15
+ @cached_property
16
+ def data(self):
17
+ try:
18
+ from datasets import load_dataset
19
+ except ModuleNotFoundError:
20
+ logging.error("the datasets library is not installed:")
21
+ logging.error("pip install datasets")
22
+ raise
23
+
24
+ ds = load_dataset(self.repo_id, data_files=self.data_files, split=self.split)
25
+ return ds
datamaestro/data/ml.py CHANGED
@@ -1,19 +1,28 @@
1
1
  """Machine learning generic data formats"""
2
- from typing import List
2
+
3
3
  from pathlib import Path
4
- from . import Base, argument
4
+ from typing import Generic, TypeVar, Optional
5
+ from experimaestro import Param, Meta
6
+ from . import Base
7
+
8
+ Train = TypeVar("Train", bound=Base)
9
+ Validation = TypeVar("Validation", bound=Base)
10
+ Test = TypeVar("Test", bound=Base)
11
+
12
+
13
+ class Supervised(Base, Generic[Train, Validation, Test]):
14
+ train: Param[Base]
15
+ """The training dataset"""
5
16
 
17
+ validation: Param[Optional[Base]] = None
18
+ """The validation dataset (optional)"""
6
19
 
7
- @argument("train", type=Base, help="The training dataset")
8
- @argument("validation", type=Base, help="The validation dataset", required=False)
9
- @argument("test", type=Base, help="The test dataset", required=False)
10
- class Supervised(Base):
11
- pass
20
+ test: Param[Optional[Base]] = None
21
+ """The training optional"""
12
22
 
13
23
 
14
- @argument("path", type=Path)
15
- @argument("classes")
16
24
  class FolderBased(Base):
17
25
  """Classification dataset where folders give the basis"""
18
26
 
19
- pass
27
+ classes: Param[list[str]]
28
+ path: Meta[Path]
@@ -1,43 +1,50 @@
1
- from pathlib import Path
1
+ from abc import ABC, abstractmethod
2
2
  from struct import Struct
3
- from . import File
3
+ from typing import TYPE_CHECKING
4
+ from . import File, Base
4
5
 
6
+ if TYPE_CHECKING:
7
+ import numpy as np
5
8
 
6
- class IDX(File):
9
+
10
+ class Tensor(Base, ABC):
11
+ @abstractmethod
12
+ def data(self) -> "np.ndarray":
13
+ """Returns the tensor in numpy format"""
14
+ pass
15
+
16
+
17
+ class IDX(Tensor, File):
7
18
  """IDX File format
8
- the IDX file format is a simple format for vectors and multidimensional matrices of various numerical types.
9
19
 
10
- The basic format is
20
+ The IDX file format is a simple format for vectors and multidimensional
21
+ matrices of various numerical types.
22
+
23
+ The basic format is:
11
24
 
12
- magic number
13
- size in dimension 0
14
- size in dimension 1
15
- size in dimension 2
16
- .....
17
- size in dimension N
18
- data
25
+ magic number size in dimension 0 size in dimension 1 size in dimension 2
26
+ ..... size in dimension N data
19
27
 
20
28
  The magic number is an integer (MSB first). The first 2 bytes are always 0.
21
29
 
22
- The third byte codes the type of the data:
23
- 0x08: unsigned byte
24
- 0x09: signed byte
25
- 0x0B: short (2 bytes)
26
- 0x0C: int (4 bytes)
27
- 0x0D: float (4 bytes)
28
- 0x0E: double (8 bytes)
30
+ The third byte codes the type of the data: 0x08: unsigned byte 0x09: signed
31
+ byte 0x0B: short (2 bytes) 0x0C: int (4 bytes) 0x0D: float (4 bytes) 0x0E:
32
+ double (8 bytes)
29
33
 
30
- The 4-th byte codes the number of dimensions of the vector/matrix: 1 for vectors, 2 for matrices....
34
+ The 4-th byte codes the number of dimensions of the vector/matrix: 1 for
35
+ vectors, 2 for matrices....
31
36
 
32
- The sizes in each dimension are 4-byte integers (MSB first, high endian, like in most non-Intel processors).
37
+ The sizes in each dimension are 4-byte integers (MSB first, high endian,
38
+ like in most non-Intel processors).
33
39
 
34
- The data is stored like in a C array, i.e. the index in the last dimension changes the fastest.
40
+ The data is stored like in a C array, i.e. the index in the last dimension
41
+ changes the fastest.
35
42
  """
36
43
 
37
44
  MAGIC_NUMBER = Struct(">HBB")
38
45
  DIM = Struct(">I")
39
46
 
40
- def data(self):
47
+ def data(self) -> "np.ndarray":
41
48
  """Returns the tensor"""
42
49
  import numpy as np
43
50
 
@@ -58,7 +65,8 @@ class IDX(File):
58
65
  shape = [IDX.DIM.unpack_from(fp.read(IDX.DIM.size))[0] for i in range(size)]
59
66
 
60
67
  size = np.prod(shape)
61
- # Could use np.fromfile... if it were not broken - see https://github.com/numpy/numpy/issues/7989
68
+ # Could use np.fromfile... if it were not broken
69
+ # see https://github.com/numpy/numpy/issues/7989
62
70
  data = np.frombuffer(fp.read(), dtype=dtype, count=size)
63
71
  data = data.reshape(shape, order="C")
64
72
  return data