datamaestro 1.3.1__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {datamaestro-1.3.1 → datamaestro-1.4.0}/.github/workflows/pytest.yml +1 -1
  2. {datamaestro-1.3.1 → datamaestro-1.4.0}/PKG-INFO +39 -42
  3. {datamaestro-1.3.1 → datamaestro-1.4.0}/README.md +37 -39
  4. datamaestro-1.4.0/pyproject.toml +13 -0
  5. {datamaestro-1.3.1 → datamaestro-1.4.0}/requirements.txt +1 -1
  6. {datamaestro-1.3.1 → datamaestro-1.4.0}/setup.cfg +0 -1
  7. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/__init__.py +0 -2
  8. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/__main__.py +13 -9
  9. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/context.py +0 -5
  10. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/definitions.py +12 -26
  11. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/download/__init__.py +3 -3
  12. datamaestro-1.4.0/src/datamaestro/download/custom.py +21 -0
  13. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/download/huggingface.py +1 -1
  14. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/download/single.py +2 -16
  15. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/version.py +2 -2
  16. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro.egg-info/PKG-INFO +39 -42
  17. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro.egg-info/SOURCES.txt +0 -2
  18. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro.egg-info/requires.txt +1 -1
  19. datamaestro-1.3.1/pyproject.toml +0 -6
  20. datamaestro-1.3.1/src/datamaestro/datasets/__init__.py +0 -0
  21. datamaestro-1.3.1/src/datamaestro/datasets/yaml_repository.py +0 -103
  22. datamaestro-1.3.1/src/datamaestro/download/custom.py +0 -29
  23. {datamaestro-1.3.1 → datamaestro-1.4.0}/.coverage +0 -0
  24. {datamaestro-1.3.1 → datamaestro-1.4.0}/.github/workflows/python-publish.yml +0 -0
  25. {datamaestro-1.3.1 → datamaestro-1.4.0}/.gitignore +0 -0
  26. {datamaestro-1.3.1 → datamaestro-1.4.0}/.pre-commit-config.yaml +0 -0
  27. {datamaestro-1.3.1 → datamaestro-1.4.0}/.readthedocs.yml +0 -0
  28. {datamaestro-1.3.1 → datamaestro-1.4.0}/CHANGELOG.md +0 -0
  29. {datamaestro-1.3.1 → datamaestro-1.4.0}/LICENSE +0 -0
  30. {datamaestro-1.3.1 → datamaestro-1.4.0}/MANIFEST.in +0 -0
  31. {datamaestro-1.3.1 → datamaestro-1.4.0}/TODO.md +0 -0
  32. {datamaestro-1.3.1 → datamaestro-1.4.0}/docs/Makefile +0 -0
  33. {datamaestro-1.3.1 → datamaestro-1.4.0}/docs/make.bat +0 -0
  34. {datamaestro-1.3.1 → datamaestro-1.4.0}/docs/requirements.txt +0 -0
  35. {datamaestro-1.3.1 → datamaestro-1.4.0}/docs/source/api/data.md +0 -0
  36. {datamaestro-1.3.1 → datamaestro-1.4.0}/docs/source/api/download.rst +0 -0
  37. {datamaestro-1.3.1 → datamaestro-1.4.0}/docs/source/api/index.md +0 -0
  38. {datamaestro-1.3.1 → datamaestro-1.4.0}/docs/source/api/records.rst +0 -0
  39. {datamaestro-1.3.1 → datamaestro-1.4.0}/docs/source/conf.py +0 -0
  40. {datamaestro-1.3.1 → datamaestro-1.4.0}/docs/source/datasets.rst +0 -0
  41. {datamaestro-1.3.1 → datamaestro-1.4.0}/docs/source/developping.md +0 -0
  42. {datamaestro-1.3.1 → datamaestro-1.4.0}/docs/source/index.md +0 -0
  43. {datamaestro-1.3.1 → datamaestro-1.4.0}/docs/source/style.css +0 -0
  44. {datamaestro-1.3.1 → datamaestro-1.4.0}/mkdocs.yml +0 -0
  45. {datamaestro-1.3.1 → datamaestro-1.4.0}/pytest.ini +0 -0
  46. {datamaestro-1.3.1 → datamaestro-1.4.0}/requirements-dev.txt +0 -0
  47. {datamaestro-1.3.1 → datamaestro-1.4.0}/schema.yaml +0 -0
  48. {datamaestro-1.3.1 → datamaestro-1.4.0}/setup.py +0 -0
  49. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/annotations/__init__.py +0 -0
  50. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/annotations/agreement.py +0 -0
  51. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/commands/__init__.py +0 -0
  52. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/commands/mainstyle.css +0 -0
  53. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/commands/site.py +0 -0
  54. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/data/__init__.py +0 -0
  55. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/data/csv.py +0 -0
  56. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/data/huggingface.py +0 -0
  57. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/data/ml.py +0 -0
  58. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/data/tensor.py +0 -0
  59. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/download/archive.py +0 -0
  60. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/download/links.py +0 -0
  61. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/download/manual.py +0 -0
  62. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/download/multiple.py +0 -0
  63. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/download/sync.py +0 -0
  64. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/download/todo.py +0 -0
  65. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/download/wayback.py +0 -0
  66. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/record.py +0 -0
  67. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/registry.py +0 -0
  68. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/search.py +0 -0
  69. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/settings.py +0 -0
  70. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/sphinx.py +0 -0
  71. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/stream/__init__.py +0 -0
  72. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/stream/compress.py +0 -0
  73. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/stream/lines.py +0 -0
  74. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/templates/dataset.py +0 -0
  75. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/test/__init__.py +0 -0
  76. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/test/checks.py +0 -0
  77. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/test/conftest.py +0 -0
  78. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/test/test_annotations.py +0 -0
  79. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/test/test_download_handlers.py +0 -0
  80. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/test/test_record.py +0 -0
  81. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/utils.py +0 -0
  82. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro.egg-info/dependency_links.txt +0 -0
  83. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro.egg-info/entry_points.txt +0 -0
  84. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro.egg-info/not-zip-safe +0 -0
  85. {datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro.egg-info/top_level.txt +0 -0
  86. {datamaestro-1.3.1 → datamaestro-1.4.0}/tox.ini +0 -0
@@ -15,7 +15,7 @@ jobs:
15
15
  runs-on: ubuntu-latest
16
16
  strategy:
17
17
  matrix:
18
- python-version: [3.8, 3.9, "3.10", "3.11"]
18
+ python-version: ["3.9", "3.10", "3.11"]
19
19
 
20
20
  steps:
21
21
  - uses: actions/checkout@v2
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro
3
- Version: 1.3.1
3
+ Version: 1.4.0
4
4
  Summary: "Dataset management command line and API"
5
5
  Home-page: https://github.com/experimaestro/datamaestro
6
6
  Author: Benjamin Piwowarski
@@ -13,7 +13,6 @@ Classifier: Intended Audience :: Science/Research
13
13
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
14
14
  Classifier: Operating System :: OS Independent
15
15
  Classifier: Programming Language :: Python
16
- Classifier: Programming Language :: Python :: 3.8
17
16
  Classifier: Programming Language :: Python :: 3.9
18
17
  Classifier: Programming Language :: Python :: 3.10
19
18
  Classifier: Programming Language :: Python :: 3.11
@@ -28,7 +27,7 @@ Requires-Dist: marshmallow
28
27
  Requires-Dist: cached_property
29
28
  Requires-Dist: requests
30
29
  Requires-Dist: bitmath
31
- Requires-Dist: experimaestro>=1.5.0
30
+ Requires-Dist: experimaestro>=1.6
32
31
  Requires-Dist: mkdocs
33
32
  Requires-Dist: pymdown-extensions
34
33
  Requires-Dist: mkdocs-material
@@ -98,22 +97,10 @@ $ datamaestro search tag:image
98
97
  [image] com.lecun.mnist
99
98
 
100
99
  $ datamaestro prepare com.lecun.mnist
101
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
102
- INFO:root:Transforming file
103
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
104
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
105
- INFO:root:Transforming file
106
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
107
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
108
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
109
- Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz: 32.8kB [00:00, 92.1kB/s] INFO:root:Transforming file
110
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
111
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
112
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
113
- Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz: 9.92MB [00:00, 10.6MB/s]
114
- INFO:root:Transforming file
115
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
116
- ...JSON...
100
+ INFO:root:Materializing 4 resources
101
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/train_images.idx
102
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/test_images.idx
103
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz into .../datamaestro/store/com/lecun/test_labels.idx
117
104
  ```
118
105
 
119
106
  The previous command also returns a JSON on standard output
@@ -159,13 +146,12 @@ and is integrated with [experimaestro](http://experimaestro.github.io/experimaes
159
146
  Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
160
147
 
161
148
 
162
- For MNIST, this corresponds to.
149
+ For instance, the MNIST dataset can be described by the following
163
150
 
164
151
  ```python
165
- from datamaestro_image.data import ImageClassification, LabelledImages, Base, IDXImage
166
- from datamaestro.download.single import filedownloader
167
- from datamaestro.definitions import argument, datatasks, datatags, dataset
168
- from datamaestro.data.tensor import IDX
152
+ from datamaestro import dataset
153
+ from datamaestro.download.single import download_file
154
+ from datamaestro_image.data import ImageClassification, LabelledImages, IDXImage
169
155
 
170
156
 
171
157
  @filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
@@ -176,26 +162,37 @@ from datamaestro.data.tensor import IDX
176
162
  ImageClassification,
177
163
  url="http://yann.lecun.com/exdb/mnist/",
178
164
  )
179
- def MNIST(train_images, train_labels, test_images, test_labels):
180
- """The MNIST database
181
-
182
- The MNIST database of handwritten digits, available from this page, has a
183
- training set of 60,000 examples, and a test set of 10,000 examples. It is a
184
- subset of a larger set available from NIST. The digits have been
185
- size-normalized and centered in a fixed-size image.
186
- """
187
- return {
188
- "train": LabelledImages(
189
- images=IDXImage(path=train_images),
190
- labels=IDX(path=train_labels)
191
- ),
192
- "test": LabelledImages(
193
- images=IDXImage(path=test_images),
194
- labels=IDX(path=test_labels)
195
- ),
196
- }
165
+
166
+ return ImageClassification(
167
+ train=LabelledImages(
168
+ images=IDXImage(path=train_images), labels=IDXImage(path=train_labels)
169
+ ),
170
+ test=LabelledImages(
171
+ images=IDXImage(path=test_images), labels=IDXImage(path=test_labels)
172
+ ),
173
+ )
174
+ ```
175
+
176
+ When building dataset modules, some extra documentation can be provided:
177
+
178
+ ```yaml
179
+ ids: [com.lecun.mnist]
180
+ entry_point: "datamaestro_image.config.com.lecun:mnist"
181
+ title: The MNIST database
182
+ url: http://yann.lecun.com/exdb/mnist/
183
+ groups: [image-classification]
184
+ description: |
185
+ The MNIST database of handwritten digits, available from this page,
186
+ has a training set of 60,000 examples, and a test set of 10,000
187
+ examples. It is a subset of a larger set available from NIST. The
188
+ digits have been size-normalized and centered in a fixed-size image.
197
189
  ```
198
190
 
191
+ This will allow to
192
+
193
+ 1. Document the dataset
194
+ 2. Allow to use the command line interface to manipulate it (download resources, etc.)
195
+
199
196
  # 0.8.0
200
197
 
201
198
  - Integration with other repositories: abstracting away the notion of dataset
@@ -57,22 +57,10 @@ $ datamaestro search tag:image
57
57
  [image] com.lecun.mnist
58
58
 
59
59
  $ datamaestro prepare com.lecun.mnist
60
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
61
- INFO:root:Transforming file
62
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
63
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
64
- INFO:root:Transforming file
65
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
66
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
67
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
68
- Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz: 32.8kB [00:00, 92.1kB/s] INFO:root:Transforming file
69
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
70
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
71
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
72
- Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz: 9.92MB [00:00, 10.6MB/s]
73
- INFO:root:Transforming file
74
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
75
- ...JSON...
60
+ INFO:root:Materializing 4 resources
61
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/train_images.idx
62
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/test_images.idx
63
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz into .../datamaestro/store/com/lecun/test_labels.idx
76
64
  ```
77
65
 
78
66
  The previous command also returns a JSON on standard output
@@ -118,13 +106,12 @@ and is integrated with [experimaestro](http://experimaestro.github.io/experimaes
118
106
  Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
119
107
 
120
108
 
121
- For MNIST, this corresponds to.
109
+ For instance, the MNIST dataset can be described by the following
122
110
 
123
111
  ```python
124
- from datamaestro_image.data import ImageClassification, LabelledImages, Base, IDXImage
125
- from datamaestro.download.single import filedownloader
126
- from datamaestro.definitions import argument, datatasks, datatags, dataset
127
- from datamaestro.data.tensor import IDX
112
+ from datamaestro import dataset
113
+ from datamaestro.download.single import download_file
114
+ from datamaestro_image.data import ImageClassification, LabelledImages, IDXImage
128
115
 
129
116
 
130
117
  @filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
@@ -135,22 +122,33 @@ from datamaestro.data.tensor import IDX
135
122
  ImageClassification,
136
123
  url="http://yann.lecun.com/exdb/mnist/",
137
124
  )
138
- def MNIST(train_images, train_labels, test_images, test_labels):
139
- """The MNIST database
140
-
141
- The MNIST database of handwritten digits, available from this page, has a
142
- training set of 60,000 examples, and a test set of 10,000 examples. It is a
143
- subset of a larger set available from NIST. The digits have been
144
- size-normalized and centered in a fixed-size image.
145
- """
146
- return {
147
- "train": LabelledImages(
148
- images=IDXImage(path=train_images),
149
- labels=IDX(path=train_labels)
150
- ),
151
- "test": LabelledImages(
152
- images=IDXImage(path=test_images),
153
- labels=IDX(path=test_labels)
154
- ),
155
- }
125
+
126
+ return ImageClassification(
127
+ train=LabelledImages(
128
+ images=IDXImage(path=train_images), labels=IDXImage(path=train_labels)
129
+ ),
130
+ test=LabelledImages(
131
+ images=IDXImage(path=test_images), labels=IDXImage(path=test_labels)
132
+ ),
133
+ )
134
+ ```
135
+
136
+ When building dataset modules, some extra documentation can be provided:
137
+
138
+ ```yaml
139
+ ids: [com.lecun.mnist]
140
+ entry_point: "datamaestro_image.config.com.lecun:mnist"
141
+ title: The MNIST database
142
+ url: http://yann.lecun.com/exdb/mnist/
143
+ groups: [image-classification]
144
+ description: |
145
+ The MNIST database of handwritten digits, available from this page,
146
+ has a training set of 60,000 examples, and a test set of 10,000
147
+ examples. It is a subset of a larger set available from NIST. The
148
+ digits have been size-normalized and centered in a fixed-size image.
156
149
  ```
150
+
151
+ This will allow to
152
+
153
+ 1. Document the dataset
154
+ 2. Allow to use the command line interface to manipulate it (download resources, etc.)
@@ -0,0 +1,13 @@
1
+ [tool.setuptools_scm]
2
+ write_to = "src/datamaestro/version.py"
3
+ fallback_version = "0.0.0-dev"
4
+
5
+ [build-system]
6
+ requires = ["setuptools", "setuptools-scm", "wheel"] # PEP 508 specifications.
7
+
8
+ [tool.flake8]
9
+ doctests = "True"
10
+ exclude = ".git, .eggs, __pycache__, tests/, docs/, build/, dist/, app/"
11
+ max-line-length = "88"
12
+ # See https://github.com/PyCQA/pycodestyle/issues/373
13
+ extend-ignore = "E203"
@@ -10,7 +10,7 @@ requests
10
10
  bitmath
11
11
 
12
12
  # Experimaestro for data definitions
13
- experimaestro>=1.5.0
13
+ experimaestro>=1.6
14
14
 
15
15
  # Mkdocs
16
16
  mkdocs
@@ -16,7 +16,6 @@ classifiers =
16
16
  License :: OSI Approved :: GNU General Public License v3 (GPLv3)
17
17
  Operating System :: OS Independent
18
18
  Programming Language :: Python
19
- Programming Language :: Python :: 3.8
20
19
  Programming Language :: Python :: 3.9
21
20
  Programming Language :: Python :: 3.10
22
21
  Programming Language :: Python :: 3.11
@@ -7,8 +7,6 @@ from .context import (
7
7
  prepare_dataset,
8
8
  )
9
9
 
10
- from .datasets.yaml_repository import YAMLRepository
11
-
12
10
  from pkg_resources import get_distribution, DistributionNotFound
13
11
  from .definitions import dataset, metadata
14
12
  from .data import Base
@@ -319,13 +319,17 @@ def search(config: Config, searchterms):
319
319
 
320
320
  logging.debug("Search: %s", condition)
321
321
  for dataset in config.context.datasets():
322
- if condition.match(dataset):
323
- cfg = dataset.configtype
324
- print(
325
- "[%s] %s (%s)"
326
- % (
327
- dataset.repository.id,
328
- dataset.id,
329
- cfg.__name__ if cfg is not None else "?",
322
+ try:
323
+ if condition.match(dataset):
324
+ cfg = dataset.configtype
325
+ print(
326
+ "[%s] %s (%s)"
327
+ % (
328
+ dataset.repository.id,
329
+ dataset.id,
330
+ cfg.__name__ if cfg is not None else "?",
331
+ )
330
332
  )
331
- )
333
+ except Exception:
334
+ logging.error("Error while matching with dataset %s", dataset)
335
+ raise
@@ -88,11 +88,6 @@ class Context:
88
88
 
89
89
  return ContextManager()
90
90
 
91
- @property
92
- def storepath(self):
93
- """Replaces the data path"""
94
- return self._path.joinpath("store")
95
-
96
91
  @property
97
92
  def datapath(self):
98
93
  return self._path.joinpath("data")
@@ -7,7 +7,6 @@ import inspect
7
7
  from pathlib import Path
8
8
  from itertools import chain
9
9
  from abc import ABC, abstractmethod
10
- from contextlib import contextmanager
11
10
  import traceback
12
11
  from typing import (
13
12
  Dict,
@@ -19,7 +18,6 @@ from typing import (
19
18
  Callable,
20
19
  TYPE_CHECKING,
21
20
  Union,
22
- ClassVar,
23
21
  _GenericAlias,
24
22
  )
25
23
  from experimaestro import ( # noqa: F401 (re-exports)
@@ -217,8 +215,8 @@ class AbstractDataset(AbstractData):
217
215
  def download(self, force=False):
218
216
  """Download all the necessary resources"""
219
217
  success = True
220
- logging.info("Materializing %d resources", len(self.ordered_resources))
221
218
  self.prepare()
219
+ logging.info("Materializing %d resources", len(self.ordered_resources))
222
220
  for resource in self.ordered_resources:
223
221
  try:
224
222
  resource.download(force)
@@ -274,9 +272,6 @@ class DatasetWrapper(AbstractDataset):
274
272
  annotations (otherwise, derive from `AbstractDataset`).
275
273
  """
276
274
 
277
- BUILDING: ClassVar[list["DatasetWrapper"]] = []
278
- """Currently built dataset"""
279
-
280
275
  def __init__(self, annotation, t: type):
281
276
  self.config = None
282
277
  self.repository: Optional[Repository] = None
@@ -287,6 +282,11 @@ class DatasetWrapper(AbstractDataset):
287
282
  repository, components = DataDefinition.repository_relpath(t)
288
283
  super().__init__(repository)
289
284
 
285
+ self.module_name = None
286
+ if repository is None:
287
+ # Try to find the module name
288
+ self.module_name, _ = t.__module__.split(".", 1)
289
+
290
290
  # Set some variables
291
291
  self.url = annotation.url
292
292
  self.doi = annotation.doi
@@ -361,12 +361,6 @@ class DatasetWrapper(AbstractDataset):
361
361
  self._prepare()
362
362
  return super().download(force=force)
363
363
 
364
- @contextmanager
365
- def building(self):
366
- DatasetWrapper.BUILDING.append(self)
367
- yield self
368
- DatasetWrapper.BUILDING.pop()
369
-
370
364
  def _prepare(self) -> "Base":
371
365
  if self.config is not None:
372
366
  return self.config
@@ -378,8 +372,7 @@ class DatasetWrapper(AbstractDataset):
378
372
  # Construct the object
379
373
  resources = {key: value.prepare() for key, value in self.resources.items()}
380
374
 
381
- with self.building():
382
- result = self.t(**resources)
375
+ result = self.t(**resources)
383
376
 
384
377
  # Download resources
385
378
  logging.debug("Building with data type %s and dataset %s", self.base, self.t)
@@ -425,18 +418,11 @@ class DatasetWrapper(AbstractDataset):
425
418
  @property
426
419
  def datapath(self):
427
420
  """Returns the destination path for downloads"""
428
- from datamaestro import Context # noqa: F811
429
-
430
- path = Context.instance().storepath / self._path
431
-
432
- if (self.repository is not None) and (not path.exists()):
433
- old_path: Path = self.repository.datapath / self._path
434
- if old_path.exists():
435
- logging.info(
436
- "Moving from old path [%s] to new path [%s]", old_path, path
437
- )
438
- path.parent.mkdir(exist_ok=True, parents=True)
439
- old_path.rename(path)
421
+ if self.repository is not None:
422
+ return self.repository.datapath / self._path
423
+
424
+ # No repository, use __custom__/[MODULE NAME]
425
+ path = self.context.datapath / "__custom__" / self.module_name / self._path
440
426
 
441
427
  return path
442
428
 
@@ -31,7 +31,7 @@ class Resource(DatasetAnnotation, ABC):
31
31
  self.varname = varname
32
32
  # Ensures that the object is initialized
33
33
  self._post = False
34
- self.definition = None
34
+ self.definition: AbstractDataset = None
35
35
 
36
36
  def annotate(self, dataset: AbstractDataset):
37
37
  assert self.definition is None
@@ -45,9 +45,9 @@ class Resource(DatasetAnnotation, ABC):
45
45
 
46
46
  def contextualize(self):
47
47
  """When using an annotation inline, uses the current dataset wrapper object"""
48
- from datamaestro.definitions import DatasetWrapper
48
+ from datamaestro.definitions import AbstractDataset
49
49
 
50
- wrapper = DatasetWrapper.BUILDING[-1]
50
+ wrapper = AbstractDataset.processing()
51
51
  self.annotate(wrapper)
52
52
 
53
53
  @property
@@ -0,0 +1,21 @@
1
+ from typing import Protocol
2
+ from pathlib import Path
3
+ from datamaestro import Context
4
+ from datamaestro.download import Resource
5
+
6
+
7
+ class Downloader(Protocol):
8
+ def __call__(self, context: Context, root: Path, *, force=False):
9
+ pass
10
+
11
+
12
+ class custom_download(Resource):
13
+ def __init__(self, varname: str, downloader: Downloader):
14
+ super().__init__(varname)
15
+ self.downloader = downloader
16
+
17
+ def prepare(self):
18
+ return self.definition.datapath
19
+
20
+ def download(self, force=False):
21
+ self.downloader(self.context, self.definition.datapath, force=force)
@@ -5,7 +5,7 @@ from datamaestro.download import Download
5
5
 
6
6
 
7
7
  class hf_download(Download):
8
- """Use Hugging Face to donwload a file"""
8
+ """Use Hugging Face to download a file"""
9
9
 
10
10
  def __init__(
11
11
  self,
@@ -9,7 +9,7 @@ import os
9
9
  import urllib3
10
10
  from pathlib import Path
11
11
  import re
12
- from datamaestro.utils import copyfileobjs, FileChecker
12
+ from datamaestro.utils import copyfileobjs
13
13
  from datamaestro.stream import Transform
14
14
  from datamaestro.download import Download
15
15
 
@@ -35,7 +35,7 @@ class SingleDownload(Download):
35
35
  return self.path
36
36
 
37
37
  def download(self, force=False):
38
- if not self.path.is_file():
38
+ if not self.path.is_file() and not force:
39
39
  self._download(self.path)
40
40
 
41
41
 
@@ -96,20 +96,6 @@ class filedownloader(SingleDownload):
96
96
  logging.info("Created file %s" % destination)
97
97
 
98
98
 
99
- def file_from_url(
100
- filename: str,
101
- url: str,
102
- *,
103
- size: Optional[int] = None,
104
- transforms: Optional[Transform] = None,
105
- checker: Optional[FileChecker] = None,
106
- ) -> Path:
107
- """Defines a file that should be downloaded from"""
108
- downloader = filedownloader(filename, url, size, transforms, checker)
109
- downloader.contextualize()
110
- return downloader.path
111
-
112
-
113
99
  class concatdownload(SingleDownload):
114
100
  """Concatenate all files in an archive"""
115
101
 
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '1.3.1'
21
- __version_tuple__ = version_tuple = (1, 3, 1)
20
+ __version__ = version = '1.4.0'
21
+ __version_tuple__ = version_tuple = (1, 4, 0)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro
3
- Version: 1.3.1
3
+ Version: 1.4.0
4
4
  Summary: "Dataset management command line and API"
5
5
  Home-page: https://github.com/experimaestro/datamaestro
6
6
  Author: Benjamin Piwowarski
@@ -13,7 +13,6 @@ Classifier: Intended Audience :: Science/Research
13
13
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
14
14
  Classifier: Operating System :: OS Independent
15
15
  Classifier: Programming Language :: Python
16
- Classifier: Programming Language :: Python :: 3.8
17
16
  Classifier: Programming Language :: Python :: 3.9
18
17
  Classifier: Programming Language :: Python :: 3.10
19
18
  Classifier: Programming Language :: Python :: 3.11
@@ -28,7 +27,7 @@ Requires-Dist: marshmallow
28
27
  Requires-Dist: cached_property
29
28
  Requires-Dist: requests
30
29
  Requires-Dist: bitmath
31
- Requires-Dist: experimaestro>=1.5.0
30
+ Requires-Dist: experimaestro>=1.6
32
31
  Requires-Dist: mkdocs
33
32
  Requires-Dist: pymdown-extensions
34
33
  Requires-Dist: mkdocs-material
@@ -98,22 +97,10 @@ $ datamaestro search tag:image
98
97
  [image] com.lecun.mnist
99
98
 
100
99
  $ datamaestro prepare com.lecun.mnist
101
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
102
- INFO:root:Transforming file
103
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
104
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
105
- INFO:root:Transforming file
106
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
107
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
108
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
109
- Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz: 32.8kB [00:00, 92.1kB/s] INFO:root:Transforming file
110
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
111
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
112
- INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
113
- Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz: 9.92MB [00:00, 10.6MB/s]
114
- INFO:root:Transforming file
115
- INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
116
- ...JSON...
100
+ INFO:root:Materializing 4 resources
101
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/train_images.idx
102
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/test_images.idx
103
+ INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz into .../datamaestro/store/com/lecun/test_labels.idx
117
104
  ```
118
105
 
119
106
  The previous command also returns a JSON on standard output
@@ -159,13 +146,12 @@ and is integrated with [experimaestro](http://experimaestro.github.io/experimaes
159
146
  Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
160
147
 
161
148
 
162
- For MNIST, this corresponds to.
149
+ For instance, the MNIST dataset can be described by the following
163
150
 
164
151
  ```python
165
- from datamaestro_image.data import ImageClassification, LabelledImages, Base, IDXImage
166
- from datamaestro.download.single import filedownloader
167
- from datamaestro.definitions import argument, datatasks, datatags, dataset
168
- from datamaestro.data.tensor import IDX
152
+ from datamaestro import dataset
153
+ from datamaestro.download.single import download_file
154
+ from datamaestro_image.data import ImageClassification, LabelledImages, IDXImage
169
155
 
170
156
 
171
157
  @filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
@@ -176,26 +162,37 @@ from datamaestro.data.tensor import IDX
176
162
  ImageClassification,
177
163
  url="http://yann.lecun.com/exdb/mnist/",
178
164
  )
179
- def MNIST(train_images, train_labels, test_images, test_labels):
180
- """The MNIST database
181
-
182
- The MNIST database of handwritten digits, available from this page, has a
183
- training set of 60,000 examples, and a test set of 10,000 examples. It is a
184
- subset of a larger set available from NIST. The digits have been
185
- size-normalized and centered in a fixed-size image.
186
- """
187
- return {
188
- "train": LabelledImages(
189
- images=IDXImage(path=train_images),
190
- labels=IDX(path=train_labels)
191
- ),
192
- "test": LabelledImages(
193
- images=IDXImage(path=test_images),
194
- labels=IDX(path=test_labels)
195
- ),
196
- }
165
+
166
+ return ImageClassification(
167
+ train=LabelledImages(
168
+ images=IDXImage(path=train_images), labels=IDXImage(path=train_labels)
169
+ ),
170
+ test=LabelledImages(
171
+ images=IDXImage(path=test_images), labels=IDXImage(path=test_labels)
172
+ ),
173
+ )
174
+ ```
175
+
176
+ When building dataset modules, some extra documentation can be provided:
177
+
178
+ ```yaml
179
+ ids: [com.lecun.mnist]
180
+ entry_point: "datamaestro_image.config.com.lecun:mnist"
181
+ title: The MNIST database
182
+ url: http://yann.lecun.com/exdb/mnist/
183
+ groups: [image-classification]
184
+ description: |
185
+ The MNIST database of handwritten digits, available from this page,
186
+ has a training set of 60,000 examples, and a test set of 10,000
187
+ examples. It is a subset of a larger set available from NIST. The
188
+ digits have been size-normalized and centered in a fixed-size image.
197
189
  ```
198
190
 
191
+ This will allow to
192
+
193
+ 1. Document the dataset
194
+ 2. Allow to use the command line interface to manipulate it (download resources, etc.)
195
+
199
196
  # 0.8.0
200
197
 
201
198
  - Integration with other repositories: abstracting away the notion of dataset
@@ -58,8 +58,6 @@ src/datamaestro/data/csv.py
58
58
  src/datamaestro/data/huggingface.py
59
59
  src/datamaestro/data/ml.py
60
60
  src/datamaestro/data/tensor.py
61
- src/datamaestro/datasets/__init__.py
62
- src/datamaestro/datasets/yaml_repository.py
63
61
  src/datamaestro/download/__init__.py
64
62
  src/datamaestro/download/archive.py
65
63
  src/datamaestro/download/custom.py
@@ -5,7 +5,7 @@ marshmallow
5
5
  cached_property
6
6
  requests
7
7
  bitmath
8
- experimaestro>=1.5.0
8
+ experimaestro>=1.6
9
9
  mkdocs
10
10
  pymdown-extensions
11
11
  mkdocs-material
@@ -1,6 +0,0 @@
1
- [tool.setuptools_scm]
2
- write_to = "src/datamaestro/version.py"
3
- fallback_version = "0.0.0-dev"
4
-
5
- [build-system]
6
- requires = ["setuptools", "setuptools-scm", "wheel"] # PEP 508 specifications.
File without changes
@@ -1,103 +0,0 @@
1
- import re
2
- from typing import Iterator, Optional
3
- from functools import cached_property
4
- from attrs import field
5
- import importlib
6
- from omegaconf import OmegaConf
7
- from functools import partial
8
- from attrs import define
9
- from datamaestro import BaseRepository
10
- from datamaestro.definitions import AbstractDataset, DatasetWrapper
11
- from datamaestro.data import Base
12
-
13
-
14
- re_spec = re.compile(r"""^(\w\.)+:(\w+)""")
15
-
16
-
17
- @define
18
- class RepositoryDataset:
19
- ids: list[str]
20
- """ID(s) of this dataset"""
21
-
22
- entry_point: str = field(validator=re_spec.match)
23
- """The entry point"""
24
-
25
- title: str
26
- """The full name of the dataset"""
27
-
28
- description: str
29
- """Description of the dataset"""
30
-
31
- url: Optional[str]
32
- """The URL"""
33
-
34
- groups: Optional[list[str]]
35
- """Groups to which this repository belongs"""
36
-
37
-
38
- @define
39
- class RepositoryAuthors:
40
- name: str
41
- email: str
42
-
43
-
44
- @define
45
- class RepositoryGroup:
46
- name: str
47
- tasks: list[str]
48
- tags: list[str]
49
-
50
-
51
- @define
52
- class RepositoryConfiguration:
53
- namespace: str
54
- authors: list[RepositoryAuthors]
55
- description: str
56
- groups: dict[str, RepositoryGroup]
57
- datasets: list[RepositoryDataset]
58
-
59
-
60
- class YAMLDataset(AbstractDataset):
61
- def __init__(self, repository: "YAMLRepository", information: RepositoryDataset):
62
- super().__init__(repository)
63
- self.information = information
64
- self.id = self.information.ids[0]
65
- self.aliases = set(self.information.ids)
66
-
67
- @cached_property
68
- def wrapper(self) -> DatasetWrapper:
69
- module, func_name = self.information.entry_point.split(":")
70
- wrapper = getattr(importlib.import_module(module), func_name)
71
- return wrapper
72
-
73
- def _prepare(self) -> "Base":
74
- return self.wrapper()
75
-
76
- def download(self, **kwargs):
77
- return self.wrapper.download(**kwargs)
78
-
79
-
80
- class YAMLRepository(BaseRepository):
81
- """YAML-based repository"""
82
-
83
- @property
84
- def id(self):
85
- return self.configuration.namespace
86
-
87
- @property
88
- def name(self):
89
- return self.configuration.namespace
90
-
91
- @cached_property
92
- def configuration(self):
93
- schema = OmegaConf.structured(RepositoryConfiguration)
94
- with importlib.resources.path(
95
- self.__class__.__module__, "datamaestro.yaml"
96
- ) as fp:
97
- conf = OmegaConf.load(fp)
98
-
99
- conf: RepositoryConfiguration = OmegaConf.merge(schema, conf)
100
- return conf
101
-
102
- def __iter__(self) -> Iterator["AbstractDataset"]:
103
- return map(partial(YAMLDataset, self), self.configuration.datasets)
@@ -1,29 +0,0 @@
1
- from typing import Protocol
2
- from pathlib import Path
3
- from datamaestro import Context
4
- from datamaestro.definitions import DatasetWrapper
5
- from datamaestro.download import Resource
6
-
7
-
8
- class Downloader(Protocol):
9
- def __call__(self, context: Context, root: Path, *, force=False):
10
- pass
11
-
12
-
13
- class CustomResource(Resource):
14
- def __init__(self, ds_wrapper: DatasetWrapper, downloader: Downloader):
15
- self.ds_wrapper = ds_wrapper
16
- self.downloader = downloader
17
-
18
- def prepare(self):
19
- pass
20
-
21
- def download(self, force=False):
22
- self.downloader(self.context, self.ds_wrapper.datapath, force=force)
23
-
24
-
25
- def custom_download(downloader: Downloader) -> Path:
26
- ds_wrapper = DatasetWrapper.BUILDING[-1]
27
- ds_wrapper.ordered_resources.append(CustomResource(ds_wrapper, downloader))
28
-
29
- return ds_wrapper.datapath
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes