datamaestro 1.6.1__tar.gz → 1.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. datamaestro-1.7.0/.github/workflows/pytest.yml +33 -0
  2. datamaestro-1.7.0/.pre-commit-config.yaml +23 -0
  3. {datamaestro-1.6.1 → datamaestro-1.7.0}/.readthedocs.yml +7 -8
  4. {datamaestro-1.6.1 → datamaestro-1.7.0}/PKG-INFO +46 -47
  5. {datamaestro-1.6.1 → datamaestro-1.7.0}/README.md +39 -46
  6. datamaestro-1.7.0/docs/source/api/data.md +280 -0
  7. datamaestro-1.7.0/docs/source/api/download.rst +513 -0
  8. datamaestro-1.7.0/docs/source/api/index.md +181 -0
  9. datamaestro-1.7.0/docs/source/api/records.rst +181 -0
  10. datamaestro-1.7.0/docs/source/cli.md +225 -0
  11. {datamaestro-1.6.1 → datamaestro-1.7.0}/docs/source/conf.py +13 -2
  12. datamaestro-1.7.0/docs/source/configuration.md +181 -0
  13. datamaestro-1.7.0/docs/source/datasets.rst +401 -0
  14. datamaestro-1.7.0/docs/source/developping.md +316 -0
  15. datamaestro-1.7.0/docs/source/getting-started.md +179 -0
  16. datamaestro-1.7.0/docs/source/index.md +209 -0
  17. {datamaestro-1.6.1 → datamaestro-1.7.0}/pyproject.toml +37 -0
  18. datamaestro-1.7.0/release-notes.md +5 -0
  19. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/__main__.py +9 -6
  20. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/commands/site.py +16 -5
  21. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/data/ml.py +1 -0
  22. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/definitions.py +234 -15
  23. datamaestro-1.7.0/src/datamaestro/download/__init__.py +655 -0
  24. datamaestro-1.7.0/src/datamaestro/download/archive.py +218 -0
  25. datamaestro-1.7.0/src/datamaestro/download/custom.py +53 -0
  26. datamaestro-1.7.0/src/datamaestro/download/huggingface.py +77 -0
  27. datamaestro-1.7.0/src/datamaestro/download/links.py +181 -0
  28. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/download/multiple.py +27 -5
  29. datamaestro-1.7.0/src/datamaestro/download/single.py +183 -0
  30. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/download/sync.py +0 -1
  31. datamaestro-1.7.0/src/datamaestro/download/todo.py +15 -0
  32. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/download/wayback.py +3 -3
  33. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/record.py +48 -2
  34. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/settings.py +2 -1
  35. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/sphinx.py +1 -3
  36. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/stream/lines.py +8 -6
  37. datamaestro-1.7.0/src/datamaestro/test/__init__.py +3 -0
  38. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/test/conftest.py +1 -2
  39. datamaestro-1.7.0/src/datamaestro/test/test_resource.py +1388 -0
  40. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/utils.py +7 -6
  41. datamaestro-1.7.0/src/datamaestro/v2.md +301 -0
  42. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/version.py +1 -1
  43. {datamaestro-1.6.1 → datamaestro-1.7.0}/uv.lock +1404 -737
  44. datamaestro-1.6.1/.github/workflows/pytest.yml +0 -40
  45. datamaestro-1.6.1/.pre-commit-config.yaml +0 -17
  46. datamaestro-1.6.1/docs/source/api/data.md +0 -48
  47. datamaestro-1.6.1/docs/source/api/download.rst +0 -71
  48. datamaestro-1.6.1/docs/source/api/index.md +0 -17
  49. datamaestro-1.6.1/docs/source/api/records.rst +0 -112
  50. datamaestro-1.6.1/docs/source/datasets.rst +0 -68
  51. datamaestro-1.6.1/docs/source/developping.md +0 -12
  52. datamaestro-1.6.1/docs/source/index.md +0 -136
  53. datamaestro-1.6.1/release-notes.md +0 -5
  54. datamaestro-1.6.1/src/datamaestro/download/__init__.py +0 -112
  55. datamaestro-1.6.1/src/datamaestro/download/archive.py +0 -174
  56. datamaestro-1.6.1/src/datamaestro/download/custom.py +0 -21
  57. datamaestro-1.6.1/src/datamaestro/download/huggingface.py +0 -45
  58. datamaestro-1.6.1/src/datamaestro/download/links.py +0 -124
  59. datamaestro-1.6.1/src/datamaestro/download/single.py +0 -126
  60. datamaestro-1.6.1/src/datamaestro/download/todo.py +0 -10
  61. datamaestro-1.6.1/src/datamaestro/test/__init__.py +0 -1
  62. {datamaestro-1.6.1 → datamaestro-1.7.0}/.coverage +0 -0
  63. {datamaestro-1.6.1 → datamaestro-1.7.0}/.flake8 +0 -0
  64. {datamaestro-1.6.1 → datamaestro-1.7.0}/.github/workflows/python-publish.yml +0 -0
  65. {datamaestro-1.6.1 → datamaestro-1.7.0}/.gitignore +0 -0
  66. {datamaestro-1.6.1 → datamaestro-1.7.0}/.python-version +0 -0
  67. {datamaestro-1.6.1 → datamaestro-1.7.0}/CHANGELOG.md +0 -0
  68. {datamaestro-1.6.1 → datamaestro-1.7.0}/LICENSE +0 -0
  69. {datamaestro-1.6.1 → datamaestro-1.7.0}/MANIFEST.in +0 -0
  70. {datamaestro-1.6.1 → datamaestro-1.7.0}/TODO.md +0 -0
  71. {datamaestro-1.6.1 → datamaestro-1.7.0}/cliff.toml +0 -0
  72. {datamaestro-1.6.1 → datamaestro-1.7.0}/docs/Makefile +0 -0
  73. {datamaestro-1.6.1 → datamaestro-1.7.0}/docs/make.bat +0 -0
  74. {datamaestro-1.6.1 → datamaestro-1.7.0}/docs/requirements.txt +0 -0
  75. {datamaestro-1.6.1 → datamaestro-1.7.0}/docs/source/style.css +0 -0
  76. {datamaestro-1.6.1 → datamaestro-1.7.0}/pytest.ini +0 -0
  77. {datamaestro-1.6.1 → datamaestro-1.7.0}/requirements-dev.txt +0 -0
  78. {datamaestro-1.6.1 → datamaestro-1.7.0}/requirements.txt +0 -0
  79. {datamaestro-1.6.1 → datamaestro-1.7.0}/schema.yaml +0 -0
  80. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/__init__.py +0 -0
  81. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/annotations/__init__.py +0 -0
  82. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/annotations/agreement.py +0 -0
  83. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/commands/__init__.py +0 -0
  84. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/commands/mainstyle.css +0 -0
  85. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/context.py +0 -0
  86. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/data/__init__.py +0 -0
  87. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/data/csv.py +0 -0
  88. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/data/huggingface.py +0 -0
  89. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/data/tensor.py +0 -0
  90. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/download/manual.py +0 -0
  91. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/registry.py +0 -0
  92. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/search.py +0 -0
  93. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/stream/__init__.py +0 -0
  94. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/stream/compress.py +0 -0
  95. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/templates/dataset.py +0 -0
  96. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/test/checks.py +0 -0
  97. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/test/test_annotations.py +0 -0
  98. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/test/test_download_handlers.py +0 -0
  99. {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/test/test_record.py +0 -0
  100. {datamaestro-1.6.1 → datamaestro-1.7.0}/tox.ini +0 -0
@@ -0,0 +1,33 @@
1
+ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2
+ # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3
+
4
+ name: Python test
5
+
6
+ on:
7
+ push:
8
+ branches: [ master ]
9
+ pull_request:
10
+ branches: [ master ]
11
+
12
+ jobs:
13
+ build:
14
+
15
+ runs-on: ubuntu-latest
16
+ strategy:
17
+ matrix:
18
+ python-version: ["3.10", "3.11", "3.12"]
19
+
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+ - name: Install uv
23
+ uses: astral-sh/setup-uv@v5
24
+ - name: Set up Python ${{ matrix.python-version }}
25
+ run: uv python install ${{ matrix.python-version }}
26
+ - name: Install dependencies
27
+ run: uv sync --group dev
28
+ - name: Lint with ruff
29
+ run: |
30
+ uv run ruff check .
31
+ uv run ruff format --check .
32
+ - name: Test with pytest
33
+ run: uv run pytest
@@ -0,0 +1,23 @@
1
+ default_install_hook_types:
2
+ - pre-commit
3
+ - commit-msg
4
+
5
+ repos:
6
+ - hooks:
7
+ - id: check-yaml
8
+ - id: end-of-file-fixer
9
+ - id: trailing-whitespace
10
+ repo: https://github.com/pre-commit/pre-commit-hooks
11
+ rev: v5.0.0
12
+ - hooks:
13
+ - id: ruff
14
+ args: [--fix]
15
+ - id: ruff-format
16
+ repo: https://github.com/astral-sh/ruff-pre-commit
17
+ rev: v0.9.6
18
+ - hooks:
19
+ - id: conventional-pre-commit
20
+ stages:
21
+ - commit-msg
22
+ repo: https://github.com/compilerla/conventional-pre-commit
23
+ rev: v4.0.0
@@ -2,20 +2,19 @@
2
2
  # Read the Docs configuration file
3
3
  # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4
4
 
5
- # Required
6
5
  version: 2
7
6
 
8
- sphinx:
9
- configuration: docs/source/conf.py
10
-
11
7
  build:
12
- os: "ubuntu-20.04"
8
+ os: "ubuntu-22.04"
13
9
  tools:
14
- python: "3.10"
10
+ python: "3.11"
11
+
12
+ sphinx:
13
+ configuration: docs/source/conf.py
15
14
 
16
- # Install the package
17
15
  python:
18
16
  install:
19
17
  - method: pip
20
18
  path: .
21
- - requirements: docs/requirements.txt
19
+ extra_requirements:
20
+ - docs
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro
3
- Version: 1.6.1
3
+ Version: 1.7.0
4
4
  Summary: Add your description here
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License-File: LICENSE
@@ -25,6 +25,12 @@ Requires-Dist: pymdown-extensions>=10.16
25
25
  Requires-Dist: requests>=2.32.4
26
26
  Requires-Dist: tqdm>=4.67.1
27
27
  Requires-Dist: urllib3>=2.5.0
28
+ Provides-Extra: docs
29
+ Requires-Dist: myst-parser>0.18; extra == 'docs'
30
+ Requires-Dist: sphinx-codeautolink>=0.15; extra == 'docs'
31
+ Requires-Dist: sphinx-rtd-theme==1.2.2; extra == 'docs'
32
+ Requires-Dist: sphinx-toolbox>=4.1.2; extra == 'docs'
33
+ Requires-Dist: sphinx>=4.2; extra == 'docs'
28
34
  Description-Content-Type: text/markdown
29
35
 
30
36
  [![PyPI version](https://badge.fury.io/py/datamaestro.svg)](https://badge.fury.io/py/datamaestro) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![DOI](https://zenodo.org/badge/4573876.svg)](https://zenodo.org/badge/latestdoi/4573876)
@@ -127,57 +133,50 @@ Out[3]: (dtype('uint8'), (60000, 28, 28))
127
133
 
128
134
  ## Python definition of datasets
129
135
 
130
- Each dataset (or a set of related datasets) is described in Python using a mix of declarative
131
- and imperative statements. This allows to quickly define how to download dataset using the
132
- datamaestro declarative API; the imperative part is used when creating the JSON output,
133
- and is integrated with [experimaestro](http://experimaestro.github.io/experimaestro-python).
136
+ Datasets are defined as Python classes with resource attributes that describe how
137
+ to download and process data. The framework automatically builds a dependency graph
138
+ and handles downloads with two-path safety and state tracking.
134
139
 
135
- Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
140
+ ```python
141
+ from datamaestro_image.data import ImageClassification, LabelledImages
142
+ from datamaestro.data.tensor import IDX
143
+ from datamaestro.download.single import FileDownloader
144
+ from datamaestro.definitions import AbstractDataset, dataset
136
145
 
137
146
 
138
- For instance, the MNIST dataset can be described by the following
147
+ @dataset(url="http://yann.lecun.com/exdb/mnist/")
148
+ class MNIST(ImageClassification):
149
+ """The MNIST database of handwritten digits."""
139
150
 
140
- ```python
141
- from datamaestro import dataset
142
- from datamaestro.download.single import download_file
143
- from datamaestro_image.data import ImageClassification, LabelledImages, IDXImage
144
-
145
-
146
- @filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
147
- @filedownloader("train_labels.idx", "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")
148
- @filedownloader("test_images.idx", "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")
149
- @filedownloader("test_labels.idx", "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")
150
- @dataset(
151
- ImageClassification,
152
- url="http://yann.lecun.com/exdb/mnist/",
153
- )
154
-
155
- return ImageClassification(
156
- train=LabelledImages(
157
- images=IDXImage(path=train_images), labels=IDXImage(path=train_labels)
158
- ),
159
- test=LabelledImages(
160
- images=IDXImage(path=test_images), labels=IDXImage(path=test_labels)
161
- ),
151
+ TRAIN_IMAGES = FileDownloader(
152
+ "train_images.idx",
153
+ "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
154
+ )
155
+ TRAIN_LABELS = FileDownloader(
156
+ "train_labels.idx",
157
+ "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz",
158
+ )
159
+ TEST_IMAGES = FileDownloader(
160
+ "test_images.idx",
161
+ "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
162
+ )
163
+ TEST_LABELS = FileDownloader(
164
+ "test_labels.idx",
165
+ "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz",
162
166
  )
163
- ```
164
167
 
165
- When building dataset modules, some extra documentation can be provided:
166
-
167
- ```yaml
168
- ids: [com.lecun.mnist]
169
- entry_point: "datamaestro_image.config.com.lecun:mnist"
170
- title: The MNIST database
171
- url: http://yann.lecun.com/exdb/mnist/
172
- groups: [image-classification]
173
- description: |
174
- The MNIST database of handwritten digits, available from this page,
175
- has a training set of 60,000 examples, and a test set of 10,000
176
- examples. It is a subset of a larger set available from NIST. The
177
- digits have been size-normalized and centered in a fixed-size image.
168
+ @classmethod
169
+ def __create_dataset__(cls, dataset: AbstractDataset):
170
+ return cls.C(
171
+ train=LabelledImages(
172
+ images=IDX(path=cls.TRAIN_IMAGES.path),
173
+ labels=IDX(path=cls.TRAIN_LABELS.path),
174
+ ),
175
+ test=LabelledImages(
176
+ images=IDX(path=cls.TEST_IMAGES.path),
177
+ labels=IDX(path=cls.TEST_LABELS.path),
178
+ ),
179
+ )
178
180
  ```
179
181
 
180
- This will allow to
181
-
182
- 1. Document the dataset
183
- 2. Allow to use the command line interface to manipulate it (download resources, etc.)
182
+ Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
@@ -98,57 +98,50 @@ Out[3]: (dtype('uint8'), (60000, 28, 28))
98
98
 
99
99
  ## Python definition of datasets
100
100
 
101
- Each dataset (or a set of related datasets) is described in Python using a mix of declarative
102
- and imperative statements. This allows to quickly define how to download dataset using the
103
- datamaestro declarative API; the imperative part is used when creating the JSON output,
104
- and is integrated with [experimaestro](http://experimaestro.github.io/experimaestro-python).
101
+ Datasets are defined as Python classes with resource attributes that describe how
102
+ to download and process data. The framework automatically builds a dependency graph
103
+ and handles downloads with two-path safety and state tracking.
105
104
 
106
- Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
105
+ ```python
106
+ from datamaestro_image.data import ImageClassification, LabelledImages
107
+ from datamaestro.data.tensor import IDX
108
+ from datamaestro.download.single import FileDownloader
109
+ from datamaestro.definitions import AbstractDataset, dataset
107
110
 
108
111
 
109
- For instance, the MNIST dataset can be described by the following
112
+ @dataset(url="http://yann.lecun.com/exdb/mnist/")
113
+ class MNIST(ImageClassification):
114
+ """The MNIST database of handwritten digits."""
110
115
 
111
- ```python
112
- from datamaestro import dataset
113
- from datamaestro.download.single import download_file
114
- from datamaestro_image.data import ImageClassification, LabelledImages, IDXImage
115
-
116
-
117
- @filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
118
- @filedownloader("train_labels.idx", "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")
119
- @filedownloader("test_images.idx", "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")
120
- @filedownloader("test_labels.idx", "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")
121
- @dataset(
122
- ImageClassification,
123
- url="http://yann.lecun.com/exdb/mnist/",
124
- )
125
-
126
- return ImageClassification(
127
- train=LabelledImages(
128
- images=IDXImage(path=train_images), labels=IDXImage(path=train_labels)
129
- ),
130
- test=LabelledImages(
131
- images=IDXImage(path=test_images), labels=IDXImage(path=test_labels)
132
- ),
116
+ TRAIN_IMAGES = FileDownloader(
117
+ "train_images.idx",
118
+ "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
119
+ )
120
+ TRAIN_LABELS = FileDownloader(
121
+ "train_labels.idx",
122
+ "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz",
123
+ )
124
+ TEST_IMAGES = FileDownloader(
125
+ "test_images.idx",
126
+ "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
127
+ )
128
+ TEST_LABELS = FileDownloader(
129
+ "test_labels.idx",
130
+ "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz",
133
131
  )
134
- ```
135
132
 
136
- When building dataset modules, some extra documentation can be provided:
137
-
138
- ```yaml
139
- ids: [com.lecun.mnist]
140
- entry_point: "datamaestro_image.config.com.lecun:mnist"
141
- title: The MNIST database
142
- url: http://yann.lecun.com/exdb/mnist/
143
- groups: [image-classification]
144
- description: |
145
- The MNIST database of handwritten digits, available from this page,
146
- has a training set of 60,000 examples, and a test set of 10,000
147
- examples. It is a subset of a larger set available from NIST. The
148
- digits have been size-normalized and centered in a fixed-size image.
133
+ @classmethod
134
+ def __create_dataset__(cls, dataset: AbstractDataset):
135
+ return cls.C(
136
+ train=LabelledImages(
137
+ images=IDX(path=cls.TRAIN_IMAGES.path),
138
+ labels=IDX(path=cls.TRAIN_LABELS.path),
139
+ ),
140
+ test=LabelledImages(
141
+ images=IDX(path=cls.TEST_IMAGES.path),
142
+ labels=IDX(path=cls.TEST_LABELS.path),
143
+ ),
144
+ )
149
145
  ```
150
146
 
151
- This will allow to
152
-
153
- 1. Document the dataset
154
- 2. Allow to use the command line interface to manipulate it (download resources, etc.)
147
+ Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
@@ -0,0 +1,280 @@
1
+ # Data Types
2
+
3
+ Data types define the structure of dataset contents. They inherit from `datamaestro.data.Base`
4
+ and use experimaestro's configuration system for type-safe parameter handling.
5
+
6
+ ## Base Types
7
+
8
+ ### Base
9
+
10
+ The root class for all data types:
11
+
12
+ ```python
13
+ from datamaestro.data import Base
14
+
15
+ class MyData(Base):
16
+ """Custom data type"""
17
+ pass
18
+ ```
19
+
20
+ ```{eval-rst}
21
+ .. autoxpmconfig:: datamaestro.data.Base
22
+ ```
23
+
24
+ ### Generic
25
+
26
+ Generic data with a path:
27
+
28
+ ```{eval-rst}
29
+ .. autoxpmconfig:: datamaestro.data.Generic
30
+ ```
31
+
32
+ ### File
33
+
34
+ Single file reference:
35
+
36
+ ```python
37
+ from datamaestro.data import File
38
+
39
+ # In dataset definition
40
+ return File(path=downloaded_path)
41
+
42
+ # Usage
43
+ print(ds.path) # Path to the file
44
+ ```
45
+
46
+ ```{eval-rst}
47
+ .. autoxpmconfig:: datamaestro.data.File
48
+ ```
49
+
50
+ ## CSV Data
51
+
52
+ Package: `datamaestro.data.csv`
53
+
54
+ ### Generic CSV
55
+
56
+ ```python
57
+ from datamaestro.data.csv import Generic
58
+
59
+ return Generic(
60
+ path=csv_path,
61
+ separator=",",
62
+ names_row=0, # Header row index
63
+ size=1000, # Number of rows (optional)
64
+ )
65
+ ```
66
+
67
+ ```{eval-rst}
68
+ .. autoxpmconfig:: datamaestro.data.csv.Generic
69
+ ```
70
+
71
+ ### Matrix CSV
72
+
73
+ For numeric CSV data:
74
+
75
+ ```python
76
+ from datamaestro.data.csv import Matrix
77
+
78
+ return Matrix(
79
+ path=csv_path,
80
+ separator=",",
81
+ target=-1, # Target column index (-1 for last)
82
+ )
83
+ ```
84
+
85
+ ```{eval-rst}
86
+ .. autoxpmconfig:: datamaestro.data.csv.Matrix
87
+ ```
88
+
89
+ ## Tensor Data
90
+
91
+ Package: `datamaestro.data.tensor`
92
+
93
+ ### IDX Format
94
+
95
+ The IDX format is used by MNIST and similar datasets:
96
+
97
+ ```python
98
+ from datamaestro.data.tensor import IDX
99
+
100
+ idx_data = IDX(path=idx_file_path)
101
+
102
+ # Load as numpy array
103
+ array = idx_data.data()
104
+ print(array.shape) # e.g., (60000, 28, 28)
105
+ print(array.dtype) # e.g., uint8
106
+ ```
107
+
108
+ ```{eval-rst}
109
+ .. autoxpmconfig:: datamaestro.data.tensor.IDX
110
+ ```
111
+
112
+ ## Machine Learning
113
+
114
+ Package: `datamaestro.data.ml`
115
+
116
+ ### Supervised Learning
117
+
118
+ For supervised learning datasets with train/test splits:
119
+
120
+ ```python
121
+ from datamaestro.data.ml import Supervised
122
+
123
+ return Supervised(
124
+ train=train_data,
125
+ test=test_data,
126
+ validation=validation_data, # Optional
127
+ )
128
+ ```
129
+
130
+ ```{eval-rst}
131
+ .. autoxpmconfig:: datamaestro.data.ml.Supervised
132
+ ```
133
+
134
+ ## HuggingFace Integration
135
+
136
+ Package: `datamaestro.data.huggingface`
137
+
138
+ For datasets from the HuggingFace Hub:
139
+
140
+ ```python
141
+ from datamaestro.data.huggingface import DatasetDict
142
+
143
+ return DatasetDict(
144
+ dataset_id="squad",
145
+ config=None, # Optional config name
146
+ )
147
+ ```
148
+
149
+ ## Creating Custom Data Types
150
+
151
+ ### Basic Custom Type
152
+
153
+ Create custom data types by inheriting from {py:class}`~datamaestro.data.Base`.
154
+ Use `Param` from experimaestro to define typed parameters:
155
+
156
+ ```python
157
+ from pathlib import Path
158
+ from experimaestro import Param
159
+ from datamaestro.data import Base
160
+
161
+ class TextCorpus(Base):
162
+ """A text corpus with documents"""
163
+
164
+ path: Param[Path]
165
+ """Path to the corpus directory"""
166
+
167
+ encoding: Param[str] = "utf-8"
168
+ """Text encoding"""
169
+
170
+ def documents(self):
171
+ """Iterate over documents"""
172
+ for file in self.path.glob("*.txt"):
173
+ yield file.read_text(encoding=self.encoding)
174
+
175
+ def __len__(self):
176
+ return len(list(self.path.glob("*.txt")))
177
+ ```
178
+
179
+ ### Nested Data Types
180
+
181
+ ```python
182
+ from experimaestro import Param
183
+ from datamaestro.data import Base
184
+
185
+ class LabelledData(Base):
186
+ """Data with labels"""
187
+
188
+ data: Param[Base]
189
+ """The actual data"""
190
+
191
+ labels: Param[Base]
192
+ """The labels"""
193
+
194
+ class ImageClassification(Base):
195
+ """Image classification dataset"""
196
+
197
+ train: Param[LabelledData]
198
+ """Training split"""
199
+
200
+ test: Param[LabelledData]
201
+ """Test split"""
202
+
203
+ num_classes: Param[int]
204
+ """Number of classes"""
205
+ ```
206
+
207
+ ### With Data Loading Methods
208
+
209
+ ```python
210
+ from pathlib import Path
211
+ from experimaestro import Param
212
+ from datamaestro.data import Base
213
+
214
+ class JSONLData(Base):
215
+ """JSON Lines format data"""
216
+
217
+ path: Param[Path]
218
+
219
+ def __iter__(self):
220
+ """Iterate over records"""
221
+ import json
222
+ with open(self.path) as f:
223
+ for line in f:
224
+ yield json.loads(line)
225
+
226
+ def to_pandas(self):
227
+ """Load as pandas DataFrame"""
228
+ import pandas as pd
229
+ return pd.read_json(self.path, lines=True)
230
+
231
+ def to_list(self):
232
+ """Load all records into a list"""
233
+ return list(self)
234
+ ```
235
+
236
+ ### Inheriting from Existing Types
237
+
238
+ ```python
239
+ from datamaestro.data.csv import Matrix
240
+
241
+ class ClassificationMatrix(Matrix):
242
+ """CSV matrix for classification tasks"""
243
+
244
+ num_classes: Param[int]
245
+ """Number of target classes"""
246
+
247
+ class_names: Param[list] = None
248
+ """Optional class names"""
249
+
250
+ def get_class_name(self, index: int) -> str:
251
+ if self.class_names:
252
+ return self.class_names[index]
253
+ return str(index)
254
+ ```
255
+
256
+ ## Type Annotations with Experimaestro
257
+
258
+ Data types use experimaestro's annotation system ({py:class}`~experimaestro.Param`,
259
+ {py:class}`~experimaestro.Option`, {py:class}`~experimaestro.Meta`):
260
+
261
+ ```python
262
+ from experimaestro import Param, Option, Meta
263
+ from datamaestro.data import Base
264
+
265
+ class MyData(Base):
266
+ # Required parameter
267
+ path: Param[Path]
268
+
269
+ # Optional parameter with default
270
+ encoding: Param[str] = "utf-8"
271
+
272
+ # Option (not serialized, for runtime configuration)
273
+ cache_size: Option[int] = 1000
274
+
275
+ # Metadata (not part of configuration identity)
276
+ description: Meta[str] = ""
277
+ ```
278
+
279
+ See the [experimaestro documentation](https://experimaestro-python.readthedocs.io/)
280
+ for more details on the configuration system.