datamaestro 1.6.1__tar.gz → 1.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro-1.7.0/.github/workflows/pytest.yml +33 -0
- datamaestro-1.7.0/.pre-commit-config.yaml +23 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/.readthedocs.yml +7 -8
- {datamaestro-1.6.1 → datamaestro-1.7.0}/PKG-INFO +46 -47
- {datamaestro-1.6.1 → datamaestro-1.7.0}/README.md +39 -46
- datamaestro-1.7.0/docs/source/api/data.md +280 -0
- datamaestro-1.7.0/docs/source/api/download.rst +513 -0
- datamaestro-1.7.0/docs/source/api/index.md +181 -0
- datamaestro-1.7.0/docs/source/api/records.rst +181 -0
- datamaestro-1.7.0/docs/source/cli.md +225 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/docs/source/conf.py +13 -2
- datamaestro-1.7.0/docs/source/configuration.md +181 -0
- datamaestro-1.7.0/docs/source/datasets.rst +401 -0
- datamaestro-1.7.0/docs/source/developping.md +316 -0
- datamaestro-1.7.0/docs/source/getting-started.md +179 -0
- datamaestro-1.7.0/docs/source/index.md +209 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/pyproject.toml +37 -0
- datamaestro-1.7.0/release-notes.md +5 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/__main__.py +9 -6
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/commands/site.py +16 -5
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/data/ml.py +1 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/definitions.py +234 -15
- datamaestro-1.7.0/src/datamaestro/download/__init__.py +655 -0
- datamaestro-1.7.0/src/datamaestro/download/archive.py +218 -0
- datamaestro-1.7.0/src/datamaestro/download/custom.py +53 -0
- datamaestro-1.7.0/src/datamaestro/download/huggingface.py +77 -0
- datamaestro-1.7.0/src/datamaestro/download/links.py +181 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/download/multiple.py +27 -5
- datamaestro-1.7.0/src/datamaestro/download/single.py +183 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/download/sync.py +0 -1
- datamaestro-1.7.0/src/datamaestro/download/todo.py +15 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/download/wayback.py +3 -3
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/record.py +48 -2
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/settings.py +2 -1
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/sphinx.py +1 -3
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/stream/lines.py +8 -6
- datamaestro-1.7.0/src/datamaestro/test/__init__.py +3 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/test/conftest.py +1 -2
- datamaestro-1.7.0/src/datamaestro/test/test_resource.py +1388 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/utils.py +7 -6
- datamaestro-1.7.0/src/datamaestro/v2.md +301 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/version.py +1 -1
- {datamaestro-1.6.1 → datamaestro-1.7.0}/uv.lock +1404 -737
- datamaestro-1.6.1/.github/workflows/pytest.yml +0 -40
- datamaestro-1.6.1/.pre-commit-config.yaml +0 -17
- datamaestro-1.6.1/docs/source/api/data.md +0 -48
- datamaestro-1.6.1/docs/source/api/download.rst +0 -71
- datamaestro-1.6.1/docs/source/api/index.md +0 -17
- datamaestro-1.6.1/docs/source/api/records.rst +0 -112
- datamaestro-1.6.1/docs/source/datasets.rst +0 -68
- datamaestro-1.6.1/docs/source/developping.md +0 -12
- datamaestro-1.6.1/docs/source/index.md +0 -136
- datamaestro-1.6.1/release-notes.md +0 -5
- datamaestro-1.6.1/src/datamaestro/download/__init__.py +0 -112
- datamaestro-1.6.1/src/datamaestro/download/archive.py +0 -174
- datamaestro-1.6.1/src/datamaestro/download/custom.py +0 -21
- datamaestro-1.6.1/src/datamaestro/download/huggingface.py +0 -45
- datamaestro-1.6.1/src/datamaestro/download/links.py +0 -124
- datamaestro-1.6.1/src/datamaestro/download/single.py +0 -126
- datamaestro-1.6.1/src/datamaestro/download/todo.py +0 -10
- datamaestro-1.6.1/src/datamaestro/test/__init__.py +0 -1
- {datamaestro-1.6.1 → datamaestro-1.7.0}/.coverage +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/.flake8 +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/.github/workflows/python-publish.yml +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/.gitignore +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/.python-version +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/CHANGELOG.md +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/LICENSE +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/MANIFEST.in +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/TODO.md +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/cliff.toml +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/docs/Makefile +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/docs/make.bat +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/docs/requirements.txt +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/docs/source/style.css +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/pytest.ini +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/requirements-dev.txt +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/requirements.txt +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/schema.yaml +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/__init__.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/annotations/__init__.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/annotations/agreement.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/commands/__init__.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/commands/mainstyle.css +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/context.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/data/__init__.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/data/csv.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/data/huggingface.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/data/tensor.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/download/manual.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/registry.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/search.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/stream/__init__.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/stream/compress.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/templates/dataset.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/test/checks.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/test/test_annotations.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/test/test_download_handlers.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/src/datamaestro/test/test_record.py +0 -0
- {datamaestro-1.6.1 → datamaestro-1.7.0}/tox.ini +0 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
|
|
2
|
+
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
|
|
3
|
+
|
|
4
|
+
name: Python test
|
|
5
|
+
|
|
6
|
+
on:
|
|
7
|
+
push:
|
|
8
|
+
branches: [ master ]
|
|
9
|
+
pull_request:
|
|
10
|
+
branches: [ master ]
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
build:
|
|
14
|
+
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
strategy:
|
|
17
|
+
matrix:
|
|
18
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
19
|
+
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v4
|
|
22
|
+
- name: Install uv
|
|
23
|
+
uses: astral-sh/setup-uv@v5
|
|
24
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
25
|
+
run: uv python install ${{ matrix.python-version }}
|
|
26
|
+
- name: Install dependencies
|
|
27
|
+
run: uv sync --group dev
|
|
28
|
+
- name: Lint with ruff
|
|
29
|
+
run: |
|
|
30
|
+
uv run ruff check .
|
|
31
|
+
uv run ruff format --check .
|
|
32
|
+
- name: Test with pytest
|
|
33
|
+
run: uv run pytest
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
default_install_hook_types:
|
|
2
|
+
- pre-commit
|
|
3
|
+
- commit-msg
|
|
4
|
+
|
|
5
|
+
repos:
|
|
6
|
+
- hooks:
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: end-of-file-fixer
|
|
9
|
+
- id: trailing-whitespace
|
|
10
|
+
repo: https://github.com/pre-commit/pre-commit-hooks
|
|
11
|
+
rev: v5.0.0
|
|
12
|
+
- hooks:
|
|
13
|
+
- id: ruff
|
|
14
|
+
args: [--fix]
|
|
15
|
+
- id: ruff-format
|
|
16
|
+
repo: https://github.com/astral-sh/ruff-pre-commit
|
|
17
|
+
rev: v0.9.6
|
|
18
|
+
- hooks:
|
|
19
|
+
- id: conventional-pre-commit
|
|
20
|
+
stages:
|
|
21
|
+
- commit-msg
|
|
22
|
+
repo: https://github.com/compilerla/conventional-pre-commit
|
|
23
|
+
rev: v4.0.0
|
|
@@ -2,20 +2,19 @@
|
|
|
2
2
|
# Read the Docs configuration file
|
|
3
3
|
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
|
4
4
|
|
|
5
|
-
# Required
|
|
6
5
|
version: 2
|
|
7
6
|
|
|
8
|
-
sphinx:
|
|
9
|
-
configuration: docs/source/conf.py
|
|
10
|
-
|
|
11
7
|
build:
|
|
12
|
-
os: "ubuntu-
|
|
8
|
+
os: "ubuntu-22.04"
|
|
13
9
|
tools:
|
|
14
|
-
python: "3.
|
|
10
|
+
python: "3.11"
|
|
11
|
+
|
|
12
|
+
sphinx:
|
|
13
|
+
configuration: docs/source/conf.py
|
|
15
14
|
|
|
16
|
-
# Install the package
|
|
17
15
|
python:
|
|
18
16
|
install:
|
|
19
17
|
- method: pip
|
|
20
18
|
path: .
|
|
21
|
-
|
|
19
|
+
extra_requirements:
|
|
20
|
+
- docs
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.7.0
|
|
4
4
|
Summary: Add your description here
|
|
5
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -25,6 +25,12 @@ Requires-Dist: pymdown-extensions>=10.16
|
|
|
25
25
|
Requires-Dist: requests>=2.32.4
|
|
26
26
|
Requires-Dist: tqdm>=4.67.1
|
|
27
27
|
Requires-Dist: urllib3>=2.5.0
|
|
28
|
+
Provides-Extra: docs
|
|
29
|
+
Requires-Dist: myst-parser>0.18; extra == 'docs'
|
|
30
|
+
Requires-Dist: sphinx-codeautolink>=0.15; extra == 'docs'
|
|
31
|
+
Requires-Dist: sphinx-rtd-theme==1.2.2; extra == 'docs'
|
|
32
|
+
Requires-Dist: sphinx-toolbox>=4.1.2; extra == 'docs'
|
|
33
|
+
Requires-Dist: sphinx>=4.2; extra == 'docs'
|
|
28
34
|
Description-Content-Type: text/markdown
|
|
29
35
|
|
|
30
36
|
[](https://badge.fury.io/py/datamaestro) [](https://github.com/pre-commit/pre-commit) [](https://zenodo.org/badge/latestdoi/4573876)
|
|
@@ -127,57 +133,50 @@ Out[3]: (dtype('uint8'), (60000, 28, 28))
|
|
|
127
133
|
|
|
128
134
|
## Python definition of datasets
|
|
129
135
|
|
|
130
|
-
|
|
131
|
-
and
|
|
132
|
-
|
|
133
|
-
and is integrated with [experimaestro](http://experimaestro.github.io/experimaestro-python).
|
|
136
|
+
Datasets are defined as Python classes with resource attributes that describe how
|
|
137
|
+
to download and process data. The framework automatically builds a dependency graph
|
|
138
|
+
and handles downloads with two-path safety and state tracking.
|
|
134
139
|
|
|
135
|
-
|
|
140
|
+
```python
|
|
141
|
+
from datamaestro_image.data import ImageClassification, LabelledImages
|
|
142
|
+
from datamaestro.data.tensor import IDX
|
|
143
|
+
from datamaestro.download.single import FileDownloader
|
|
144
|
+
from datamaestro.definitions import AbstractDataset, dataset
|
|
136
145
|
|
|
137
146
|
|
|
138
|
-
|
|
147
|
+
@dataset(url="http://yann.lecun.com/exdb/mnist/")
|
|
148
|
+
class MNIST(ImageClassification):
|
|
149
|
+
"""The MNIST database of handwritten digits."""
|
|
139
150
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
return ImageClassification(
|
|
156
|
-
train=LabelledImages(
|
|
157
|
-
images=IDXImage(path=train_images), labels=IDXImage(path=train_labels)
|
|
158
|
-
),
|
|
159
|
-
test=LabelledImages(
|
|
160
|
-
images=IDXImage(path=test_images), labels=IDXImage(path=test_labels)
|
|
161
|
-
),
|
|
151
|
+
TRAIN_IMAGES = FileDownloader(
|
|
152
|
+
"train_images.idx",
|
|
153
|
+
"http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
|
|
154
|
+
)
|
|
155
|
+
TRAIN_LABELS = FileDownloader(
|
|
156
|
+
"train_labels.idx",
|
|
157
|
+
"http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz",
|
|
158
|
+
)
|
|
159
|
+
TEST_IMAGES = FileDownloader(
|
|
160
|
+
"test_images.idx",
|
|
161
|
+
"http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
|
|
162
|
+
)
|
|
163
|
+
TEST_LABELS = FileDownloader(
|
|
164
|
+
"test_labels.idx",
|
|
165
|
+
"http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz",
|
|
162
166
|
)
|
|
163
|
-
```
|
|
164
167
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
digits have been size-normalized and centered in a fixed-size image.
|
|
168
|
+
@classmethod
|
|
169
|
+
def __create_dataset__(cls, dataset: AbstractDataset):
|
|
170
|
+
return cls.C(
|
|
171
|
+
train=LabelledImages(
|
|
172
|
+
images=IDX(path=cls.TRAIN_IMAGES.path),
|
|
173
|
+
labels=IDX(path=cls.TRAIN_LABELS.path),
|
|
174
|
+
),
|
|
175
|
+
test=LabelledImages(
|
|
176
|
+
images=IDX(path=cls.TEST_IMAGES.path),
|
|
177
|
+
labels=IDX(path=cls.TEST_LABELS.path),
|
|
178
|
+
),
|
|
179
|
+
)
|
|
178
180
|
```
|
|
179
181
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
1. Document the dataset
|
|
183
|
-
2. Allow to use the command line interface to manipulate it (download resources, etc.)
|
|
182
|
+
Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
|
|
@@ -98,57 +98,50 @@ Out[3]: (dtype('uint8'), (60000, 28, 28))
|
|
|
98
98
|
|
|
99
99
|
## Python definition of datasets
|
|
100
100
|
|
|
101
|
-
|
|
102
|
-
and
|
|
103
|
-
|
|
104
|
-
and is integrated with [experimaestro](http://experimaestro.github.io/experimaestro-python).
|
|
101
|
+
Datasets are defined as Python classes with resource attributes that describe how
|
|
102
|
+
to download and process data. The framework automatically builds a dependency graph
|
|
103
|
+
and handles downloads with two-path safety and state tracking.
|
|
105
104
|
|
|
106
|
-
|
|
105
|
+
```python
|
|
106
|
+
from datamaestro_image.data import ImageClassification, LabelledImages
|
|
107
|
+
from datamaestro.data.tensor import IDX
|
|
108
|
+
from datamaestro.download.single import FileDownloader
|
|
109
|
+
from datamaestro.definitions import AbstractDataset, dataset
|
|
107
110
|
|
|
108
111
|
|
|
109
|
-
|
|
112
|
+
@dataset(url="http://yann.lecun.com/exdb/mnist/")
|
|
113
|
+
class MNIST(ImageClassification):
|
|
114
|
+
"""The MNIST database of handwritten digits."""
|
|
110
115
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
return ImageClassification(
|
|
127
|
-
train=LabelledImages(
|
|
128
|
-
images=IDXImage(path=train_images), labels=IDXImage(path=train_labels)
|
|
129
|
-
),
|
|
130
|
-
test=LabelledImages(
|
|
131
|
-
images=IDXImage(path=test_images), labels=IDXImage(path=test_labels)
|
|
132
|
-
),
|
|
116
|
+
TRAIN_IMAGES = FileDownloader(
|
|
117
|
+
"train_images.idx",
|
|
118
|
+
"http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
|
|
119
|
+
)
|
|
120
|
+
TRAIN_LABELS = FileDownloader(
|
|
121
|
+
"train_labels.idx",
|
|
122
|
+
"http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz",
|
|
123
|
+
)
|
|
124
|
+
TEST_IMAGES = FileDownloader(
|
|
125
|
+
"test_images.idx",
|
|
126
|
+
"http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
|
|
127
|
+
)
|
|
128
|
+
TEST_LABELS = FileDownloader(
|
|
129
|
+
"test_labels.idx",
|
|
130
|
+
"http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz",
|
|
133
131
|
)
|
|
134
|
-
```
|
|
135
132
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
digits have been size-normalized and centered in a fixed-size image.
|
|
133
|
+
@classmethod
|
|
134
|
+
def __create_dataset__(cls, dataset: AbstractDataset):
|
|
135
|
+
return cls.C(
|
|
136
|
+
train=LabelledImages(
|
|
137
|
+
images=IDX(path=cls.TRAIN_IMAGES.path),
|
|
138
|
+
labels=IDX(path=cls.TRAIN_LABELS.path),
|
|
139
|
+
),
|
|
140
|
+
test=LabelledImages(
|
|
141
|
+
images=IDX(path=cls.TEST_IMAGES.path),
|
|
142
|
+
labels=IDX(path=cls.TEST_LABELS.path),
|
|
143
|
+
),
|
|
144
|
+
)
|
|
149
145
|
```
|
|
150
146
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
1. Document the dataset
|
|
154
|
-
2. Allow to use the command line interface to manipulate it (download resources, etc.)
|
|
147
|
+
Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
# Data Types
|
|
2
|
+
|
|
3
|
+
Data types define the structure of dataset contents. They inherit from `datamaestro.data.Base`
|
|
4
|
+
and use experimaestro's configuration system for type-safe parameter handling.
|
|
5
|
+
|
|
6
|
+
## Base Types
|
|
7
|
+
|
|
8
|
+
### Base
|
|
9
|
+
|
|
10
|
+
The root class for all data types:
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
from datamaestro.data import Base
|
|
14
|
+
|
|
15
|
+
class MyData(Base):
|
|
16
|
+
"""Custom data type"""
|
|
17
|
+
pass
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
```{eval-rst}
|
|
21
|
+
.. autoxpmconfig:: datamaestro.data.Base
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Generic
|
|
25
|
+
|
|
26
|
+
Generic data with a path:
|
|
27
|
+
|
|
28
|
+
```{eval-rst}
|
|
29
|
+
.. autoxpmconfig:: datamaestro.data.Generic
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### File
|
|
33
|
+
|
|
34
|
+
Single file reference:
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from datamaestro.data import File
|
|
38
|
+
|
|
39
|
+
# In dataset definition
|
|
40
|
+
return File(path=downloaded_path)
|
|
41
|
+
|
|
42
|
+
# Usage
|
|
43
|
+
print(ds.path) # Path to the file
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
```{eval-rst}
|
|
47
|
+
.. autoxpmconfig:: datamaestro.data.File
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## CSV Data
|
|
51
|
+
|
|
52
|
+
Package: `datamaestro.data.csv`
|
|
53
|
+
|
|
54
|
+
### Generic CSV
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from datamaestro.data.csv import Generic
|
|
58
|
+
|
|
59
|
+
return Generic(
|
|
60
|
+
path=csv_path,
|
|
61
|
+
separator=",",
|
|
62
|
+
names_row=0, # Header row index
|
|
63
|
+
size=1000, # Number of rows (optional)
|
|
64
|
+
)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
```{eval-rst}
|
|
68
|
+
.. autoxpmconfig:: datamaestro.data.csv.Generic
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Matrix CSV
|
|
72
|
+
|
|
73
|
+
For numeric CSV data:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from datamaestro.data.csv import Matrix
|
|
77
|
+
|
|
78
|
+
return Matrix(
|
|
79
|
+
path=csv_path,
|
|
80
|
+
separator=",",
|
|
81
|
+
target=-1, # Target column index (-1 for last)
|
|
82
|
+
)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
```{eval-rst}
|
|
86
|
+
.. autoxpmconfig:: datamaestro.data.csv.Matrix
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Tensor Data
|
|
90
|
+
|
|
91
|
+
Package: `datamaestro.data.tensor`
|
|
92
|
+
|
|
93
|
+
### IDX Format
|
|
94
|
+
|
|
95
|
+
The IDX format is used by MNIST and similar datasets:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from datamaestro.data.tensor import IDX
|
|
99
|
+
|
|
100
|
+
idx_data = IDX(path=idx_file_path)
|
|
101
|
+
|
|
102
|
+
# Load as numpy array
|
|
103
|
+
array = idx_data.data()
|
|
104
|
+
print(array.shape) # e.g., (60000, 28, 28)
|
|
105
|
+
print(array.dtype) # e.g., uint8
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
```{eval-rst}
|
|
109
|
+
.. autoxpmconfig:: datamaestro.data.tensor.IDX
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Machine Learning
|
|
113
|
+
|
|
114
|
+
Package: `datamaestro.data.ml`
|
|
115
|
+
|
|
116
|
+
### Supervised Learning
|
|
117
|
+
|
|
118
|
+
For supervised learning datasets with train/test splits:
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
from datamaestro.data.ml import Supervised
|
|
122
|
+
|
|
123
|
+
return Supervised(
|
|
124
|
+
train=train_data,
|
|
125
|
+
test=test_data,
|
|
126
|
+
validation=validation_data, # Optional
|
|
127
|
+
)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
```{eval-rst}
|
|
131
|
+
.. autoxpmconfig:: datamaestro.data.ml.Supervised
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## HuggingFace Integration
|
|
135
|
+
|
|
136
|
+
Package: `datamaestro.data.huggingface`
|
|
137
|
+
|
|
138
|
+
For datasets from the HuggingFace Hub:
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from datamaestro.data.huggingface import DatasetDict
|
|
142
|
+
|
|
143
|
+
return DatasetDict(
|
|
144
|
+
dataset_id="squad",
|
|
145
|
+
config=None, # Optional config name
|
|
146
|
+
)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Creating Custom Data Types
|
|
150
|
+
|
|
151
|
+
### Basic Custom Type
|
|
152
|
+
|
|
153
|
+
Create custom data types by inheriting from {py:class}`~datamaestro.data.Base`.
|
|
154
|
+
Use `Param` from experimaestro to define typed parameters:
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
from pathlib import Path
|
|
158
|
+
from experimaestro import Param
|
|
159
|
+
from datamaestro.data import Base
|
|
160
|
+
|
|
161
|
+
class TextCorpus(Base):
|
|
162
|
+
"""A text corpus with documents"""
|
|
163
|
+
|
|
164
|
+
path: Param[Path]
|
|
165
|
+
"""Path to the corpus directory"""
|
|
166
|
+
|
|
167
|
+
encoding: Param[str] = "utf-8"
|
|
168
|
+
"""Text encoding"""
|
|
169
|
+
|
|
170
|
+
def documents(self):
|
|
171
|
+
"""Iterate over documents"""
|
|
172
|
+
for file in self.path.glob("*.txt"):
|
|
173
|
+
yield file.read_text(encoding=self.encoding)
|
|
174
|
+
|
|
175
|
+
def __len__(self):
|
|
176
|
+
return len(list(self.path.glob("*.txt")))
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### Nested Data Types
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
from experimaestro import Param
|
|
183
|
+
from datamaestro.data import Base
|
|
184
|
+
|
|
185
|
+
class LabelledData(Base):
|
|
186
|
+
"""Data with labels"""
|
|
187
|
+
|
|
188
|
+
data: Param[Base]
|
|
189
|
+
"""The actual data"""
|
|
190
|
+
|
|
191
|
+
labels: Param[Base]
|
|
192
|
+
"""The labels"""
|
|
193
|
+
|
|
194
|
+
class ImageClassification(Base):
|
|
195
|
+
"""Image classification dataset"""
|
|
196
|
+
|
|
197
|
+
train: Param[LabelledData]
|
|
198
|
+
"""Training split"""
|
|
199
|
+
|
|
200
|
+
test: Param[LabelledData]
|
|
201
|
+
"""Test split"""
|
|
202
|
+
|
|
203
|
+
num_classes: Param[int]
|
|
204
|
+
"""Number of classes"""
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### With Data Loading Methods
|
|
208
|
+
|
|
209
|
+
```python
|
|
210
|
+
from pathlib import Path
|
|
211
|
+
from experimaestro import Param
|
|
212
|
+
from datamaestro.data import Base
|
|
213
|
+
|
|
214
|
+
class JSONLData(Base):
|
|
215
|
+
"""JSON Lines format data"""
|
|
216
|
+
|
|
217
|
+
path: Param[Path]
|
|
218
|
+
|
|
219
|
+
def __iter__(self):
|
|
220
|
+
"""Iterate over records"""
|
|
221
|
+
import json
|
|
222
|
+
with open(self.path) as f:
|
|
223
|
+
for line in f:
|
|
224
|
+
yield json.loads(line)
|
|
225
|
+
|
|
226
|
+
def to_pandas(self):
|
|
227
|
+
"""Load as pandas DataFrame"""
|
|
228
|
+
import pandas as pd
|
|
229
|
+
return pd.read_json(self.path, lines=True)
|
|
230
|
+
|
|
231
|
+
def to_list(self):
|
|
232
|
+
"""Load all records into a list"""
|
|
233
|
+
return list(self)
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
### Inheriting from Existing Types
|
|
237
|
+
|
|
238
|
+
```python
|
|
239
|
+
from datamaestro.data.csv import Matrix
|
|
240
|
+
|
|
241
|
+
class ClassificationMatrix(Matrix):
|
|
242
|
+
"""CSV matrix for classification tasks"""
|
|
243
|
+
|
|
244
|
+
num_classes: Param[int]
|
|
245
|
+
"""Number of target classes"""
|
|
246
|
+
|
|
247
|
+
class_names: Param[list] = None
|
|
248
|
+
"""Optional class names"""
|
|
249
|
+
|
|
250
|
+
def get_class_name(self, index: int) -> str:
|
|
251
|
+
if self.class_names:
|
|
252
|
+
return self.class_names[index]
|
|
253
|
+
return str(index)
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## Type Annotations with Experimaestro
|
|
257
|
+
|
|
258
|
+
Data types use experimaestro's annotation system ({py:class}`~experimaestro.Param`,
|
|
259
|
+
{py:class}`~experimaestro.Option`, {py:class}`~experimaestro.Meta`):
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
from experimaestro import Param, Option, Meta
|
|
263
|
+
from datamaestro.data import Base
|
|
264
|
+
|
|
265
|
+
class MyData(Base):
|
|
266
|
+
# Required parameter
|
|
267
|
+
path: Param[Path]
|
|
268
|
+
|
|
269
|
+
# Optional parameter with default
|
|
270
|
+
encoding: Param[str] = "utf-8"
|
|
271
|
+
|
|
272
|
+
# Option (not serialized, for runtime configuration)
|
|
273
|
+
cache_size: Option[int] = 1000
|
|
274
|
+
|
|
275
|
+
# Metadata (not part of configuration identity)
|
|
276
|
+
description: Meta[str] = ""
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
See the [experimaestro documentation](https://experimaestro-python.readthedocs.io/)
|
|
280
|
+
for more details on the configuration system.
|