datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +11 -7
- datamaestro/__main__.py +29 -8
- datamaestro/annotations/__init__.py +1 -1
- datamaestro/annotations/agreement.py +9 -3
- datamaestro/commands/site.py +27 -15
- datamaestro/context.py +143 -87
- datamaestro/data/__init__.py +23 -11
- datamaestro/data/csv.py +12 -12
- datamaestro/data/huggingface.py +25 -0
- datamaestro/data/ml.py +19 -10
- datamaestro/data/tensor.py +32 -24
- datamaestro/definitions.py +492 -131
- datamaestro/download/__init__.py +610 -24
- datamaestro/download/archive.py +129 -77
- datamaestro/download/custom.py +53 -0
- datamaestro/download/huggingface.py +77 -0
- datamaestro/download/links.py +106 -50
- datamaestro/download/multiple.py +27 -5
- datamaestro/download/single.py +114 -51
- datamaestro/download/sync.py +0 -1
- datamaestro/download/todo.py +9 -4
- datamaestro/download/wayback.py +164 -0
- datamaestro/record.py +232 -0
- datamaestro/registry.py +1 -0
- datamaestro/search.py +1 -1
- datamaestro/settings.py +3 -1
- datamaestro/sphinx.py +224 -0
- datamaestro/stream/__init__.py +0 -2
- datamaestro/stream/lines.py +10 -7
- datamaestro/templates/dataset.py +5 -4
- datamaestro/test/__init__.py +3 -1
- datamaestro/test/checks.py +1 -5
- datamaestro/test/conftest.py +1 -6
- datamaestro/test/test_annotations.py +2 -2
- datamaestro/test/test_download_handlers.py +3 -4
- datamaestro/test/test_record.py +72 -0
- datamaestro/test/test_resource.py +1388 -0
- datamaestro/utils.py +15 -9
- datamaestro/v2.md +301 -0
- datamaestro/version.py +4 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
- datamaestro-1.7.0.dist-info/RECORD +49 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
- datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/context.cpython-38.pyc +0 -0
- datamaestro/__pycache__/context.cpython-39.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
- datamaestro/__pycache__/search.cpython-38.pyc +0 -0
- datamaestro/__pycache__/search.cpython-39.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
- datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro-0.8.1.dist-info/RECORD +0 -109
- datamaestro-0.8.1.dist-info/top_level.txt +0 -1
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
datamaestro/data/csv.py
CHANGED
|
@@ -1,17 +1,16 @@
|
|
|
1
|
-
from
|
|
1
|
+
from typing import Optional, Tuple, List, Any
|
|
2
2
|
from csv import reader as csv_reader
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
from
|
|
3
|
+
from experimaestro import Param, Meta
|
|
4
|
+
from experimaestro import documentation
|
|
5
|
+
from . import File
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
@argument("ignore", type=int, default=0)
|
|
9
|
-
@argument("names_row", type=int, default=-1)
|
|
10
8
|
class Generic(File):
|
|
11
9
|
"""A generic CSV file"""
|
|
12
10
|
|
|
11
|
+
delimiter: Meta[str] = ","
|
|
13
12
|
ignore: Meta[int] = 0
|
|
14
|
-
names_row: Meta[int] = 1
|
|
13
|
+
names_row: Meta[int] = -1
|
|
15
14
|
|
|
16
15
|
@documentation
|
|
17
16
|
def columns(self):
|
|
@@ -23,19 +22,20 @@ class Generic(File):
|
|
|
23
22
|
for i in range(self.ignore):
|
|
24
23
|
fp.readline()
|
|
25
24
|
|
|
26
|
-
for ix, row in enumerate(csv_reader(fp)):
|
|
25
|
+
for ix, row in enumerate(csv_reader(fp, delimiter=self.delimiter)):
|
|
27
26
|
if ix == self.names_row:
|
|
28
27
|
return row
|
|
29
28
|
|
|
30
29
|
|
|
31
|
-
@argument("names_row", type=int, default=-1)
|
|
32
|
-
@argument("size_row", type=int, default=-1)
|
|
33
|
-
@argument("target", type=str, default=None)
|
|
34
30
|
class Matrix(Generic):
|
|
35
31
|
"""A numerical dataset"""
|
|
36
32
|
|
|
33
|
+
names_row: Param[int] = -1
|
|
34
|
+
size_row: Param[int] = -1
|
|
35
|
+
target: Param[Optional[str]] = None
|
|
36
|
+
|
|
37
37
|
@documentation
|
|
38
|
-
def data(self) -> Tuple[List[str],
|
|
38
|
+
def data(self) -> Tuple[List[str], Any]:
|
|
39
39
|
"""Returns the list of fields and the numeric data
|
|
40
40
|
|
|
41
41
|
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Huggingface datamaestro adapters"""
|
|
2
|
+
|
|
3
|
+
from functools import cached_property
|
|
4
|
+
from typing import Optional
|
|
5
|
+
from . import Base
|
|
6
|
+
import logging
|
|
7
|
+
from experimaestro import Param
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HuggingFaceDataset(Base):
|
|
11
|
+
repo_id: Param[str]
|
|
12
|
+
data_files: Param[Optional[str]] = None
|
|
13
|
+
split: Param[Optional[str]] = None
|
|
14
|
+
|
|
15
|
+
@cached_property
|
|
16
|
+
def data(self):
|
|
17
|
+
try:
|
|
18
|
+
from datasets import load_dataset
|
|
19
|
+
except ModuleNotFoundError:
|
|
20
|
+
logging.error("the datasets library is not installed:")
|
|
21
|
+
logging.error("pip install datasets")
|
|
22
|
+
raise
|
|
23
|
+
|
|
24
|
+
ds = load_dataset(self.repo_id, data_files=self.data_files, split=self.split)
|
|
25
|
+
return ds
|
datamaestro/data/ml.py
CHANGED
|
@@ -1,19 +1,28 @@
|
|
|
1
1
|
"""Machine learning generic data formats"""
|
|
2
|
-
|
|
2
|
+
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from
|
|
4
|
+
from typing import Generic, TypeVar, Optional
|
|
5
|
+
from experimaestro import Param, Meta
|
|
6
|
+
from . import Base
|
|
7
|
+
|
|
8
|
+
Train = TypeVar("Train", bound=Base)
|
|
9
|
+
Validation = TypeVar("Validation", bound=Base)
|
|
10
|
+
Test = TypeVar("Test", bound=Base)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Supervised(Base, Generic[Train, Validation, Test]):
|
|
14
|
+
train: Param[Base]
|
|
15
|
+
"""The training dataset"""
|
|
5
16
|
|
|
17
|
+
validation: Param[Optional[Base]] = None
|
|
18
|
+
"""The validation dataset (optional)"""
|
|
6
19
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@argument("test", type=Base, help="The test dataset", required=False)
|
|
10
|
-
class Supervised(Base):
|
|
11
|
-
pass
|
|
20
|
+
test: Param[Optional[Base]] = None
|
|
21
|
+
"""The training optional"""
|
|
12
22
|
|
|
13
23
|
|
|
14
|
-
@argument("path", type=Path)
|
|
15
|
-
@argument("classes")
|
|
16
24
|
class FolderBased(Base):
|
|
17
25
|
"""Classification dataset where folders give the basis"""
|
|
18
26
|
|
|
19
|
-
|
|
27
|
+
classes: Param[list[str]]
|
|
28
|
+
path: Meta[Path]
|
datamaestro/data/tensor.py
CHANGED
|
@@ -1,43 +1,50 @@
|
|
|
1
|
-
from
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
2
|
from struct import Struct
|
|
3
|
-
from
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
from . import File, Base
|
|
4
5
|
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
import numpy as np
|
|
5
8
|
|
|
6
|
-
|
|
9
|
+
|
|
10
|
+
class Tensor(Base, ABC):
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def data(self) -> "np.ndarray":
|
|
13
|
+
"""Returns the tensor in numpy format"""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class IDX(Tensor, File):
|
|
7
18
|
"""IDX File format
|
|
8
|
-
the IDX file format is a simple format for vectors and multidimensional matrices of various numerical types.
|
|
9
19
|
|
|
10
|
-
The
|
|
20
|
+
The IDX file format is a simple format for vectors and multidimensional
|
|
21
|
+
matrices of various numerical types.
|
|
22
|
+
|
|
23
|
+
The basic format is:
|
|
11
24
|
|
|
12
|
-
magic number
|
|
13
|
-
size in dimension
|
|
14
|
-
size in dimension 1
|
|
15
|
-
size in dimension 2
|
|
16
|
-
.....
|
|
17
|
-
size in dimension N
|
|
18
|
-
data
|
|
25
|
+
magic number size in dimension 0 size in dimension 1 size in dimension 2
|
|
26
|
+
..... size in dimension N data
|
|
19
27
|
|
|
20
28
|
The magic number is an integer (MSB first). The first 2 bytes are always 0.
|
|
21
29
|
|
|
22
|
-
The third byte codes the type of the data:
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
0x0B: short (2 bytes)
|
|
26
|
-
0x0C: int (4 bytes)
|
|
27
|
-
0x0D: float (4 bytes)
|
|
28
|
-
0x0E: double (8 bytes)
|
|
30
|
+
The third byte codes the type of the data: 0x08: unsigned byte 0x09: signed
|
|
31
|
+
byte 0x0B: short (2 bytes) 0x0C: int (4 bytes) 0x0D: float (4 bytes) 0x0E:
|
|
32
|
+
double (8 bytes)
|
|
29
33
|
|
|
30
|
-
The 4-th byte codes the number of dimensions of the vector/matrix: 1 for
|
|
34
|
+
The 4-th byte codes the number of dimensions of the vector/matrix: 1 for
|
|
35
|
+
vectors, 2 for matrices....
|
|
31
36
|
|
|
32
|
-
The sizes in each dimension are 4-byte integers (MSB first, high endian,
|
|
37
|
+
The sizes in each dimension are 4-byte integers (MSB first, high endian,
|
|
38
|
+
like in most non-Intel processors).
|
|
33
39
|
|
|
34
|
-
The data is stored like in a C array, i.e. the index in the last dimension
|
|
40
|
+
The data is stored like in a C array, i.e. the index in the last dimension
|
|
41
|
+
changes the fastest.
|
|
35
42
|
"""
|
|
36
43
|
|
|
37
44
|
MAGIC_NUMBER = Struct(">HBB")
|
|
38
45
|
DIM = Struct(">I")
|
|
39
46
|
|
|
40
|
-
def data(self):
|
|
47
|
+
def data(self) -> "np.ndarray":
|
|
41
48
|
"""Returns the tensor"""
|
|
42
49
|
import numpy as np
|
|
43
50
|
|
|
@@ -58,7 +65,8 @@ class IDX(File):
|
|
|
58
65
|
shape = [IDX.DIM.unpack_from(fp.read(IDX.DIM.size))[0] for i in range(size)]
|
|
59
66
|
|
|
60
67
|
size = np.prod(shape)
|
|
61
|
-
# Could use np.fromfile... if it were not broken
|
|
68
|
+
# Could use np.fromfile... if it were not broken
|
|
69
|
+
# see https://github.com/numpy/numpy/issues/7989
|
|
62
70
|
data = np.frombuffer(fp.read(), dtype=dtype, count=size)
|
|
63
71
|
data = data.reshape(shape, order="C")
|
|
64
72
|
return data
|