datamaestro 1.0.6__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamaestro-1.0.6 → datamaestro-1.2.0}/PKG-INFO +1 -1
- datamaestro-1.2.0/docs/source/api/records.rst +112 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/annotations/agreement.py +9 -3
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/context.py +18 -9
- datamaestro-1.2.0/src/datamaestro/data/ml.py +27 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/definitions.py +58 -18
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/__init__.py +31 -2
- datamaestro-1.2.0/src/datamaestro/record.py +177 -0
- datamaestro-1.2.0/src/datamaestro/test/test_record.py +72 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/version.py +2 -2
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/PKG-INFO +1 -1
- datamaestro-1.0.6/docs/source/api/records.rst +0 -59
- datamaestro-1.0.6/src/datamaestro/data/ml.py +0 -19
- datamaestro-1.0.6/src/datamaestro/record.py +0 -312
- datamaestro-1.0.6/src/datamaestro/test/test_record.py +0 -151
- {datamaestro-1.0.6 → datamaestro-1.2.0}/.coverage +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/.github/workflows/pytest.yml +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/.github/workflows/python-publish.yml +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/.gitignore +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/.pre-commit-config.yaml +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/.readthedocs.yml +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/CHANGELOG.md +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/LICENSE +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/MANIFEST.in +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/README.md +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/TODO.md +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/Makefile +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/make.bat +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/requirements.txt +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/api/data.md +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/api/download.rst +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/api/index.md +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/conf.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/datasets.rst +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/developping.md +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/index.md +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/style.css +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/mkdocs.yml +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/pyproject.toml +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/pytest.ini +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/requirements-dev.txt +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/requirements.txt +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/schema.yaml +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/setup.cfg +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/setup.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/__init__.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/__main__.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/annotations/__init__.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/commands/__init__.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/commands/mainstyle.css +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/commands/site.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/data/__init__.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/data/csv.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/data/huggingface.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/data/tensor.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/archive.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/huggingface.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/links.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/manual.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/multiple.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/single.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/sync.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/todo.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/registry.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/search.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/settings.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/sphinx.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/stream/__init__.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/stream/compress.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/stream/lines.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/templates/dataset.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/test/__init__.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/test/checks.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/test/conftest.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/test/test_annotations.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/test/test_download_handlers.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/utils.py +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/SOURCES.txt +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/dependency_links.txt +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/entry_points.txt +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/not-zip-safe +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/requires.txt +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/top_level.txt +0 -0
- {datamaestro-1.0.6 → datamaestro-1.2.0}/tox.ini +0 -0
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
Records
|
|
2
|
+
=======
|
|
3
|
+
|
|
4
|
+
Records are flexible ways to compose information coming from various sources. For instance,
|
|
5
|
+
your processing chain can produce records only containing an ID. Later, you add can retrieve
|
|
6
|
+
the item content and add it to the record. Further in the processing, you would want to add
|
|
7
|
+
some transformation of the item content.
|
|
8
|
+
|
|
9
|
+
Records allow to perform this type of transformations by holding a set of **items**. Record types
|
|
10
|
+
form a lattice of types so that checking that some item types are present in an item is easy.
|
|
11
|
+
|
|
12
|
+
.. code-block:: python
|
|
13
|
+
@define
|
|
14
|
+
class AItem(Item):
|
|
15
|
+
a: int
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@define
|
|
19
|
+
class A1Item(AItem):
|
|
20
|
+
a1: int
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@define
|
|
24
|
+
class BItem(Item):
|
|
25
|
+
b: int
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@define
|
|
29
|
+
class CItem(Item):
|
|
30
|
+
c: int
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
record = Record(AItem(1), BItem(2))
|
|
35
|
+
print(record[AItem].a) # 1
|
|
36
|
+
print(record[BItem].b) # 1
|
|
37
|
+
|
|
38
|
+
# records types are only defined by their item types
|
|
39
|
+
other_record = Record(A1Item(1), BItem(2))
|
|
40
|
+
|
|
41
|
+
# records can be updated
|
|
42
|
+
new_record = record.update(BItem(3), CItem(4))
|
|
43
|
+
print(new_record[BItem].b) # 3
|
|
44
|
+
print(new_record[CItem].c) # 4
|
|
45
|
+
|
|
46
|
+
# records only hold one instance of a given item
|
|
47
|
+
# base type
|
|
48
|
+
new_record_a1 = record.update(A1Item(3, 4))
|
|
49
|
+
print(new_record[AItem].a) # 3
|
|
50
|
+
print(new_record[A1Item].a) # 3
|
|
51
|
+
print(new_record[A1Item].a1) # 4
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
Working with record types
|
|
55
|
+
*************************
|
|
56
|
+
|
|
57
|
+
Record types form a lattice of types that can be used to check
|
|
58
|
+
record properties before hand.
|
|
59
|
+
|
|
60
|
+
.. code-block:: python
|
|
61
|
+
|
|
62
|
+
ABRecord = record_type(AItem, BItem)
|
|
63
|
+
AB1Record = record_type(AItem, B1Item)
|
|
64
|
+
|
|
65
|
+
# Hierarchy-based check
|
|
66
|
+
assert ABRecord.contains(AB1Record)
|
|
67
|
+
|
|
68
|
+
# Checks for specific types
|
|
69
|
+
assert ABRecord.has(AItem, BItem)
|
|
70
|
+
|
|
71
|
+
Validating
|
|
72
|
+
**********
|
|
73
|
+
|
|
74
|
+
To ensure that a record fills the requested property,
|
|
75
|
+
one can use record types
|
|
76
|
+
|
|
77
|
+
.. code-block:: python
|
|
78
|
+
|
|
79
|
+
ABRecord = record_type(AItem, BItem)
|
|
80
|
+
|
|
81
|
+
# OK
|
|
82
|
+
ABRecord(AItem(1), BItem(2))
|
|
83
|
+
|
|
84
|
+
# Fails: A1Item is not AItem
|
|
85
|
+
ABRecord(A1Item(1), BItem(2))
|
|
86
|
+
|
|
87
|
+
# Fails: AItem is not present
|
|
88
|
+
ABRecord(BItem(2))
|
|
89
|
+
|
|
90
|
+
When updating, it is also possible to validate
|
|
91
|
+
|
|
92
|
+
.. code-block:: python
|
|
93
|
+
|
|
94
|
+
A1BRecord = record_type(A1Item, BItem)
|
|
95
|
+
record = Record(AItem(1), BItem(2))
|
|
96
|
+
|
|
97
|
+
# Update the ABRecord into a A1/B one
|
|
98
|
+
record.update(A1Item(1, 2), target=A1BRecord)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
API
|
|
102
|
+
***
|
|
103
|
+
|
|
104
|
+
.. autoclass:: datamaestro.record.Item
|
|
105
|
+
|
|
106
|
+
.. autoclass:: datamaestro.record.RecordType
|
|
107
|
+
:members: __call__, validate, sub
|
|
108
|
+
|
|
109
|
+
.. autoclass:: datamaestro.record.Record
|
|
110
|
+
:members: update, has, get
|
|
111
|
+
|
|
112
|
+
.. autofunction:: datamaestro.record.record_type
|
|
@@ -1,9 +1,15 @@
|
|
|
1
|
-
import
|
|
2
|
-
from datamaestro.definitions import
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from datamaestro.definitions import AbstractDataset, hook
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
@hook("pre-use")
|
|
6
|
-
def useragreement(definition: AbstractDataset, message, id=None):
|
|
6
|
+
def useragreement(definition: AbstractDataset, message: str, id: Optional[str] = None):
|
|
7
|
+
"""Asks for a user-agreement
|
|
8
|
+
|
|
9
|
+
:param definition: The dataset for which the agreement is asked
|
|
10
|
+
:param message: The agreement text
|
|
11
|
+
:param id: The ID of the agreement (default to the dataset ID)
|
|
12
|
+
"""
|
|
7
13
|
# Skip agreement when testing
|
|
8
14
|
if definition.context.running_test:
|
|
9
15
|
return
|
|
@@ -110,19 +110,20 @@ class Context:
|
|
|
110
110
|
if repositoryid is None:
|
|
111
111
|
return None
|
|
112
112
|
|
|
113
|
-
|
|
113
|
+
entry_points = [
|
|
114
114
|
x
|
|
115
115
|
for x in pkg_resources.iter_entry_points(
|
|
116
116
|
"datamaestro.repositories", repositoryid
|
|
117
117
|
)
|
|
118
118
|
]
|
|
119
|
-
if not
|
|
119
|
+
if not entry_points:
|
|
120
120
|
raise Exception("No datasets repository named %s", repositoryid)
|
|
121
|
-
if len(
|
|
121
|
+
if len(entry_points) > 1:
|
|
122
122
|
raise Exception(
|
|
123
|
-
"Too many datasets repository named %s (%d)"
|
|
123
|
+
"Too many datasets repository named %s (%d)"
|
|
124
|
+
% (repositoryid, len(entry_points))
|
|
124
125
|
)
|
|
125
|
-
return
|
|
126
|
+
return entry_points[0].load()(self)
|
|
126
127
|
|
|
127
128
|
@property
|
|
128
129
|
def running_test(self):
|
|
@@ -175,7 +176,6 @@ class Context:
|
|
|
175
176
|
if dlpath.is_file():
|
|
176
177
|
logging.debug("Using cached file %s for %s", dlpath, url)
|
|
177
178
|
else:
|
|
178
|
-
|
|
179
179
|
logging.info("Downloading %s", url)
|
|
180
180
|
tmppath = dlpath.with_suffix(".tmp")
|
|
181
181
|
|
|
@@ -188,7 +188,7 @@ class Context:
|
|
|
188
188
|
|
|
189
189
|
def ask(self, question: str, options: Dict[str, str]):
|
|
190
190
|
"""Ask a question to the user"""
|
|
191
|
-
print(question)
|
|
191
|
+
print(question) # noqa: T201
|
|
192
192
|
answer = None
|
|
193
193
|
while answer not in options:
|
|
194
194
|
answer = input().strip().lower()
|
|
@@ -268,6 +268,7 @@ class Datasets(Iterable["AbstractDataset"]):
|
|
|
268
268
|
|
|
269
269
|
def __iter__(self) -> Iterable["AbstractDataset"]:
|
|
270
270
|
from .definitions import DatasetWrapper
|
|
271
|
+
from datamaestro.data import Base
|
|
271
272
|
|
|
272
273
|
# Iterates over defined symbols
|
|
273
274
|
for key, value in self.module.__dict__.items():
|
|
@@ -276,10 +277,18 @@ class Datasets(Iterable["AbstractDataset"]):
|
|
|
276
277
|
# Ensure it comes from the module
|
|
277
278
|
if self.module.__name__ == value.t.__module__:
|
|
278
279
|
yield value
|
|
280
|
+
elif (
|
|
281
|
+
inspect.isclass(value)
|
|
282
|
+
and issubclass(value, Base)
|
|
283
|
+
and hasattr(value, "__dataset__")
|
|
284
|
+
):
|
|
285
|
+
if self.module.__name__ == value.__module__:
|
|
286
|
+
yield value.__dataset__
|
|
279
287
|
|
|
280
288
|
|
|
281
289
|
class Repository:
|
|
282
|
-
"""A repository regroup a set of datasets and their corresponding specific
|
|
290
|
+
"""A repository regroup a set of datasets and their corresponding specific
|
|
291
|
+
handlers (downloading, filtering, etc.)"""
|
|
283
292
|
|
|
284
293
|
def __init__(self, context: Context):
|
|
285
294
|
"""Initialize a new repository
|
|
@@ -315,7 +324,7 @@ class Repository:
|
|
|
315
324
|
try:
|
|
316
325
|
return get_distribution(cls.__module__).version
|
|
317
326
|
except DistributionNotFound:
|
|
318
|
-
|
|
327
|
+
return None
|
|
319
328
|
|
|
320
329
|
def __repr__(self):
|
|
321
330
|
return "Repository(%s)" % self.basedir
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Machine learning generic data formats"""
|
|
2
|
+
from typing import Generic, TypeVar, Optional
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from experimaestro import Param, Meta, argument
|
|
5
|
+
from . import Base
|
|
6
|
+
|
|
7
|
+
Train = TypeVar("Train", bound=Base)
|
|
8
|
+
Validation = TypeVar("Validation", bound=Base)
|
|
9
|
+
Test = TypeVar("Test", bound=Base)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Supervised(Base, Generic[Train, Validation, Test]):
|
|
13
|
+
train: Param[Base]
|
|
14
|
+
"""The training dataset"""
|
|
15
|
+
|
|
16
|
+
validation: Param[Optional[Base]] = None
|
|
17
|
+
"""The validation dataset (optional)"""
|
|
18
|
+
|
|
19
|
+
test: Param[Optional[Base]] = None
|
|
20
|
+
"""The training optional"""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@argument("classes")
|
|
24
|
+
class FolderBased(Base):
|
|
25
|
+
"""Classification dataset where folders give the basis"""
|
|
26
|
+
|
|
27
|
+
path: Meta[Path]
|
|
@@ -127,6 +127,13 @@ class AbstractDataset(AbstractData):
|
|
|
127
127
|
"""
|
|
128
128
|
|
|
129
129
|
name: Optional[str] = None
|
|
130
|
+
"""The name of the dataset"""
|
|
131
|
+
|
|
132
|
+
url: Optional[str] = None
|
|
133
|
+
"""The URL of the dataset"""
|
|
134
|
+
|
|
135
|
+
doi: Optional[str] = None
|
|
136
|
+
"""The DOI of this dataset"""
|
|
130
137
|
|
|
131
138
|
def __init__(self, repository: Optional["Repository"]):
|
|
132
139
|
super().__init__()
|
|
@@ -136,6 +143,7 @@ class AbstractDataset(AbstractData):
|
|
|
136
143
|
|
|
137
144
|
# Associated resources
|
|
138
145
|
self.resources: Dict[str, "Download"] = {}
|
|
146
|
+
self.ordered_resources = []
|
|
139
147
|
|
|
140
148
|
# Hooks
|
|
141
149
|
# pre-use: before returning the dataset object
|
|
@@ -194,13 +202,15 @@ class AbstractDataset(AbstractData):
|
|
|
194
202
|
def download(self, force=False):
|
|
195
203
|
"""Download all the necessary resources"""
|
|
196
204
|
success = True
|
|
197
|
-
|
|
205
|
+
logging.info("Materializing %d resources", len(self.ordered_resources))
|
|
206
|
+
for resource in self.ordered_resources:
|
|
198
207
|
try:
|
|
199
208
|
resource.download(force)
|
|
200
209
|
except Exception:
|
|
201
|
-
logging.error("Could not download resource %s",
|
|
210
|
+
logging.error("Could not download resource %s", resource)
|
|
202
211
|
traceback.print_exc()
|
|
203
212
|
success = False
|
|
213
|
+
break
|
|
204
214
|
return success
|
|
205
215
|
|
|
206
216
|
@staticmethod
|
|
@@ -249,6 +259,7 @@ class DatasetWrapper(AbstractDataset):
|
|
|
249
259
|
def __init__(self, annotation, t: type):
|
|
250
260
|
self.t = t
|
|
251
261
|
self.base = annotation.base
|
|
262
|
+
self.config = None
|
|
252
263
|
assert self.base is not None, f"Could not set the Config type for {t}"
|
|
253
264
|
|
|
254
265
|
repository, components = DataDefinition.repository_relpath(t)
|
|
@@ -256,6 +267,7 @@ class DatasetWrapper(AbstractDataset):
|
|
|
256
267
|
|
|
257
268
|
# Set some variables
|
|
258
269
|
self.url = annotation.url
|
|
270
|
+
self.doi = annotation.doi
|
|
259
271
|
|
|
260
272
|
# Builds the ID:
|
|
261
273
|
# Removes module_name.config prefix
|
|
@@ -322,7 +334,18 @@ class DatasetWrapper(AbstractDataset):
|
|
|
322
334
|
"""Returns a pointer to a potential attribute"""
|
|
323
335
|
return FutureAttr(self, [key])
|
|
324
336
|
|
|
337
|
+
def download(self, force=False):
|
|
338
|
+
if self.base is self.t:
|
|
339
|
+
self._prepare()
|
|
340
|
+
return super().download(force=force)
|
|
341
|
+
|
|
325
342
|
def _prepare(self, download=False) -> "Base":
|
|
343
|
+
if self.config is not None:
|
|
344
|
+
return self.config
|
|
345
|
+
|
|
346
|
+
if self.base is self.t:
|
|
347
|
+
self.config = self.base.__create_dataset__(self)
|
|
348
|
+
|
|
326
349
|
if download:
|
|
327
350
|
for hook in self.hooks["pre-download"]:
|
|
328
351
|
hook(self)
|
|
@@ -332,23 +355,23 @@ class DatasetWrapper(AbstractDataset):
|
|
|
332
355
|
for hook in self.hooks["pre-use"]:
|
|
333
356
|
hook(self)
|
|
334
357
|
|
|
335
|
-
resources = {key: value.prepare() for key, value in self.resources.items()}
|
|
336
|
-
dict = self.t(**resources)
|
|
337
|
-
if dict is None:
|
|
338
|
-
name = self.t.__name__
|
|
339
|
-
filename = inspect.getfile(self.t)
|
|
340
|
-
raise Exception(
|
|
341
|
-
f"The dataset method {name} defined in "
|
|
342
|
-
f"{filename} returned a null object"
|
|
343
|
-
)
|
|
344
|
-
|
|
345
358
|
# Construct the object
|
|
346
|
-
|
|
359
|
+
if self.config is None:
|
|
360
|
+
resources = {key: value.prepare() for key, value in self.resources.items()}
|
|
361
|
+
dict = self.t(**resources)
|
|
362
|
+
if dict is None:
|
|
363
|
+
name = self.t.__name__
|
|
364
|
+
filename = inspect.getfile(self.t)
|
|
365
|
+
raise Exception(
|
|
366
|
+
f"The dataset method {name} defined in "
|
|
367
|
+
f"{filename} returned a null object"
|
|
368
|
+
)
|
|
369
|
+
self.config = self.base(**dict)
|
|
347
370
|
|
|
348
371
|
# Set the ids
|
|
349
|
-
self.setDataIDs(
|
|
372
|
+
self.setDataIDs(self.config, self.id)
|
|
350
373
|
|
|
351
|
-
return
|
|
374
|
+
return self.config
|
|
352
375
|
|
|
353
376
|
@property
|
|
354
377
|
def _path(self) -> Path:
|
|
@@ -455,7 +478,9 @@ datatasks = DataTagging(lambda d: d.tasks)
|
|
|
455
478
|
|
|
456
479
|
|
|
457
480
|
class dataset:
|
|
458
|
-
def __init__(
|
|
481
|
+
def __init__(
|
|
482
|
+
self, base=None, *, timestamp=None, id=None, url=None, size=None, doi=None
|
|
483
|
+
):
|
|
459
484
|
"""Creates a new (meta)dataset
|
|
460
485
|
|
|
461
486
|
Meta-datasets are not associated with any base type
|
|
@@ -473,6 +498,8 @@ class dataset:
|
|
|
473
498
|
url {[type]} -- [description] (default: {None})
|
|
474
499
|
|
|
475
500
|
size {str} -- The size (should be a parsable format)
|
|
501
|
+
|
|
502
|
+
doi {str} -- The DOI of the corresponding paper
|
|
476
503
|
"""
|
|
477
504
|
if hasattr(base, "__datamaestro__") and isinstance(
|
|
478
505
|
base.__datamaestro__, metadataset
|
|
@@ -486,18 +513,31 @@ class dataset:
|
|
|
486
513
|
self.meta = False
|
|
487
514
|
self.timestamp = timestamp
|
|
488
515
|
self.size = size
|
|
516
|
+
self.doi = doi
|
|
489
517
|
|
|
490
518
|
def __call__(self, t):
|
|
491
519
|
try:
|
|
492
520
|
if self.base is None:
|
|
493
|
-
|
|
494
|
-
|
|
521
|
+
from datamaestro.data import Base
|
|
522
|
+
|
|
523
|
+
if inspect.isclass(t) and issubclass(t, Base):
|
|
524
|
+
self.base = t
|
|
525
|
+
else:
|
|
526
|
+
# Get type from return annotation
|
|
527
|
+
try:
|
|
528
|
+
self.base = t.__annotations__["return"]
|
|
529
|
+
except KeyError:
|
|
530
|
+
logging.warning("No return annotation in %s", t)
|
|
531
|
+
raise
|
|
495
532
|
object.__getattribute__(t, "__datamaestro__")
|
|
496
533
|
raise AssertionError("@data should only be called once")
|
|
497
534
|
except AttributeError:
|
|
498
535
|
pass
|
|
499
536
|
|
|
500
537
|
dw = DatasetWrapper(self, t)
|
|
538
|
+
t.__dataset__ = dw
|
|
539
|
+
if inspect.isclass(t) and issubclass(t, Base):
|
|
540
|
+
return t
|
|
501
541
|
return dw
|
|
502
542
|
|
|
503
543
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
from typing import Union
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
2
3
|
from datamaestro.definitions import AbstractDataset, DatasetAnnotation
|
|
3
4
|
from datamaestro.utils import deprecated
|
|
5
|
+
from attrs import define
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
def initialized(method):
|
|
@@ -15,7 +17,12 @@ def initialized(method):
|
|
|
15
17
|
return wrapper
|
|
16
18
|
|
|
17
19
|
|
|
18
|
-
|
|
20
|
+
@define(kw_only=True)
|
|
21
|
+
class SetupOptions:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Resource(DatasetAnnotation, ABC):
|
|
19
26
|
"""
|
|
20
27
|
Base class for all download handlers
|
|
21
28
|
"""
|
|
@@ -24,13 +31,16 @@ class Download(DatasetAnnotation, ABC):
|
|
|
24
31
|
self.varname = varname
|
|
25
32
|
# Ensures that the object is initialized
|
|
26
33
|
self._post = False
|
|
34
|
+
self.definition = None
|
|
27
35
|
|
|
28
36
|
def annotate(self, dataset: AbstractDataset):
|
|
37
|
+
assert self.definition is None
|
|
29
38
|
# Register has a resource download
|
|
30
39
|
if self.varname in dataset.resources:
|
|
31
40
|
raise AssertionError("Name %s already declared as a resource", self.varname)
|
|
32
41
|
|
|
33
42
|
dataset.resources[self.varname] = self
|
|
43
|
+
dataset.ordered_resources.append(self)
|
|
34
44
|
self.definition = dataset
|
|
35
45
|
|
|
36
46
|
@property
|
|
@@ -53,10 +63,29 @@ class Download(DatasetAnnotation, ABC):
|
|
|
53
63
|
"""Prepares the dataset"""
|
|
54
64
|
...
|
|
55
65
|
|
|
66
|
+
def setup(
|
|
67
|
+
self,
|
|
68
|
+
dataset: Union[AbstractDataset],
|
|
69
|
+
options: SetupOptions = None,
|
|
70
|
+
):
|
|
71
|
+
"""Direct way to setup the resource (no annotation)"""
|
|
72
|
+
self(dataset)
|
|
73
|
+
return self.prepare()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# Keeps downwards compatibility
|
|
77
|
+
Download = Resource
|
|
78
|
+
|
|
56
79
|
|
|
57
80
|
class reference(Download):
|
|
58
|
-
def __init__(self, varname, reference):
|
|
81
|
+
def __init__(self, varname=None, reference=None):
|
|
82
|
+
"""References another dataset
|
|
83
|
+
|
|
84
|
+
:param varname: The name of the variable
|
|
85
|
+
:param reference: Another dataset
|
|
86
|
+
"""
|
|
59
87
|
super().__init__(varname)
|
|
88
|
+
assert reference is not None, "Reference cannot be null"
|
|
60
89
|
self.reference = reference
|
|
61
90
|
|
|
62
91
|
def prepare(self):
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
from typing import Type, TypeVar, Dict, Union, Optional
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Item:
|
|
5
|
+
"""Base class for all item types"""
|
|
6
|
+
|
|
7
|
+
@classmethod
|
|
8
|
+
def __get_base__(cls: Type) -> Type:
|
|
9
|
+
"""Get the most generic superclass for this type of item"""
|
|
10
|
+
if base := getattr(cls, "__base__cache__", None):
|
|
11
|
+
return base
|
|
12
|
+
|
|
13
|
+
base = cls
|
|
14
|
+
for supercls in cls.__mro__:
|
|
15
|
+
if issubclass(supercls, Item) and supercls is not Item:
|
|
16
|
+
base = supercls
|
|
17
|
+
setattr(cls, "__base__cache__", base)
|
|
18
|
+
return base
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
T = TypeVar("T", bound=Item)
|
|
22
|
+
Items = Dict[Type[T], T]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class RecordType:
|
|
26
|
+
def __init__(self, *item_types: Type[T]):
|
|
27
|
+
self.item_types = frozenset(item_types)
|
|
28
|
+
self.mapping = {item_type.__get_base__(): item_type for item_type in item_types}
|
|
29
|
+
|
|
30
|
+
def __repr__(self):
|
|
31
|
+
return f"""Record({",".join(item_type.__name__ for item_type in
|
|
32
|
+
self.item_types)})"""
|
|
33
|
+
|
|
34
|
+
def contains(self, other: "RecordType"):
|
|
35
|
+
"""Checks that each item type in other has an item type of a compatible
|
|
36
|
+
type in self"""
|
|
37
|
+
if len(self.item_types) != len(other.item_types):
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
for item_type in other.item_types:
|
|
41
|
+
if matching_type := self.mapping.get(item_type.__get_base__(), None):
|
|
42
|
+
if not issubclass(matching_type, item_type):
|
|
43
|
+
return False
|
|
44
|
+
else:
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
return True
|
|
48
|
+
|
|
49
|
+
def sub(self, *item_types: Type[T]):
|
|
50
|
+
"""Returns a new record type based on self and new item types"""
|
|
51
|
+
cls_itemtypes = [x for x in self.item_types]
|
|
52
|
+
mapping = {
|
|
53
|
+
itemtype.__get_base__(): ix for ix, itemtype in enumerate(cls_itemtypes)
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
for itemtype in item_types:
|
|
57
|
+
if (ix := mapping.get(itemtype.__get_base__(), -1)) >= 0:
|
|
58
|
+
cls_itemtypes[ix] = itemtype
|
|
59
|
+
else:
|
|
60
|
+
cls_itemtypes.append(itemtype)
|
|
61
|
+
|
|
62
|
+
return record_type(*cls_itemtypes)
|
|
63
|
+
|
|
64
|
+
def __call__(self, *items: T):
|
|
65
|
+
record = Record(*items)
|
|
66
|
+
self.validate(record)
|
|
67
|
+
return record
|
|
68
|
+
|
|
69
|
+
def has(self, itemtype: Type[T]):
|
|
70
|
+
return issubclass(self.mapping[itemtype.__get_base__()], itemtype)
|
|
71
|
+
|
|
72
|
+
def validate(self, record: "Record"):
|
|
73
|
+
"""Creates and validate a new record of this type"""
|
|
74
|
+
if self.item_types:
|
|
75
|
+
for item_type in self.item_types:
|
|
76
|
+
try:
|
|
77
|
+
record.__getitem__(item_type)
|
|
78
|
+
except KeyError:
|
|
79
|
+
raise KeyError(f"Item of type {item_type} is missing")
|
|
80
|
+
|
|
81
|
+
if len(record.items) != len(self.item_types):
|
|
82
|
+
unregistered = [
|
|
83
|
+
item
|
|
84
|
+
for item in record.items.values()
|
|
85
|
+
if all(
|
|
86
|
+
not issubclass(item.__get_base__(), item_type)
|
|
87
|
+
for item_type in self.item_types
|
|
88
|
+
)
|
|
89
|
+
]
|
|
90
|
+
raise KeyError(
|
|
91
|
+
f"The record of type {self} contains unregistered items: {unregistered}"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Creates a new record
|
|
95
|
+
return record
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def record_type(*item_types: Type[T]):
|
|
99
|
+
"""Returns a new record type"""
|
|
100
|
+
return RecordType(*item_types)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class Record:
|
|
104
|
+
"""Associate types with entries
|
|
105
|
+
|
|
106
|
+
A record is a composition of items; each item base class is unique.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
#: Items for this record
|
|
110
|
+
items: Items
|
|
111
|
+
|
|
112
|
+
def __init__(self, *items: Union[Items, T], override=False):
|
|
113
|
+
self.items = {}
|
|
114
|
+
|
|
115
|
+
if len(items) == 1 and isinstance(items[0], dict):
|
|
116
|
+
# Just copy the dictionary
|
|
117
|
+
self.items = items[0]
|
|
118
|
+
else:
|
|
119
|
+
for entry in items:
|
|
120
|
+
# Returns a new record if the item exists
|
|
121
|
+
base = entry.__get_base__()
|
|
122
|
+
if not override and base in self.items:
|
|
123
|
+
raise RuntimeError(
|
|
124
|
+
f"The item type {base} ({entry.__class__})"
|
|
125
|
+
" is already in the record"
|
|
126
|
+
)
|
|
127
|
+
self.items[base] = entry
|
|
128
|
+
|
|
129
|
+
def __str__(self):
|
|
130
|
+
return (
|
|
131
|
+
"{"
|
|
132
|
+
+ ", ".join(
|
|
133
|
+
f"{key.__module__}.{key.__qualname__}: {value}"
|
|
134
|
+
for key, value in self.items.items()
|
|
135
|
+
)
|
|
136
|
+
+ "}"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def __repr__(self):
|
|
140
|
+
return (
|
|
141
|
+
"{"
|
|
142
|
+
+ ", ".join(
|
|
143
|
+
f"{key.__module__}.{key.__qualname__}: {repr(value)}"
|
|
144
|
+
for key, value in self.items.items()
|
|
145
|
+
)
|
|
146
|
+
+ "}"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def get(self, key: Type[T]) -> Optional[T]:
|
|
150
|
+
"""Get a given item or None if it does not exist"""
|
|
151
|
+
try:
|
|
152
|
+
return self[key]
|
|
153
|
+
except KeyError:
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
def has(self, key: Type[T]) -> bool:
|
|
157
|
+
"""Returns True if the record has the given item type"""
|
|
158
|
+
return key.__get_base__() in self.items
|
|
159
|
+
|
|
160
|
+
def __getitem__(self, key: Type[T]) -> T:
|
|
161
|
+
"""Get an item given its type"""
|
|
162
|
+
base = key.__get_base__()
|
|
163
|
+
entry = self.items[base]
|
|
164
|
+
|
|
165
|
+
# Check if this matches the expected class
|
|
166
|
+
if not isinstance(entry, key):
|
|
167
|
+
raise KeyError(f"No entry with type {key}")
|
|
168
|
+
return entry
|
|
169
|
+
|
|
170
|
+
def update(self, *items: T, target: RecordType = None) -> "Record":
|
|
171
|
+
"""Update some items"""
|
|
172
|
+
# Create our new dictionary
|
|
173
|
+
item_dict = {**self.items}
|
|
174
|
+
for item in items:
|
|
175
|
+
item_dict[item.__get_base__()] = item
|
|
176
|
+
|
|
177
|
+
return Record(item_dict)
|