datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +11 -7
- datamaestro/__main__.py +29 -8
- datamaestro/annotations/__init__.py +1 -1
- datamaestro/annotations/agreement.py +9 -3
- datamaestro/commands/site.py +27 -15
- datamaestro/context.py +143 -87
- datamaestro/data/__init__.py +23 -11
- datamaestro/data/csv.py +12 -12
- datamaestro/data/huggingface.py +25 -0
- datamaestro/data/ml.py +19 -10
- datamaestro/data/tensor.py +32 -24
- datamaestro/definitions.py +492 -131
- datamaestro/download/__init__.py +610 -24
- datamaestro/download/archive.py +129 -77
- datamaestro/download/custom.py +53 -0
- datamaestro/download/huggingface.py +77 -0
- datamaestro/download/links.py +106 -50
- datamaestro/download/multiple.py +27 -5
- datamaestro/download/single.py +114 -51
- datamaestro/download/sync.py +0 -1
- datamaestro/download/todo.py +9 -4
- datamaestro/download/wayback.py +164 -0
- datamaestro/record.py +232 -0
- datamaestro/registry.py +1 -0
- datamaestro/search.py +1 -1
- datamaestro/settings.py +3 -1
- datamaestro/sphinx.py +224 -0
- datamaestro/stream/__init__.py +0 -2
- datamaestro/stream/lines.py +10 -7
- datamaestro/templates/dataset.py +5 -4
- datamaestro/test/__init__.py +3 -1
- datamaestro/test/checks.py +1 -5
- datamaestro/test/conftest.py +1 -6
- datamaestro/test/test_annotations.py +2 -2
- datamaestro/test/test_download_handlers.py +3 -4
- datamaestro/test/test_record.py +72 -0
- datamaestro/test/test_resource.py +1388 -0
- datamaestro/utils.py +15 -9
- datamaestro/v2.md +301 -0
- datamaestro/version.py +4 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
- datamaestro-1.7.0.dist-info/RECORD +49 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
- datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/context.cpython-38.pyc +0 -0
- datamaestro/__pycache__/context.cpython-39.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
- datamaestro/__pycache__/search.cpython-38.pyc +0 -0
- datamaestro/__pycache__/search.cpython-39.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
- datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro-0.8.1.dist-info/RECORD +0 -109
- datamaestro-0.8.1.dist-info/top_level.txt +0 -1
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
datamaestro/record.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""Record module for type-safe heterogeneous containers.
|
|
2
|
+
|
|
3
|
+
.. deprecated:: 2.0
|
|
4
|
+
This module will be removed in v2. Use :class:`typing.TypedDict` instead
|
|
5
|
+
for type-safe heterogeneous data structures. TypedDict provides better IDE
|
|
6
|
+
support, type checking, and is part of the standard library.
|
|
7
|
+
|
|
8
|
+
When using TypedDict, define key constants in classes (e.g., ``MyItem.ID``)
|
|
9
|
+
to avoid typos and enable IDE autocomplete. Prefix keys with package name
|
|
10
|
+
using underscore ``_`` as delimiter to avoid conflicts between different
|
|
11
|
+
data sources.
|
|
12
|
+
|
|
13
|
+
Example migration::
|
|
14
|
+
|
|
15
|
+
# Old way (deprecated)
|
|
16
|
+
@define
|
|
17
|
+
class MyItem(Item):
|
|
18
|
+
value: int
|
|
19
|
+
|
|
20
|
+
record = Record(MyItem(42))
|
|
21
|
+
print(record[MyItem].value)
|
|
22
|
+
|
|
23
|
+
# New way (recommended)
|
|
24
|
+
from typing import TypedDict
|
|
25
|
+
|
|
26
|
+
# Define key constants in classes
|
|
27
|
+
class MyItem:
|
|
28
|
+
ID = "mypackage_value"
|
|
29
|
+
|
|
30
|
+
class MyRecord(TypedDict):
|
|
31
|
+
mypackage_value: int
|
|
32
|
+
|
|
33
|
+
data: MyRecord = {MyItem.ID: 42}
|
|
34
|
+
print(data[MyItem.ID])
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
import warnings
|
|
38
|
+
from typing import Type, TypeVar, Dict, Union, Optional
|
|
39
|
+
|
|
40
|
+
# Emit deprecation warning when module is imported
|
|
41
|
+
warnings.warn(
|
|
42
|
+
"The datamaestro.record module is deprecated and will be removed in v2. "
|
|
43
|
+
"Use typing.TypedDict instead (use class constants like MyItem.ID for keys, "
|
|
44
|
+
"prefixed with package name).",
|
|
45
|
+
DeprecationWarning,
|
|
46
|
+
stacklevel=2,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Item:
|
|
51
|
+
"""Base class for all item types"""
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def __get_base__(cls: Type) -> Type:
|
|
55
|
+
"""Get the most generic superclass for this type of item"""
|
|
56
|
+
if base := getattr(cls, "__base__cache__", None):
|
|
57
|
+
return base
|
|
58
|
+
|
|
59
|
+
base = cls
|
|
60
|
+
for supercls in cls.__mro__:
|
|
61
|
+
if issubclass(supercls, Item) and supercls is not Item:
|
|
62
|
+
base = supercls
|
|
63
|
+
setattr(cls, "__base__cache__", base)
|
|
64
|
+
return base
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
T = TypeVar("T", bound=Item)
|
|
68
|
+
Items = Dict[Type[T], T]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class RecordType:
|
|
72
|
+
def __init__(self, *item_types: Type[T]):
|
|
73
|
+
self.item_types = frozenset(item_types)
|
|
74
|
+
self.mapping = {item_type.__get_base__(): item_type for item_type in item_types}
|
|
75
|
+
|
|
76
|
+
def __repr__(self):
|
|
77
|
+
names = ",".join(item_type.__name__ for item_type in self.item_types)
|
|
78
|
+
return f"Record({names})"
|
|
79
|
+
|
|
80
|
+
def contains(self, other: "RecordType"):
|
|
81
|
+
"""Checks that each item type in other has an item type of a compatible
|
|
82
|
+
type in self"""
|
|
83
|
+
if len(self.item_types) != len(other.item_types):
|
|
84
|
+
return False
|
|
85
|
+
|
|
86
|
+
for item_type in other.item_types:
|
|
87
|
+
if matching_type := self.mapping.get(item_type.__get_base__(), None):
|
|
88
|
+
if not issubclass(matching_type, item_type):
|
|
89
|
+
return False
|
|
90
|
+
else:
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
return True
|
|
94
|
+
|
|
95
|
+
def sub(self, *item_types: Type[T]):
|
|
96
|
+
"""Returns a new record type based on self and new item types"""
|
|
97
|
+
cls_itemtypes = [x for x in self.item_types]
|
|
98
|
+
mapping = {
|
|
99
|
+
itemtype.__get_base__(): ix for ix, itemtype in enumerate(cls_itemtypes)
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
for itemtype in item_types:
|
|
103
|
+
if (ix := mapping.get(itemtype.__get_base__(), -1)) >= 0:
|
|
104
|
+
cls_itemtypes[ix] = itemtype
|
|
105
|
+
else:
|
|
106
|
+
cls_itemtypes.append(itemtype)
|
|
107
|
+
|
|
108
|
+
return record_type(*cls_itemtypes)
|
|
109
|
+
|
|
110
|
+
def __call__(self, *items: T):
|
|
111
|
+
record = Record(*items)
|
|
112
|
+
self.validate(record)
|
|
113
|
+
return record
|
|
114
|
+
|
|
115
|
+
def has(self, itemtype: Type[T]):
|
|
116
|
+
return issubclass(self.mapping[itemtype.__get_base__()], itemtype)
|
|
117
|
+
|
|
118
|
+
def validate(self, record: "Record"):
|
|
119
|
+
"""Creates and validate a new record of this type"""
|
|
120
|
+
if self.item_types:
|
|
121
|
+
for item_type in self.item_types:
|
|
122
|
+
try:
|
|
123
|
+
record.__getitem__(item_type)
|
|
124
|
+
except KeyError:
|
|
125
|
+
raise KeyError(f"Item of type {item_type} is missing")
|
|
126
|
+
|
|
127
|
+
if len(record.items) != len(self.item_types):
|
|
128
|
+
unregistered = [
|
|
129
|
+
item
|
|
130
|
+
for item in record.items.values()
|
|
131
|
+
if all(
|
|
132
|
+
not issubclass(item.__get_base__(), item_type)
|
|
133
|
+
for item_type in self.item_types
|
|
134
|
+
)
|
|
135
|
+
]
|
|
136
|
+
raise KeyError(
|
|
137
|
+
f"The record of type {self} contains unregistered items: {unregistered}"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Creates a new record
|
|
141
|
+
return record
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def record_type(*item_types: Type[T]):
|
|
145
|
+
"""Returns a new record type"""
|
|
146
|
+
return RecordType(*item_types)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class Record:
|
|
150
|
+
"""Associate types with entries
|
|
151
|
+
|
|
152
|
+
A record is a composition of items; each item base class is unique.
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
#: Items for this record
|
|
156
|
+
items: Items
|
|
157
|
+
|
|
158
|
+
def __init__(self, *items: Union[Items, T], override=False):
|
|
159
|
+
self.items = {}
|
|
160
|
+
|
|
161
|
+
if len(items) == 1 and isinstance(items[0], dict):
|
|
162
|
+
# Just copy the dictionary
|
|
163
|
+
self.items = items[0]
|
|
164
|
+
else:
|
|
165
|
+
for entry in items:
|
|
166
|
+
# Returns a new record if the item exists
|
|
167
|
+
base = entry.__get_base__()
|
|
168
|
+
if not override and base in self.items:
|
|
169
|
+
raise RuntimeError(
|
|
170
|
+
f"The item type {base} ({entry.__class__})"
|
|
171
|
+
" is already in the record"
|
|
172
|
+
)
|
|
173
|
+
self.items[base] = entry
|
|
174
|
+
|
|
175
|
+
def __str__(self):
|
|
176
|
+
return (
|
|
177
|
+
"{"
|
|
178
|
+
+ ", ".join(
|
|
179
|
+
f"{key.__module__}.{key.__qualname__}: {value}"
|
|
180
|
+
for key, value in self.items.items()
|
|
181
|
+
)
|
|
182
|
+
+ "}"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def __repr__(self):
|
|
186
|
+
return (
|
|
187
|
+
"{"
|
|
188
|
+
+ ", ".join(
|
|
189
|
+
f"{key.__module__}.{key.__qualname__}: {repr(value)}"
|
|
190
|
+
for key, value in self.items.items()
|
|
191
|
+
)
|
|
192
|
+
+ "}"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
def get(self, key: Type[T]) -> Optional[T]:
|
|
196
|
+
"""Get a given item or None if it does not exist"""
|
|
197
|
+
try:
|
|
198
|
+
return self[key]
|
|
199
|
+
except KeyError:
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
def has(self, key: Type[T]) -> bool:
|
|
203
|
+
"""Returns True if the record has the given item type"""
|
|
204
|
+
return key.__get_base__() in self.items
|
|
205
|
+
|
|
206
|
+
def __getitem__(self, key: Type[T]) -> T:
|
|
207
|
+
"""Get an item given its type"""
|
|
208
|
+
base = key.__get_base__()
|
|
209
|
+
try:
|
|
210
|
+
entry = self.items[base]
|
|
211
|
+
except KeyError:
|
|
212
|
+
raise KeyError(
|
|
213
|
+
f"""No entry with type {key}: """
|
|
214
|
+
f"""{",".join(str(s) for s in self.items.keys())}"""
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Check if this matches the expected class
|
|
218
|
+
if not isinstance(entry, key):
|
|
219
|
+
raise KeyError(
|
|
220
|
+
f"""No entry with type {key}: """
|
|
221
|
+
f"""{",".join(str(s) for s in self.items.keys())}"""
|
|
222
|
+
)
|
|
223
|
+
return entry
|
|
224
|
+
|
|
225
|
+
def update(self, *items: T, target: RecordType = None) -> "Record":
|
|
226
|
+
"""Update some items"""
|
|
227
|
+
# Create our new dictionary
|
|
228
|
+
item_dict = {**self.items}
|
|
229
|
+
for item in items:
|
|
230
|
+
item_dict[item.__get_base__()] = item
|
|
231
|
+
|
|
232
|
+
return Record(item_dict)
|
datamaestro/registry.py
CHANGED
datamaestro/search.py
CHANGED
datamaestro/settings.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
"""Global and user settings utility classes"""
|
|
2
|
+
|
|
2
3
|
import marshmallow as mm
|
|
3
|
-
from
|
|
4
|
+
from typing import Dict, Any
|
|
5
|
+
from experimaestro.utils.settings import JsonSettings
|
|
4
6
|
from pathlib import Path
|
|
5
7
|
|
|
6
8
|
# --- Global settings
|
datamaestro/sphinx.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
# Sphinx extension for datamaestro datasets
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Tuple
|
|
4
|
+
from sphinx.ext.autodoc.mock import mock
|
|
5
|
+
|
|
6
|
+
from docutils import nodes
|
|
7
|
+
|
|
8
|
+
from sphinx.application import Sphinx
|
|
9
|
+
from sphinx.domains import Domain, ObjType
|
|
10
|
+
from sphinx.roles import XRefRole
|
|
11
|
+
from sphinx.util.docutils import SphinxDirective
|
|
12
|
+
from sphinx.locale import _
|
|
13
|
+
from sphinx import addnodes
|
|
14
|
+
from sphinx.util.nodes import make_refnode
|
|
15
|
+
import datamaestro
|
|
16
|
+
from datamaestro.data import AbstractDataset
|
|
17
|
+
import logging
|
|
18
|
+
from myst_parser.config.main import MdParserConfig
|
|
19
|
+
from myst_parser.mdit_to_docutils.base import DocutilsRenderer
|
|
20
|
+
from myst_parser.parsers.mdit import create_md_parser
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DatasetNode(nodes.paragraph):
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def to_docutils(source: str):
|
|
28
|
+
parser = create_md_parser(MdParserConfig(), DocutilsRenderer)
|
|
29
|
+
return parser.render(source)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DatasetsDirective(SphinxDirective):
|
|
33
|
+
def dataset_desc(self, ds: AbstractDataset):
|
|
34
|
+
dm = self.env.get_domain("dm")
|
|
35
|
+
|
|
36
|
+
assert isinstance(dm, DatamaestroDomain)
|
|
37
|
+
dm.add_dataset(ds.id)
|
|
38
|
+
|
|
39
|
+
# indexnode = addnodes.index(entries=[])
|
|
40
|
+
desc = addnodes.desc()
|
|
41
|
+
desc["domain"] = DatamaestroDomain.name
|
|
42
|
+
desc["objtype"] = desc["desctype"] = "dataset"
|
|
43
|
+
desc["classes"].append(DatamaestroDomain.name)
|
|
44
|
+
|
|
45
|
+
signodes = addnodes.desc_signature(ds.id, "", is_multiline=True)
|
|
46
|
+
desc.append(signodes)
|
|
47
|
+
|
|
48
|
+
signode = addnodes.desc_signature_line()
|
|
49
|
+
signode += nodes.Text("Dataset ")
|
|
50
|
+
signode += addnodes.desc_name(text=ds.id)
|
|
51
|
+
signode["ids"].append("dataset" + "-" + ds.id)
|
|
52
|
+
signodes.append(signode)
|
|
53
|
+
|
|
54
|
+
content = addnodes.desc_content()
|
|
55
|
+
desc.append(content)
|
|
56
|
+
|
|
57
|
+
if ds.configtype:
|
|
58
|
+
ctype = ds.configtype
|
|
59
|
+
name = f"{ctype.__module__}.{ctype.__qualname__}"
|
|
60
|
+
|
|
61
|
+
te = nodes.paragraph()
|
|
62
|
+
te.append(nodes.Text("Experimaestro type: "))
|
|
63
|
+
|
|
64
|
+
p = nodes.paragraph()
|
|
65
|
+
returns = addnodes.desc_returns()
|
|
66
|
+
xref = addnodes.pending_xref(
|
|
67
|
+
"",
|
|
68
|
+
nodes.Text(name),
|
|
69
|
+
refdomain="py",
|
|
70
|
+
reftype="class",
|
|
71
|
+
reftarget=name,
|
|
72
|
+
)
|
|
73
|
+
returns.append(xref)
|
|
74
|
+
p.append(returns)
|
|
75
|
+
|
|
76
|
+
content.append(p)
|
|
77
|
+
|
|
78
|
+
# node.append(nodes.Text(ds.id))
|
|
79
|
+
if ds.name:
|
|
80
|
+
content.append(
|
|
81
|
+
nodes.paragraph("", "", nodes.strong("", nodes.Text(ds.name)))
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
if ds.tags or ds.tasks:
|
|
85
|
+
if ds.tags:
|
|
86
|
+
content.append(
|
|
87
|
+
nodes.paragraph(
|
|
88
|
+
"",
|
|
89
|
+
"",
|
|
90
|
+
nodes.strong("", nodes.Text("Tags: ")),
|
|
91
|
+
nodes.Text(", ".join(ds.tags)),
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
if ds.tasks:
|
|
95
|
+
content.append(
|
|
96
|
+
nodes.paragraph(
|
|
97
|
+
"",
|
|
98
|
+
"",
|
|
99
|
+
nodes.strong("", "Tasks: "),
|
|
100
|
+
nodes.Text(", ".join(ds.tasks)),
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if ds.url:
|
|
105
|
+
href = nodes.reference(refuri=ds.url)
|
|
106
|
+
href.append(nodes.Text(ds.url))
|
|
107
|
+
p = nodes.paragraph()
|
|
108
|
+
p.append(nodes.Text("External link: "))
|
|
109
|
+
p.append(href)
|
|
110
|
+
content.append(p)
|
|
111
|
+
|
|
112
|
+
if ds.description:
|
|
113
|
+
content.extend(to_docutils(ds.description))
|
|
114
|
+
|
|
115
|
+
return desc
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class RepositoryDirective(DatasetsDirective):
|
|
119
|
+
"""Generates the document for a whole repository"""
|
|
120
|
+
|
|
121
|
+
has_content = True
|
|
122
|
+
required_arguments = 1
|
|
123
|
+
optional_arguments = 0
|
|
124
|
+
|
|
125
|
+
def run(self):
|
|
126
|
+
(repository_id,) = self.arguments
|
|
127
|
+
with mock(self.config.autodoc_mock_imports):
|
|
128
|
+
repository = datamaestro.Context.instance().repository(repository_id) # type: Optional[datamaestro.Repository]
|
|
129
|
+
assert repository is not None
|
|
130
|
+
|
|
131
|
+
docnodes = []
|
|
132
|
+
for module in repository.modules():
|
|
133
|
+
section = nodes.section(
|
|
134
|
+
ids=[f"dm-datasets-{repository_id}-{module.id}"]
|
|
135
|
+
)
|
|
136
|
+
docnodes.append(section)
|
|
137
|
+
|
|
138
|
+
section += nodes.title("", nodes.Text(module.title))
|
|
139
|
+
section += nodes.paragraph()
|
|
140
|
+
if module.description:
|
|
141
|
+
section += to_docutils(module.description).children
|
|
142
|
+
|
|
143
|
+
for ds in iter(module):
|
|
144
|
+
section += self.dataset_desc(ds)
|
|
145
|
+
|
|
146
|
+
return docnodes
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class DatasetDirective(DatasetsDirective):
|
|
150
|
+
has_content = True
|
|
151
|
+
required_arguments = 1
|
|
152
|
+
optional_arguments = 1
|
|
153
|
+
|
|
154
|
+
def run(self):
|
|
155
|
+
# --- Retrieve the datasets
|
|
156
|
+
if len(self.arguments) == 2:
|
|
157
|
+
module_name, repository_name = self.arguments
|
|
158
|
+
else:
|
|
159
|
+
(module_name,) = self.arguments
|
|
160
|
+
repository_name = self.env.config["datamaestro_repository"]
|
|
161
|
+
|
|
162
|
+
datasets = None
|
|
163
|
+
with mock(self.config.autodoc_mock_imports):
|
|
164
|
+
for repository in datamaestro.Context.instance().repositories():
|
|
165
|
+
if repository_name is None or repository.id == repository_name:
|
|
166
|
+
datasets = repository.datasets(module_name)
|
|
167
|
+
if datasets is not None:
|
|
168
|
+
break
|
|
169
|
+
|
|
170
|
+
assert datasets is not None
|
|
171
|
+
|
|
172
|
+
# --- Start documenting
|
|
173
|
+
|
|
174
|
+
docnodes = []
|
|
175
|
+
# node.document = self.state.document
|
|
176
|
+
if datasets.description:
|
|
177
|
+
docnodes.extend(to_docutils(datasets.description))
|
|
178
|
+
|
|
179
|
+
for ds in datasets:
|
|
180
|
+
docnodes.append(self.dataset_desc(ds))
|
|
181
|
+
return docnodes
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class DatamaestroDomain(Domain):
|
|
185
|
+
name = "dm"
|
|
186
|
+
object_types = {
|
|
187
|
+
"dataset": ObjType(_("dataset"), "ds"),
|
|
188
|
+
}
|
|
189
|
+
directives = {
|
|
190
|
+
"repository": RepositoryDirective,
|
|
191
|
+
"datasets": DatasetDirective,
|
|
192
|
+
}
|
|
193
|
+
roles = {"ref": XRefRole()}
|
|
194
|
+
indices = {
|
|
195
|
+
# TODO: Add indices for tags and tasks
|
|
196
|
+
}
|
|
197
|
+
initial_data: Dict[str, Dict[str, Tuple[str, str]]] = {
|
|
198
|
+
"datasets": {}, # fullname -> dataset
|
|
199
|
+
"tags": {}, # tag -> list of datasets,
|
|
200
|
+
"tasks": {}, # task name -> list of datasets
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
def add_dataset(self, dsid):
|
|
204
|
+
self.data["datasets"][dsid] = (self.env.docname, f"dataset-{dsid}")
|
|
205
|
+
|
|
206
|
+
def resolve_xref(self, env, fromdocname, builder, typ, target, node, contnode):
|
|
207
|
+
logging.debug("[dm/sphinx] Searching for", target)
|
|
208
|
+
|
|
209
|
+
ref = self.data["datasets"].get(target, None)
|
|
210
|
+
if ref:
|
|
211
|
+
docname, targ = ref
|
|
212
|
+
return make_refnode(builder, fromdocname, docname, targ, contnode, targ)
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def setup(app: Sphinx) -> Dict[str, Any]:
|
|
217
|
+
"""Setup experimaestro for Sphinx documentation"""
|
|
218
|
+
|
|
219
|
+
app.add_domain(DatamaestroDomain)
|
|
220
|
+
app.add_node(DatasetNode)
|
|
221
|
+
|
|
222
|
+
app.add_config_value("datamaestro_repository", None, True)
|
|
223
|
+
|
|
224
|
+
return {"version": datamaestro.version, "parallel_read_safe": True}
|
datamaestro/stream/__init__.py
CHANGED
datamaestro/stream/lines.py
CHANGED
|
@@ -28,7 +28,8 @@ class LineTransformStream(io.RawIOBase):
|
|
|
28
28
|
self.current = self.transform(line).encode("utf-8")
|
|
29
29
|
|
|
30
30
|
def readinto(self, b):
|
|
31
|
-
"""Read bytes into a pre-allocated, writable bytes-like object b and
|
|
31
|
+
"""Read bytes into a pre-allocated, writable bytes-like object b and
|
|
32
|
+
return the number of bytes read"""
|
|
32
33
|
if self.current is None:
|
|
33
34
|
return 0
|
|
34
35
|
|
|
@@ -41,12 +42,14 @@ class LineTransformStream(io.RawIOBase):
|
|
|
41
42
|
return offset
|
|
42
43
|
|
|
43
44
|
# How many bytes to read from current line
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
b[offset : (offset +
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
45
|
+
chunk_len = min(lb, len(self.current) - self.offset)
|
|
46
|
+
|
|
47
|
+
b[offset : (offset + chunk_len)] = self.current[
|
|
48
|
+
self.offset : (self.offset + chunk_len)
|
|
49
|
+
]
|
|
50
|
+
lb -= chunk_len
|
|
51
|
+
offset += chunk_len
|
|
52
|
+
self.offset += chunk_len
|
|
50
53
|
|
|
51
54
|
return offset
|
|
52
55
|
|
datamaestro/templates/dataset.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# See documentation on
|
|
1
|
+
# See documentation on https://datamaestro.readthedocs.io
|
|
2
2
|
|
|
3
3
|
from datamaestro.definitions import datatasks, datatags, dataset
|
|
4
4
|
from datamaestro.data import Base
|
|
@@ -7,11 +7,12 @@ from datamaestro.data import Base
|
|
|
7
7
|
@datatags("tag1", "tag2")
|
|
8
8
|
@datatasks("task1", "task2")
|
|
9
9
|
@dataset(
|
|
10
|
-
Base,
|
|
10
|
+
Base,
|
|
11
|
+
url="__URL__",
|
|
11
12
|
)
|
|
12
13
|
def __IDENTIFIER__():
|
|
13
14
|
"""Line description
|
|
14
15
|
|
|
15
|
-
|
|
16
|
-
|
|
16
|
+
Long description
|
|
17
|
+
"""
|
|
17
18
|
return {}
|
datamaestro/test/__init__.py
CHANGED
datamaestro/test/checks.py
CHANGED
datamaestro/test/conftest.py
CHANGED
|
@@ -1,13 +1,8 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
import contextlib
|
|
3
|
-
import unittest
|
|
4
|
-
import tempfile
|
|
5
2
|
from datamaestro import Repository, Context
|
|
6
3
|
import shutil
|
|
7
4
|
import logging
|
|
8
5
|
import pytest
|
|
9
|
-
import os
|
|
10
|
-
import shutil
|
|
11
6
|
|
|
12
7
|
|
|
13
8
|
class MyRepository(Repository):
|
|
@@ -27,7 +22,7 @@ def context(tmp_path_factory):
|
|
|
27
22
|
context = Context(Path(dir))
|
|
28
23
|
logging.info("Created datamaestro test directory %s", dir)
|
|
29
24
|
|
|
30
|
-
|
|
25
|
+
_repository = MyRepository(context) # noqa: F841 - registered on creation
|
|
31
26
|
|
|
32
27
|
yield context
|
|
33
28
|
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from datamaestro.annotations.agreement import useragreement
|
|
2
2
|
from datamaestro.definitions import AbstractDataset
|
|
3
|
-
from .conftest import repository
|
|
4
3
|
|
|
5
4
|
|
|
6
5
|
def test_useragreements(context):
|
|
7
6
|
# Fake dataset
|
|
8
7
|
class t(AbstractDataset):
|
|
9
|
-
|
|
8
|
+
def _prepare(self):
|
|
9
|
+
pass
|
|
10
10
|
|
|
11
11
|
useragreement("test")(t(None))
|
|
@@ -1,9 +1,5 @@
|
|
|
1
|
-
import unittest
|
|
2
|
-
import logging
|
|
3
1
|
from pathlib import Path
|
|
4
|
-
import shutil
|
|
5
2
|
import datamaestro.download.single as single
|
|
6
|
-
from datamaestro import Repository, Context
|
|
7
3
|
from datamaestro.definitions import AbstractDataset
|
|
8
4
|
from .conftest import MyRepository
|
|
9
5
|
|
|
@@ -16,6 +12,9 @@ class Dataset(AbstractDataset):
|
|
|
16
12
|
super().__init__(repository)
|
|
17
13
|
self.datapath = Path(repository.context._path)
|
|
18
14
|
|
|
15
|
+
def _prepare(self):
|
|
16
|
+
pass
|
|
17
|
+
|
|
19
18
|
|
|
20
19
|
def test_filedownloader(context):
|
|
21
20
|
repository = MyRepository(context)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
from datamaestro.record import Item, record_type
|
|
3
|
+
from attrs import define
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@define
|
|
8
|
+
class AItem(Item):
|
|
9
|
+
a: int
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@define
|
|
13
|
+
class A1Item(AItem):
|
|
14
|
+
a1: int
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@define
|
|
18
|
+
class BItem(Item):
|
|
19
|
+
b: int
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@define
|
|
23
|
+
class B1Item(BItem):
|
|
24
|
+
b1: int
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@define
|
|
28
|
+
class CItem(Item):
|
|
29
|
+
c: int
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
ARecord = record_type(AItem)
|
|
33
|
+
BaseRecord = ARecord.sub(A1Item)
|
|
34
|
+
MyRecord = BaseRecord.sub(BItem)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_record_simple():
|
|
38
|
+
a = A1Item(1, 2)
|
|
39
|
+
b = BItem(4)
|
|
40
|
+
r = MyRecord(a, b)
|
|
41
|
+
assert r[AItem] is a
|
|
42
|
+
assert r[A1Item] is a
|
|
43
|
+
assert r[BItem] is b
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_record_missing_init():
|
|
47
|
+
with pytest.raises(KeyError):
|
|
48
|
+
# A1Item is missing
|
|
49
|
+
MyRecord(AItem(1), BItem(2))
|
|
50
|
+
|
|
51
|
+
with pytest.raises(KeyError):
|
|
52
|
+
MyRecord(A1Item(1, 2))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_record_update():
|
|
56
|
+
a = A1Item(1, 2)
|
|
57
|
+
b = BItem(4)
|
|
58
|
+
r = MyRecord(a, b)
|
|
59
|
+
|
|
60
|
+
r2 = r.update(BItem(3))
|
|
61
|
+
assert r is not r2
|
|
62
|
+
assert r2[BItem] is not b
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_record_pickled():
|
|
66
|
+
# First,
|
|
67
|
+
MyRecord2 = BaseRecord.sub(BItem)
|
|
68
|
+
r = MyRecord2(A1Item(1, 2), BItem(2))
|
|
69
|
+
r = pickle.loads(pickle.dumps(r))
|
|
70
|
+
|
|
71
|
+
assert r[A1Item].a == 1
|
|
72
|
+
assert r[BItem].b == 2
|