datamaestro 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +1 -2
- datamaestro/__main__.py +11 -7
- datamaestro/commands/site.py +16 -5
- datamaestro/context.py +32 -16
- datamaestro/data/ml.py +1 -0
- datamaestro/definitions.py +246 -20
- datamaestro/download/__init__.py +583 -40
- datamaestro/download/archive.py +120 -76
- datamaestro/download/custom.py +38 -6
- datamaestro/download/huggingface.py +46 -14
- datamaestro/download/links.py +106 -49
- datamaestro/download/multiple.py +27 -5
- datamaestro/download/single.py +111 -54
- datamaestro/download/sync.py +0 -1
- datamaestro/download/todo.py +9 -4
- datamaestro/download/wayback.py +3 -3
- datamaestro/record.py +48 -2
- datamaestro/settings.py +2 -1
- datamaestro/sphinx.py +1 -3
- datamaestro/stream/lines.py +8 -6
- datamaestro/test/__init__.py +3 -1
- datamaestro/test/conftest.py +1 -2
- datamaestro/test/test_resource.py +1388 -0
- datamaestro/utils.py +7 -6
- datamaestro/v2.md +301 -0
- datamaestro/version.py +4 -21
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/METADATA +63 -94
- datamaestro-1.7.0.dist-info/RECORD +49 -0
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
- datamaestro-1.5.0.dist-info/RECORD +0 -48
- datamaestro-1.5.0.dist-info/top_level.txt +0 -1
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -0
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/licenses/LICENSE +0 -0
datamaestro/download/multiple.py
CHANGED
|
@@ -1,13 +1,31 @@
|
|
|
1
|
+
"""Multiple download resources (legacy).
|
|
2
|
+
|
|
3
|
+
Note: This module uses a legacy API pattern and needs deeper refactoring.
|
|
4
|
+
The List and Datasets classes use an older constructor signature that
|
|
5
|
+
differs from the modern Resource interface.
|
|
6
|
+
"""
|
|
7
|
+
|
|
1
8
|
import logging
|
|
2
|
-
from pathlib import Path
|
|
3
9
|
import os
|
|
10
|
+
import warnings
|
|
11
|
+
from pathlib import Path
|
|
4
12
|
|
|
5
|
-
from datamaestro import AbstractDataset
|
|
13
|
+
from datamaestro.definitions import AbstractDataset
|
|
6
14
|
from datamaestro.download import Download
|
|
7
15
|
|
|
16
|
+
warnings.warn(
|
|
17
|
+
"datamaestro.download.multiple uses a legacy API. "
|
|
18
|
+
"Consider migrating to class-attribute resource definitions.",
|
|
19
|
+
DeprecationWarning,
|
|
20
|
+
stacklevel=2,
|
|
21
|
+
)
|
|
22
|
+
|
|
8
23
|
|
|
9
24
|
class List(Download):
|
|
10
|
-
"""Download multiple files or directories given by a list
|
|
25
|
+
"""Download multiple files or directories given by a list.
|
|
26
|
+
|
|
27
|
+
Legacy: uses old-style constructor API.
|
|
28
|
+
"""
|
|
11
29
|
|
|
12
30
|
def __init__(self, dataset: AbstractDataset, definition: object):
|
|
13
31
|
super().__init__(dataset, definition)
|
|
@@ -32,7 +50,10 @@ class List(Download):
|
|
|
32
50
|
|
|
33
51
|
|
|
34
52
|
class Datasets(Download):
|
|
35
|
-
"""Use links to dataset files
|
|
53
|
+
"""Use links to dataset files.
|
|
54
|
+
|
|
55
|
+
Legacy: uses old-style constructor API.
|
|
56
|
+
"""
|
|
36
57
|
|
|
37
58
|
def __init__(self, dataset: AbstractDataset, definition: object):
|
|
38
59
|
super().__init__(dataset, definition)
|
|
@@ -48,7 +69,8 @@ class Datasets(Download):
|
|
|
48
69
|
if isinstance(files, Path):
|
|
49
70
|
if not files.is_dir():
|
|
50
71
|
raise AssertionError(
|
|
51
|
-
"Dataset path is not a directory: %s",
|
|
72
|
+
"Dataset path is not a directory: %s",
|
|
73
|
+
files,
|
|
52
74
|
)
|
|
53
75
|
path = destination / key
|
|
54
76
|
if not path.exists():
|
datamaestro/download/single.py
CHANGED
|
@@ -1,64 +1,83 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
"""Single file download resources.
|
|
2
|
+
|
|
3
|
+
Provides FileResource subclasses for downloading individual files
|
|
4
|
+
from URLs, with optional transforms and integrity checking.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
5
9
|
import io
|
|
6
10
|
import gzip
|
|
7
|
-
import
|
|
11
|
+
import logging
|
|
8
12
|
import os
|
|
9
|
-
import
|
|
13
|
+
import os.path as op
|
|
14
|
+
import shutil
|
|
15
|
+
import tarfile
|
|
10
16
|
from pathlib import Path
|
|
11
|
-
|
|
12
|
-
|
|
17
|
+
|
|
18
|
+
import urllib3
|
|
19
|
+
|
|
20
|
+
from datamaestro.download import FileResource
|
|
13
21
|
from datamaestro.stream import Transform
|
|
14
|
-
from datamaestro.
|
|
22
|
+
from datamaestro.utils import copyfileobjs
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
15
25
|
|
|
16
26
|
|
|
17
27
|
def open_ext(*args, **kwargs):
|
|
18
|
-
"""Opens a file according to its extension"""
|
|
28
|
+
"""Opens a file according to its extension."""
|
|
19
29
|
name = args[0]
|
|
20
30
|
if name.endswith(".gz"):
|
|
21
31
|
return gzip.open(*args, *kwargs)
|
|
22
32
|
return io.open(*args, **kwargs)
|
|
23
33
|
|
|
24
34
|
|
|
25
|
-
class
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
35
|
+
class FileDownloader(FileResource):
|
|
36
|
+
"""Downloads a single file from a URL.
|
|
37
|
+
|
|
38
|
+
Supports optional transforms (e.g., gzip decompression)
|
|
39
|
+
and integrity checking.
|
|
29
40
|
|
|
30
|
-
|
|
31
|
-
def path(self):
|
|
32
|
-
return self.definition.datapath / self.name
|
|
41
|
+
Usage as class attribute (preferred)::
|
|
33
42
|
|
|
34
|
-
|
|
35
|
-
|
|
43
|
+
@dataset(url="...")
|
|
44
|
+
class MyDataset(Base):
|
|
45
|
+
DATA = FileDownloader.apply(
|
|
46
|
+
"data.csv", "http://example.com/data.csv.gz"
|
|
47
|
+
)
|
|
36
48
|
|
|
37
|
-
|
|
38
|
-
if not self.path.is_file() and not force:
|
|
39
|
-
self._download(self.path)
|
|
49
|
+
Usage as decorator (deprecated)::
|
|
40
50
|
|
|
51
|
+
@filedownloader("data.csv", "http://example.com/data.csv.gz")
|
|
52
|
+
@dataset(Base)
|
|
53
|
+
def my_dataset(data): ...
|
|
54
|
+
"""
|
|
41
55
|
|
|
42
|
-
class filedownloader(SingleDownload):
|
|
43
56
|
def __init__(
|
|
44
57
|
self,
|
|
45
58
|
filename: str,
|
|
46
59
|
url: str,
|
|
47
|
-
size: int = None,
|
|
48
|
-
transforms:
|
|
60
|
+
size: int | None = None,
|
|
61
|
+
transforms: Transform | None = None,
|
|
49
62
|
checker=None,
|
|
63
|
+
*,
|
|
64
|
+
varname: str | None = None,
|
|
65
|
+
transient: bool = False,
|
|
50
66
|
):
|
|
51
|
-
"""Downloads a file given by a URL
|
|
52
|
-
|
|
53
|
-
:param filename: The filename within the data folder; the variable name
|
|
54
|
-
corresponds to the filename without the extension.
|
|
55
|
-
|
|
56
|
-
:param url: The URL to download.
|
|
57
|
-
|
|
58
|
-
:param transforms: Transform the file before storing it size: size in
|
|
59
|
-
bytes (or None)
|
|
60
67
|
"""
|
|
61
|
-
|
|
68
|
+
Args:
|
|
69
|
+
filename: The filename within the data folder; the variable
|
|
70
|
+
name corresponds to the filename without the extension.
|
|
71
|
+
url: The URL to download.
|
|
72
|
+
size: Expected size in bytes (or None).
|
|
73
|
+
transforms: Transform the file before storing it.
|
|
74
|
+
Auto-detected from URL path if None.
|
|
75
|
+
checker: File integrity checker.
|
|
76
|
+
varname: Explicit resource name.
|
|
77
|
+
transient: If True, data can be deleted after dependents
|
|
78
|
+
complete.
|
|
79
|
+
"""
|
|
80
|
+
super().__init__(filename, varname=varname, transient=transient)
|
|
62
81
|
self.url = url
|
|
63
82
|
self.checker = checker
|
|
64
83
|
self.size = size
|
|
@@ -67,8 +86,8 @@ class filedownloader(SingleDownload):
|
|
|
67
86
|
path = Path(Path(p.path).name)
|
|
68
87
|
self.transforms = transforms if transforms else Transform.createFromPath(path)
|
|
69
88
|
|
|
70
|
-
def _download(self, destination):
|
|
71
|
-
|
|
89
|
+
def _download(self, destination: Path) -> None:
|
|
90
|
+
logger.info("Downloading %s into %s", self.url, destination)
|
|
72
91
|
|
|
73
92
|
# Creates directory if needed
|
|
74
93
|
dir = op.dirname(destination)
|
|
@@ -78,41 +97,69 @@ class filedownloader(SingleDownload):
|
|
|
78
97
|
with self.context.downloadURL(self.url, size=self.size) as file:
|
|
79
98
|
# Transform if need be
|
|
80
99
|
if self.transforms:
|
|
81
|
-
|
|
82
|
-
with
|
|
83
|
-
"
|
|
84
|
-
|
|
100
|
+
logger.info("Transforming file")
|
|
101
|
+
with (
|
|
102
|
+
self.transforms(file.path.open("rb")) as stream,
|
|
103
|
+
destination.open("wb") as out,
|
|
104
|
+
):
|
|
85
105
|
if self.checker:
|
|
86
106
|
copyfileobjs(stream, [out, self.checker])
|
|
87
107
|
self.checker.close()
|
|
88
108
|
else:
|
|
89
109
|
shutil.copyfileobj(stream, out)
|
|
90
110
|
else:
|
|
91
|
-
|
|
111
|
+
logger.info("Keeping original downloaded file %s", file.path)
|
|
92
112
|
if self.checker:
|
|
93
113
|
self.checker.check(file.path)
|
|
94
114
|
(shutil.copy if file.keep else shutil.move)(file.path, destination)
|
|
95
115
|
|
|
96
|
-
|
|
116
|
+
logger.info("Created file %s", destination)
|
|
97
117
|
|
|
98
118
|
|
|
99
|
-
|
|
100
|
-
|
|
119
|
+
# Factory alias for backward compat and convenient usage
|
|
120
|
+
filedownloader = FileDownloader.apply
|
|
101
121
|
|
|
102
|
-
def __init__(self, filename: str, url: str, transforms=None):
|
|
103
|
-
"""Concat the files in an archive
|
|
104
122
|
|
|
123
|
+
class ConcatDownloader(FileResource):
|
|
124
|
+
"""Concatenate all files from an archive into a single file.
|
|
125
|
+
|
|
126
|
+
Usage as class attribute (preferred)::
|
|
127
|
+
|
|
128
|
+
@dataset(url="...")
|
|
129
|
+
class MyDataset(Base):
|
|
130
|
+
DATA = ConcatDownloader.apply(
|
|
131
|
+
"data.txt", "http://example.com/data.tar.gz"
|
|
132
|
+
)
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
def __init__(
|
|
136
|
+
self,
|
|
137
|
+
filename: str,
|
|
138
|
+
url: str,
|
|
139
|
+
transforms=None,
|
|
140
|
+
*,
|
|
141
|
+
varname: str | None = None,
|
|
142
|
+
transient: bool = False,
|
|
143
|
+
):
|
|
144
|
+
"""
|
|
105
145
|
Args:
|
|
106
|
-
filename: The filename within the data folder; the variable
|
|
107
|
-
|
|
108
|
-
|
|
146
|
+
filename: The filename within the data folder; the variable
|
|
147
|
+
name corresponds to the filename without the extension.
|
|
148
|
+
url: The URL to download.
|
|
149
|
+
transforms: Transform the file before storing it.
|
|
150
|
+
varname: Explicit resource name.
|
|
151
|
+
transient: If True, data can be deleted after dependents
|
|
152
|
+
complete.
|
|
109
153
|
"""
|
|
110
|
-
super().__init__(filename)
|
|
154
|
+
super().__init__(filename, varname=varname, transient=transient)
|
|
111
155
|
self.url = url
|
|
112
156
|
self.transforms = transforms
|
|
113
157
|
|
|
114
|
-
def _download(self, destination):
|
|
115
|
-
with
|
|
158
|
+
def _download(self, destination: Path) -> None:
|
|
159
|
+
with (
|
|
160
|
+
self.context.downloadURL(self.url) as dl,
|
|
161
|
+
tarfile.open(dl.path) as archive,
|
|
162
|
+
):
|
|
116
163
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
117
164
|
|
|
118
165
|
with open(destination, "wb") as out:
|
|
@@ -121,6 +168,16 @@ class concatdownload(SingleDownload):
|
|
|
121
168
|
transforms = self.transforms or Transform.createFromPath(
|
|
122
169
|
Path(tarinfo.name)
|
|
123
170
|
)
|
|
124
|
-
|
|
171
|
+
logger.debug("Processing file %s", tarinfo.name)
|
|
125
172
|
with transforms(archive.fileobject(archive, tarinfo)) as fp:
|
|
126
173
|
shutil.copyfileobj(fp, out)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# Factory alias for backward compat
|
|
177
|
+
concatdownload = ConcatDownloader.apply
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# --- Backward compat aliases ---
|
|
181
|
+
# Keep old class names importable but they now point to new classes
|
|
182
|
+
|
|
183
|
+
SingleDownload = FileDownloader
|
datamaestro/download/sync.py
CHANGED
datamaestro/download/todo.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
|
-
from
|
|
1
|
+
from datamaestro.download import Resource
|
|
2
2
|
|
|
3
|
-
from datamaestro.download import Download
|
|
4
3
|
|
|
4
|
+
class Todo(Resource):
|
|
5
|
+
"""Placeholder resource indicating download is not yet implemented."""
|
|
5
6
|
|
|
6
|
-
|
|
7
|
-
def download(self, destination: Path):
|
|
7
|
+
def download(self, force=False):
|
|
8
8
|
raise NotImplementedError(
|
|
9
9
|
"Download method not defined - please edit the definition file"
|
|
10
10
|
)
|
|
11
|
+
|
|
12
|
+
def prepare(self):
|
|
13
|
+
raise NotImplementedError(
|
|
14
|
+
"Prepare method not defined - please edit the definition file"
|
|
15
|
+
)
|
datamaestro/download/wayback.py
CHANGED
|
@@ -129,12 +129,12 @@ class wayback_documents(Resource):
|
|
|
129
129
|
self.urls_fn = urls_fn
|
|
130
130
|
|
|
131
131
|
def prepare(self):
|
|
132
|
-
return self.
|
|
132
|
+
return self.dataset.datapath / self.name
|
|
133
133
|
|
|
134
134
|
def download(self, force=False):
|
|
135
135
|
# Creates directory if needed
|
|
136
|
-
destination: Path = self.
|
|
137
|
-
self.
|
|
136
|
+
destination: Path = self.dataset.datapath / self.name
|
|
137
|
+
self.dataset.datapath.mkdir(exist_ok=True)
|
|
138
138
|
|
|
139
139
|
# Early exit
|
|
140
140
|
done_path = destination.with_suffix(".done")
|
datamaestro/record.py
CHANGED
|
@@ -1,5 +1,51 @@
|
|
|
1
|
+
"""Record module for type-safe heterogeneous containers.
|
|
2
|
+
|
|
3
|
+
.. deprecated:: 2.0
|
|
4
|
+
This module will be removed in v2. Use :class:`typing.TypedDict` instead
|
|
5
|
+
for type-safe heterogeneous data structures. TypedDict provides better IDE
|
|
6
|
+
support, type checking, and is part of the standard library.
|
|
7
|
+
|
|
8
|
+
When using TypedDict, define key constants in classes (e.g., ``MyItem.ID``)
|
|
9
|
+
to avoid typos and enable IDE autocomplete. Prefix keys with package name
|
|
10
|
+
using underscore ``_`` as delimiter to avoid conflicts between different
|
|
11
|
+
data sources.
|
|
12
|
+
|
|
13
|
+
Example migration::
|
|
14
|
+
|
|
15
|
+
# Old way (deprecated)
|
|
16
|
+
@define
|
|
17
|
+
class MyItem(Item):
|
|
18
|
+
value: int
|
|
19
|
+
|
|
20
|
+
record = Record(MyItem(42))
|
|
21
|
+
print(record[MyItem].value)
|
|
22
|
+
|
|
23
|
+
# New way (recommended)
|
|
24
|
+
from typing import TypedDict
|
|
25
|
+
|
|
26
|
+
# Define key constants in classes
|
|
27
|
+
class MyItem:
|
|
28
|
+
ID = "mypackage_value"
|
|
29
|
+
|
|
30
|
+
class MyRecord(TypedDict):
|
|
31
|
+
mypackage_value: int
|
|
32
|
+
|
|
33
|
+
data: MyRecord = {MyItem.ID: 42}
|
|
34
|
+
print(data[MyItem.ID])
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
import warnings
|
|
1
38
|
from typing import Type, TypeVar, Dict, Union, Optional
|
|
2
39
|
|
|
40
|
+
# Emit deprecation warning when module is imported
|
|
41
|
+
warnings.warn(
|
|
42
|
+
"The datamaestro.record module is deprecated and will be removed in v2. "
|
|
43
|
+
"Use typing.TypedDict instead (use class constants like MyItem.ID for keys, "
|
|
44
|
+
"prefixed with package name).",
|
|
45
|
+
DeprecationWarning,
|
|
46
|
+
stacklevel=2,
|
|
47
|
+
)
|
|
48
|
+
|
|
3
49
|
|
|
4
50
|
class Item:
|
|
5
51
|
"""Base class for all item types"""
|
|
@@ -28,8 +74,8 @@ class RecordType:
|
|
|
28
74
|
self.mapping = {item_type.__get_base__(): item_type for item_type in item_types}
|
|
29
75
|
|
|
30
76
|
def __repr__(self):
|
|
31
|
-
|
|
32
|
-
|
|
77
|
+
names = ",".join(item_type.__name__ for item_type in self.item_types)
|
|
78
|
+
return f"Record({names})"
|
|
33
79
|
|
|
34
80
|
def contains(self, other: "RecordType"):
|
|
35
81
|
"""Checks that each item type in other has an item type of a compatible
|
datamaestro/settings.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
"""Global and user settings utility classes"""
|
|
2
|
+
|
|
2
3
|
import marshmallow as mm
|
|
3
4
|
from typing import Dict, Any
|
|
4
|
-
from experimaestro.utils.settings import JsonSettings
|
|
5
|
+
from experimaestro.utils.settings import JsonSettings
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
|
|
7
8
|
# --- Global settings
|
datamaestro/sphinx.py
CHANGED
|
@@ -125,9 +125,7 @@ class RepositoryDirective(DatasetsDirective):
|
|
|
125
125
|
def run(self):
|
|
126
126
|
(repository_id,) = self.arguments
|
|
127
127
|
with mock(self.config.autodoc_mock_imports):
|
|
128
|
-
repository = datamaestro.Context.instance().repository(
|
|
129
|
-
repository_id
|
|
130
|
-
) # type: Optional[datamaestro.Repository]
|
|
128
|
+
repository = datamaestro.Context.instance().repository(repository_id) # type: Optional[datamaestro.Repository]
|
|
131
129
|
assert repository is not None
|
|
132
130
|
|
|
133
131
|
docnodes = []
|
datamaestro/stream/lines.py
CHANGED
|
@@ -42,12 +42,14 @@ class LineTransformStream(io.RawIOBase):
|
|
|
42
42
|
return offset
|
|
43
43
|
|
|
44
44
|
# How many bytes to read from current line
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
b[offset : (offset +
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
45
|
+
chunk_len = min(lb, len(self.current) - self.offset)
|
|
46
|
+
|
|
47
|
+
b[offset : (offset + chunk_len)] = self.current[
|
|
48
|
+
self.offset : (self.offset + chunk_len)
|
|
49
|
+
]
|
|
50
|
+
lb -= chunk_len
|
|
51
|
+
offset += chunk_len
|
|
52
|
+
self.offset += chunk_len
|
|
51
53
|
|
|
52
54
|
return offset
|
|
53
55
|
|
datamaestro/test/__init__.py
CHANGED
datamaestro/test/conftest.py
CHANGED
|
@@ -3,7 +3,6 @@ from datamaestro import Repository, Context
|
|
|
3
3
|
import shutil
|
|
4
4
|
import logging
|
|
5
5
|
import pytest
|
|
6
|
-
import shutil
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class MyRepository(Repository):
|
|
@@ -23,7 +22,7 @@ def context(tmp_path_factory):
|
|
|
23
22
|
context = Context(Path(dir))
|
|
24
23
|
logging.info("Created datamaestro test directory %s", dir)
|
|
25
24
|
|
|
26
|
-
|
|
25
|
+
_repository = MyRepository(context) # noqa: F841 - registered on creation
|
|
27
26
|
|
|
28
27
|
yield context
|
|
29
28
|
|