datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +11 -7
- datamaestro/__main__.py +29 -8
- datamaestro/annotations/__init__.py +1 -1
- datamaestro/annotations/agreement.py +9 -3
- datamaestro/commands/site.py +27 -15
- datamaestro/context.py +143 -87
- datamaestro/data/__init__.py +23 -11
- datamaestro/data/csv.py +12 -12
- datamaestro/data/huggingface.py +25 -0
- datamaestro/data/ml.py +19 -10
- datamaestro/data/tensor.py +32 -24
- datamaestro/definitions.py +492 -131
- datamaestro/download/__init__.py +610 -24
- datamaestro/download/archive.py +129 -77
- datamaestro/download/custom.py +53 -0
- datamaestro/download/huggingface.py +77 -0
- datamaestro/download/links.py +106 -50
- datamaestro/download/multiple.py +27 -5
- datamaestro/download/single.py +114 -51
- datamaestro/download/sync.py +0 -1
- datamaestro/download/todo.py +9 -4
- datamaestro/download/wayback.py +164 -0
- datamaestro/record.py +232 -0
- datamaestro/registry.py +1 -0
- datamaestro/search.py +1 -1
- datamaestro/settings.py +3 -1
- datamaestro/sphinx.py +224 -0
- datamaestro/stream/__init__.py +0 -2
- datamaestro/stream/lines.py +10 -7
- datamaestro/templates/dataset.py +5 -4
- datamaestro/test/__init__.py +3 -1
- datamaestro/test/checks.py +1 -5
- datamaestro/test/conftest.py +1 -6
- datamaestro/test/test_annotations.py +2 -2
- datamaestro/test/test_download_handlers.py +3 -4
- datamaestro/test/test_record.py +72 -0
- datamaestro/test/test_resource.py +1388 -0
- datamaestro/utils.py +15 -9
- datamaestro/v2.md +301 -0
- datamaestro/version.py +4 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
- datamaestro-1.7.0.dist-info/RECORD +49 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
- datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/context.cpython-38.pyc +0 -0
- datamaestro/__pycache__/context.cpython-39.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
- datamaestro/__pycache__/search.cpython-38.pyc +0 -0
- datamaestro/__pycache__/search.cpython-39.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
- datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro-0.8.1.dist-info/RECORD +0 -109
- datamaestro-0.8.1.dist-info/top_level.txt +0 -1
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
datamaestro/download/archive.py
CHANGED
|
@@ -1,37 +1,68 @@
|
|
|
1
|
+
"""Archive download resources.
|
|
2
|
+
|
|
3
|
+
Provides FolderResource subclasses for downloading and extracting
|
|
4
|
+
ZIP and TAR archives.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
1
9
|
import logging
|
|
2
|
-
|
|
3
|
-
import zipfile
|
|
10
|
+
import re
|
|
4
11
|
import shutil
|
|
5
|
-
import urllib3
|
|
6
12
|
import tarfile
|
|
7
|
-
import
|
|
8
|
-
import
|
|
9
|
-
from typing import
|
|
10
|
-
|
|
11
|
-
|
|
13
|
+
import zipfile
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Set
|
|
16
|
+
|
|
17
|
+
import urllib3
|
|
18
|
+
|
|
19
|
+
from datamaestro.download import FolderResource
|
|
20
|
+
from datamaestro.utils import CachedFile, FileChecker
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ArchiveDownloader(FolderResource):
|
|
26
|
+
"""Abstract base for all archive-related extractors.
|
|
27
|
+
|
|
28
|
+
Usage as class attribute (preferred)::
|
|
12
29
|
|
|
30
|
+
@dataset(url="...")
|
|
31
|
+
class MyDataset(Base):
|
|
32
|
+
DATA = ZipDownloader.apply(
|
|
33
|
+
"archive", "http://example.com/data.zip"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
Usage as decorator (deprecated)::
|
|
13
37
|
|
|
14
|
-
|
|
15
|
-
|
|
38
|
+
@zipdownloader("archive", "http://example.com/data.zip")
|
|
39
|
+
@dataset(Base)
|
|
40
|
+
def my_dataset(archive): ...
|
|
41
|
+
"""
|
|
16
42
|
|
|
17
43
|
def __init__(
|
|
18
44
|
self,
|
|
19
|
-
varname,
|
|
45
|
+
varname: str,
|
|
20
46
|
url: str,
|
|
21
|
-
subpath: str = None,
|
|
22
|
-
checker: FileChecker = None,
|
|
23
|
-
files: Set[str] = None,
|
|
47
|
+
subpath: str | None = None,
|
|
48
|
+
checker: FileChecker | None = None,
|
|
49
|
+
files: Set[str] | None = None,
|
|
50
|
+
*,
|
|
51
|
+
transient: bool = False,
|
|
24
52
|
):
|
|
25
|
-
"""Downloads and extract the content of the archive
|
|
53
|
+
"""Downloads and extract the content of the archive.
|
|
26
54
|
|
|
27
55
|
Args:
|
|
28
|
-
varname: The name of the variable when defining the dataset
|
|
29
|
-
url: The archive URL
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
56
|
+
varname: The name of the variable when defining the dataset.
|
|
57
|
+
url: The archive URL.
|
|
58
|
+
subpath: A subpath in the archive; only files from this
|
|
59
|
+
subpath will be extracted.
|
|
60
|
+
checker: The hash check for the downloaded file.
|
|
61
|
+
files: A set of files; if present, only extract those.
|
|
62
|
+
transient: If True, data can be deleted after dependents
|
|
63
|
+
complete.
|
|
33
64
|
"""
|
|
34
|
-
super().__init__(varname)
|
|
65
|
+
super().__init__(varname=varname, transient=transient)
|
|
35
66
|
self.url = url
|
|
36
67
|
self.subpath = subpath
|
|
37
68
|
self.checker = checker
|
|
@@ -42,20 +73,33 @@ class ArchiveDownloader(Download):
|
|
|
42
73
|
def postinit(self):
|
|
43
74
|
# Define the path
|
|
44
75
|
p = urllib3.util.parse_url(self.url)
|
|
45
|
-
|
|
76
|
+
self._archive_name = self._name(Path(p.path).name)
|
|
46
77
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
78
|
+
@property
|
|
79
|
+
def path(self) -> Path:
|
|
80
|
+
"""Final path to the extracted directory."""
|
|
81
|
+
if not self._post:
|
|
82
|
+
self._post = True
|
|
83
|
+
self.postinit()
|
|
51
84
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
return self.
|
|
85
|
+
if len(self.dataset.resources) > 1:
|
|
86
|
+
return self.dataset.datapath / self._archive_name
|
|
87
|
+
return self.dataset.datapath
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def transient_path(self) -> Path:
|
|
91
|
+
"""Temporary path for extraction."""
|
|
92
|
+
if not self._post:
|
|
93
|
+
self._post = True
|
|
94
|
+
self.postinit()
|
|
95
|
+
|
|
96
|
+
if len(self.dataset.resources) > 1:
|
|
97
|
+
return self.dataset.datapath / ".downloads" / self._archive_name
|
|
98
|
+
return self.dataset.datapath / ".downloads" / self.name
|
|
55
99
|
|
|
56
100
|
@property
|
|
57
101
|
def extractall(self):
|
|
58
|
-
"""Returns whether everything can be extracted"""
|
|
102
|
+
"""Returns whether everything can be extracted."""
|
|
59
103
|
return self._files is None and self.subpath is None
|
|
60
104
|
|
|
61
105
|
def filter(self, iterable, getname):
|
|
@@ -63,91 +107,88 @@ class ArchiveDownloader(Download):
|
|
|
63
107
|
|
|
64
108
|
for info in iterable:
|
|
65
109
|
name = getname(info)
|
|
66
|
-
|
|
67
|
-
if self._files and not
|
|
110
|
+
logger.debug("Looking at %s", name)
|
|
111
|
+
if self._files and name not in self._files:
|
|
68
112
|
continue
|
|
69
113
|
|
|
70
|
-
if self.subpath and
|
|
114
|
+
if self.subpath and name.startswith(self.subpath):
|
|
71
115
|
yield info, name[L:]
|
|
72
116
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
@initialized
|
|
76
|
-
def download(self, force=False):
|
|
77
|
-
# Already downloaded
|
|
78
|
-
destination = self.definition.datapath
|
|
79
|
-
if destination.is_dir():
|
|
80
|
-
return
|
|
117
|
+
if not self.subpath:
|
|
118
|
+
yield info, name
|
|
81
119
|
|
|
82
|
-
|
|
120
|
+
def _download(self, destination: Path) -> None:
|
|
121
|
+
logger.info("Downloading %s into %s", self.url, destination)
|
|
83
122
|
|
|
84
123
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
85
|
-
tmpdestination = destination.with_suffix(".tmp")
|
|
86
|
-
if tmpdestination.exists():
|
|
87
|
-
logging.warn("Removing temporary directory %s", tmpdestination)
|
|
88
|
-
shutil.rmtree(tmpdestination)
|
|
89
124
|
|
|
90
125
|
with self.context.downloadURL(self.url) as file:
|
|
91
126
|
if self.checker:
|
|
92
127
|
self.checker.check(file.path)
|
|
93
|
-
self.unarchive(file,
|
|
94
|
-
|
|
95
|
-
# Look at the content
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
"Moving single file/directory {} into destination {}".format(
|
|
104
|
-
path, destination
|
|
105
|
-
)
|
|
128
|
+
self.unarchive(file, destination)
|
|
129
|
+
|
|
130
|
+
# Look at the content - if single directory, unwrap
|
|
131
|
+
children = list(destination.iterdir())
|
|
132
|
+
if len(children) == 1 and children[0].is_dir():
|
|
133
|
+
single_dir = children[0]
|
|
134
|
+
logger.info(
|
|
135
|
+
"Moving single directory %s into destination %s",
|
|
136
|
+
single_dir,
|
|
137
|
+
destination,
|
|
106
138
|
)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
shutil.
|
|
139
|
+
# Move contents up one level
|
|
140
|
+
tmp = destination.with_suffix(".unwrap")
|
|
141
|
+
shutil.move(str(single_dir), str(tmp))
|
|
142
|
+
shutil.rmtree(destination)
|
|
143
|
+
shutil.move(str(tmp), str(destination))
|
|
144
|
+
|
|
145
|
+
def unarchive(self, file, destination: Path):
|
|
146
|
+
raise NotImplementedError()
|
|
111
147
|
|
|
148
|
+
def _name(self, name: str) -> str:
|
|
149
|
+
raise NotImplementedError()
|
|
112
150
|
|
|
113
|
-
|
|
114
|
-
|
|
151
|
+
|
|
152
|
+
class ZipDownloader(ArchiveDownloader):
|
|
153
|
+
"""ZIP Archive handler."""
|
|
115
154
|
|
|
116
155
|
def _name(self, name):
|
|
117
156
|
return re.sub(r"\.zip$", "", name)
|
|
118
157
|
|
|
119
158
|
def unarchive(self, file, destination: Path):
|
|
120
|
-
|
|
159
|
+
logger.info("Unzipping file")
|
|
121
160
|
with zipfile.ZipFile(file.path) as zip:
|
|
122
161
|
if self.extractall:
|
|
123
162
|
zip.extractall(destination)
|
|
124
163
|
else:
|
|
125
164
|
for zip_info, name in self.filter(
|
|
126
|
-
zip.infolist(),
|
|
165
|
+
zip.infolist(),
|
|
166
|
+
lambda zip_info: zip_info.filename,
|
|
127
167
|
):
|
|
128
168
|
if zip_info.is_dir():
|
|
129
169
|
(destination / name).mkdir()
|
|
130
170
|
else:
|
|
131
|
-
|
|
171
|
+
logger.info(
|
|
132
172
|
"File %s (%s) to %s",
|
|
133
173
|
zip_info.filename,
|
|
134
174
|
name,
|
|
135
175
|
destination / name,
|
|
136
176
|
)
|
|
137
|
-
with
|
|
138
|
-
|
|
139
|
-
|
|
177
|
+
with (
|
|
178
|
+
zip.open(zip_info) as fp,
|
|
179
|
+
(destination / name).open("wb") as out,
|
|
180
|
+
):
|
|
140
181
|
shutil.copyfileobj(fp, out)
|
|
141
182
|
|
|
142
183
|
|
|
143
|
-
class
|
|
144
|
-
"""TAR archive handler"""
|
|
184
|
+
class TarDownloader(ArchiveDownloader):
|
|
185
|
+
"""TAR archive handler."""
|
|
145
186
|
|
|
146
187
|
def _name(self, name):
|
|
147
188
|
return re.sub(r"\.tar(\.gz|\.bz\|xz)?$", "", name)
|
|
148
189
|
|
|
149
190
|
def unarchive(self, file: CachedFile, destination: Path):
|
|
150
|
-
|
|
191
|
+
logger.info("Unarchiving file")
|
|
151
192
|
if self.subpath:
|
|
152
193
|
raise NotImplementedError()
|
|
153
194
|
|
|
@@ -159,8 +200,19 @@ class tardownloader(ArchiveDownloader):
|
|
|
159
200
|
if info.isdir():
|
|
160
201
|
(destination / name).mkdir()
|
|
161
202
|
else:
|
|
162
|
-
|
|
163
|
-
"File %s (%s) to %s",
|
|
203
|
+
logger.info(
|
|
204
|
+
"File %s (%s) to %s",
|
|
205
|
+
info.name,
|
|
206
|
+
name,
|
|
207
|
+
destination / name,
|
|
208
|
+
)
|
|
209
|
+
logger.info(
|
|
210
|
+
"Extracting into %s",
|
|
211
|
+
destination / name,
|
|
164
212
|
)
|
|
165
|
-
logging.info("Extracting into %s", destination / name)
|
|
166
213
|
tar.extract(info, destination / name)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# Factory aliases for backward compat and convenient usage
|
|
217
|
+
zipdownloader = ZipDownloader.apply
|
|
218
|
+
tardownloader = TarDownloader.apply
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Custom download resources.
|
|
2
|
+
|
|
3
|
+
Provides a Resource subclass that delegates to a user-defined
|
|
4
|
+
download function.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Protocol
|
|
11
|
+
|
|
12
|
+
from datamaestro import Context
|
|
13
|
+
from datamaestro.download import Resource
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Downloader(Protocol):
|
|
17
|
+
def __call__(self, context: Context, root: Path, *, force: bool = False):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class custom_download(Resource):
|
|
22
|
+
"""A resource that delegates to a user-defined download function.
|
|
23
|
+
|
|
24
|
+
Usage as class attribute (preferred)::
|
|
25
|
+
|
|
26
|
+
@dataset(url="...")
|
|
27
|
+
class MyDataset(Base):
|
|
28
|
+
DATA = custom_download(
|
|
29
|
+
"data", downloader=my_download_fn
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
Usage as decorator (deprecated)::
|
|
33
|
+
|
|
34
|
+
@custom_download("data", downloader=my_download_fn)
|
|
35
|
+
@dataset(Base)
|
|
36
|
+
def my_dataset(data): ...
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
varname: str,
|
|
42
|
+
downloader: Downloader,
|
|
43
|
+
*,
|
|
44
|
+
transient: bool = False,
|
|
45
|
+
):
|
|
46
|
+
super().__init__(varname=varname, transient=transient)
|
|
47
|
+
self.downloader = downloader
|
|
48
|
+
|
|
49
|
+
def prepare(self):
|
|
50
|
+
return self.dataset.datapath
|
|
51
|
+
|
|
52
|
+
def download(self, force=False):
|
|
53
|
+
self.downloader(self.context, self.dataset.datapath, force=force)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""HuggingFace Hub download resources.
|
|
2
|
+
|
|
3
|
+
Provides a ValueResource subclass for loading datasets from
|
|
4
|
+
the HuggingFace Hub.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
from datamaestro.download import ValueResource
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HFDownloader(ValueResource):
|
|
17
|
+
"""Load a dataset from the HuggingFace Hub.
|
|
18
|
+
|
|
19
|
+
Usage as class attribute (preferred)::
|
|
20
|
+
|
|
21
|
+
@dataset(url="...")
|
|
22
|
+
class MyDataset(Base):
|
|
23
|
+
DATA = HFDownloader.apply(
|
|
24
|
+
"hf_data", repo_id="user/dataset"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
Usage as decorator (deprecated)::
|
|
28
|
+
|
|
29
|
+
@hf_download("hf_data", repo_id="user/dataset")
|
|
30
|
+
@dataset(Base)
|
|
31
|
+
def my_dataset(hf_data): ...
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
varname: str,
|
|
37
|
+
repo_id: str,
|
|
38
|
+
*,
|
|
39
|
+
data_files: str | None = None,
|
|
40
|
+
split: str | None = None,
|
|
41
|
+
transient: bool = False,
|
|
42
|
+
):
|
|
43
|
+
"""
|
|
44
|
+
Args:
|
|
45
|
+
varname: Variable name.
|
|
46
|
+
repo_id: The HuggingFace repository ID.
|
|
47
|
+
data_files: Specific data files to load.
|
|
48
|
+
split: Dataset split to load.
|
|
49
|
+
transient: If True, data can be deleted after dependents
|
|
50
|
+
complete.
|
|
51
|
+
"""
|
|
52
|
+
super().__init__(varname=varname, transient=transient)
|
|
53
|
+
self.repo_id = repo_id
|
|
54
|
+
self.data_files = data_files
|
|
55
|
+
self.split = split
|
|
56
|
+
|
|
57
|
+
def download(self, force=False):
|
|
58
|
+
try:
|
|
59
|
+
from datasets import load_dataset
|
|
60
|
+
except ModuleNotFoundError:
|
|
61
|
+
logger.error("the datasets library is not installed:")
|
|
62
|
+
logger.error("pip install datasets")
|
|
63
|
+
raise
|
|
64
|
+
|
|
65
|
+
self._dataset = load_dataset(self.repo_id, data_files=self.data_files)
|
|
66
|
+
return True
|
|
67
|
+
|
|
68
|
+
def prepare(self):
|
|
69
|
+
return {
|
|
70
|
+
"repo_id": self.repo_id,
|
|
71
|
+
"data_files": self.data_files,
|
|
72
|
+
"split": self.split,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# Factory alias for backward compat
|
|
77
|
+
hf_download = HFDownloader.apply
|
datamaestro/download/links.py
CHANGED
|
@@ -1,30 +1,53 @@
|
|
|
1
|
+
"""Link-based resources.
|
|
2
|
+
|
|
3
|
+
Provides resources that create symlinks to other datasets or
|
|
4
|
+
user-specified paths.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
1
9
|
import logging
|
|
2
10
|
import os
|
|
3
|
-
from
|
|
4
|
-
from datamaestro.utils import deprecated
|
|
5
|
-
from datamaestro.definitions import AbstractDataset
|
|
11
|
+
from pathlib import Path
|
|
6
12
|
from typing import List
|
|
7
|
-
|
|
13
|
+
|
|
8
14
|
from datamaestro.context import ResolvablePath
|
|
9
|
-
from
|
|
10
|
-
import
|
|
11
|
-
import
|
|
15
|
+
from datamaestro.definitions import AbstractDataset
|
|
16
|
+
from datamaestro.download import Resource
|
|
17
|
+
from datamaestro.utils import deprecated
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class links(Resource):
|
|
23
|
+
"""Link with another dataset path.
|
|
12
24
|
|
|
25
|
+
Usage as class attribute (preferred)::
|
|
13
26
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
27
|
+
@dataset(url="...")
|
|
28
|
+
class MyDataset(Base):
|
|
29
|
+
DATA = links("data", ref1=other_dataset1)
|
|
17
30
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
31
|
+
Usage as decorator (deprecated)::
|
|
32
|
+
|
|
33
|
+
@links("data", ref1=other_dataset1)
|
|
34
|
+
@dataset(Base)
|
|
35
|
+
def my_dataset(data): ...
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
varname: str,
|
|
41
|
+
*,
|
|
42
|
+
transient: bool = False,
|
|
43
|
+
**link_targets: List[AbstractDataset],
|
|
44
|
+
):
|
|
45
|
+
super().__init__(varname=varname, transient=transient)
|
|
46
|
+
self.links = link_targets
|
|
24
47
|
|
|
25
48
|
@property
|
|
26
49
|
def path(self):
|
|
27
|
-
return self.
|
|
50
|
+
return self.dataset.datapath
|
|
28
51
|
|
|
29
52
|
def prepare(self):
|
|
30
53
|
return self.path
|
|
@@ -39,24 +62,36 @@ class links(Download):
|
|
|
39
62
|
|
|
40
63
|
if not dest.exists():
|
|
41
64
|
if dest.is_symlink():
|
|
42
|
-
|
|
65
|
+
logger.info("Removing dangling symlink %s", dest)
|
|
43
66
|
dest.unlink()
|
|
44
67
|
os.symlink(path, dest)
|
|
45
68
|
|
|
69
|
+
def has_files(self):
|
|
70
|
+
return False
|
|
71
|
+
|
|
46
72
|
|
|
47
73
|
# Deprecated
|
|
48
74
|
Links = deprecated("Use @links instead of @Links", links)
|
|
49
75
|
|
|
50
76
|
|
|
51
|
-
class linkpath(
|
|
52
|
-
|
|
53
|
-
|
|
77
|
+
class linkpath(Resource):
|
|
78
|
+
"""Link to a path selected from proposals.
|
|
79
|
+
|
|
80
|
+
Usage as class attribute (preferred)::
|
|
54
81
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
82
|
+
@dataset(url="...")
|
|
83
|
+
class MyDataset(Base):
|
|
84
|
+
DATA = linkpath("data", proposals=[...])
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
varname: str,
|
|
90
|
+
proposals,
|
|
91
|
+
*,
|
|
92
|
+
transient: bool = False,
|
|
93
|
+
):
|
|
94
|
+
super().__init__(varname=varname, transient=transient)
|
|
60
95
|
self.proposals = proposals
|
|
61
96
|
|
|
62
97
|
def prepare(self):
|
|
@@ -64,62 +99,83 @@ class linkpath(Download):
|
|
|
64
99
|
|
|
65
100
|
@property
|
|
66
101
|
def path(self):
|
|
67
|
-
return self.
|
|
102
|
+
return self.dataset.datapath / self.name
|
|
68
103
|
|
|
69
|
-
def download(self,
|
|
104
|
+
def download(self, force=False):
|
|
70
105
|
if self.check(self.path):
|
|
71
106
|
return
|
|
72
107
|
|
|
73
108
|
if self.path.is_symlink():
|
|
74
|
-
|
|
109
|
+
logger.warning("Removing dangling symlink %s", self.path)
|
|
75
110
|
self.path.unlink()
|
|
76
111
|
|
|
77
112
|
path = None
|
|
78
113
|
|
|
79
114
|
for searchpath in self.proposals:
|
|
80
|
-
|
|
115
|
+
logger.info("Trying path %s", searchpath)
|
|
81
116
|
try:
|
|
82
117
|
path = ResolvablePath.resolve(self.context, searchpath)
|
|
83
118
|
if self.check(path):
|
|
84
119
|
break
|
|
85
|
-
|
|
120
|
+
logger.info("Path %s not found", path)
|
|
86
121
|
except KeyError:
|
|
87
|
-
|
|
122
|
+
logger.info("Could not expand path %s", searchpath)
|
|
88
123
|
|
|
89
124
|
# Ask the user
|
|
90
125
|
while path is None or not self.check(path):
|
|
91
|
-
path = Path(input("Path to %s: " % self.
|
|
126
|
+
path = Path(input("Path to %s: " % self.name))
|
|
92
127
|
assert path.name
|
|
93
128
|
|
|
94
|
-
|
|
129
|
+
logger.debug("Linking %s to %s", path, self.path)
|
|
95
130
|
self.path.parent.mkdir(exist_ok=True, parents=True)
|
|
96
131
|
os.symlink(path, self.path)
|
|
97
132
|
|
|
133
|
+
def check(self, path):
|
|
134
|
+
raise NotImplementedError()
|
|
135
|
+
|
|
98
136
|
|
|
99
137
|
class linkfolder(linkpath):
|
|
138
|
+
"""Link to a folder.
|
|
139
|
+
|
|
140
|
+
Usage as class attribute::
|
|
141
|
+
|
|
142
|
+
@dataset(url="...")
|
|
143
|
+
class MyDataset(Base):
|
|
144
|
+
DATA = linkfolder("data", proposals=[...])
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
def __init__(
|
|
148
|
+
self,
|
|
149
|
+
varname: str,
|
|
150
|
+
proposals,
|
|
151
|
+
*,
|
|
152
|
+
transient: bool = False,
|
|
153
|
+
):
|
|
154
|
+
super().__init__(varname, proposals, transient=transient)
|
|
155
|
+
|
|
100
156
|
def check(self, path):
|
|
101
157
|
return path.is_dir()
|
|
102
158
|
|
|
103
|
-
def __init__(self, varname: str, proposals):
|
|
104
|
-
"""Link to a folder
|
|
105
159
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
proposals: List of potential paths
|
|
109
|
-
"""
|
|
110
|
-
super().__init__(varname, proposals)
|
|
160
|
+
class linkfile(linkpath):
|
|
161
|
+
"""Link to a file.
|
|
111
162
|
|
|
163
|
+
Usage as class attribute::
|
|
112
164
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
165
|
+
@dataset(url="...")
|
|
166
|
+
class MyDataset(Base):
|
|
167
|
+
DATA = linkfile("data", proposals=[...])
|
|
168
|
+
"""
|
|
116
169
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
170
|
+
def __init__(
|
|
171
|
+
self,
|
|
172
|
+
varname: str,
|
|
173
|
+
proposals,
|
|
174
|
+
*,
|
|
175
|
+
transient: bool = False,
|
|
176
|
+
):
|
|
177
|
+
super().__init__(varname, proposals, transient=transient)
|
|
122
178
|
|
|
123
179
|
def check(self, path):
|
|
124
|
-
|
|
180
|
+
logger.debug("Checking %s (exists: %s)", path, path.is_file())
|
|
125
181
|
return path.is_file()
|