datamaestro 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +1 -2
- datamaestro/__main__.py +11 -7
- datamaestro/commands/site.py +16 -5
- datamaestro/context.py +32 -16
- datamaestro/data/ml.py +1 -0
- datamaestro/definitions.py +246 -20
- datamaestro/download/__init__.py +583 -40
- datamaestro/download/archive.py +120 -76
- datamaestro/download/custom.py +38 -6
- datamaestro/download/huggingface.py +46 -14
- datamaestro/download/links.py +106 -49
- datamaestro/download/multiple.py +27 -5
- datamaestro/download/single.py +111 -54
- datamaestro/download/sync.py +0 -1
- datamaestro/download/todo.py +9 -4
- datamaestro/download/wayback.py +3 -3
- datamaestro/record.py +48 -2
- datamaestro/settings.py +2 -1
- datamaestro/sphinx.py +1 -3
- datamaestro/stream/lines.py +8 -6
- datamaestro/test/__init__.py +3 -1
- datamaestro/test/conftest.py +1 -2
- datamaestro/test/test_resource.py +1388 -0
- datamaestro/utils.py +7 -6
- datamaestro/v2.md +301 -0
- datamaestro/version.py +4 -21
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/METADATA +63 -94
- datamaestro-1.7.0.dist-info/RECORD +49 -0
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
- datamaestro-1.5.0.dist-info/RECORD +0 -48
- datamaestro-1.5.0.dist-info/top_level.txt +0 -1
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -0
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/licenses/LICENSE +0 -0
datamaestro/download/archive.py
CHANGED
|
@@ -1,41 +1,68 @@
|
|
|
1
|
+
"""Archive download resources.
|
|
2
|
+
|
|
3
|
+
Provides FolderResource subclasses for downloading and extracting
|
|
4
|
+
ZIP and TAR archives.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
1
9
|
import logging
|
|
2
|
-
|
|
3
|
-
import zipfile
|
|
10
|
+
import re
|
|
4
11
|
import shutil
|
|
5
|
-
import urllib3
|
|
6
12
|
import tarfile
|
|
7
|
-
import
|
|
13
|
+
import zipfile
|
|
14
|
+
from pathlib import Path
|
|
8
15
|
from typing import Set
|
|
9
|
-
|
|
16
|
+
|
|
17
|
+
import urllib3
|
|
18
|
+
|
|
19
|
+
from datamaestro.download import FolderResource
|
|
10
20
|
from datamaestro.utils import CachedFile, FileChecker
|
|
11
21
|
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ArchiveDownloader(FolderResource):
|
|
26
|
+
"""Abstract base for all archive-related extractors.
|
|
27
|
+
|
|
28
|
+
Usage as class attribute (preferred)::
|
|
29
|
+
|
|
30
|
+
@dataset(url="...")
|
|
31
|
+
class MyDataset(Base):
|
|
32
|
+
DATA = ZipDownloader.apply(
|
|
33
|
+
"archive", "http://example.com/data.zip"
|
|
34
|
+
)
|
|
12
35
|
|
|
13
|
-
|
|
14
|
-
|
|
36
|
+
Usage as decorator (deprecated)::
|
|
37
|
+
|
|
38
|
+
@zipdownloader("archive", "http://example.com/data.zip")
|
|
39
|
+
@dataset(Base)
|
|
40
|
+
def my_dataset(archive): ...
|
|
41
|
+
"""
|
|
15
42
|
|
|
16
43
|
def __init__(
|
|
17
44
|
self,
|
|
18
|
-
varname,
|
|
45
|
+
varname: str,
|
|
19
46
|
url: str,
|
|
20
|
-
subpath: str = None,
|
|
21
|
-
checker: FileChecker = None,
|
|
22
|
-
files: Set[str] = None,
|
|
47
|
+
subpath: str | None = None,
|
|
48
|
+
checker: FileChecker | None = None,
|
|
49
|
+
files: Set[str] | None = None,
|
|
50
|
+
*,
|
|
51
|
+
transient: bool = False,
|
|
23
52
|
):
|
|
24
|
-
"""Downloads and extract the content of the archive
|
|
53
|
+
"""Downloads and extract the content of the archive.
|
|
25
54
|
|
|
26
55
|
Args:
|
|
27
|
-
varname: The name of the variable when defining the dataset
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
checker:
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
files: A set of files; if present, only extract those
|
|
56
|
+
varname: The name of the variable when defining the dataset.
|
|
57
|
+
url: The archive URL.
|
|
58
|
+
subpath: A subpath in the archive; only files from this
|
|
59
|
+
subpath will be extracted.
|
|
60
|
+
checker: The hash check for the downloaded file.
|
|
61
|
+
files: A set of files; if present, only extract those.
|
|
62
|
+
transient: If True, data can be deleted after dependents
|
|
63
|
+
complete.
|
|
37
64
|
"""
|
|
38
|
-
super().__init__(varname)
|
|
65
|
+
super().__init__(varname=varname, transient=transient)
|
|
39
66
|
self.url = url
|
|
40
67
|
self.subpath = subpath
|
|
41
68
|
self.checker = checker
|
|
@@ -46,20 +73,33 @@ class ArchiveDownloader(Download):
|
|
|
46
73
|
def postinit(self):
|
|
47
74
|
# Define the path
|
|
48
75
|
p = urllib3.util.parse_url(self.url)
|
|
49
|
-
|
|
76
|
+
self._archive_name = self._name(Path(p.path).name)
|
|
50
77
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
78
|
+
@property
|
|
79
|
+
def path(self) -> Path:
|
|
80
|
+
"""Final path to the extracted directory."""
|
|
81
|
+
if not self._post:
|
|
82
|
+
self._post = True
|
|
83
|
+
self.postinit()
|
|
84
|
+
|
|
85
|
+
if len(self.dataset.resources) > 1:
|
|
86
|
+
return self.dataset.datapath / self._archive_name
|
|
87
|
+
return self.dataset.datapath
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def transient_path(self) -> Path:
|
|
91
|
+
"""Temporary path for extraction."""
|
|
92
|
+
if not self._post:
|
|
93
|
+
self._post = True
|
|
94
|
+
self.postinit()
|
|
55
95
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
return self.
|
|
96
|
+
if len(self.dataset.resources) > 1:
|
|
97
|
+
return self.dataset.datapath / ".downloads" / self._archive_name
|
|
98
|
+
return self.dataset.datapath / ".downloads" / self.name
|
|
59
99
|
|
|
60
100
|
@property
|
|
61
101
|
def extractall(self):
|
|
62
|
-
"""Returns whether everything can be extracted"""
|
|
102
|
+
"""Returns whether everything can be extracted."""
|
|
63
103
|
return self._files is None and self.subpath is None
|
|
64
104
|
|
|
65
105
|
def filter(self, iterable, getname):
|
|
@@ -67,8 +107,8 @@ class ArchiveDownloader(Download):
|
|
|
67
107
|
|
|
68
108
|
for info in iterable:
|
|
69
109
|
name = getname(info)
|
|
70
|
-
|
|
71
|
-
if self._files and not
|
|
110
|
+
logger.debug("Looking at %s", name)
|
|
111
|
+
if self._files and name not in self._files:
|
|
72
112
|
continue
|
|
73
113
|
|
|
74
114
|
if self.subpath and name.startswith(self.subpath):
|
|
@@ -77,82 +117,78 @@ class ArchiveDownloader(Download):
|
|
|
77
117
|
if not self.subpath:
|
|
78
118
|
yield info, name
|
|
79
119
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
# Already downloaded
|
|
83
|
-
destination = self.definition.datapath
|
|
84
|
-
if destination.is_dir():
|
|
85
|
-
return
|
|
86
|
-
|
|
87
|
-
logging.info("Downloading %s into %s", self.url, destination)
|
|
120
|
+
def _download(self, destination: Path) -> None:
|
|
121
|
+
logger.info("Downloading %s into %s", self.url, destination)
|
|
88
122
|
|
|
89
123
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
90
|
-
tmpdestination = destination.with_suffix(".tmp")
|
|
91
|
-
if tmpdestination.exists():
|
|
92
|
-
logging.warn("Removing temporary directory %s", tmpdestination)
|
|
93
|
-
shutil.rmtree(tmpdestination)
|
|
94
124
|
|
|
95
125
|
with self.context.downloadURL(self.url) as file:
|
|
96
126
|
if self.checker:
|
|
97
127
|
self.checker.check(file.path)
|
|
98
|
-
self.unarchive(file,
|
|
99
|
-
|
|
100
|
-
# Look at the content
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
"Moving single file/directory {} into destination {}".format(
|
|
109
|
-
path, destination
|
|
110
|
-
)
|
|
128
|
+
self.unarchive(file, destination)
|
|
129
|
+
|
|
130
|
+
# Look at the content - if single directory, unwrap
|
|
131
|
+
children = list(destination.iterdir())
|
|
132
|
+
if len(children) == 1 and children[0].is_dir():
|
|
133
|
+
single_dir = children[0]
|
|
134
|
+
logger.info(
|
|
135
|
+
"Moving single directory %s into destination %s",
|
|
136
|
+
single_dir,
|
|
137
|
+
destination,
|
|
111
138
|
)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
shutil.
|
|
139
|
+
# Move contents up one level
|
|
140
|
+
tmp = destination.with_suffix(".unwrap")
|
|
141
|
+
shutil.move(str(single_dir), str(tmp))
|
|
142
|
+
shutil.rmtree(destination)
|
|
143
|
+
shutil.move(str(tmp), str(destination))
|
|
144
|
+
|
|
145
|
+
def unarchive(self, file, destination: Path):
|
|
146
|
+
raise NotImplementedError()
|
|
147
|
+
|
|
148
|
+
def _name(self, name: str) -> str:
|
|
149
|
+
raise NotImplementedError()
|
|
116
150
|
|
|
117
151
|
|
|
118
|
-
class
|
|
119
|
-
"""ZIP Archive handler"""
|
|
152
|
+
class ZipDownloader(ArchiveDownloader):
|
|
153
|
+
"""ZIP Archive handler."""
|
|
120
154
|
|
|
121
155
|
def _name(self, name):
|
|
122
156
|
return re.sub(r"\.zip$", "", name)
|
|
123
157
|
|
|
124
158
|
def unarchive(self, file, destination: Path):
|
|
125
|
-
|
|
159
|
+
logger.info("Unzipping file")
|
|
126
160
|
with zipfile.ZipFile(file.path) as zip:
|
|
127
161
|
if self.extractall:
|
|
128
162
|
zip.extractall(destination)
|
|
129
163
|
else:
|
|
130
164
|
for zip_info, name in self.filter(
|
|
131
|
-
zip.infolist(),
|
|
165
|
+
zip.infolist(),
|
|
166
|
+
lambda zip_info: zip_info.filename,
|
|
132
167
|
):
|
|
133
168
|
if zip_info.is_dir():
|
|
134
169
|
(destination / name).mkdir()
|
|
135
170
|
else:
|
|
136
|
-
|
|
171
|
+
logger.info(
|
|
137
172
|
"File %s (%s) to %s",
|
|
138
173
|
zip_info.filename,
|
|
139
174
|
name,
|
|
140
175
|
destination / name,
|
|
141
176
|
)
|
|
142
|
-
with
|
|
143
|
-
|
|
144
|
-
|
|
177
|
+
with (
|
|
178
|
+
zip.open(zip_info) as fp,
|
|
179
|
+
(destination / name).open("wb") as out,
|
|
180
|
+
):
|
|
145
181
|
shutil.copyfileobj(fp, out)
|
|
146
182
|
|
|
147
183
|
|
|
148
|
-
class
|
|
149
|
-
"""TAR archive handler"""
|
|
184
|
+
class TarDownloader(ArchiveDownloader):
|
|
185
|
+
"""TAR archive handler."""
|
|
150
186
|
|
|
151
187
|
def _name(self, name):
|
|
152
188
|
return re.sub(r"\.tar(\.gz|\.bz\|xz)?$", "", name)
|
|
153
189
|
|
|
154
190
|
def unarchive(self, file: CachedFile, destination: Path):
|
|
155
|
-
|
|
191
|
+
logger.info("Unarchiving file")
|
|
156
192
|
if self.subpath:
|
|
157
193
|
raise NotImplementedError()
|
|
158
194
|
|
|
@@ -164,11 +200,19 @@ class tardownloader(ArchiveDownloader):
|
|
|
164
200
|
if info.isdir():
|
|
165
201
|
(destination / name).mkdir()
|
|
166
202
|
else:
|
|
167
|
-
|
|
203
|
+
logger.info(
|
|
168
204
|
"File %s (%s) to %s",
|
|
169
205
|
info.name,
|
|
170
206
|
name,
|
|
171
207
|
destination / name,
|
|
172
208
|
)
|
|
173
|
-
|
|
209
|
+
logger.info(
|
|
210
|
+
"Extracting into %s",
|
|
211
|
+
destination / name,
|
|
212
|
+
)
|
|
174
213
|
tar.extract(info, destination / name)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# Factory aliases for backward compat and convenient usage
|
|
217
|
+
zipdownloader = ZipDownloader.apply
|
|
218
|
+
tardownloader = TarDownloader.apply
|
datamaestro/download/custom.py
CHANGED
|
@@ -1,21 +1,53 @@
|
|
|
1
|
-
|
|
1
|
+
"""Custom download resources.
|
|
2
|
+
|
|
3
|
+
Provides a Resource subclass that delegates to a user-defined
|
|
4
|
+
download function.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
2
9
|
from pathlib import Path
|
|
10
|
+
from typing import Protocol
|
|
11
|
+
|
|
3
12
|
from datamaestro import Context
|
|
4
13
|
from datamaestro.download import Resource
|
|
5
14
|
|
|
6
15
|
|
|
7
16
|
class Downloader(Protocol):
|
|
8
|
-
def __call__(self, context: Context, root: Path, *, force=False):
|
|
17
|
+
def __call__(self, context: Context, root: Path, *, force: bool = False):
|
|
9
18
|
pass
|
|
10
19
|
|
|
11
20
|
|
|
12
21
|
class custom_download(Resource):
|
|
13
|
-
|
|
14
|
-
|
|
22
|
+
"""A resource that delegates to a user-defined download function.
|
|
23
|
+
|
|
24
|
+
Usage as class attribute (preferred)::
|
|
25
|
+
|
|
26
|
+
@dataset(url="...")
|
|
27
|
+
class MyDataset(Base):
|
|
28
|
+
DATA = custom_download(
|
|
29
|
+
"data", downloader=my_download_fn
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
Usage as decorator (deprecated)::
|
|
33
|
+
|
|
34
|
+
@custom_download("data", downloader=my_download_fn)
|
|
35
|
+
@dataset(Base)
|
|
36
|
+
def my_dataset(data): ...
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
varname: str,
|
|
42
|
+
downloader: Downloader,
|
|
43
|
+
*,
|
|
44
|
+
transient: bool = False,
|
|
45
|
+
):
|
|
46
|
+
super().__init__(varname=varname, transient=transient)
|
|
15
47
|
self.downloader = downloader
|
|
16
48
|
|
|
17
49
|
def prepare(self):
|
|
18
|
-
return self.
|
|
50
|
+
return self.dataset.datapath
|
|
19
51
|
|
|
20
52
|
def download(self, force=False):
|
|
21
|
-
self.downloader(self.context, self.
|
|
53
|
+
self.downloader(self.context, self.dataset.datapath, force=force)
|
|
@@ -1,27 +1,55 @@
|
|
|
1
|
+
"""HuggingFace Hub download resources.
|
|
2
|
+
|
|
3
|
+
Provides a ValueResource subclass for loading datasets from
|
|
4
|
+
the HuggingFace Hub.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
1
9
|
import logging
|
|
2
|
-
from typing import Optional
|
|
3
10
|
|
|
4
|
-
from datamaestro.download import
|
|
11
|
+
from datamaestro.download import ValueResource
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HFDownloader(ValueResource):
|
|
17
|
+
"""Load a dataset from the HuggingFace Hub.
|
|
5
18
|
|
|
19
|
+
Usage as class attribute (preferred)::
|
|
6
20
|
|
|
7
|
-
|
|
8
|
-
|
|
21
|
+
@dataset(url="...")
|
|
22
|
+
class MyDataset(Base):
|
|
23
|
+
DATA = HFDownloader.apply(
|
|
24
|
+
"hf_data", repo_id="user/dataset"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
Usage as decorator (deprecated)::
|
|
28
|
+
|
|
29
|
+
@hf_download("hf_data", repo_id="user/dataset")
|
|
30
|
+
@dataset(Base)
|
|
31
|
+
def my_dataset(hf_data): ...
|
|
32
|
+
"""
|
|
9
33
|
|
|
10
34
|
def __init__(
|
|
11
35
|
self,
|
|
12
36
|
varname: str,
|
|
13
37
|
repo_id: str,
|
|
14
38
|
*,
|
|
15
|
-
data_files:
|
|
16
|
-
split:
|
|
39
|
+
data_files: str | None = None,
|
|
40
|
+
split: str | None = None,
|
|
41
|
+
transient: bool = False,
|
|
17
42
|
):
|
|
18
|
-
"""
|
|
19
|
-
|
|
43
|
+
"""
|
|
20
44
|
Args:
|
|
21
|
-
varname: Variable name
|
|
22
|
-
repo_id: The HuggingFace repository ID
|
|
45
|
+
varname: Variable name.
|
|
46
|
+
repo_id: The HuggingFace repository ID.
|
|
47
|
+
data_files: Specific data files to load.
|
|
48
|
+
split: Dataset split to load.
|
|
49
|
+
transient: If True, data can be deleted after dependents
|
|
50
|
+
complete.
|
|
23
51
|
"""
|
|
24
|
-
super().__init__(varname)
|
|
52
|
+
super().__init__(varname=varname, transient=transient)
|
|
25
53
|
self.repo_id = repo_id
|
|
26
54
|
self.data_files = data_files
|
|
27
55
|
self.split = split
|
|
@@ -30,11 +58,11 @@ class hf_download(Download):
|
|
|
30
58
|
try:
|
|
31
59
|
from datasets import load_dataset
|
|
32
60
|
except ModuleNotFoundError:
|
|
33
|
-
|
|
34
|
-
|
|
61
|
+
logger.error("the datasets library is not installed:")
|
|
62
|
+
logger.error("pip install datasets")
|
|
35
63
|
raise
|
|
36
64
|
|
|
37
|
-
self.
|
|
65
|
+
self._dataset = load_dataset(self.repo_id, data_files=self.data_files)
|
|
38
66
|
return True
|
|
39
67
|
|
|
40
68
|
def prepare(self):
|
|
@@ -43,3 +71,7 @@ class hf_download(Download):
|
|
|
43
71
|
"data_files": self.data_files,
|
|
44
72
|
"split": self.split,
|
|
45
73
|
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# Factory alias for backward compat
|
|
77
|
+
hf_download = HFDownloader.apply
|
datamaestro/download/links.py
CHANGED
|
@@ -1,29 +1,53 @@
|
|
|
1
|
+
"""Link-based resources.
|
|
2
|
+
|
|
3
|
+
Provides resources that create symlinks to other datasets or
|
|
4
|
+
user-specified paths.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
1
9
|
import logging
|
|
2
10
|
import os
|
|
3
|
-
from
|
|
4
|
-
from datamaestro.definitions import AbstractDataset
|
|
11
|
+
from pathlib import Path
|
|
5
12
|
from typing import List
|
|
6
|
-
|
|
13
|
+
|
|
7
14
|
from datamaestro.context import ResolvablePath
|
|
8
|
-
from
|
|
9
|
-
import
|
|
10
|
-
import
|
|
15
|
+
from datamaestro.definitions import AbstractDataset
|
|
16
|
+
from datamaestro.download import Resource
|
|
17
|
+
from datamaestro.utils import deprecated
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class links(Resource):
|
|
23
|
+
"""Link with another dataset path.
|
|
11
24
|
|
|
25
|
+
Usage as class attribute (preferred)::
|
|
12
26
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
27
|
+
@dataset(url="...")
|
|
28
|
+
class MyDataset(Base):
|
|
29
|
+
DATA = links("data", ref1=other_dataset1)
|
|
16
30
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
31
|
+
Usage as decorator (deprecated)::
|
|
32
|
+
|
|
33
|
+
@links("data", ref1=other_dataset1)
|
|
34
|
+
@dataset(Base)
|
|
35
|
+
def my_dataset(data): ...
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
varname: str,
|
|
41
|
+
*,
|
|
42
|
+
transient: bool = False,
|
|
43
|
+
**link_targets: List[AbstractDataset],
|
|
44
|
+
):
|
|
45
|
+
super().__init__(varname=varname, transient=transient)
|
|
46
|
+
self.links = link_targets
|
|
23
47
|
|
|
24
48
|
@property
|
|
25
49
|
def path(self):
|
|
26
|
-
return self.
|
|
50
|
+
return self.dataset.datapath
|
|
27
51
|
|
|
28
52
|
def prepare(self):
|
|
29
53
|
return self.path
|
|
@@ -38,24 +62,36 @@ class links(Download):
|
|
|
38
62
|
|
|
39
63
|
if not dest.exists():
|
|
40
64
|
if dest.is_symlink():
|
|
41
|
-
|
|
65
|
+
logger.info("Removing dangling symlink %s", dest)
|
|
42
66
|
dest.unlink()
|
|
43
67
|
os.symlink(path, dest)
|
|
44
68
|
|
|
69
|
+
def has_files(self):
|
|
70
|
+
return False
|
|
71
|
+
|
|
45
72
|
|
|
46
73
|
# Deprecated
|
|
47
74
|
Links = deprecated("Use @links instead of @Links", links)
|
|
48
75
|
|
|
49
76
|
|
|
50
|
-
class linkpath(
|
|
51
|
-
|
|
52
|
-
|
|
77
|
+
class linkpath(Resource):
|
|
78
|
+
"""Link to a path selected from proposals.
|
|
79
|
+
|
|
80
|
+
Usage as class attribute (preferred)::
|
|
53
81
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
82
|
+
@dataset(url="...")
|
|
83
|
+
class MyDataset(Base):
|
|
84
|
+
DATA = linkpath("data", proposals=[...])
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
varname: str,
|
|
90
|
+
proposals,
|
|
91
|
+
*,
|
|
92
|
+
transient: bool = False,
|
|
93
|
+
):
|
|
94
|
+
super().__init__(varname=varname, transient=transient)
|
|
59
95
|
self.proposals = proposals
|
|
60
96
|
|
|
61
97
|
def prepare(self):
|
|
@@ -63,62 +99,83 @@ class linkpath(Download):
|
|
|
63
99
|
|
|
64
100
|
@property
|
|
65
101
|
def path(self):
|
|
66
|
-
return self.
|
|
102
|
+
return self.dataset.datapath / self.name
|
|
67
103
|
|
|
68
|
-
def download(self,
|
|
104
|
+
def download(self, force=False):
|
|
69
105
|
if self.check(self.path):
|
|
70
106
|
return
|
|
71
107
|
|
|
72
108
|
if self.path.is_symlink():
|
|
73
|
-
|
|
109
|
+
logger.warning("Removing dangling symlink %s", self.path)
|
|
74
110
|
self.path.unlink()
|
|
75
111
|
|
|
76
112
|
path = None
|
|
77
113
|
|
|
78
114
|
for searchpath in self.proposals:
|
|
79
|
-
|
|
115
|
+
logger.info("Trying path %s", searchpath)
|
|
80
116
|
try:
|
|
81
117
|
path = ResolvablePath.resolve(self.context, searchpath)
|
|
82
118
|
if self.check(path):
|
|
83
119
|
break
|
|
84
|
-
|
|
120
|
+
logger.info("Path %s not found", path)
|
|
85
121
|
except KeyError:
|
|
86
|
-
|
|
122
|
+
logger.info("Could not expand path %s", searchpath)
|
|
87
123
|
|
|
88
124
|
# Ask the user
|
|
89
125
|
while path is None or not self.check(path):
|
|
90
|
-
path = Path(input("Path to %s: " % self.
|
|
126
|
+
path = Path(input("Path to %s: " % self.name))
|
|
91
127
|
assert path.name
|
|
92
128
|
|
|
93
|
-
|
|
129
|
+
logger.debug("Linking %s to %s", path, self.path)
|
|
94
130
|
self.path.parent.mkdir(exist_ok=True, parents=True)
|
|
95
131
|
os.symlink(path, self.path)
|
|
96
132
|
|
|
133
|
+
def check(self, path):
|
|
134
|
+
raise NotImplementedError()
|
|
135
|
+
|
|
97
136
|
|
|
98
137
|
class linkfolder(linkpath):
|
|
138
|
+
"""Link to a folder.
|
|
139
|
+
|
|
140
|
+
Usage as class attribute::
|
|
141
|
+
|
|
142
|
+
@dataset(url="...")
|
|
143
|
+
class MyDataset(Base):
|
|
144
|
+
DATA = linkfolder("data", proposals=[...])
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
def __init__(
|
|
148
|
+
self,
|
|
149
|
+
varname: str,
|
|
150
|
+
proposals,
|
|
151
|
+
*,
|
|
152
|
+
transient: bool = False,
|
|
153
|
+
):
|
|
154
|
+
super().__init__(varname, proposals, transient=transient)
|
|
155
|
+
|
|
99
156
|
def check(self, path):
|
|
100
157
|
return path.is_dir()
|
|
101
158
|
|
|
102
|
-
def __init__(self, varname: str, proposals):
|
|
103
|
-
"""Link to a folder
|
|
104
159
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
proposals: List of potential paths
|
|
108
|
-
"""
|
|
109
|
-
super().__init__(varname, proposals)
|
|
160
|
+
class linkfile(linkpath):
|
|
161
|
+
"""Link to a file.
|
|
110
162
|
|
|
163
|
+
Usage as class attribute::
|
|
111
164
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
165
|
+
@dataset(url="...")
|
|
166
|
+
class MyDataset(Base):
|
|
167
|
+
DATA = linkfile("data", proposals=[...])
|
|
168
|
+
"""
|
|
115
169
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
170
|
+
def __init__(
|
|
171
|
+
self,
|
|
172
|
+
varname: str,
|
|
173
|
+
proposals,
|
|
174
|
+
*,
|
|
175
|
+
transient: bool = False,
|
|
176
|
+
):
|
|
177
|
+
super().__init__(varname, proposals, transient=transient)
|
|
121
178
|
|
|
122
179
|
def check(self, path):
|
|
123
|
-
|
|
180
|
+
logger.debug("Checking %s (exists: %s)", path, path.is_file())
|
|
124
181
|
return path.is_file()
|