datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. datamaestro/__init__.py +11 -7
  2. datamaestro/__main__.py +29 -8
  3. datamaestro/annotations/__init__.py +1 -1
  4. datamaestro/annotations/agreement.py +9 -3
  5. datamaestro/commands/site.py +27 -15
  6. datamaestro/context.py +143 -87
  7. datamaestro/data/__init__.py +23 -11
  8. datamaestro/data/csv.py +12 -12
  9. datamaestro/data/huggingface.py +25 -0
  10. datamaestro/data/ml.py +19 -10
  11. datamaestro/data/tensor.py +32 -24
  12. datamaestro/definitions.py +492 -131
  13. datamaestro/download/__init__.py +610 -24
  14. datamaestro/download/archive.py +129 -77
  15. datamaestro/download/custom.py +53 -0
  16. datamaestro/download/huggingface.py +77 -0
  17. datamaestro/download/links.py +106 -50
  18. datamaestro/download/multiple.py +27 -5
  19. datamaestro/download/single.py +114 -51
  20. datamaestro/download/sync.py +0 -1
  21. datamaestro/download/todo.py +9 -4
  22. datamaestro/download/wayback.py +164 -0
  23. datamaestro/record.py +232 -0
  24. datamaestro/registry.py +1 -0
  25. datamaestro/search.py +1 -1
  26. datamaestro/settings.py +3 -1
  27. datamaestro/sphinx.py +224 -0
  28. datamaestro/stream/__init__.py +0 -2
  29. datamaestro/stream/lines.py +10 -7
  30. datamaestro/templates/dataset.py +5 -4
  31. datamaestro/test/__init__.py +3 -1
  32. datamaestro/test/checks.py +1 -5
  33. datamaestro/test/conftest.py +1 -6
  34. datamaestro/test/test_annotations.py +2 -2
  35. datamaestro/test/test_download_handlers.py +3 -4
  36. datamaestro/test/test_record.py +72 -0
  37. datamaestro/test/test_resource.py +1388 -0
  38. datamaestro/utils.py +15 -9
  39. datamaestro/v2.md +301 -0
  40. datamaestro/version.py +4 -0
  41. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
  42. datamaestro-1.7.0.dist-info/RECORD +49 -0
  43. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
  44. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
  45. datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
  46. datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
  47. datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
  48. datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
  49. datamaestro/__pycache__/context.cpython-38.pyc +0 -0
  50. datamaestro/__pycache__/context.cpython-39.pyc +0 -0
  51. datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
  52. datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
  53. datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
  54. datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
  55. datamaestro/__pycache__/search.cpython-38.pyc +0 -0
  56. datamaestro/__pycache__/search.cpython-39.pyc +0 -0
  57. datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
  58. datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
  59. datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
  60. datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
  61. datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
  62. datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
  63. datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
  64. datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
  65. datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
  66. datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
  67. datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
  68. datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
  69. datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
  70. datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
  71. datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
  72. datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
  73. datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
  74. datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
  75. datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
  76. datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
  77. datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
  78. datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
  79. datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
  80. datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
  81. datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
  82. datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
  83. datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
  84. datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
  85. datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
  86. datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
  87. datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
  88. datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
  89. datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
  90. datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
  91. datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
  92. datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
  93. datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
  94. datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
  95. datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
  96. datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
  97. datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
  98. datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
  99. datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
  100. datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
  101. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
  102. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
  103. datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
  104. datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
  105. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
  106. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
  107. datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
  108. datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
  109. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
  110. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
  111. datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
  112. datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
  113. datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
  114. datamaestro-0.8.1.dist-info/RECORD +0 -109
  115. datamaestro-0.8.1.dist-info/top_level.txt +0 -1
  116. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,37 +1,68 @@
1
+ """Archive download resources.
2
+
3
+ Provides FolderResource subclasses for downloading and extracting
4
+ ZIP and TAR archives.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
1
9
  import logging
2
- from pathlib import Path
3
- import zipfile
10
+ import re
4
11
  import shutil
5
- import urllib3
6
12
  import tarfile
7
- import re
8
- import hashlib
9
- from typing import List, Set, Optional
10
- from datamaestro.download import Download, initialized
11
- from datamaestro.utils import CachedFile, HashCheck, FileChecker
13
+ import zipfile
14
+ from pathlib import Path
15
+ from typing import Set
16
+
17
+ import urllib3
18
+
19
+ from datamaestro.download import FolderResource
20
+ from datamaestro.utils import CachedFile, FileChecker
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class ArchiveDownloader(FolderResource):
26
+ """Abstract base for all archive-related extractors.
27
+
28
+ Usage as class attribute (preferred)::
12
29
 
30
+ @dataset(url="...")
31
+ class MyDataset(Base):
32
+ DATA = ZipDownloader.apply(
33
+ "archive", "http://example.com/data.zip"
34
+ )
35
+
36
+ Usage as decorator (deprecated)::
13
37
 
14
- class ArchiveDownloader(Download):
15
- """Abstract class for all archive related extractors"""
38
+ @zipdownloader("archive", "http://example.com/data.zip")
39
+ @dataset(Base)
40
+ def my_dataset(archive): ...
41
+ """
16
42
 
17
43
  def __init__(
18
44
  self,
19
- varname,
45
+ varname: str,
20
46
  url: str,
21
- subpath: str = None,
22
- checker: FileChecker = None,
23
- files: Set[str] = None,
47
+ subpath: str | None = None,
48
+ checker: FileChecker | None = None,
49
+ files: Set[str] | None = None,
50
+ *,
51
+ transient: bool = False,
24
52
  ):
25
- """Downloads and extract the content of the archive
53
+ """Downloads and extract the content of the archive.
26
54
 
27
55
  Args:
28
- varname: The name of the variable when defining the dataset
29
- url: The archive URL
30
- checker: the hash check for the downloaded file, composed of two
31
- subpath: A subpath in the archive; only files from this subpath will be extracted
32
- files: A set of files; if present, only extract those
56
+ varname: The name of the variable when defining the dataset.
57
+ url: The archive URL.
58
+ subpath: A subpath in the archive; only files from this
59
+ subpath will be extracted.
60
+ checker: The hash check for the downloaded file.
61
+ files: A set of files; if present, only extract those.
62
+ transient: If True, data can be deleted after dependents
63
+ complete.
33
64
  """
34
- super().__init__(varname)
65
+ super().__init__(varname=varname, transient=transient)
35
66
  self.url = url
36
67
  self.subpath = subpath
37
68
  self.checker = checker
@@ -42,20 +73,33 @@ class ArchiveDownloader(Download):
42
73
  def postinit(self):
43
74
  # Define the path
44
75
  p = urllib3.util.parse_url(self.url)
45
- name = self._name(Path(p.path).name)
76
+ self._archive_name = self._name(Path(p.path).name)
46
77
 
47
- if len(self.definition.resources) > 1:
48
- self.path = self.definition.datapath / name
49
- else:
50
- self.path = self.definition.datapath
78
+ @property
79
+ def path(self) -> Path:
80
+ """Final path to the extracted directory."""
81
+ if not self._post:
82
+ self._post = True
83
+ self.postinit()
51
84
 
52
- @initialized
53
- def prepare(self):
54
- return self.path
85
+ if len(self.dataset.resources) > 1:
86
+ return self.dataset.datapath / self._archive_name
87
+ return self.dataset.datapath
88
+
89
+ @property
90
+ def transient_path(self) -> Path:
91
+ """Temporary path for extraction."""
92
+ if not self._post:
93
+ self._post = True
94
+ self.postinit()
95
+
96
+ if len(self.dataset.resources) > 1:
97
+ return self.dataset.datapath / ".downloads" / self._archive_name
98
+ return self.dataset.datapath / ".downloads" / self.name
55
99
 
56
100
  @property
57
101
  def extractall(self):
58
- """Returns whether everything can be extracted"""
102
+ """Returns whether everything can be extracted."""
59
103
  return self._files is None and self.subpath is None
60
104
 
61
105
  def filter(self, iterable, getname):
@@ -63,91 +107,88 @@ class ArchiveDownloader(Download):
63
107
 
64
108
  for info in iterable:
65
109
  name = getname(info)
66
- logging.debug("Looking at %s", name)
67
- if self._files and not (name in self._files):
110
+ logger.debug("Looking at %s", name)
111
+ if self._files and name not in self._files:
68
112
  continue
69
113
 
70
- if self.subpath and path.startswith(self.subpath):
114
+ if self.subpath and name.startswith(self.subpath):
71
115
  yield info, name[L:]
72
116
 
73
- yield info, name
74
-
75
- @initialized
76
- def download(self, force=False):
77
- # Already downloaded
78
- destination = self.definition.datapath
79
- if destination.is_dir():
80
- return
117
+ if not self.subpath:
118
+ yield info, name
81
119
 
82
- logging.info("Downloading %s into %s", self.url, destination)
120
+ def _download(self, destination: Path) -> None:
121
+ logger.info("Downloading %s into %s", self.url, destination)
83
122
 
84
123
  destination.parent.mkdir(parents=True, exist_ok=True)
85
- tmpdestination = destination.with_suffix(".tmp")
86
- if tmpdestination.exists():
87
- logging.warn("Removing temporary directory %s", tmpdestination)
88
- shutil.rmtree(tmpdestination)
89
124
 
90
125
  with self.context.downloadURL(self.url) as file:
91
126
  if self.checker:
92
127
  self.checker.check(file.path)
93
- self.unarchive(file, tmpdestination)
94
-
95
- # Look at the content
96
- for ix, path in enumerate(tmpdestination.iterdir()):
97
- if ix > 1:
98
- break
99
-
100
- # Just one folder: move
101
- if ix == 0 and path.is_dir():
102
- logging.info(
103
- "Moving single file/directory {} into destination {}".format(
104
- path, destination
105
- )
128
+ self.unarchive(file, destination)
129
+
130
+ # Look at the content - if single directory, unwrap
131
+ children = list(destination.iterdir())
132
+ if len(children) == 1 and children[0].is_dir():
133
+ single_dir = children[0]
134
+ logger.info(
135
+ "Moving single directory %s into destination %s",
136
+ single_dir,
137
+ destination,
106
138
  )
107
- shutil.move(str(path), str(destination))
108
- shutil.rmtree(tmpdestination)
109
- else:
110
- shutil.move(tmpdestination, destination)
139
+ # Move contents up one level
140
+ tmp = destination.with_suffix(".unwrap")
141
+ shutil.move(str(single_dir), str(tmp))
142
+ shutil.rmtree(destination)
143
+ shutil.move(str(tmp), str(destination))
144
+
145
+ def unarchive(self, file, destination: Path):
146
+ raise NotImplementedError()
111
147
 
148
+ def _name(self, name: str) -> str:
149
+ raise NotImplementedError()
112
150
 
113
- class zipdownloader(ArchiveDownloader):
114
- """ZIP Archive handler"""
151
+
152
+ class ZipDownloader(ArchiveDownloader):
153
+ """ZIP Archive handler."""
115
154
 
116
155
  def _name(self, name):
117
156
  return re.sub(r"\.zip$", "", name)
118
157
 
119
158
  def unarchive(self, file, destination: Path):
120
- logging.info("Unzipping file")
159
+ logger.info("Unzipping file")
121
160
  with zipfile.ZipFile(file.path) as zip:
122
161
  if self.extractall:
123
162
  zip.extractall(destination)
124
163
  else:
125
164
  for zip_info, name in self.filter(
126
- zip.infolist(), lambda zip_info: zip_info.filename
165
+ zip.infolist(),
166
+ lambda zip_info: zip_info.filename,
127
167
  ):
128
168
  if zip_info.is_dir():
129
169
  (destination / name).mkdir()
130
170
  else:
131
- logging.info(
171
+ logger.info(
132
172
  "File %s (%s) to %s",
133
173
  zip_info.filename,
134
174
  name,
135
175
  destination / name,
136
176
  )
137
- with zip.open(zip_info) as fp, (destination / name).open(
138
- "wb"
139
- ) as out:
177
+ with (
178
+ zip.open(zip_info) as fp,
179
+ (destination / name).open("wb") as out,
180
+ ):
140
181
  shutil.copyfileobj(fp, out)
141
182
 
142
183
 
143
- class tardownloader(ArchiveDownloader):
144
- """TAR archive handler"""
184
+ class TarDownloader(ArchiveDownloader):
185
+ """TAR archive handler."""
145
186
 
146
187
  def _name(self, name):
147
188
  return re.sub(r"\.tar(\.gz|\.bz\|xz)?$", "", name)
148
189
 
149
190
  def unarchive(self, file: CachedFile, destination: Path):
150
- logging.info("Unarchiving file")
191
+ logger.info("Unarchiving file")
151
192
  if self.subpath:
152
193
  raise NotImplementedError()
153
194
 
@@ -159,8 +200,19 @@ class tardownloader(ArchiveDownloader):
159
200
  if info.isdir():
160
201
  (destination / name).mkdir()
161
202
  else:
162
- logging.info(
163
- "File %s (%s) to %s", info.name, name, destination / name,
203
+ logger.info(
204
+ "File %s (%s) to %s",
205
+ info.name,
206
+ name,
207
+ destination / name,
208
+ )
209
+ logger.info(
210
+ "Extracting into %s",
211
+ destination / name,
164
212
  )
165
- logging.info("Extracting into %s", destination / name)
166
213
  tar.extract(info, destination / name)
214
+
215
+
216
+ # Factory aliases for backward compat and convenient usage
217
+ zipdownloader = ZipDownloader.apply
218
+ tardownloader = TarDownloader.apply
@@ -0,0 +1,53 @@
1
+ """Custom download resources.
2
+
3
+ Provides a Resource subclass that delegates to a user-defined
4
+ download function.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Protocol
11
+
12
+ from datamaestro import Context
13
+ from datamaestro.download import Resource
14
+
15
+
16
+ class Downloader(Protocol):
17
+ def __call__(self, context: Context, root: Path, *, force: bool = False):
18
+ pass
19
+
20
+
21
+ class custom_download(Resource):
22
+ """A resource that delegates to a user-defined download function.
23
+
24
+ Usage as class attribute (preferred)::
25
+
26
+ @dataset(url="...")
27
+ class MyDataset(Base):
28
+ DATA = custom_download(
29
+ "data", downloader=my_download_fn
30
+ )
31
+
32
+ Usage as decorator (deprecated)::
33
+
34
+ @custom_download("data", downloader=my_download_fn)
35
+ @dataset(Base)
36
+ def my_dataset(data): ...
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ varname: str,
42
+ downloader: Downloader,
43
+ *,
44
+ transient: bool = False,
45
+ ):
46
+ super().__init__(varname=varname, transient=transient)
47
+ self.downloader = downloader
48
+
49
+ def prepare(self):
50
+ return self.dataset.datapath
51
+
52
+ def download(self, force=False):
53
+ self.downloader(self.context, self.dataset.datapath, force=force)
@@ -0,0 +1,77 @@
1
+ """HuggingFace Hub download resources.
2
+
3
+ Provides a ValueResource subclass for loading datasets from
4
+ the HuggingFace Hub.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+
11
+ from datamaestro.download import ValueResource
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class HFDownloader(ValueResource):
17
+ """Load a dataset from the HuggingFace Hub.
18
+
19
+ Usage as class attribute (preferred)::
20
+
21
+ @dataset(url="...")
22
+ class MyDataset(Base):
23
+ DATA = HFDownloader.apply(
24
+ "hf_data", repo_id="user/dataset"
25
+ )
26
+
27
+ Usage as decorator (deprecated)::
28
+
29
+ @hf_download("hf_data", repo_id="user/dataset")
30
+ @dataset(Base)
31
+ def my_dataset(hf_data): ...
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ varname: str,
37
+ repo_id: str,
38
+ *,
39
+ data_files: str | None = None,
40
+ split: str | None = None,
41
+ transient: bool = False,
42
+ ):
43
+ """
44
+ Args:
45
+ varname: Variable name.
46
+ repo_id: The HuggingFace repository ID.
47
+ data_files: Specific data files to load.
48
+ split: Dataset split to load.
49
+ transient: If True, data can be deleted after dependents
50
+ complete.
51
+ """
52
+ super().__init__(varname=varname, transient=transient)
53
+ self.repo_id = repo_id
54
+ self.data_files = data_files
55
+ self.split = split
56
+
57
+ def download(self, force=False):
58
+ try:
59
+ from datasets import load_dataset
60
+ except ModuleNotFoundError:
61
+ logger.error("the datasets library is not installed:")
62
+ logger.error("pip install datasets")
63
+ raise
64
+
65
+ self._dataset = load_dataset(self.repo_id, data_files=self.data_files)
66
+ return True
67
+
68
+ def prepare(self):
69
+ return {
70
+ "repo_id": self.repo_id,
71
+ "data_files": self.data_files,
72
+ "split": self.split,
73
+ }
74
+
75
+
76
+ # Factory alias for backward compat
77
+ hf_download = HFDownloader.apply
@@ -1,30 +1,53 @@
1
+ """Link-based resources.
2
+
3
+ Provides resources that create symlinks to other datasets or
4
+ user-specified paths.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
1
9
  import logging
2
10
  import os
3
- from datamaestro.download import Download
4
- from datamaestro.utils import deprecated
5
- from datamaestro.definitions import AbstractDataset
11
+ from pathlib import Path
6
12
  from typing import List
7
- from datamaestro.download import Download
13
+
8
14
  from datamaestro.context import ResolvablePath
9
- from pathlib import Path
10
- import os
11
- import logging
15
+ from datamaestro.definitions import AbstractDataset
16
+ from datamaestro.download import Resource
17
+ from datamaestro.utils import deprecated
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class links(Resource):
23
+ """Link with another dataset path.
12
24
 
25
+ Usage as class attribute (preferred)::
13
26
 
14
- class links(Download):
15
- def __init__(self, varname: str, **links: List[AbstractDataset]):
16
- """Link with another dataset path
27
+ @dataset(url="...")
28
+ class MyDataset(Base):
29
+ DATA = links("data", ref1=other_dataset1)
17
30
 
18
- Args:
19
- varname: The name of the variable when defining the dataset
20
- links: A list of
21
- """
22
- super().__init__(varname)
23
- self.links = links
31
+ Usage as decorator (deprecated)::
32
+
33
+ @links("data", ref1=other_dataset1)
34
+ @dataset(Base)
35
+ def my_dataset(data): ...
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ varname: str,
41
+ *,
42
+ transient: bool = False,
43
+ **link_targets: List[AbstractDataset],
44
+ ):
45
+ super().__init__(varname=varname, transient=transient)
46
+ self.links = link_targets
24
47
 
25
48
  @property
26
49
  def path(self):
27
- return self.definition.datapath
50
+ return self.dataset.datapath
28
51
 
29
52
  def prepare(self):
30
53
  return self.path
@@ -39,24 +62,36 @@ class links(Download):
39
62
 
40
63
  if not dest.exists():
41
64
  if dest.is_symlink():
42
- logging.info("Removing dandling symlink %s", dest)
65
+ logger.info("Removing dangling symlink %s", dest)
43
66
  dest.unlink()
44
67
  os.symlink(path, dest)
45
68
 
69
+ def has_files(self):
70
+ return False
71
+
46
72
 
47
73
  # Deprecated
48
74
  Links = deprecated("Use @links instead of @Links", links)
49
75
 
50
76
 
51
- class linkpath(Download):
52
- def __init__(self, varname: str, proposals):
53
- """Link to a folder
77
+ class linkpath(Resource):
78
+ """Link to a path selected from proposals.
79
+
80
+ Usage as class attribute (preferred)::
54
81
 
55
- Args:
56
- varname: Name of the variable
57
- proposals: List of potential paths
58
- """
59
- super().__init__(varname)
82
+ @dataset(url="...")
83
+ class MyDataset(Base):
84
+ DATA = linkpath("data", proposals=[...])
85
+ """
86
+
87
+ def __init__(
88
+ self,
89
+ varname: str,
90
+ proposals,
91
+ *,
92
+ transient: bool = False,
93
+ ):
94
+ super().__init__(varname=varname, transient=transient)
60
95
  self.proposals = proposals
61
96
 
62
97
  def prepare(self):
@@ -64,62 +99,83 @@ class linkpath(Download):
64
99
 
65
100
  @property
66
101
  def path(self):
67
- return self.definition.datapath / self.varname
102
+ return self.dataset.datapath / self.name
68
103
 
69
- def download(self, destination):
104
+ def download(self, force=False):
70
105
  if self.check(self.path):
71
106
  return
72
107
 
73
108
  if self.path.is_symlink():
74
- logging.warning("Removing dandling symlink %s", self.path)
109
+ logger.warning("Removing dangling symlink %s", self.path)
75
110
  self.path.unlink()
76
111
 
77
112
  path = None
78
113
 
79
114
  for searchpath in self.proposals:
80
- logging.info("Trying path %s", searchpath)
115
+ logger.info("Trying path %s", searchpath)
81
116
  try:
82
117
  path = ResolvablePath.resolve(self.context, searchpath)
83
118
  if self.check(path):
84
119
  break
85
- logging.info("Path %s not found", path)
120
+ logger.info("Path %s not found", path)
86
121
  except KeyError:
87
- logging.info("Could not expand path %s", searchpath)
122
+ logger.info("Could not expand path %s", searchpath)
88
123
 
89
124
  # Ask the user
90
125
  while path is None or not self.check(path):
91
- path = Path(input("Path to %s: " % self.varname))
126
+ path = Path(input("Path to %s: " % self.name))
92
127
  assert path.name
93
128
 
94
- logging.debug("Linking %s to %s", path, self.path)
129
+ logger.debug("Linking %s to %s", path, self.path)
95
130
  self.path.parent.mkdir(exist_ok=True, parents=True)
96
131
  os.symlink(path, self.path)
97
132
 
133
+ def check(self, path):
134
+ raise NotImplementedError()
135
+
98
136
 
99
137
  class linkfolder(linkpath):
138
+ """Link to a folder.
139
+
140
+ Usage as class attribute::
141
+
142
+ @dataset(url="...")
143
+ class MyDataset(Base):
144
+ DATA = linkfolder("data", proposals=[...])
145
+ """
146
+
147
+ def __init__(
148
+ self,
149
+ varname: str,
150
+ proposals,
151
+ *,
152
+ transient: bool = False,
153
+ ):
154
+ super().__init__(varname, proposals, transient=transient)
155
+
100
156
  def check(self, path):
101
157
  return path.is_dir()
102
158
 
103
- def __init__(self, varname: str, proposals):
104
- """Link to a folder
105
159
 
106
- Args:
107
- varname: Name of the variable
108
- proposals: List of potential paths
109
- """
110
- super().__init__(varname, proposals)
160
+ class linkfile(linkpath):
161
+ """Link to a file.
111
162
 
163
+ Usage as class attribute::
112
164
 
113
- class linkfile(linkpath):
114
- def __init__(self, varname: str, proposals):
115
- """Link to a file
165
+ @dataset(url="...")
166
+ class MyDataset(Base):
167
+ DATA = linkfile("data", proposals=[...])
168
+ """
116
169
 
117
- Args:
118
- varname: Name of the variable
119
- proposals: List of potential paths
120
- """
121
- super().__init__(varname, proposals)
170
+ def __init__(
171
+ self,
172
+ varname: str,
173
+ proposals,
174
+ *,
175
+ transient: bool = False,
176
+ ):
177
+ super().__init__(varname, proposals, transient=transient)
122
178
 
123
179
  def check(self, path):
124
- print("Checking", path, path.is_file())
180
+ logger.debug("Checking %s (exists: %s)", path, path.is_file())
125
181
  return path.is_file()