datamaestro 1.6.2__py3-none-any.whl → 1.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,41 +1,68 @@
1
+ """Archive download resources.
2
+
3
+ Provides FolderResource subclasses for downloading and extracting
4
+ ZIP and TAR archives.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
1
9
  import logging
2
- from pathlib import Path
3
- import zipfile
10
+ import re
4
11
  import shutil
5
- import urllib3
6
12
  import tarfile
7
- import re
13
+ import zipfile
14
+ from pathlib import Path
8
15
  from typing import Set
9
- from datamaestro.download import Download, initialized
16
+
17
+ import urllib3
18
+
19
+ from datamaestro.download import FolderResource
10
20
  from datamaestro.utils import CachedFile, FileChecker
11
21
 
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class ArchiveDownloader(FolderResource):
26
+ """Abstract base for all archive-related extractors.
27
+
28
+ Usage as class attribute (preferred)::
29
+
30
+ @dataset(url="...")
31
+ class MyDataset(Base):
32
+ DATA = ZipDownloader.apply(
33
+ "archive", "http://example.com/data.zip"
34
+ )
12
35
 
13
- class ArchiveDownloader(Download):
14
- """Abstract class for all archive related extractors"""
36
+ Usage as decorator (deprecated)::
37
+
38
+ @zipdownloader("archive", "http://example.com/data.zip")
39
+ @dataset(Base)
40
+ def my_dataset(archive): ...
41
+ """
15
42
 
16
43
  def __init__(
17
44
  self,
18
- varname,
45
+ varname: str,
19
46
  url: str,
20
- subpath: str = None,
21
- checker: FileChecker = None,
22
- files: Set[str] = None,
47
+ subpath: str | None = None,
48
+ checker: FileChecker | None = None,
49
+ files: Set[str] | None = None,
50
+ *,
51
+ transient: bool = False,
23
52
  ):
24
- """Downloads and extract the content of the archive
53
+ """Downloads and extract the content of the archive.
25
54
 
26
55
  Args:
27
- varname: The name of the variable when defining the dataset
28
-
29
- url: The archive URL
30
-
31
- checker: the hash check for the downloaded file, composed of two
32
-
33
- subpath: A subpath in the archive; only files from this subpath will
34
- be extracted
35
-
36
- files: A set of files; if present, only extract those
56
+ varname: The name of the variable when defining the dataset.
57
+ url: The archive URL.
58
+ subpath: A subpath in the archive; only files from this
59
+ subpath will be extracted.
60
+ checker: The hash check for the downloaded file.
61
+ files: A set of files; if present, only extract those.
62
+ transient: If True, data can be deleted after dependents
63
+ complete.
37
64
  """
38
- super().__init__(varname)
65
+ super().__init__(varname=varname, transient=transient)
39
66
  self.url = url
40
67
  self.subpath = subpath
41
68
  self.checker = checker
@@ -46,20 +73,33 @@ class ArchiveDownloader(Download):
46
73
  def postinit(self):
47
74
  # Define the path
48
75
  p = urllib3.util.parse_url(self.url)
49
- name = self._name(Path(p.path).name)
76
+ self._archive_name = self._name(Path(p.path).name)
50
77
 
51
- if len(self.definition.resources) > 1:
52
- self.path = self.definition.datapath / name
53
- else:
54
- self.path = self.definition.datapath
78
+ @property
79
+ def path(self) -> Path:
80
+ """Final path to the extracted directory."""
81
+ if not self._post:
82
+ self._post = True
83
+ self.postinit()
84
+
85
+ if len(self.dataset.resources) > 1:
86
+ return self.dataset.datapath / self._archive_name
87
+ return self.dataset.datapath
88
+
89
+ @property
90
+ def transient_path(self) -> Path:
91
+ """Temporary path for extraction."""
92
+ if not self._post:
93
+ self._post = True
94
+ self.postinit()
55
95
 
56
- @initialized
57
- def prepare(self):
58
- return self.path
96
+ if len(self.dataset.resources) > 1:
97
+ return self.dataset.datapath / ".downloads" / self._archive_name
98
+ return self.dataset.datapath / ".downloads" / self.name
59
99
 
60
100
  @property
61
101
  def extractall(self):
62
- """Returns whether everything can be extracted"""
102
+ """Returns whether everything can be extracted."""
63
103
  return self._files is None and self.subpath is None
64
104
 
65
105
  def filter(self, iterable, getname):
@@ -67,8 +107,8 @@ class ArchiveDownloader(Download):
67
107
 
68
108
  for info in iterable:
69
109
  name = getname(info)
70
- logging.debug("Looking at %s", name)
71
- if self._files and not (name in self._files):
110
+ logger.debug("Looking at %s", name)
111
+ if self._files and name not in self._files:
72
112
  continue
73
113
 
74
114
  if self.subpath and name.startswith(self.subpath):
@@ -77,82 +117,78 @@ class ArchiveDownloader(Download):
77
117
  if not self.subpath:
78
118
  yield info, name
79
119
 
80
- @initialized
81
- def download(self, force=False):
82
- # Already downloaded
83
- destination = self.definition.datapath
84
- if destination.is_dir():
85
- return
86
-
87
- logging.info("Downloading %s into %s", self.url, destination)
120
+ def _download(self, destination: Path) -> None:
121
+ logger.info("Downloading %s into %s", self.url, destination)
88
122
 
89
123
  destination.parent.mkdir(parents=True, exist_ok=True)
90
- tmpdestination = destination.with_suffix(".tmp")
91
- if tmpdestination.exists():
92
- logging.warn("Removing temporary directory %s", tmpdestination)
93
- shutil.rmtree(tmpdestination)
94
124
 
95
125
  with self.context.downloadURL(self.url) as file:
96
126
  if self.checker:
97
127
  self.checker.check(file.path)
98
- self.unarchive(file, tmpdestination)
99
-
100
- # Look at the content
101
- for ix, path in enumerate(tmpdestination.iterdir()):
102
- if ix > 1:
103
- break
104
-
105
- # Just one folder: move
106
- if ix == 0 and path.is_dir():
107
- logging.info(
108
- "Moving single file/directory {} into destination {}".format(
109
- path, destination
110
- )
128
+ self.unarchive(file, destination)
129
+
130
+ # Look at the content - if single directory, unwrap
131
+ children = list(destination.iterdir())
132
+ if len(children) == 1 and children[0].is_dir():
133
+ single_dir = children[0]
134
+ logger.info(
135
+ "Moving single directory %s into destination %s",
136
+ single_dir,
137
+ destination,
111
138
  )
112
- shutil.move(str(path), str(destination))
113
- shutil.rmtree(tmpdestination)
114
- else:
115
- shutil.move(tmpdestination, destination)
139
+ # Move contents up one level
140
+ tmp = destination.with_suffix(".unwrap")
141
+ shutil.move(str(single_dir), str(tmp))
142
+ shutil.rmtree(destination)
143
+ shutil.move(str(tmp), str(destination))
144
+
145
+ def unarchive(self, file, destination: Path):
146
+ raise NotImplementedError()
147
+
148
+ def _name(self, name: str) -> str:
149
+ raise NotImplementedError()
116
150
 
117
151
 
118
- class zipdownloader(ArchiveDownloader):
119
- """ZIP Archive handler"""
152
+ class ZipDownloader(ArchiveDownloader):
153
+ """ZIP Archive handler."""
120
154
 
121
155
  def _name(self, name):
122
156
  return re.sub(r"\.zip$", "", name)
123
157
 
124
158
  def unarchive(self, file, destination: Path):
125
- logging.info("Unzipping file")
159
+ logger.info("Unzipping file")
126
160
  with zipfile.ZipFile(file.path) as zip:
127
161
  if self.extractall:
128
162
  zip.extractall(destination)
129
163
  else:
130
164
  for zip_info, name in self.filter(
131
- zip.infolist(), lambda zip_info: zip_info.filename
165
+ zip.infolist(),
166
+ lambda zip_info: zip_info.filename,
132
167
  ):
133
168
  if zip_info.is_dir():
134
169
  (destination / name).mkdir()
135
170
  else:
136
- logging.info(
171
+ logger.info(
137
172
  "File %s (%s) to %s",
138
173
  zip_info.filename,
139
174
  name,
140
175
  destination / name,
141
176
  )
142
- with zip.open(zip_info) as fp, (destination / name).open(
143
- "wb"
144
- ) as out:
177
+ with (
178
+ zip.open(zip_info) as fp,
179
+ (destination / name).open("wb") as out,
180
+ ):
145
181
  shutil.copyfileobj(fp, out)
146
182
 
147
183
 
148
- class tardownloader(ArchiveDownloader):
149
- """TAR archive handler"""
184
+ class TarDownloader(ArchiveDownloader):
185
+ """TAR archive handler."""
150
186
 
151
187
  def _name(self, name):
152
188
  return re.sub(r"\.tar(\.gz|\.bz\|xz)?$", "", name)
153
189
 
154
190
  def unarchive(self, file: CachedFile, destination: Path):
155
- logging.info("Unarchiving file")
191
+ logger.info("Unarchiving file")
156
192
  if self.subpath:
157
193
  raise NotImplementedError()
158
194
 
@@ -164,11 +200,19 @@ class tardownloader(ArchiveDownloader):
164
200
  if info.isdir():
165
201
  (destination / name).mkdir()
166
202
  else:
167
- logging.info(
203
+ logger.info(
168
204
  "File %s (%s) to %s",
169
205
  info.name,
170
206
  name,
171
207
  destination / name,
172
208
  )
173
- logging.info("Extracting into %s", destination / name)
209
+ logger.info(
210
+ "Extracting into %s",
211
+ destination / name,
212
+ )
174
213
  tar.extract(info, destination / name)
214
+
215
+
216
+ # Factory aliases for backward compat and convenient usage
217
+ zipdownloader = ZipDownloader.apply
218
+ tardownloader = TarDownloader.apply
@@ -1,21 +1,53 @@
1
- from typing import Protocol
1
+ """Custom download resources.
2
+
3
+ Provides a Resource subclass that delegates to a user-defined
4
+ download function.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
2
9
  from pathlib import Path
10
+ from typing import Protocol
11
+
3
12
  from datamaestro import Context
4
13
  from datamaestro.download import Resource
5
14
 
6
15
 
7
16
  class Downloader(Protocol):
8
- def __call__(self, context: Context, root: Path, *, force=False):
17
+ def __call__(self, context: Context, root: Path, *, force: bool = False):
9
18
  pass
10
19
 
11
20
 
12
21
  class custom_download(Resource):
13
- def __init__(self, varname: str, downloader: Downloader):
14
- super().__init__(varname)
22
+ """A resource that delegates to a user-defined download function.
23
+
24
+ Usage as class attribute (preferred)::
25
+
26
+ @dataset(url="...")
27
+ class MyDataset(Base):
28
+ DATA = custom_download(
29
+ "data", downloader=my_download_fn
30
+ )
31
+
32
+ Usage as decorator (deprecated)::
33
+
34
+ @custom_download("data", downloader=my_download_fn)
35
+ @dataset(Base)
36
+ def my_dataset(data): ...
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ varname: str,
42
+ downloader: Downloader,
43
+ *,
44
+ transient: bool = False,
45
+ ):
46
+ super().__init__(varname=varname, transient=transient)
15
47
  self.downloader = downloader
16
48
 
17
49
  def prepare(self):
18
- return self.definition.datapath
50
+ return self.dataset.datapath
19
51
 
20
52
  def download(self, force=False):
21
- self.downloader(self.context, self.definition.datapath, force=force)
53
+ self.downloader(self.context, self.dataset.datapath, force=force)
@@ -1,27 +1,55 @@
1
+ """HuggingFace Hub download resources.
2
+
3
+ Provides a ValueResource subclass for loading datasets from
4
+ the HuggingFace Hub.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
1
9
  import logging
2
- from typing import Optional
3
10
 
4
- from datamaestro.download import Download
11
+ from datamaestro.download import ValueResource
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class HFDownloader(ValueResource):
17
+ """Load a dataset from the HuggingFace Hub.
5
18
 
19
+ Usage as class attribute (preferred)::
6
20
 
7
- class hf_download(Download):
8
- """Use Hugging Face to download a file"""
21
+ @dataset(url="...")
22
+ class MyDataset(Base):
23
+ DATA = HFDownloader.apply(
24
+ "hf_data", repo_id="user/dataset"
25
+ )
26
+
27
+ Usage as decorator (deprecated)::
28
+
29
+ @hf_download("hf_data", repo_id="user/dataset")
30
+ @dataset(Base)
31
+ def my_dataset(hf_data): ...
32
+ """
9
33
 
10
34
  def __init__(
11
35
  self,
12
36
  varname: str,
13
37
  repo_id: str,
14
38
  *,
15
- data_files: Optional[str] = None,
16
- split: Optional[str] = None
39
+ data_files: str | None = None,
40
+ split: str | None = None,
41
+ transient: bool = False,
17
42
  ):
18
- """Use
19
-
43
+ """
20
44
  Args:
21
- varname: Variable name
22
- repo_id: The HuggingFace repository ID
45
+ varname: Variable name.
46
+ repo_id: The HuggingFace repository ID.
47
+ data_files: Specific data files to load.
48
+ split: Dataset split to load.
49
+ transient: If True, data can be deleted after dependents
50
+ complete.
23
51
  """
24
- super().__init__(varname)
52
+ super().__init__(varname=varname, transient=transient)
25
53
  self.repo_id = repo_id
26
54
  self.data_files = data_files
27
55
  self.split = split
@@ -30,11 +58,11 @@ class hf_download(Download):
30
58
  try:
31
59
  from datasets import load_dataset
32
60
  except ModuleNotFoundError:
33
- logging.error("the datasets library is not installed:")
34
- logging.error("pip install datasets")
61
+ logger.error("the datasets library is not installed:")
62
+ logger.error("pip install datasets")
35
63
  raise
36
64
 
37
- self.dataset = load_dataset(self.repo_id, data_files=self.data_files)
65
+ self._dataset = load_dataset(self.repo_id, data_files=self.data_files)
38
66
  return True
39
67
 
40
68
  def prepare(self):
@@ -43,3 +71,7 @@ class hf_download(Download):
43
71
  "data_files": self.data_files,
44
72
  "split": self.split,
45
73
  }
74
+
75
+
76
+ # Factory alias for backward compat
77
+ hf_download = HFDownloader.apply
@@ -1,29 +1,53 @@
1
+ """Link-based resources.
2
+
3
+ Provides resources that create symlinks to other datasets or
4
+ user-specified paths.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
1
9
  import logging
2
10
  import os
3
- from datamaestro.utils import deprecated
4
- from datamaestro.definitions import AbstractDataset
11
+ from pathlib import Path
5
12
  from typing import List
6
- from datamaestro.download import Download
13
+
7
14
  from datamaestro.context import ResolvablePath
8
- from pathlib import Path
9
- import os
10
- import logging
15
+ from datamaestro.definitions import AbstractDataset
16
+ from datamaestro.download import Resource
17
+ from datamaestro.utils import deprecated
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class links(Resource):
23
+ """Link with another dataset path.
11
24
 
25
+ Usage as class attribute (preferred)::
12
26
 
13
- class links(Download):
14
- def __init__(self, varname: str, **links: List[AbstractDataset]):
15
- """Link with another dataset path
27
+ @dataset(url="...")
28
+ class MyDataset(Base):
29
+ DATA = links("data", ref1=other_dataset1)
16
30
 
17
- Args:
18
- varname: The name of the variable when defining the dataset
19
- links: A list of
20
- """
21
- super().__init__(varname)
22
- self.links = links
31
+ Usage as decorator (deprecated)::
32
+
33
+ @links("data", ref1=other_dataset1)
34
+ @dataset(Base)
35
+ def my_dataset(data): ...
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ varname: str,
41
+ *,
42
+ transient: bool = False,
43
+ **link_targets: List[AbstractDataset],
44
+ ):
45
+ super().__init__(varname=varname, transient=transient)
46
+ self.links = link_targets
23
47
 
24
48
  @property
25
49
  def path(self):
26
- return self.definition.datapath
50
+ return self.dataset.datapath
27
51
 
28
52
  def prepare(self):
29
53
  return self.path
@@ -31,31 +55,51 @@ class links(Download):
31
55
  def download(self, force=False):
32
56
  self.path.mkdir(exist_ok=True, parents=True)
33
57
  for key, value in self.links.items():
34
- value.download(force)
58
+ # Resolve class-based datasets
59
+ if hasattr(value, "__dataset__"):
60
+ wrapper = value.__dataset__
61
+ wrapper.download(force)
62
+ path = wrapper()
63
+ elif hasattr(value, "download"):
64
+ value.download(force)
65
+ path = value()
66
+ else:
67
+ path = value # Already a path
35
68
 
36
- path = value()
37
69
  dest = self.path / key
38
70
 
39
71
  if not dest.exists():
40
72
  if dest.is_symlink():
41
- logging.info("Removing dandling symlink %s", dest)
73
+ logger.info("Removing dangling symlink %s", dest)
42
74
  dest.unlink()
43
75
  os.symlink(path, dest)
44
76
 
77
+ def has_files(self):
78
+ return False
79
+
45
80
 
46
81
  # Deprecated
47
82
  Links = deprecated("Use @links instead of @Links", links)
48
83
 
49
84
 
50
- class linkpath(Download):
51
- def __init__(self, varname: str, proposals):
52
- """Link to a folder
85
+ class linkpath(Resource):
86
+ """Link to a path selected from proposals.
87
+
88
+ Usage as class attribute (preferred)::
53
89
 
54
- Args:
55
- varname: Name of the variable
56
- proposals: List of potential paths
57
- """
58
- super().__init__(varname)
90
+ @dataset(url="...")
91
+ class MyDataset(Base):
92
+ DATA = linkpath("data", proposals=[...])
93
+ """
94
+
95
+ def __init__(
96
+ self,
97
+ varname: str,
98
+ proposals,
99
+ *,
100
+ transient: bool = False,
101
+ ):
102
+ super().__init__(varname=varname, transient=transient)
59
103
  self.proposals = proposals
60
104
 
61
105
  def prepare(self):
@@ -63,62 +107,83 @@ class linkpath(Download):
63
107
 
64
108
  @property
65
109
  def path(self):
66
- return self.definition.datapath / self.varname
110
+ return self.dataset.datapath / self.name
67
111
 
68
- def download(self, destination):
112
+ def download(self, force=False):
69
113
  if self.check(self.path):
70
114
  return
71
115
 
72
116
  if self.path.is_symlink():
73
- logging.warning("Removing dandling symlink %s", self.path)
117
+ logger.warning("Removing dangling symlink %s", self.path)
74
118
  self.path.unlink()
75
119
 
76
120
  path = None
77
121
 
78
122
  for searchpath in self.proposals:
79
- logging.info("Trying path %s", searchpath)
123
+ logger.info("Trying path %s", searchpath)
80
124
  try:
81
125
  path = ResolvablePath.resolve(self.context, searchpath)
82
126
  if self.check(path):
83
127
  break
84
- logging.info("Path %s not found", path)
128
+ logger.info("Path %s not found", path)
85
129
  except KeyError:
86
- logging.info("Could not expand path %s", searchpath)
130
+ logger.info("Could not expand path %s", searchpath)
87
131
 
88
132
  # Ask the user
89
133
  while path is None or not self.check(path):
90
- path = Path(input("Path to %s: " % self.varname))
134
+ path = Path(input("Path to %s: " % self.name))
91
135
  assert path.name
92
136
 
93
- logging.debug("Linking %s to %s", path, self.path)
137
+ logger.debug("Linking %s to %s", path, self.path)
94
138
  self.path.parent.mkdir(exist_ok=True, parents=True)
95
139
  os.symlink(path, self.path)
96
140
 
141
+ def check(self, path):
142
+ raise NotImplementedError()
143
+
97
144
 
98
145
  class linkfolder(linkpath):
146
+ """Link to a folder.
147
+
148
+ Usage as class attribute::
149
+
150
+ @dataset(url="...")
151
+ class MyDataset(Base):
152
+ DATA = linkfolder("data", proposals=[...])
153
+ """
154
+
155
+ def __init__(
156
+ self,
157
+ varname: str,
158
+ proposals,
159
+ *,
160
+ transient: bool = False,
161
+ ):
162
+ super().__init__(varname, proposals, transient=transient)
163
+
99
164
  def check(self, path):
100
165
  return path.is_dir()
101
166
 
102
- def __init__(self, varname: str, proposals):
103
- """Link to a folder
104
167
 
105
- Args:
106
- varname: Name of the variable
107
- proposals: List of potential paths
108
- """
109
- super().__init__(varname, proposals)
168
+ class linkfile(linkpath):
169
+ """Link to a file.
110
170
 
171
+ Usage as class attribute::
111
172
 
112
- class linkfile(linkpath):
113
- def __init__(self, varname: str, proposals):
114
- """Link to a file
173
+ @dataset(url="...")
174
+ class MyDataset(Base):
175
+ DATA = linkfile("data", proposals=[...])
176
+ """
115
177
 
116
- Args:
117
- varname: Name of the variable
118
- proposals: List of potential paths
119
- """
120
- super().__init__(varname, proposals)
178
+ def __init__(
179
+ self,
180
+ varname: str,
181
+ proposals,
182
+ *,
183
+ transient: bool = False,
184
+ ):
185
+ super().__init__(varname, proposals, transient=transient)
121
186
 
122
187
  def check(self, path):
123
- print("Checking", path, path.is_file())
188
+ logger.debug("Checking %s (exists: %s)", path, path.is_file())
124
189
  return path.is_file()