datamaestro 1.6.2__py3-none-any.whl → 1.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,31 @@
1
+ """Multiple download resources (legacy).
2
+
3
+ Note: This module uses a legacy API pattern and needs deeper refactoring.
4
+ The List and Datasets classes use an older constructor signature that
5
+ differs from the modern Resource interface.
6
+ """
7
+
1
8
  import logging
2
- from pathlib import Path
3
9
  import os
10
+ import warnings
11
+ from pathlib import Path
4
12
 
5
- from datamaestro import AbstractDataset
13
+ from datamaestro.definitions import AbstractDataset
6
14
  from datamaestro.download import Download
7
15
 
16
+ warnings.warn(
17
+ "datamaestro.download.multiple uses a legacy API. "
18
+ "Consider migrating to class-attribute resource definitions.",
19
+ DeprecationWarning,
20
+ stacklevel=2,
21
+ )
22
+
8
23
 
9
24
  class List(Download):
10
- """Download multiple files or directories given by a list"""
25
+ """Download multiple files or directories given by a list.
26
+
27
+ Legacy: uses old-style constructor API.
28
+ """
11
29
 
12
30
  def __init__(self, dataset: AbstractDataset, definition: object):
13
31
  super().__init__(dataset, definition)
@@ -32,7 +50,10 @@ class List(Download):
32
50
 
33
51
 
34
52
  class Datasets(Download):
35
- """Use links to dataset files"""
53
+ """Use links to dataset files.
54
+
55
+ Legacy: uses old-style constructor API.
56
+ """
36
57
 
37
58
  def __init__(self, dataset: AbstractDataset, definition: object):
38
59
  super().__init__(dataset, definition)
@@ -48,7 +69,8 @@ class Datasets(Download):
48
69
  if isinstance(files, Path):
49
70
  if not files.is_dir():
50
71
  raise AssertionError(
51
- "Dataset path is not a directory: %s", files
72
+ "Dataset path is not a directory: %s",
73
+ files,
52
74
  )
53
75
  path = destination / key
54
76
  if not path.exists():
@@ -1,64 +1,83 @@
1
- from typing import Optional
2
- import logging
3
- import shutil
4
- import tarfile
1
+ """Single file download resources.
2
+
3
+ Provides FileResource subclasses for downloading individual files
4
+ from URLs, with optional transforms and integrity checking.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
5
9
  import io
6
10
  import gzip
7
- import os.path as op
11
+ import logging
8
12
  import os
9
- import urllib3
13
+ import os.path as op
14
+ import shutil
15
+ import tarfile
10
16
  from pathlib import Path
11
- import re
12
- from datamaestro.utils import copyfileobjs
17
+
18
+ import urllib3
19
+
20
+ from datamaestro.download import FileResource
13
21
  from datamaestro.stream import Transform
14
- from datamaestro.download import Download
22
+ from datamaestro.utils import copyfileobjs
23
+
24
+ logger = logging.getLogger(__name__)
15
25
 
16
26
 
17
27
  def open_ext(*args, **kwargs):
18
- """Opens a file according to its extension"""
28
+ """Opens a file according to its extension."""
19
29
  name = args[0]
20
30
  if name.endswith(".gz"):
21
31
  return gzip.open(*args, *kwargs)
22
32
  return io.open(*args, **kwargs)
23
33
 
24
34
 
25
- class SingleDownload(Download):
26
- def __init__(self, filename: str):
27
- super().__init__(re.sub(r"\..*$", "", filename))
28
- self.name = filename
35
+ class FileDownloader(FileResource):
36
+ """Downloads a single file from a URL.
37
+
38
+ Supports optional transforms (e.g., gzip decompression)
39
+ and integrity checking.
29
40
 
30
- @property
31
- def path(self):
32
- return self.definition.datapath / self.name
41
+ Usage as class attribute (preferred)::
33
42
 
34
- def prepare(self):
35
- return self.path
43
+ @dataset(url="...")
44
+ class MyDataset(Base):
45
+ DATA = FileDownloader.apply(
46
+ "data.csv", "http://example.com/data.csv.gz"
47
+ )
36
48
 
37
- def download(self, force=False):
38
- if not self.path.is_file() and not force:
39
- self._download(self.path)
49
+ Usage as decorator (deprecated)::
40
50
 
51
+ @filedownloader("data.csv", "http://example.com/data.csv.gz")
52
+ @dataset(Base)
53
+ def my_dataset(data): ...
54
+ """
41
55
 
42
- class filedownloader(SingleDownload):
43
56
  def __init__(
44
57
  self,
45
58
  filename: str,
46
59
  url: str,
47
- size: int = None,
48
- transforms: Optional[Transform] = None,
60
+ size: int | None = None,
61
+ transforms: Transform | None = None,
49
62
  checker=None,
63
+ *,
64
+ varname: str | None = None,
65
+ transient: bool = False,
50
66
  ):
51
- """Downloads a file given by a URL
52
-
53
- :param filename: The filename within the data folder; the variable name
54
- corresponds to the filename without the extension.
55
-
56
- :param url: The URL to download.
57
-
58
- :param transforms: Transform the file before storing it size: size in
59
- bytes (or None)
60
67
  """
61
- super().__init__(filename)
68
+ Args:
69
+ filename: The filename within the data folder; the variable
70
+ name corresponds to the filename without the extension.
71
+ url: The URL to download.
72
+ size: Expected size in bytes (or None).
73
+ transforms: Transform the file before storing it.
74
+ Auto-detected from URL path if None.
75
+ checker: File integrity checker.
76
+ varname: Explicit resource name.
77
+ transient: If True, data can be deleted after dependents
78
+ complete.
79
+ """
80
+ super().__init__(filename, varname=varname, transient=transient)
62
81
  self.url = url
63
82
  self.checker = checker
64
83
  self.size = size
@@ -67,8 +86,8 @@ class filedownloader(SingleDownload):
67
86
  path = Path(Path(p.path).name)
68
87
  self.transforms = transforms if transforms else Transform.createFromPath(path)
69
88
 
70
- def _download(self, destination):
71
- logging.info("Downloading %s into %s", self.url, destination)
89
+ def _download(self, destination: Path) -> None:
90
+ logger.info("Downloading %s into %s", self.url, destination)
72
91
 
73
92
  # Creates directory if needed
74
93
  dir = op.dirname(destination)
@@ -78,41 +97,69 @@ class filedownloader(SingleDownload):
78
97
  with self.context.downloadURL(self.url, size=self.size) as file:
79
98
  # Transform if need be
80
99
  if self.transforms:
81
- logging.info("Transforming file")
82
- with self.transforms(file.path.open("rb")) as stream, destination.open(
83
- "wb"
84
- ) as out:
100
+ logger.info("Transforming file")
101
+ with (
102
+ self.transforms(file.path.open("rb")) as stream,
103
+ destination.open("wb") as out,
104
+ ):
85
105
  if self.checker:
86
106
  copyfileobjs(stream, [out, self.checker])
87
107
  self.checker.close()
88
108
  else:
89
109
  shutil.copyfileobj(stream, out)
90
110
  else:
91
- logging.info("Keeping original downloaded file %s", file.path)
111
+ logger.info("Keeping original downloaded file %s", file.path)
92
112
  if self.checker:
93
113
  self.checker.check(file.path)
94
114
  (shutil.copy if file.keep else shutil.move)(file.path, destination)
95
115
 
96
- logging.info("Created file %s" % destination)
116
+ logger.info("Created file %s", destination)
97
117
 
98
118
 
99
- class concatdownload(SingleDownload):
100
- """Concatenate all files in an archive"""
119
+ # Factory alias for backward compat and convenient usage
120
+ filedownloader = FileDownloader.apply
101
121
 
102
- def __init__(self, filename: str, url: str, transforms=None):
103
- """Concat the files in an archive
104
122
 
123
+ class ConcatDownloader(FileResource):
124
+ """Concatenate all files from an archive into a single file.
125
+
126
+ Usage as class attribute (preferred)::
127
+
128
+ @dataset(url="...")
129
+ class MyDataset(Base):
130
+ DATA = ConcatDownloader.apply(
131
+ "data.txt", "http://example.com/data.tar.gz"
132
+ )
133
+ """
134
+
135
+ def __init__(
136
+ self,
137
+ filename: str,
138
+ url: str,
139
+ transforms=None,
140
+ *,
141
+ varname: str | None = None,
142
+ transient: bool = False,
143
+ ):
144
+ """
105
145
  Args:
106
- filename: The filename within the data folder; the variable name
107
- corresponds to the filename without the extension url: The URL to
108
- download transforms: Transform the file before storing it
146
+ filename: The filename within the data folder; the variable
147
+ name corresponds to the filename without the extension.
148
+ url: The URL to download.
149
+ transforms: Transform the file before storing it.
150
+ varname: Explicit resource name.
151
+ transient: If True, data can be deleted after dependents
152
+ complete.
109
153
  """
110
- super().__init__(filename)
154
+ super().__init__(filename, varname=varname, transient=transient)
111
155
  self.url = url
112
156
  self.transforms = transforms
113
157
 
114
- def _download(self, destination):
115
- with self.context.downloadURL(self.url) as dl, tarfile.open(dl.path) as archive:
158
+ def _download(self, destination: Path) -> None:
159
+ with (
160
+ self.context.downloadURL(self.url) as dl,
161
+ tarfile.open(dl.path) as archive,
162
+ ):
116
163
  destination.parent.mkdir(parents=True, exist_ok=True)
117
164
 
118
165
  with open(destination, "wb") as out:
@@ -121,6 +168,16 @@ class concatdownload(SingleDownload):
121
168
  transforms = self.transforms or Transform.createFromPath(
122
169
  Path(tarinfo.name)
123
170
  )
124
- logging.debug("Processing file %s", tarinfo.name)
171
+ logger.debug("Processing file %s", tarinfo.name)
125
172
  with transforms(archive.fileobject(archive, tarinfo)) as fp:
126
173
  shutil.copyfileobj(fp, out)
174
+
175
+
176
+ # Factory alias for backward compat
177
+ concatdownload = ConcatDownloader.apply
178
+
179
+
180
+ # --- Backward compat aliases ---
181
+ # Keep old class names importable but they now point to new classes
182
+
183
+ SingleDownload = FileDownloader
@@ -2,7 +2,6 @@ import logging
2
2
  from pathlib import Path
3
3
 
4
4
  from datamaestro.download import Download
5
- from datamaestro.definitions import AbstractDataset
6
5
 
7
6
  from subprocess import run
8
7
 
@@ -1,10 +1,15 @@
1
- from pathlib import Path
1
+ from datamaestro.download import Resource
2
2
 
3
- from datamaestro.download import Download
4
3
 
4
+ class Todo(Resource):
5
+ """Placeholder resource indicating download is not yet implemented."""
5
6
 
6
- class Todo(Download):
7
- def download(self, destination: Path):
7
+ def download(self, force=False):
8
8
  raise NotImplementedError(
9
9
  "Download method not defined - please edit the definition file"
10
10
  )
11
+
12
+ def prepare(self):
13
+ raise NotImplementedError(
14
+ "Prepare method not defined - please edit the definition file"
15
+ )
@@ -129,12 +129,12 @@ class wayback_documents(Resource):
129
129
  self.urls_fn = urls_fn
130
130
 
131
131
  def prepare(self):
132
- return self.definition.datapath / self.varname
132
+ return self.dataset.datapath / self.name
133
133
 
134
134
  def download(self, force=False):
135
135
  # Creates directory if needed
136
- destination: Path = self.definition.datapath / self.varname
137
- self.definition.datapath.mkdir(exist_ok=True)
136
+ destination: Path = self.dataset.datapath / self.name
137
+ self.dataset.datapath.mkdir(exist_ok=True)
138
138
 
139
139
  # Early exit
140
140
  done_path = destination.with_suffix(".done")
datamaestro/record.py CHANGED
@@ -1,5 +1,51 @@
1
+ """Record module for type-safe heterogeneous containers.
2
+
3
+ .. deprecated:: 2.0
4
+ This module will be removed in v2. Use :class:`typing.TypedDict` instead
5
+ for type-safe heterogeneous data structures. TypedDict provides better IDE
6
+ support, type checking, and is part of the standard library.
7
+
8
+ When using TypedDict, define key constants in classes (e.g., ``MyItem.ID``)
9
+ to avoid typos and enable IDE autocomplete. Prefix keys with package name
10
+ using underscore ``_`` as delimiter to avoid conflicts between different
11
+ data sources.
12
+
13
+ Example migration::
14
+
15
+ # Old way (deprecated)
16
+ @define
17
+ class MyItem(Item):
18
+ value: int
19
+
20
+ record = Record(MyItem(42))
21
+ print(record[MyItem].value)
22
+
23
+ # New way (recommended)
24
+ from typing import TypedDict
25
+
26
+ # Define key constants in classes
27
+ class MyItem:
28
+ ID = "mypackage_value"
29
+
30
+ class MyRecord(TypedDict):
31
+ mypackage_value: int
32
+
33
+ data: MyRecord = {MyItem.ID: 42}
34
+ print(data[MyItem.ID])
35
+ """
36
+
37
+ import warnings
1
38
  from typing import Type, TypeVar, Dict, Union, Optional
2
39
 
40
+ # Emit deprecation warning when module is imported
41
+ warnings.warn(
42
+ "The datamaestro.record module is deprecated and will be removed in v2. "
43
+ "Use typing.TypedDict instead (use class constants like MyItem.ID for keys, "
44
+ "prefixed with package name).",
45
+ DeprecationWarning,
46
+ stacklevel=2,
47
+ )
48
+
3
49
 
4
50
  class Item:
5
51
  """Base class for all item types"""
@@ -28,8 +74,8 @@ class RecordType:
28
74
  self.mapping = {item_type.__get_base__(): item_type for item_type in item_types}
29
75
 
30
76
  def __repr__(self):
31
- return f"""Record({",".join(item_type.__name__ for item_type in
32
- self.item_types)})"""
77
+ names = ",".join(item_type.__name__ for item_type in self.item_types)
78
+ return f"Record({names})"
33
79
 
34
80
  def contains(self, other: "RecordType"):
35
81
  """Checks that each item type in other has an item type of a compatible
datamaestro/settings.py CHANGED
@@ -1,7 +1,8 @@
1
1
  """Global and user settings utility classes"""
2
+
2
3
  import marshmallow as mm
3
4
  from typing import Dict, Any
4
- from experimaestro.utils.settings import JsonSettings, PathField
5
+ from experimaestro.utils.settings import JsonSettings
5
6
  from pathlib import Path
6
7
 
7
8
  # --- Global settings
datamaestro/sphinx.py CHANGED
@@ -125,9 +125,7 @@ class RepositoryDirective(DatasetsDirective):
125
125
  def run(self):
126
126
  (repository_id,) = self.arguments
127
127
  with mock(self.config.autodoc_mock_imports):
128
- repository = datamaestro.Context.instance().repository(
129
- repository_id
130
- ) # type: Optional[datamaestro.Repository]
128
+ repository = datamaestro.Context.instance().repository(repository_id) # type: Optional[datamaestro.Repository]
131
129
  assert repository is not None
132
130
 
133
131
  docnodes = []
@@ -42,12 +42,14 @@ class LineTransformStream(io.RawIOBase):
42
42
  return offset
43
43
 
44
44
  # How many bytes to read from current line
45
- l = min(lb, len(self.current) - self.offset)
46
-
47
- b[offset : (offset + l)] = self.current[self.offset : (self.offset + l)]
48
- lb -= l
49
- offset += l
50
- self.offset += l
45
+ chunk_len = min(lb, len(self.current) - self.offset)
46
+
47
+ b[offset : (offset + chunk_len)] = self.current[
48
+ self.offset : (self.offset + chunk_len)
49
+ ]
50
+ lb -= chunk_len
51
+ offset += chunk_len
52
+ self.offset += chunk_len
51
53
 
52
54
  return offset
53
55
 
@@ -1 +1,3 @@
1
- from .checks import *
1
+ from .checks import DatasetTests
2
+
3
+ __all__ = ["DatasetTests"]
@@ -3,7 +3,6 @@ from datamaestro import Repository, Context
3
3
  import shutil
4
4
  import logging
5
5
  import pytest
6
- import shutil
7
6
 
8
7
 
9
8
  class MyRepository(Repository):
@@ -23,7 +22,7 @@ def context(tmp_path_factory):
23
22
  context = Context(Path(dir))
24
23
  logging.info("Created datamaestro test directory %s", dir)
25
24
 
26
- repository = MyRepository(context)
25
+ _repository = MyRepository(context) # noqa: F841 - registered on creation
27
26
 
28
27
  yield context
29
28