datamaestro 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +1 -2
- datamaestro/__main__.py +11 -7
- datamaestro/commands/site.py +16 -5
- datamaestro/context.py +32 -16
- datamaestro/data/ml.py +1 -0
- datamaestro/definitions.py +246 -20
- datamaestro/download/__init__.py +583 -40
- datamaestro/download/archive.py +120 -76
- datamaestro/download/custom.py +38 -6
- datamaestro/download/huggingface.py +46 -14
- datamaestro/download/links.py +106 -49
- datamaestro/download/multiple.py +27 -5
- datamaestro/download/single.py +111 -54
- datamaestro/download/sync.py +0 -1
- datamaestro/download/todo.py +9 -4
- datamaestro/download/wayback.py +3 -3
- datamaestro/record.py +48 -2
- datamaestro/settings.py +2 -1
- datamaestro/sphinx.py +1 -3
- datamaestro/stream/lines.py +8 -6
- datamaestro/test/__init__.py +3 -1
- datamaestro/test/conftest.py +1 -2
- datamaestro/test/test_resource.py +1388 -0
- datamaestro/utils.py +7 -6
- datamaestro/v2.md +301 -0
- datamaestro/version.py +4 -21
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/METADATA +63 -94
- datamaestro-1.7.0.dist-info/RECORD +49 -0
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
- datamaestro-1.5.0.dist-info/RECORD +0 -48
- datamaestro-1.5.0.dist-info/top_level.txt +0 -1
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -0
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1388 @@
|
|
|
1
|
+
"""Tests for the new Resource interface.
|
|
2
|
+
|
|
3
|
+
Covers:
|
|
4
|
+
- ResourceState enum and metadata persistence
|
|
5
|
+
- Resource base class (bind, dependencies, state, cleanup)
|
|
6
|
+
- FileResource, FolderResource, ValueResource
|
|
7
|
+
- Topological sort and cycle detection
|
|
8
|
+
- Two-path download flow (transient_path -> path)
|
|
9
|
+
- Eager transient cleanup
|
|
10
|
+
- can_recover property behavior
|
|
11
|
+
- Both new class-based and legacy decorator-based dataset definitions
|
|
12
|
+
- Each concrete resource type
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from unittest.mock import MagicMock
|
|
20
|
+
|
|
21
|
+
import pytest
|
|
22
|
+
|
|
23
|
+
from datamaestro.definitions import (
|
|
24
|
+
AbstractDataset,
|
|
25
|
+
topological_sort,
|
|
26
|
+
_compute_dependents,
|
|
27
|
+
_bind_class_resources,
|
|
28
|
+
)
|
|
29
|
+
from datamaestro.download import (
|
|
30
|
+
Resource,
|
|
31
|
+
ResourceState,
|
|
32
|
+
ResourceStateFile,
|
|
33
|
+
FileResource,
|
|
34
|
+
FolderResource,
|
|
35
|
+
ValueResource,
|
|
36
|
+
Download,
|
|
37
|
+
reference,
|
|
38
|
+
)
|
|
39
|
+
from .conftest import MyRepository
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# ---- Helpers ----
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SimpleDataset(AbstractDataset):
|
|
46
|
+
"""Minimal dataset for testing."""
|
|
47
|
+
|
|
48
|
+
def __init__(self, repository, datapath: Path):
|
|
49
|
+
super().__init__(repository)
|
|
50
|
+
self._datapath = datapath
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def datapath(self):
|
|
54
|
+
return self._datapath
|
|
55
|
+
|
|
56
|
+
def _prepare(self):
|
|
57
|
+
# Return a mock Base-like object for the prepare flow
|
|
58
|
+
obj = MagicMock()
|
|
59
|
+
obj.__xpm__ = MagicMock()
|
|
60
|
+
obj.__xpm__.values = {}
|
|
61
|
+
return obj
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def description(self):
|
|
65
|
+
return "test dataset"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class DummyFileResource(FileResource):
|
|
69
|
+
"""Concrete FileResource for testing."""
|
|
70
|
+
|
|
71
|
+
def __init__(self, filename, url="http://example.com/test", **kw):
|
|
72
|
+
super().__init__(filename, **kw)
|
|
73
|
+
self.url = url
|
|
74
|
+
self._download_called = False
|
|
75
|
+
|
|
76
|
+
def _download(self, destination: Path):
|
|
77
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
78
|
+
destination.write_text(f"downloaded from {self.url}")
|
|
79
|
+
self._download_called = True
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class DummyFolderResource(FolderResource):
|
|
83
|
+
"""Concrete FolderResource for testing."""
|
|
84
|
+
|
|
85
|
+
def __init__(self, **kw):
|
|
86
|
+
super().__init__(**kw)
|
|
87
|
+
self._download_called = False
|
|
88
|
+
|
|
89
|
+
def _download(self, destination: Path):
|
|
90
|
+
destination.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
(destination / "file.txt").write_text("content")
|
|
92
|
+
self._download_called = True
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class DummyValueResource(ValueResource):
|
|
96
|
+
"""Concrete ValueResource for testing."""
|
|
97
|
+
|
|
98
|
+
def __init__(self, value, **kw):
|
|
99
|
+
super().__init__(**kw)
|
|
100
|
+
self._value = value
|
|
101
|
+
|
|
102
|
+
def download(self, force=False):
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
def prepare(self):
|
|
106
|
+
return self._value
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class RecoverableResource(FileResource):
|
|
110
|
+
"""Resource that supports recovery from PARTIAL state."""
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def can_recover(self) -> bool:
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
def __init__(self, filename, **kw):
|
|
117
|
+
super().__init__(filename, **kw)
|
|
118
|
+
|
|
119
|
+
def _download(self, destination: Path):
|
|
120
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
121
|
+
destination.write_text("recovered")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class FailingResource(FileResource):
|
|
125
|
+
"""Resource that fails during download."""
|
|
126
|
+
|
|
127
|
+
def __init__(self, filename, **kw):
|
|
128
|
+
super().__init__(filename, **kw)
|
|
129
|
+
|
|
130
|
+
def _download(self, destination: Path):
|
|
131
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
132
|
+
destination.write_text("partial data")
|
|
133
|
+
raise RuntimeError("Download failed")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class DependentResource(FileResource):
|
|
137
|
+
"""Resource that depends on another resource."""
|
|
138
|
+
|
|
139
|
+
def __init__(self, filename, source: Resource, **kw):
|
|
140
|
+
super().__init__(filename, **kw)
|
|
141
|
+
self._dependencies = [source]
|
|
142
|
+
|
|
143
|
+
def _download(self, destination: Path):
|
|
144
|
+
# Read from dependency's path
|
|
145
|
+
source = self.dependencies[0]
|
|
146
|
+
data = source.path.read_text()
|
|
147
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
148
|
+
destination.write_text(f"processed: {data}")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# ---- Fixtures ----
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@pytest.fixture
|
|
155
|
+
def datapath(tmp_path):
|
|
156
|
+
"""Temporary dataset data path."""
|
|
157
|
+
return tmp_path / "dataset"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@pytest.fixture
|
|
161
|
+
def dataset(context, datapath):
|
|
162
|
+
"""A minimal dataset bound to a repository."""
|
|
163
|
+
repository = MyRepository(context)
|
|
164
|
+
ds = SimpleDataset(repository, datapath)
|
|
165
|
+
return ds
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# ==== ResourceState Tests ====
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class TestResourceState:
|
|
172
|
+
def test_values(self):
|
|
173
|
+
assert ResourceState.NONE == "none"
|
|
174
|
+
assert ResourceState.PARTIAL == "partial"
|
|
175
|
+
assert ResourceState.COMPLETE == "complete"
|
|
176
|
+
|
|
177
|
+
def test_from_string(self):
|
|
178
|
+
assert ResourceState("none") == ResourceState.NONE
|
|
179
|
+
assert ResourceState("partial") == ResourceState.PARTIAL
|
|
180
|
+
assert ResourceState("complete") == ResourceState.COMPLETE
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# ==== ResourceStateFile Tests ====
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class TestResourceStateFile:
|
|
187
|
+
def test_read_nonexistent(self, datapath):
|
|
188
|
+
sf = ResourceStateFile(datapath)
|
|
189
|
+
assert sf.read("TRAIN") == ResourceState.NONE
|
|
190
|
+
|
|
191
|
+
def test_write_and_read(self, datapath):
|
|
192
|
+
sf = ResourceStateFile(datapath)
|
|
193
|
+
sf.write("TRAIN", ResourceState.COMPLETE)
|
|
194
|
+
|
|
195
|
+
assert sf.read("TRAIN") == ResourceState.COMPLETE
|
|
196
|
+
assert sf.read("TEST") == ResourceState.NONE
|
|
197
|
+
|
|
198
|
+
def test_multiple_resources(self, datapath):
|
|
199
|
+
sf = ResourceStateFile(datapath)
|
|
200
|
+
sf.write("A", ResourceState.COMPLETE)
|
|
201
|
+
sf.write("B", ResourceState.PARTIAL)
|
|
202
|
+
sf.write("C", ResourceState.NONE)
|
|
203
|
+
|
|
204
|
+
assert sf.read("A") == ResourceState.COMPLETE
|
|
205
|
+
assert sf.read("B") == ResourceState.PARTIAL
|
|
206
|
+
assert sf.read("C") == ResourceState.NONE
|
|
207
|
+
|
|
208
|
+
def test_overwrite(self, datapath):
|
|
209
|
+
sf = ResourceStateFile(datapath)
|
|
210
|
+
sf.write("A", ResourceState.PARTIAL)
|
|
211
|
+
assert sf.read("A") == ResourceState.PARTIAL
|
|
212
|
+
|
|
213
|
+
sf.write("A", ResourceState.COMPLETE)
|
|
214
|
+
assert sf.read("A") == ResourceState.COMPLETE
|
|
215
|
+
|
|
216
|
+
def test_file_format(self, datapath):
|
|
217
|
+
sf = ResourceStateFile(datapath)
|
|
218
|
+
sf.write("TRAIN", ResourceState.COMPLETE)
|
|
219
|
+
|
|
220
|
+
state_path = datapath / ".state.json"
|
|
221
|
+
assert state_path.exists()
|
|
222
|
+
|
|
223
|
+
with state_path.open() as f:
|
|
224
|
+
data = json.load(f)
|
|
225
|
+
|
|
226
|
+
assert data["version"] == 1
|
|
227
|
+
assert data["resources"]["TRAIN"]["state"] == "complete"
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# ==== Resource Base Class Tests ====
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class TestResourceBase:
|
|
234
|
+
def test_bind(self, dataset):
|
|
235
|
+
r = DummyFileResource("test.txt")
|
|
236
|
+
r.bind("TEST", dataset)
|
|
237
|
+
|
|
238
|
+
assert r.name == "TEST"
|
|
239
|
+
assert r.dataset is dataset
|
|
240
|
+
assert "TEST" in dataset.resources
|
|
241
|
+
assert r in dataset.ordered_resources
|
|
242
|
+
|
|
243
|
+
def test_bind_with_varname(self, dataset):
|
|
244
|
+
r = DummyFileResource("test.txt", varname="my_var")
|
|
245
|
+
r.bind("ATTR_NAME", dataset)
|
|
246
|
+
|
|
247
|
+
# varname takes precedence
|
|
248
|
+
assert r.name == "my_var"
|
|
249
|
+
|
|
250
|
+
def test_bind_duplicate_raises(self, dataset):
|
|
251
|
+
r1 = DummyFileResource("test1.txt")
|
|
252
|
+
r2 = DummyFileResource("test2.txt")
|
|
253
|
+
r1.bind("TEST", dataset)
|
|
254
|
+
|
|
255
|
+
with pytest.raises(AssertionError, match="already declared"):
|
|
256
|
+
r2.bind("TEST", dataset)
|
|
257
|
+
|
|
258
|
+
def test_bind_already_bound_raises(self, dataset):
|
|
259
|
+
r = DummyFileResource("test.txt")
|
|
260
|
+
r.bind("TEST", dataset)
|
|
261
|
+
|
|
262
|
+
ds2 = SimpleDataset(None, dataset.datapath / "other")
|
|
263
|
+
with pytest.raises(AssertionError, match="already bound"):
|
|
264
|
+
r.bind("TEST2", ds2)
|
|
265
|
+
|
|
266
|
+
def test_state_default_none(self, dataset):
|
|
267
|
+
r = DummyFileResource("test.txt")
|
|
268
|
+
r.bind("TEST", dataset)
|
|
269
|
+
assert r.state == ResourceState.NONE
|
|
270
|
+
|
|
271
|
+
def test_state_set_and_get(self, dataset):
|
|
272
|
+
r = DummyFileResource("test.txt")
|
|
273
|
+
r.bind("TEST", dataset)
|
|
274
|
+
|
|
275
|
+
r.state = ResourceState.COMPLETE
|
|
276
|
+
assert r.state == ResourceState.COMPLETE
|
|
277
|
+
|
|
278
|
+
r.state = ResourceState.PARTIAL
|
|
279
|
+
assert r.state == ResourceState.PARTIAL
|
|
280
|
+
|
|
281
|
+
def test_dependencies_default_empty(self, dataset):
|
|
282
|
+
r = DummyFileResource("test.txt")
|
|
283
|
+
r.bind("TEST", dataset)
|
|
284
|
+
assert r.dependencies == []
|
|
285
|
+
|
|
286
|
+
def test_dependents_default_empty(self, dataset):
|
|
287
|
+
r = DummyFileResource("test.txt")
|
|
288
|
+
r.bind("TEST", dataset)
|
|
289
|
+
assert r.dependents == []
|
|
290
|
+
|
|
291
|
+
def test_can_recover_default_false(self, dataset):
|
|
292
|
+
r = DummyFileResource("test.txt")
|
|
293
|
+
r.bind("TEST", dataset)
|
|
294
|
+
assert r.can_recover is False
|
|
295
|
+
|
|
296
|
+
def test_can_recover_override(self, dataset):
|
|
297
|
+
r = RecoverableResource("test.txt")
|
|
298
|
+
r.bind("TEST", dataset)
|
|
299
|
+
assert r.can_recover is True
|
|
300
|
+
|
|
301
|
+
def test_has_files_default_true(self, dataset):
|
|
302
|
+
r = DummyFileResource("test.txt")
|
|
303
|
+
r.bind("TEST", dataset)
|
|
304
|
+
assert r.has_files() is True
|
|
305
|
+
|
|
306
|
+
def test_transient_flag(self, dataset):
|
|
307
|
+
r = DummyFileResource("test.txt", transient=True)
|
|
308
|
+
r.bind("TEST", dataset)
|
|
309
|
+
assert r.transient is True
|
|
310
|
+
|
|
311
|
+
def test_context_property(self, dataset):
|
|
312
|
+
r = DummyFileResource("test.txt")
|
|
313
|
+
r.bind("TEST", dataset)
|
|
314
|
+
assert r.context is dataset.context
|
|
315
|
+
|
|
316
|
+
def test_cleanup(self, dataset):
|
|
317
|
+
r = DummyFileResource("test.txt")
|
|
318
|
+
r.bind("TEST", dataset)
|
|
319
|
+
|
|
320
|
+
# Create files at both paths
|
|
321
|
+
r.path.parent.mkdir(parents=True, exist_ok=True)
|
|
322
|
+
r.path.write_text("final")
|
|
323
|
+
r.transient_path.parent.mkdir(parents=True, exist_ok=True)
|
|
324
|
+
r.transient_path.write_text("temp")
|
|
325
|
+
r.state = ResourceState.COMPLETE
|
|
326
|
+
|
|
327
|
+
r.cleanup()
|
|
328
|
+
|
|
329
|
+
assert not r.path.exists()
|
|
330
|
+
assert not r.transient_path.exists()
|
|
331
|
+
assert r.state == ResourceState.NONE
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
# ==== FileResource Tests ====
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
class TestFileResource:
|
|
338
|
+
def test_path(self, dataset):
|
|
339
|
+
r = DummyFileResource("data.csv")
|
|
340
|
+
r.bind("DATA", dataset)
|
|
341
|
+
|
|
342
|
+
expected = dataset.datapath / "data.csv"
|
|
343
|
+
assert r.path == expected
|
|
344
|
+
|
|
345
|
+
def test_transient_path(self, dataset):
|
|
346
|
+
r = DummyFileResource("data.csv")
|
|
347
|
+
r.bind("DATA", dataset)
|
|
348
|
+
|
|
349
|
+
expected = dataset.datapath / ".downloads" / "data.csv"
|
|
350
|
+
assert r.transient_path == expected
|
|
351
|
+
|
|
352
|
+
def test_prepare_returns_path(self, dataset):
|
|
353
|
+
r = DummyFileResource("data.csv")
|
|
354
|
+
r.bind("DATA", dataset)
|
|
355
|
+
assert r.prepare() == r.path
|
|
356
|
+
|
|
357
|
+
def test_download_writes_to_transient(self, dataset):
|
|
358
|
+
r = DummyFileResource("data.csv")
|
|
359
|
+
r.bind("DATA", dataset)
|
|
360
|
+
r.download()
|
|
361
|
+
|
|
362
|
+
assert r.transient_path.exists()
|
|
363
|
+
assert "downloaded" in r.transient_path.read_text()
|
|
364
|
+
assert r._download_called
|
|
365
|
+
|
|
366
|
+
def test_stream_default_none(self, dataset):
|
|
367
|
+
r = DummyFileResource("data.csv")
|
|
368
|
+
r.bind("DATA", dataset)
|
|
369
|
+
assert r.stream() is None
|
|
370
|
+
|
|
371
|
+
def test_varname_from_filename(self):
|
|
372
|
+
"""Without explicit varname, name is derived from filename."""
|
|
373
|
+
r = DummyFileResource("data.csv.gz")
|
|
374
|
+
assert r.name == "data"
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
# ==== FolderResource Tests ====
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
class TestFolderResource:
|
|
381
|
+
def test_path(self, dataset):
|
|
382
|
+
r = DummyFolderResource(varname="archive")
|
|
383
|
+
r.bind("ARCHIVE", dataset)
|
|
384
|
+
|
|
385
|
+
expected = dataset.datapath / "archive"
|
|
386
|
+
assert r.path == expected
|
|
387
|
+
|
|
388
|
+
def test_transient_path(self, dataset):
|
|
389
|
+
r = DummyFolderResource(varname="archive")
|
|
390
|
+
r.bind("ARCHIVE", dataset)
|
|
391
|
+
|
|
392
|
+
expected = dataset.datapath / ".downloads" / "archive"
|
|
393
|
+
assert r.transient_path == expected
|
|
394
|
+
|
|
395
|
+
def test_prepare_returns_path(self, dataset):
|
|
396
|
+
r = DummyFolderResource(varname="archive")
|
|
397
|
+
r.bind("ARCHIVE", dataset)
|
|
398
|
+
assert r.prepare() == r.path
|
|
399
|
+
|
|
400
|
+
def test_download_creates_directory(self, dataset):
|
|
401
|
+
r = DummyFolderResource(varname="archive")
|
|
402
|
+
r.bind("ARCHIVE", dataset)
|
|
403
|
+
r.download()
|
|
404
|
+
|
|
405
|
+
assert r.transient_path.is_dir()
|
|
406
|
+
assert (r.transient_path / "file.txt").exists()
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
# ==== ValueResource Tests ====
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
class TestValueResource:
|
|
413
|
+
def test_has_files_false(self, dataset):
|
|
414
|
+
r = DummyValueResource({"key": "value"}, varname="data")
|
|
415
|
+
r.bind("DATA", dataset)
|
|
416
|
+
assert r.has_files() is False
|
|
417
|
+
|
|
418
|
+
def test_prepare_returns_value(self, dataset):
|
|
419
|
+
val = {"key": "value"}
|
|
420
|
+
r = DummyValueResource(val, varname="data")
|
|
421
|
+
r.bind("DATA", dataset)
|
|
422
|
+
assert r.prepare() == val
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
# ==== Topological Sort Tests ====
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
class TestTopologicalSort:
|
|
429
|
+
def test_empty(self):
|
|
430
|
+
assert topological_sort({}) == []
|
|
431
|
+
|
|
432
|
+
def test_single(self, dataset):
|
|
433
|
+
r = DummyFileResource("a.txt")
|
|
434
|
+
r.bind("A", dataset)
|
|
435
|
+
result = topological_sort(dataset.resources)
|
|
436
|
+
assert result == [r]
|
|
437
|
+
|
|
438
|
+
def test_linear_chain(self, dataset):
|
|
439
|
+
a = DummyFileResource("a.txt")
|
|
440
|
+
a.bind("A", dataset)
|
|
441
|
+
|
|
442
|
+
b = DependentResource("b.txt", source=a)
|
|
443
|
+
b.bind("B", dataset)
|
|
444
|
+
|
|
445
|
+
result = topological_sort(dataset.resources)
|
|
446
|
+
assert result.index(a) < result.index(b)
|
|
447
|
+
|
|
448
|
+
def test_diamond(self, dataset):
|
|
449
|
+
a = DummyFileResource("a.txt")
|
|
450
|
+
a.bind("A", dataset)
|
|
451
|
+
|
|
452
|
+
b = DependentResource("b.txt", source=a)
|
|
453
|
+
b.bind("B", dataset)
|
|
454
|
+
|
|
455
|
+
c = DependentResource("c.txt", source=a)
|
|
456
|
+
c.bind("C", dataset)
|
|
457
|
+
|
|
458
|
+
d = DependentResource("d.txt", source=b)
|
|
459
|
+
d._dependencies.append(c)
|
|
460
|
+
d.bind("D", dataset)
|
|
461
|
+
|
|
462
|
+
result = topological_sort(dataset.resources)
|
|
463
|
+
assert result.index(a) < result.index(b)
|
|
464
|
+
assert result.index(a) < result.index(c)
|
|
465
|
+
assert result.index(b) < result.index(d)
|
|
466
|
+
assert result.index(c) < result.index(d)
|
|
467
|
+
|
|
468
|
+
def test_cycle_detection(self, dataset):
|
|
469
|
+
a = DummyFileResource("a.txt")
|
|
470
|
+
a.bind("A", dataset)
|
|
471
|
+
|
|
472
|
+
b = DependentResource("b.txt", source=a)
|
|
473
|
+
b.bind("B", dataset)
|
|
474
|
+
|
|
475
|
+
# Create cycle: a depends on b
|
|
476
|
+
a._dependencies = [b]
|
|
477
|
+
|
|
478
|
+
with pytest.raises(ValueError, match="Cycle detected"):
|
|
479
|
+
topological_sort(dataset.resources)
|
|
480
|
+
|
|
481
|
+
def test_independent_resources(self, dataset):
|
|
482
|
+
a = DummyFileResource("a.txt")
|
|
483
|
+
a.bind("A", dataset)
|
|
484
|
+
|
|
485
|
+
b = DummyFileResource("b.txt")
|
|
486
|
+
b.bind("B", dataset)
|
|
487
|
+
|
|
488
|
+
result = topological_sort(dataset.resources)
|
|
489
|
+
assert len(result) == 2
|
|
490
|
+
assert set(result) == {a, b}
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
# ==== Dependents Computation Tests ====
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
class TestComputeDependents:
|
|
497
|
+
def test_no_dependencies(self, dataset):
|
|
498
|
+
a = DummyFileResource("a.txt")
|
|
499
|
+
a.bind("A", dataset)
|
|
500
|
+
|
|
501
|
+
_compute_dependents(dataset.resources)
|
|
502
|
+
assert a.dependents == []
|
|
503
|
+
|
|
504
|
+
def test_linear_dependents(self, dataset):
|
|
505
|
+
a = DummyFileResource("a.txt")
|
|
506
|
+
a.bind("A", dataset)
|
|
507
|
+
|
|
508
|
+
b = DependentResource("b.txt", source=a)
|
|
509
|
+
b.bind("B", dataset)
|
|
510
|
+
|
|
511
|
+
_compute_dependents(dataset.resources)
|
|
512
|
+
assert b in a.dependents
|
|
513
|
+
assert a not in b.dependents
|
|
514
|
+
|
|
515
|
+
def test_multiple_dependents(self, dataset):
|
|
516
|
+
a = DummyFileResource("a.txt")
|
|
517
|
+
a.bind("A", dataset)
|
|
518
|
+
|
|
519
|
+
b = DependentResource("b.txt", source=a)
|
|
520
|
+
b.bind("B", dataset)
|
|
521
|
+
|
|
522
|
+
c = DependentResource("c.txt", source=a)
|
|
523
|
+
c.bind("C", dataset)
|
|
524
|
+
|
|
525
|
+
_compute_dependents(dataset.resources)
|
|
526
|
+
assert set(a.dependents) == {b, c}
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
# ==== Two-Path Download Flow Tests ====
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
class TestTwoPathFlow:
|
|
533
|
+
def test_download_moves_to_final_path(self, dataset):
|
|
534
|
+
"""Framework should move transient_path -> path on success."""
|
|
535
|
+
r = DummyFileResource("data.txt")
|
|
536
|
+
r.bind("DATA", dataset)
|
|
537
|
+
|
|
538
|
+
dataset.ordered_resources = [r]
|
|
539
|
+
_compute_dependents(dataset.resources)
|
|
540
|
+
|
|
541
|
+
dataset.download()
|
|
542
|
+
|
|
543
|
+
assert r.path.exists()
|
|
544
|
+
assert r.state == ResourceState.COMPLETE
|
|
545
|
+
|
|
546
|
+
def test_failure_no_recover_cleans_up(self, dataset):
|
|
547
|
+
"""On failure without can_recover, transient data is deleted."""
|
|
548
|
+
r = FailingResource("data.txt")
|
|
549
|
+
r.bind("DATA", dataset)
|
|
550
|
+
|
|
551
|
+
dataset.ordered_resources = [r]
|
|
552
|
+
_compute_dependents(dataset.resources)
|
|
553
|
+
|
|
554
|
+
result = dataset.download()
|
|
555
|
+
|
|
556
|
+
assert result is False
|
|
557
|
+
assert not r.transient_path.exists()
|
|
558
|
+
assert r.state == ResourceState.NONE
|
|
559
|
+
|
|
560
|
+
def test_failure_with_recover_preserves(self, dataset):
|
|
561
|
+
"""On failure with can_recover, transient data is preserved."""
|
|
562
|
+
|
|
563
|
+
class FailRecoverable(RecoverableResource):
|
|
564
|
+
def _download(self, destination):
|
|
565
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
566
|
+
destination.write_text("partial")
|
|
567
|
+
raise RuntimeError("partial failure")
|
|
568
|
+
|
|
569
|
+
r = FailRecoverable("data.txt")
|
|
570
|
+
r.bind("DATA", dataset)
|
|
571
|
+
|
|
572
|
+
dataset.ordered_resources = [r]
|
|
573
|
+
_compute_dependents(dataset.resources)
|
|
574
|
+
|
|
575
|
+
result = dataset.download()
|
|
576
|
+
|
|
577
|
+
assert result is False
|
|
578
|
+
assert r.transient_path.exists()
|
|
579
|
+
assert r.state == ResourceState.PARTIAL
|
|
580
|
+
|
|
581
|
+
def test_skip_complete_resources(self, dataset):
|
|
582
|
+
"""Resources already COMPLETE are skipped unless force=True."""
|
|
583
|
+
r = DummyFileResource("data.txt")
|
|
584
|
+
r.bind("DATA", dataset)
|
|
585
|
+
|
|
586
|
+
dataset.ordered_resources = [r]
|
|
587
|
+
_compute_dependents(dataset.resources)
|
|
588
|
+
|
|
589
|
+
# Mark as complete
|
|
590
|
+
r.state = ResourceState.COMPLETE
|
|
591
|
+
r.path.parent.mkdir(parents=True, exist_ok=True)
|
|
592
|
+
r.path.write_text("existing")
|
|
593
|
+
|
|
594
|
+
dataset.download()
|
|
595
|
+
|
|
596
|
+
# download should not have been called
|
|
597
|
+
assert r._download_called is False
|
|
598
|
+
|
|
599
|
+
def test_redownload_when_files_missing(self, dataset):
|
|
600
|
+
"""COMPLETE resource with missing files is re-downloaded."""
|
|
601
|
+
r = DummyFileResource("data.txt")
|
|
602
|
+
r.bind("DATA", dataset)
|
|
603
|
+
|
|
604
|
+
dataset.ordered_resources = [r]
|
|
605
|
+
_compute_dependents(dataset.resources)
|
|
606
|
+
|
|
607
|
+
# Mark as complete but do NOT create the file
|
|
608
|
+
r.state = ResourceState.COMPLETE
|
|
609
|
+
assert not r.path.exists()
|
|
610
|
+
|
|
611
|
+
dataset.download()
|
|
612
|
+
|
|
613
|
+
# Should have re-downloaded
|
|
614
|
+
assert r._download_called is True
|
|
615
|
+
assert r.path.exists()
|
|
616
|
+
assert r.state == ResourceState.COMPLETE
|
|
617
|
+
|
|
618
|
+
def test_adopt_preexisting_files(self, dataset):
|
|
619
|
+
"""Files already on disk (old downloads) are adopted as COMPLETE."""
|
|
620
|
+
r = DummyFileResource("data.txt")
|
|
621
|
+
r.bind("DATA", dataset)
|
|
622
|
+
|
|
623
|
+
dataset.ordered_resources = [r]
|
|
624
|
+
_compute_dependents(dataset.resources)
|
|
625
|
+
|
|
626
|
+
# Pre-create the file at the final path (simulating old download)
|
|
627
|
+
r.path.parent.mkdir(parents=True, exist_ok=True)
|
|
628
|
+
r.path.write_text("old data")
|
|
629
|
+
|
|
630
|
+
# State is NONE (no .state.json entry)
|
|
631
|
+
assert r.state == ResourceState.NONE
|
|
632
|
+
|
|
633
|
+
dataset.download()
|
|
634
|
+
|
|
635
|
+
# Should NOT have re-downloaded — just marked COMPLETE
|
|
636
|
+
assert r._download_called is False
|
|
637
|
+
assert r.state == ResourceState.COMPLETE
|
|
638
|
+
assert r.path.read_text() == "old data"
|
|
639
|
+
|
|
640
|
+
def test_downloads_dir_cleaned_after_success(self, dataset):
|
|
641
|
+
"""The .downloads/ directory is removed after all succeed."""
|
|
642
|
+
r = DummyFileResource("data.txt")
|
|
643
|
+
r.bind("DATA", dataset)
|
|
644
|
+
|
|
645
|
+
dataset.ordered_resources = [r]
|
|
646
|
+
_compute_dependents(dataset.resources)
|
|
647
|
+
|
|
648
|
+
result = dataset.download()
|
|
649
|
+
|
|
650
|
+
assert result is True
|
|
651
|
+
downloads_dir = dataset.datapath / ".downloads"
|
|
652
|
+
assert not downloads_dir.exists()
|
|
653
|
+
|
|
654
|
+
def test_downloads_dir_kept_on_failure(self, dataset):
|
|
655
|
+
"""The .downloads/ directory is kept if a download fails."""
|
|
656
|
+
r = FailingResource("data.txt")
|
|
657
|
+
r.bind("DATA", dataset)
|
|
658
|
+
|
|
659
|
+
dataset.ordered_resources = [r]
|
|
660
|
+
_compute_dependents(dataset.resources)
|
|
661
|
+
|
|
662
|
+
# Pre-create .downloads/ with transient data
|
|
663
|
+
r.transient_path.parent.mkdir(parents=True, exist_ok=True)
|
|
664
|
+
r.transient_path.write_text("partial")
|
|
665
|
+
|
|
666
|
+
result = dataset.download()
|
|
667
|
+
|
|
668
|
+
assert result is False
|
|
669
|
+
# .downloads/ should still exist (failure, no cleanup)
|
|
670
|
+
# (transient data itself is deleted because can_recover=False)
|
|
671
|
+
|
|
672
|
+
def test_lock_prevents_concurrent_download(self, dataset):
|
|
673
|
+
"""A second download blocks while the first holds the lock."""
|
|
674
|
+
import fcntl
|
|
675
|
+
import threading
|
|
676
|
+
|
|
677
|
+
r = DummyFileResource("data.txt")
|
|
678
|
+
r.bind("DATA", dataset)
|
|
679
|
+
dataset.ordered_resources = [r]
|
|
680
|
+
_compute_dependents(dataset.resources)
|
|
681
|
+
|
|
682
|
+
# Acquire the lock externally to simulate a concurrent download
|
|
683
|
+
dataset.datapath.mkdir(parents=True, exist_ok=True)
|
|
684
|
+
lock_path = dataset.datapath / ".state.lock"
|
|
685
|
+
lock_file = lock_path.open("w")
|
|
686
|
+
fcntl.flock(lock_file, fcntl.LOCK_EX)
|
|
687
|
+
|
|
688
|
+
result_holder = {}
|
|
689
|
+
|
|
690
|
+
def try_download():
|
|
691
|
+
result_holder["result"] = dataset.download()
|
|
692
|
+
|
|
693
|
+
t = threading.Thread(target=try_download)
|
|
694
|
+
t.start()
|
|
695
|
+
# Give thread time to hit the lock
|
|
696
|
+
t.join(timeout=0.2)
|
|
697
|
+
# Thread should still be alive (blocked on lock)
|
|
698
|
+
assert t.is_alive()
|
|
699
|
+
|
|
700
|
+
# Release the lock
|
|
701
|
+
fcntl.flock(lock_file, fcntl.LOCK_UN)
|
|
702
|
+
lock_file.close()
|
|
703
|
+
|
|
704
|
+
t.join(timeout=5)
|
|
705
|
+
assert not t.is_alive()
|
|
706
|
+
assert result_holder["result"] is True
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
# ==== Eager Transient Cleanup Tests ====
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
class TestTransientCleanup:
|
|
713
|
+
def test_transient_cleaned_after_dependents_complete(self, dataset):
|
|
714
|
+
"""Transient resources are cleaned up when all dependents
|
|
715
|
+
are COMPLETE."""
|
|
716
|
+
a = DummyFileResource("a.txt", transient=True)
|
|
717
|
+
a.bind("A", dataset)
|
|
718
|
+
|
|
719
|
+
b = DependentResource("b.txt", source=a)
|
|
720
|
+
b.bind("B", dataset)
|
|
721
|
+
|
|
722
|
+
_compute_dependents(dataset.resources)
|
|
723
|
+
dataset.ordered_resources = topological_sort(dataset.resources)
|
|
724
|
+
|
|
725
|
+
dataset.download()
|
|
726
|
+
|
|
727
|
+
# b should be complete
|
|
728
|
+
assert b.state == ResourceState.COMPLETE
|
|
729
|
+
assert b.path.exists()
|
|
730
|
+
|
|
731
|
+
# a should be cleaned up (transient)
|
|
732
|
+
assert a.state == ResourceState.NONE
|
|
733
|
+
assert not a.path.exists()
|
|
734
|
+
|
|
735
|
+
def test_non_transient_not_cleaned(self, dataset):
|
|
736
|
+
"""Non-transient resources are NOT cleaned up."""
|
|
737
|
+
a = DummyFileResource("a.txt", transient=False)
|
|
738
|
+
a.bind("A", dataset)
|
|
739
|
+
|
|
740
|
+
b = DependentResource("b.txt", source=a)
|
|
741
|
+
b.bind("B", dataset)
|
|
742
|
+
|
|
743
|
+
_compute_dependents(dataset.resources)
|
|
744
|
+
dataset.ordered_resources = topological_sort(dataset.resources)
|
|
745
|
+
|
|
746
|
+
dataset.download()
|
|
747
|
+
|
|
748
|
+
assert a.state == ResourceState.COMPLETE
|
|
749
|
+
assert a.path.exists()
|
|
750
|
+
|
|
751
|
+
def test_transient_not_cleaned_if_dependent_incomplete(self, dataset):
|
|
752
|
+
"""Transient resources are NOT cleaned if a dependent
|
|
753
|
+
hasn't completed yet."""
|
|
754
|
+
a = DummyFileResource("a.txt", transient=True)
|
|
755
|
+
a.bind("A", dataset)
|
|
756
|
+
|
|
757
|
+
b = DependentResource("b.txt", source=a)
|
|
758
|
+
b.bind("B", dataset)
|
|
759
|
+
|
|
760
|
+
c = DependentResource("c.txt", source=a)
|
|
761
|
+
c.bind("C", dataset)
|
|
762
|
+
|
|
763
|
+
_compute_dependents(dataset.resources)
|
|
764
|
+
dataset.ordered_resources = topological_sort(dataset.resources)
|
|
765
|
+
|
|
766
|
+
# Download only processes in order, so after B completes,
|
|
767
|
+
# C hasn't yet — a should not be cleaned up until C completes.
|
|
768
|
+
# The full download() handles this correctly.
|
|
769
|
+
dataset.download()
|
|
770
|
+
|
|
771
|
+
# After full download, all dependents are complete
|
|
772
|
+
# so transient should be cleaned
|
|
773
|
+
assert a.state == ResourceState.NONE
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
# ==== Legacy Decorator-Based Dataset Tests ====
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
class TestLegacyDecoratorDataset:
|
|
780
|
+
def test_filedownloader_decorator(self, context):
|
|
781
|
+
"""Legacy decorator-based filedownloader still works."""
|
|
782
|
+
import warnings
|
|
783
|
+
from datamaestro.download.single import filedownloader
|
|
784
|
+
|
|
785
|
+
repository = MyRepository(context)
|
|
786
|
+
ds = SimpleDataset(repository, context.datapath / "legacy")
|
|
787
|
+
|
|
788
|
+
with warnings.catch_warnings():
|
|
789
|
+
warnings.simplefilter("ignore", DeprecationWarning)
|
|
790
|
+
downloader = filedownloader("test.html", "http://httpbin.org/html")
|
|
791
|
+
downloader(ds)
|
|
792
|
+
|
|
793
|
+
assert "test" in ds.resources
|
|
794
|
+
assert ds.resources["test"] is downloader
|
|
795
|
+
|
|
796
|
+
def test_reference_resource(self, context):
|
|
797
|
+
"""reference resource still works."""
|
|
798
|
+
repository = MyRepository(context)
|
|
799
|
+
ds = SimpleDataset(repository, context.datapath / "ref_test")
|
|
800
|
+
|
|
801
|
+
mock_ref = MagicMock()
|
|
802
|
+
mock_ref.prepare.return_value = "prepared_value"
|
|
803
|
+
|
|
804
|
+
ref = reference(varname="ref", reference=mock_ref)
|
|
805
|
+
ref.bind("ref", ds)
|
|
806
|
+
|
|
807
|
+
assert ref.has_files() is False
|
|
808
|
+
result = ref.prepare()
|
|
809
|
+
assert result == "prepared_value"
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
# ==== New Class-Based Dataset Tests ====
|
|
813
|
+
|
|
814
|
+
|
|
815
|
+
class TestClassBasedDataset:
|
|
816
|
+
def test_bind_class_resources(self, dataset):
|
|
817
|
+
"""_bind_class_resources detects Resource attributes."""
|
|
818
|
+
from datamaestro.data import Base
|
|
819
|
+
|
|
820
|
+
class MyData(Base):
|
|
821
|
+
A = DummyFileResource("a.txt")
|
|
822
|
+
B = DummyFileResource("b.txt")
|
|
823
|
+
|
|
824
|
+
_bind_class_resources(MyData, dataset)
|
|
825
|
+
|
|
826
|
+
assert "A" in dataset.resources
|
|
827
|
+
assert "B" in dataset.resources
|
|
828
|
+
assert len(dataset.ordered_resources) == 2
|
|
829
|
+
|
|
830
|
+
def test_bind_with_dependencies(self, dataset):
|
|
831
|
+
"""Resources with dependencies are properly ordered."""
|
|
832
|
+
from datamaestro.data import Base
|
|
833
|
+
|
|
834
|
+
src = DummyFileResource("src.txt")
|
|
835
|
+
|
|
836
|
+
class MyData(Base):
|
|
837
|
+
SRC = src
|
|
838
|
+
PROCESSED = DependentResource("proc.txt", source=src)
|
|
839
|
+
|
|
840
|
+
_bind_class_resources(MyData, dataset)
|
|
841
|
+
|
|
842
|
+
# SRC should come before PROCESSED in ordered_resources
|
|
843
|
+
src_idx = dataset.ordered_resources.index(MyData.SRC)
|
|
844
|
+
proc_idx = dataset.ordered_resources.index(MyData.PROCESSED)
|
|
845
|
+
assert src_idx < proc_idx
|
|
846
|
+
|
|
847
|
+
# Check dependents were computed
|
|
848
|
+
assert MyData.PROCESSED in MyData.SRC.dependents
|
|
849
|
+
|
|
850
|
+
def test_non_resource_attributes_ignored(self, dataset):
|
|
851
|
+
"""Non-Resource class attributes are not bound."""
|
|
852
|
+
from datamaestro.data import Base
|
|
853
|
+
|
|
854
|
+
class MyData(Base):
|
|
855
|
+
A = DummyFileResource("a.txt")
|
|
856
|
+
NOT_A_RESOURCE = "just a string"
|
|
857
|
+
ALSO_NOT = 42
|
|
858
|
+
|
|
859
|
+
_bind_class_resources(MyData, dataset)
|
|
860
|
+
|
|
861
|
+
assert "A" in dataset.resources
|
|
862
|
+
assert "NOT_A_RESOURCE" not in dataset.resources
|
|
863
|
+
assert "ALSO_NOT" not in dataset.resources
|
|
864
|
+
|
|
865
|
+
|
|
866
|
+
# ==== Backward Compatibility Tests ====
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
class TestBackwardCompat:
|
|
870
|
+
def test_hasfiles_deprecated(self, dataset):
|
|
871
|
+
"""hasfiles() still works but emits deprecation."""
|
|
872
|
+
r = DummyFileResource("test.txt")
|
|
873
|
+
r.bind("TEST", dataset)
|
|
874
|
+
|
|
875
|
+
import warnings
|
|
876
|
+
|
|
877
|
+
with warnings.catch_warnings(record=True):
|
|
878
|
+
warnings.simplefilter("always")
|
|
879
|
+
result = r.hasfiles()
|
|
880
|
+
|
|
881
|
+
assert result is True
|
|
882
|
+
|
|
883
|
+
def test_definition_property_deprecated(self, dataset):
|
|
884
|
+
"""definition property still works but emits deprecation."""
|
|
885
|
+
r = DummyFileResource("test.txt")
|
|
886
|
+
r.bind("TEST", dataset)
|
|
887
|
+
|
|
888
|
+
import warnings
|
|
889
|
+
|
|
890
|
+
# Clear the one-time warning cache
|
|
891
|
+
from datamaestro.download import _deprecation_warned
|
|
892
|
+
|
|
893
|
+
_deprecation_warned.discard("definition")
|
|
894
|
+
|
|
895
|
+
with warnings.catch_warnings(record=True):
|
|
896
|
+
warnings.simplefilter("always")
|
|
897
|
+
result = r.definition
|
|
898
|
+
|
|
899
|
+
assert result is dataset
|
|
900
|
+
|
|
901
|
+
def test_download_subclass_deprecated(self):
|
|
902
|
+
"""Subclassing Download emits deprecation."""
|
|
903
|
+
from datamaestro.download import _deprecation_warned
|
|
904
|
+
|
|
905
|
+
_deprecation_warned.discard("Download-TestSub")
|
|
906
|
+
|
|
907
|
+
import warnings
|
|
908
|
+
|
|
909
|
+
with warnings.catch_warnings(record=True):
|
|
910
|
+
warnings.simplefilter("always")
|
|
911
|
+
|
|
912
|
+
class TestSub(Download):
|
|
913
|
+
def download(self, force=False):
|
|
914
|
+
pass
|
|
915
|
+
|
|
916
|
+
def prepare(self):
|
|
917
|
+
pass
|
|
918
|
+
|
|
919
|
+
def test_apply_classmethod(self):
|
|
920
|
+
"""Resource.apply creates instances."""
|
|
921
|
+
r = DummyFileResource.apply("test.txt")
|
|
922
|
+
assert isinstance(r, DummyFileResource)
|
|
923
|
+
assert r.filename == "test.txt"
|
|
924
|
+
|
|
925
|
+
|
|
926
|
+
# ==== Concrete Resource Tests ====
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
class TestFileDownloader:
|
|
930
|
+
def test_construction(self):
|
|
931
|
+
"""FileDownloader can be constructed."""
|
|
932
|
+
from datamaestro.download.single import FileDownloader
|
|
933
|
+
|
|
934
|
+
r = FileDownloader("data.csv", "http://example.com/data.csv")
|
|
935
|
+
assert r.filename == "data.csv"
|
|
936
|
+
assert r.url == "http://example.com/data.csv"
|
|
937
|
+
assert r.name == "data" # derived from filename
|
|
938
|
+
|
|
939
|
+
def test_factory_alias(self):
|
|
940
|
+
"""filedownloader is an alias for FileDownloader.apply."""
|
|
941
|
+
from datamaestro.download.single import (
|
|
942
|
+
filedownloader,
|
|
943
|
+
FileDownloader,
|
|
944
|
+
)
|
|
945
|
+
|
|
946
|
+
r = filedownloader("data.csv", "http://example.com/data.csv")
|
|
947
|
+
assert isinstance(r, FileDownloader)
|
|
948
|
+
|
|
949
|
+
def test_transient_flag(self):
|
|
950
|
+
"""FileDownloader accepts transient flag."""
|
|
951
|
+
from datamaestro.download.single import FileDownloader
|
|
952
|
+
|
|
953
|
+
r = FileDownloader(
|
|
954
|
+
"data.csv",
|
|
955
|
+
"http://example.com/data.csv",
|
|
956
|
+
transient=True,
|
|
957
|
+
)
|
|
958
|
+
assert r.transient is True
|
|
959
|
+
|
|
960
|
+
def test_backward_compat_alias(self):
|
|
961
|
+
"""SingleDownload is an alias for FileDownloader."""
|
|
962
|
+
from datamaestro.download.single import (
|
|
963
|
+
SingleDownload,
|
|
964
|
+
FileDownloader,
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
assert SingleDownload is FileDownloader
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
class TestConcatDownloader:
|
|
971
|
+
def test_construction(self):
|
|
972
|
+
from datamaestro.download.single import ConcatDownloader
|
|
973
|
+
|
|
974
|
+
r = ConcatDownloader("data.txt", "http://example.com/data.tar.gz")
|
|
975
|
+
assert r.filename == "data.txt"
|
|
976
|
+
assert r.url == "http://example.com/data.tar.gz"
|
|
977
|
+
|
|
978
|
+
def test_factory_alias(self):
|
|
979
|
+
from datamaestro.download.single import (
|
|
980
|
+
concatdownload,
|
|
981
|
+
ConcatDownloader,
|
|
982
|
+
)
|
|
983
|
+
|
|
984
|
+
r = concatdownload("data.txt", "http://example.com/data.tar.gz")
|
|
985
|
+
assert isinstance(r, ConcatDownloader)
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
class TestArchiveDownloaders:
|
|
989
|
+
def test_zip_construction(self):
|
|
990
|
+
from datamaestro.download.archive import ZipDownloader
|
|
991
|
+
|
|
992
|
+
r = ZipDownloader("archive", "http://example.com/data.zip")
|
|
993
|
+
assert r.url == "http://example.com/data.zip"
|
|
994
|
+
assert r.name == "archive"
|
|
995
|
+
|
|
996
|
+
def test_tar_construction(self):
|
|
997
|
+
from datamaestro.download.archive import TarDownloader
|
|
998
|
+
|
|
999
|
+
r = TarDownloader("archive", "http://example.com/data.tar.gz")
|
|
1000
|
+
assert r.url == "http://example.com/data.tar.gz"
|
|
1001
|
+
|
|
1002
|
+
def test_zip_factory_alias(self):
|
|
1003
|
+
from datamaestro.download.archive import (
|
|
1004
|
+
zipdownloader,
|
|
1005
|
+
ZipDownloader,
|
|
1006
|
+
)
|
|
1007
|
+
|
|
1008
|
+
r = zipdownloader("archive", "http://example.com/data.zip")
|
|
1009
|
+
assert isinstance(r, ZipDownloader)
|
|
1010
|
+
|
|
1011
|
+
def test_tar_factory_alias(self):
|
|
1012
|
+
from datamaestro.download.archive import (
|
|
1013
|
+
tardownloader,
|
|
1014
|
+
TarDownloader,
|
|
1015
|
+
)
|
|
1016
|
+
|
|
1017
|
+
r = tardownloader("archive", "http://example.com/data.tar.gz")
|
|
1018
|
+
assert isinstance(r, TarDownloader)
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
class TestCustomDownload:
|
|
1022
|
+
def test_construction(self):
|
|
1023
|
+
from datamaestro.download.custom import custom_download
|
|
1024
|
+
|
|
1025
|
+
fn = MagicMock()
|
|
1026
|
+
r = custom_download("data", fn)
|
|
1027
|
+
assert r.name == "data"
|
|
1028
|
+
assert r.downloader is fn
|
|
1029
|
+
|
|
1030
|
+
|
|
1031
|
+
class TestHFDownloader:
|
|
1032
|
+
def test_construction(self):
|
|
1033
|
+
from datamaestro.download.huggingface import HFDownloader
|
|
1034
|
+
|
|
1035
|
+
r = HFDownloader("hf", repo_id="user/dataset")
|
|
1036
|
+
assert r.repo_id == "user/dataset"
|
|
1037
|
+
assert r.name == "hf"
|
|
1038
|
+
|
|
1039
|
+
def test_factory_alias(self):
|
|
1040
|
+
from datamaestro.download.huggingface import (
|
|
1041
|
+
hf_download,
|
|
1042
|
+
HFDownloader,
|
|
1043
|
+
)
|
|
1044
|
+
|
|
1045
|
+
r = hf_download("hf", repo_id="user/dataset")
|
|
1046
|
+
assert isinstance(r, HFDownloader)
|
|
1047
|
+
|
|
1048
|
+
def test_prepare(self):
|
|
1049
|
+
from datamaestro.download.huggingface import HFDownloader
|
|
1050
|
+
|
|
1051
|
+
r = HFDownloader(
|
|
1052
|
+
"hf",
|
|
1053
|
+
repo_id="user/dataset",
|
|
1054
|
+
data_files="train.csv",
|
|
1055
|
+
split="train",
|
|
1056
|
+
)
|
|
1057
|
+
result = r.prepare()
|
|
1058
|
+
assert result == {
|
|
1059
|
+
"repo_id": "user/dataset",
|
|
1060
|
+
"data_files": "train.csv",
|
|
1061
|
+
"split": "train",
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
|
|
1065
|
+
class TestTodoResource:
|
|
1066
|
+
def test_raises_not_implemented(self):
|
|
1067
|
+
from datamaestro.download.todo import Todo
|
|
1068
|
+
|
|
1069
|
+
r = Todo(varname="test")
|
|
1070
|
+
with pytest.raises(NotImplementedError):
|
|
1071
|
+
r.download()
|
|
1072
|
+
|
|
1073
|
+
with pytest.raises(NotImplementedError):
|
|
1074
|
+
r.prepare()
|
|
1075
|
+
|
|
1076
|
+
|
|
1077
|
+
class TestReferenceResource:
|
|
1078
|
+
def test_has_files_false(self, dataset):
|
|
1079
|
+
mock_ref = MagicMock()
|
|
1080
|
+
mock_ref.prepare.return_value = "value"
|
|
1081
|
+
|
|
1082
|
+
r = reference(varname="ref", reference=mock_ref)
|
|
1083
|
+
r.bind("ref", dataset)
|
|
1084
|
+
|
|
1085
|
+
assert r.has_files() is False
|
|
1086
|
+
|
|
1087
|
+
def test_prepare_delegates(self, dataset):
|
|
1088
|
+
mock_ref = MagicMock()
|
|
1089
|
+
mock_ref.prepare.return_value = "prepared"
|
|
1090
|
+
|
|
1091
|
+
r = reference(varname="ref", reference=mock_ref)
|
|
1092
|
+
r.bind("ref", dataset)
|
|
1093
|
+
|
|
1094
|
+
result = r.prepare()
|
|
1095
|
+
assert result == "prepared"
|
|
1096
|
+
|
|
1097
|
+
def test_download_delegates(self, dataset):
|
|
1098
|
+
mock_ref = MagicMock()
|
|
1099
|
+
mock_ref.__datamaestro__ = MagicMock()
|
|
1100
|
+
|
|
1101
|
+
r = reference(varname="ref", reference=mock_ref)
|
|
1102
|
+
r.bind("ref", dataset)
|
|
1103
|
+
|
|
1104
|
+
r.download(force=True)
|
|
1105
|
+
mock_ref.__datamaestro__.download.assert_called_once_with(True)
|
|
1106
|
+
|
|
1107
|
+
def test_requires_reference(self):
|
|
1108
|
+
with pytest.raises(AssertionError, match="cannot be null"):
|
|
1109
|
+
reference(varname="ref", reference=None)
|
|
1110
|
+
|
|
1111
|
+
|
|
1112
|
+
# ==== Links Resource Tests ====
|
|
1113
|
+
|
|
1114
|
+
|
|
1115
|
+
class TestLinksResource:
|
|
1116
|
+
def test_construction(self):
|
|
1117
|
+
from datamaestro.download.links import links
|
|
1118
|
+
|
|
1119
|
+
mock_ds = MagicMock()
|
|
1120
|
+
r = links("data", ref1=mock_ds)
|
|
1121
|
+
assert r.name == "data"
|
|
1122
|
+
|
|
1123
|
+
def test_has_files_false(self, dataset):
|
|
1124
|
+
from datamaestro.download.links import links
|
|
1125
|
+
|
|
1126
|
+
mock_ds = MagicMock()
|
|
1127
|
+
r = links("data", ref1=mock_ds)
|
|
1128
|
+
r.bind("data", dataset)
|
|
1129
|
+
|
|
1130
|
+
assert r.has_files() is False
|
|
1131
|
+
|
|
1132
|
+
def test_path_is_datapath(self, dataset):
|
|
1133
|
+
from datamaestro.download.links import links
|
|
1134
|
+
|
|
1135
|
+
mock_ds = MagicMock()
|
|
1136
|
+
r = links("data", ref1=mock_ds)
|
|
1137
|
+
r.bind("data", dataset)
|
|
1138
|
+
|
|
1139
|
+
assert r.path == dataset.datapath
|
|
1140
|
+
|
|
1141
|
+
def test_prepare_returns_path(self, dataset):
|
|
1142
|
+
from datamaestro.download.links import links
|
|
1143
|
+
|
|
1144
|
+
mock_ds = MagicMock()
|
|
1145
|
+
r = links("data", ref1=mock_ds)
|
|
1146
|
+
r.bind("data", dataset)
|
|
1147
|
+
|
|
1148
|
+
assert r.prepare() == dataset.datapath
|
|
1149
|
+
|
|
1150
|
+
|
|
1151
|
+
class TestLinkFolder:
|
|
1152
|
+
def test_construction(self):
|
|
1153
|
+
from datamaestro.download.links import linkfolder
|
|
1154
|
+
|
|
1155
|
+
r = linkfolder("data", proposals=["/tmp/test"])
|
|
1156
|
+
assert r.name == "data"
|
|
1157
|
+
|
|
1158
|
+
def test_check_is_dir(self, dataset, tmp_path):
|
|
1159
|
+
from datamaestro.download.links import linkfolder
|
|
1160
|
+
|
|
1161
|
+
r = linkfolder("data", proposals=[])
|
|
1162
|
+
r.bind("data", dataset)
|
|
1163
|
+
|
|
1164
|
+
# A directory should pass
|
|
1165
|
+
assert r.check(tmp_path) is True
|
|
1166
|
+
# A non-existent path should fail
|
|
1167
|
+
assert r.check(tmp_path / "nonexistent") is False
|
|
1168
|
+
|
|
1169
|
+
def test_path(self, dataset):
|
|
1170
|
+
from datamaestro.download.links import linkfolder
|
|
1171
|
+
|
|
1172
|
+
r = linkfolder("data", proposals=[])
|
|
1173
|
+
r.bind("data", dataset)
|
|
1174
|
+
|
|
1175
|
+
assert r.path == dataset.datapath / "data"
|
|
1176
|
+
|
|
1177
|
+
def test_prepare_returns_path(self, dataset):
|
|
1178
|
+
from datamaestro.download.links import linkfolder
|
|
1179
|
+
|
|
1180
|
+
r = linkfolder("data", proposals=[])
|
|
1181
|
+
r.bind("data", dataset)
|
|
1182
|
+
|
|
1183
|
+
assert r.prepare() == r.path
|
|
1184
|
+
|
|
1185
|
+
|
|
1186
|
+
class TestLinkFile:
|
|
1187
|
+
def test_construction(self):
|
|
1188
|
+
from datamaestro.download.links import linkfile
|
|
1189
|
+
|
|
1190
|
+
r = linkfile("data", proposals=["/tmp/test.txt"])
|
|
1191
|
+
assert r.name == "data"
|
|
1192
|
+
|
|
1193
|
+
def test_check_is_file(self, dataset, tmp_path):
|
|
1194
|
+
from datamaestro.download.links import linkfile
|
|
1195
|
+
|
|
1196
|
+
r = linkfile("data", proposals=[])
|
|
1197
|
+
r.bind("data", dataset)
|
|
1198
|
+
|
|
1199
|
+
# Create a real file to check
|
|
1200
|
+
test_file = tmp_path / "test.txt"
|
|
1201
|
+
test_file.write_text("hello")
|
|
1202
|
+
|
|
1203
|
+
assert r.check(test_file) is True
|
|
1204
|
+
# A directory should fail
|
|
1205
|
+
assert r.check(tmp_path) is False
|
|
1206
|
+
# A non-existent path should fail
|
|
1207
|
+
assert r.check(tmp_path / "nonexistent") is False
|
|
1208
|
+
|
|
1209
|
+
def test_path(self, dataset):
|
|
1210
|
+
from datamaestro.download.links import linkfile
|
|
1211
|
+
|
|
1212
|
+
r = linkfile("data", proposals=[])
|
|
1213
|
+
r.bind("data", dataset)
|
|
1214
|
+
|
|
1215
|
+
assert r.path == dataset.datapath / "data"
|
|
1216
|
+
|
|
1217
|
+
|
|
1218
|
+
# ==== Wayback Resource Tests ====
|
|
1219
|
+
|
|
1220
|
+
|
|
1221
|
+
class TestWaybackDocuments:
|
|
1222
|
+
def test_construction(self):
|
|
1223
|
+
from datamaestro.download.wayback import wayback_documents
|
|
1224
|
+
|
|
1225
|
+
def urls_fn():
|
|
1226
|
+
return iter(["http://example.com"])
|
|
1227
|
+
|
|
1228
|
+
r = wayback_documents("20200101", urls_fn, name="wb")
|
|
1229
|
+
assert r.name == "wb"
|
|
1230
|
+
assert r.timestamp == "20200101"
|
|
1231
|
+
|
|
1232
|
+
def test_prepare_returns_path(self, dataset):
|
|
1233
|
+
from datamaestro.download.wayback import wayback_documents
|
|
1234
|
+
|
|
1235
|
+
def urls_fn():
|
|
1236
|
+
return iter([])
|
|
1237
|
+
|
|
1238
|
+
r = wayback_documents("20200101", urls_fn, name="wb")
|
|
1239
|
+
r.bind("wb", dataset)
|
|
1240
|
+
|
|
1241
|
+
expected = dataset.datapath / "wb"
|
|
1242
|
+
assert r.prepare() == expected
|
|
1243
|
+
|
|
1244
|
+
|
|
1245
|
+
# ==== Custom Download Functional Tests ====
|
|
1246
|
+
|
|
1247
|
+
|
|
1248
|
+
class TestCustomDownloadFunctional:
|
|
1249
|
+
def test_download_delegates(self, dataset):
|
|
1250
|
+
from datamaestro.download.custom import custom_download
|
|
1251
|
+
|
|
1252
|
+
fn = MagicMock()
|
|
1253
|
+
r = custom_download("data", fn)
|
|
1254
|
+
r.bind("data", dataset)
|
|
1255
|
+
|
|
1256
|
+
r.download(force=True)
|
|
1257
|
+
|
|
1258
|
+
fn.assert_called_once_with(dataset.context, dataset.datapath, force=True)
|
|
1259
|
+
|
|
1260
|
+
def test_prepare_returns_datapath(self, dataset):
|
|
1261
|
+
from datamaestro.download.custom import custom_download
|
|
1262
|
+
|
|
1263
|
+
fn = MagicMock()
|
|
1264
|
+
r = custom_download("data", fn)
|
|
1265
|
+
r.bind("data", dataset)
|
|
1266
|
+
|
|
1267
|
+
assert r.prepare() == dataset.datapath
|
|
1268
|
+
|
|
1269
|
+
|
|
1270
|
+
# ==== Archive Downloader Base Tests ====
|
|
1271
|
+
|
|
1272
|
+
|
|
1273
|
+
class TestArchiveDownloaderBase:
|
|
1274
|
+
def test_zip_path_with_postinit(self, dataset):
|
|
1275
|
+
from datamaestro.download.archive import ZipDownloader
|
|
1276
|
+
|
|
1277
|
+
r = ZipDownloader("archive", "http://example.com/data.zip")
|
|
1278
|
+
r.bind("archive", dataset)
|
|
1279
|
+
|
|
1280
|
+
# path should trigger postinit
|
|
1281
|
+
p = r.path
|
|
1282
|
+
assert isinstance(p, Path)
|
|
1283
|
+
|
|
1284
|
+
def test_tar_path_with_postinit(self, dataset):
|
|
1285
|
+
from datamaestro.download.archive import TarDownloader
|
|
1286
|
+
|
|
1287
|
+
r = TarDownloader("archive", "http://example.com/data.tar.gz")
|
|
1288
|
+
r.bind("archive", dataset)
|
|
1289
|
+
|
|
1290
|
+
p = r.path
|
|
1291
|
+
assert isinstance(p, Path)
|
|
1292
|
+
|
|
1293
|
+
def test_extractall_default(self):
|
|
1294
|
+
from datamaestro.download.archive import ZipDownloader
|
|
1295
|
+
|
|
1296
|
+
r = ZipDownloader("archive", "http://example.com/data.zip")
|
|
1297
|
+
assert r.extractall is True
|
|
1298
|
+
|
|
1299
|
+
def test_extractall_with_subpath(self):
|
|
1300
|
+
from datamaestro.download.archive import ZipDownloader
|
|
1301
|
+
|
|
1302
|
+
r = ZipDownloader(
|
|
1303
|
+
"archive",
|
|
1304
|
+
"http://example.com/data.zip",
|
|
1305
|
+
subpath="subdir",
|
|
1306
|
+
)
|
|
1307
|
+
assert r.extractall is False
|
|
1308
|
+
|
|
1309
|
+
def test_extractall_with_files(self):
|
|
1310
|
+
from datamaestro.download.archive import ZipDownloader
|
|
1311
|
+
|
|
1312
|
+
r = ZipDownloader(
|
|
1313
|
+
"archive",
|
|
1314
|
+
"http://example.com/data.zip",
|
|
1315
|
+
files={"file1.txt"},
|
|
1316
|
+
)
|
|
1317
|
+
assert r.extractall is False
|
|
1318
|
+
|
|
1319
|
+
def test_subpath_trailing_slash(self):
|
|
1320
|
+
from datamaestro.download.archive import ZipDownloader
|
|
1321
|
+
|
|
1322
|
+
r = ZipDownloader(
|
|
1323
|
+
"archive",
|
|
1324
|
+
"http://example.com/data.zip",
|
|
1325
|
+
subpath="subdir",
|
|
1326
|
+
)
|
|
1327
|
+
assert r.subpath == "subdir/"
|
|
1328
|
+
|
|
1329
|
+
def test_transient_flag(self):
|
|
1330
|
+
from datamaestro.download.archive import ZipDownloader
|
|
1331
|
+
|
|
1332
|
+
r = ZipDownloader(
|
|
1333
|
+
"archive",
|
|
1334
|
+
"http://example.com/data.zip",
|
|
1335
|
+
transient=True,
|
|
1336
|
+
)
|
|
1337
|
+
assert r.transient is True
|
|
1338
|
+
|
|
1339
|
+
|
|
1340
|
+
# ==== gsync (legacy) Tests ====
|
|
1341
|
+
|
|
1342
|
+
|
|
1343
|
+
class TestGsync:
|
|
1344
|
+
def test_import(self):
|
|
1345
|
+
"""gsync can be imported (legacy Download subclass)."""
|
|
1346
|
+
from datamaestro.download.sync import gsync
|
|
1347
|
+
|
|
1348
|
+
assert issubclass(gsync, Download)
|
|
1349
|
+
|
|
1350
|
+
|
|
1351
|
+
# ==== manual.py (deprecated re-export) Tests ====
|
|
1352
|
+
|
|
1353
|
+
|
|
1354
|
+
class TestManual:
|
|
1355
|
+
def test_import_linkfolder(self):
|
|
1356
|
+
"""manual.linkfolder is a deprecated re-export."""
|
|
1357
|
+
import warnings
|
|
1358
|
+
|
|
1359
|
+
with warnings.catch_warnings():
|
|
1360
|
+
warnings.simplefilter("ignore", DeprecationWarning)
|
|
1361
|
+
from datamaestro.download.manual import linkfolder
|
|
1362
|
+
|
|
1363
|
+
assert linkfolder is not None
|
|
1364
|
+
|
|
1365
|
+
|
|
1366
|
+
# ==== multiple.py (legacy) Tests ====
|
|
1367
|
+
|
|
1368
|
+
|
|
1369
|
+
class TestMultiple:
|
|
1370
|
+
def test_import_list(self):
|
|
1371
|
+
"""List can be imported (legacy Download subclass)."""
|
|
1372
|
+
import warnings
|
|
1373
|
+
|
|
1374
|
+
with warnings.catch_warnings():
|
|
1375
|
+
warnings.simplefilter("ignore", DeprecationWarning)
|
|
1376
|
+
from datamaestro.download.multiple import List
|
|
1377
|
+
|
|
1378
|
+
assert issubclass(List, Download)
|
|
1379
|
+
|
|
1380
|
+
def test_import_datasets(self):
|
|
1381
|
+
"""Datasets can be imported (legacy Download subclass)."""
|
|
1382
|
+
import warnings
|
|
1383
|
+
|
|
1384
|
+
with warnings.catch_warnings():
|
|
1385
|
+
warnings.simplefilter("ignore", DeprecationWarning)
|
|
1386
|
+
from datamaestro.download.multiple import Datasets
|
|
1387
|
+
|
|
1388
|
+
assert issubclass(Datasets, Download)
|