datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. datamaestro/__init__.py +11 -7
  2. datamaestro/__main__.py +29 -8
  3. datamaestro/annotations/__init__.py +1 -1
  4. datamaestro/annotations/agreement.py +9 -3
  5. datamaestro/commands/site.py +27 -15
  6. datamaestro/context.py +143 -87
  7. datamaestro/data/__init__.py +23 -11
  8. datamaestro/data/csv.py +12 -12
  9. datamaestro/data/huggingface.py +25 -0
  10. datamaestro/data/ml.py +19 -10
  11. datamaestro/data/tensor.py +32 -24
  12. datamaestro/definitions.py +492 -131
  13. datamaestro/download/__init__.py +610 -24
  14. datamaestro/download/archive.py +129 -77
  15. datamaestro/download/custom.py +53 -0
  16. datamaestro/download/huggingface.py +77 -0
  17. datamaestro/download/links.py +106 -50
  18. datamaestro/download/multiple.py +27 -5
  19. datamaestro/download/single.py +114 -51
  20. datamaestro/download/sync.py +0 -1
  21. datamaestro/download/todo.py +9 -4
  22. datamaestro/download/wayback.py +164 -0
  23. datamaestro/record.py +232 -0
  24. datamaestro/registry.py +1 -0
  25. datamaestro/search.py +1 -1
  26. datamaestro/settings.py +3 -1
  27. datamaestro/sphinx.py +224 -0
  28. datamaestro/stream/__init__.py +0 -2
  29. datamaestro/stream/lines.py +10 -7
  30. datamaestro/templates/dataset.py +5 -4
  31. datamaestro/test/__init__.py +3 -1
  32. datamaestro/test/checks.py +1 -5
  33. datamaestro/test/conftest.py +1 -6
  34. datamaestro/test/test_annotations.py +2 -2
  35. datamaestro/test/test_download_handlers.py +3 -4
  36. datamaestro/test/test_record.py +72 -0
  37. datamaestro/test/test_resource.py +1388 -0
  38. datamaestro/utils.py +15 -9
  39. datamaestro/v2.md +301 -0
  40. datamaestro/version.py +4 -0
  41. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
  42. datamaestro-1.7.0.dist-info/RECORD +49 -0
  43. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
  44. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
  45. datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
  46. datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
  47. datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
  48. datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
  49. datamaestro/__pycache__/context.cpython-38.pyc +0 -0
  50. datamaestro/__pycache__/context.cpython-39.pyc +0 -0
  51. datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
  52. datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
  53. datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
  54. datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
  55. datamaestro/__pycache__/search.cpython-38.pyc +0 -0
  56. datamaestro/__pycache__/search.cpython-39.pyc +0 -0
  57. datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
  58. datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
  59. datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
  60. datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
  61. datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
  62. datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
  63. datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
  64. datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
  65. datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
  66. datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
  67. datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
  68. datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
  69. datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
  70. datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
  71. datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
  72. datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
  73. datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
  74. datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
  75. datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
  76. datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
  77. datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
  78. datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
  79. datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
  80. datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
  81. datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
  82. datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
  83. datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
  84. datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
  85. datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
  86. datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
  87. datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
  88. datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
  89. datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
  90. datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
  91. datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
  92. datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
  93. datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
  94. datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
  95. datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
  96. datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
  97. datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
  98. datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
  99. datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
  100. datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
  101. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
  102. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
  103. datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
  104. datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
  105. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
  106. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
  107. datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
  108. datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
  109. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
  110. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
  111. datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
  112. datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
  113. datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
  114. datamaestro-0.8.1.dist-info/RECORD +0 -109
  115. datamaestro-0.8.1.dist-info/top_level.txt +0 -1
  116. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,1388 @@
1
+ """Tests for the new Resource interface.
2
+
3
+ Covers:
4
+ - ResourceState enum and metadata persistence
5
+ - Resource base class (bind, dependencies, state, cleanup)
6
+ - FileResource, FolderResource, ValueResource
7
+ - Topological sort and cycle detection
8
+ - Two-path download flow (transient_path -> path)
9
+ - Eager transient cleanup
10
+ - can_recover property behavior
11
+ - Both new class-based and legacy decorator-based dataset definitions
12
+ - Each concrete resource type
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ from pathlib import Path
19
+ from unittest.mock import MagicMock
20
+
21
+ import pytest
22
+
23
+ from datamaestro.definitions import (
24
+ AbstractDataset,
25
+ topological_sort,
26
+ _compute_dependents,
27
+ _bind_class_resources,
28
+ )
29
+ from datamaestro.download import (
30
+ Resource,
31
+ ResourceState,
32
+ ResourceStateFile,
33
+ FileResource,
34
+ FolderResource,
35
+ ValueResource,
36
+ Download,
37
+ reference,
38
+ )
39
+ from .conftest import MyRepository
40
+
41
+
42
+ # ---- Helpers ----
43
+
44
+
45
+ class SimpleDataset(AbstractDataset):
46
+ """Minimal dataset for testing."""
47
+
48
+ def __init__(self, repository, datapath: Path):
49
+ super().__init__(repository)
50
+ self._datapath = datapath
51
+
52
+ @property
53
+ def datapath(self):
54
+ return self._datapath
55
+
56
+ def _prepare(self):
57
+ # Return a mock Base-like object for the prepare flow
58
+ obj = MagicMock()
59
+ obj.__xpm__ = MagicMock()
60
+ obj.__xpm__.values = {}
61
+ return obj
62
+
63
+ @property
64
+ def description(self):
65
+ return "test dataset"
66
+
67
+
68
+ class DummyFileResource(FileResource):
69
+ """Concrete FileResource for testing."""
70
+
71
+ def __init__(self, filename, url="http://example.com/test", **kw):
72
+ super().__init__(filename, **kw)
73
+ self.url = url
74
+ self._download_called = False
75
+
76
+ def _download(self, destination: Path):
77
+ destination.parent.mkdir(parents=True, exist_ok=True)
78
+ destination.write_text(f"downloaded from {self.url}")
79
+ self._download_called = True
80
+
81
+
82
+ class DummyFolderResource(FolderResource):
83
+ """Concrete FolderResource for testing."""
84
+
85
+ def __init__(self, **kw):
86
+ super().__init__(**kw)
87
+ self._download_called = False
88
+
89
+ def _download(self, destination: Path):
90
+ destination.mkdir(parents=True, exist_ok=True)
91
+ (destination / "file.txt").write_text("content")
92
+ self._download_called = True
93
+
94
+
95
+ class DummyValueResource(ValueResource):
96
+ """Concrete ValueResource for testing."""
97
+
98
+ def __init__(self, value, **kw):
99
+ super().__init__(**kw)
100
+ self._value = value
101
+
102
+ def download(self, force=False):
103
+ pass
104
+
105
+ def prepare(self):
106
+ return self._value
107
+
108
+
109
+ class RecoverableResource(FileResource):
110
+ """Resource that supports recovery from PARTIAL state."""
111
+
112
+ @property
113
+ def can_recover(self) -> bool:
114
+ return True
115
+
116
+ def __init__(self, filename, **kw):
117
+ super().__init__(filename, **kw)
118
+
119
+ def _download(self, destination: Path):
120
+ destination.parent.mkdir(parents=True, exist_ok=True)
121
+ destination.write_text("recovered")
122
+
123
+
124
+ class FailingResource(FileResource):
125
+ """Resource that fails during download."""
126
+
127
+ def __init__(self, filename, **kw):
128
+ super().__init__(filename, **kw)
129
+
130
+ def _download(self, destination: Path):
131
+ destination.parent.mkdir(parents=True, exist_ok=True)
132
+ destination.write_text("partial data")
133
+ raise RuntimeError("Download failed")
134
+
135
+
136
+ class DependentResource(FileResource):
137
+ """Resource that depends on another resource."""
138
+
139
+ def __init__(self, filename, source: Resource, **kw):
140
+ super().__init__(filename, **kw)
141
+ self._dependencies = [source]
142
+
143
+ def _download(self, destination: Path):
144
+ # Read from dependency's path
145
+ source = self.dependencies[0]
146
+ data = source.path.read_text()
147
+ destination.parent.mkdir(parents=True, exist_ok=True)
148
+ destination.write_text(f"processed: {data}")
149
+
150
+
151
+ # ---- Fixtures ----
152
+
153
+
154
+ @pytest.fixture
155
+ def datapath(tmp_path):
156
+ """Temporary dataset data path."""
157
+ return tmp_path / "dataset"
158
+
159
+
160
+ @pytest.fixture
161
+ def dataset(context, datapath):
162
+ """A minimal dataset bound to a repository."""
163
+ repository = MyRepository(context)
164
+ ds = SimpleDataset(repository, datapath)
165
+ return ds
166
+
167
+
168
+ # ==== ResourceState Tests ====
169
+
170
+
171
+ class TestResourceState:
172
+ def test_values(self):
173
+ assert ResourceState.NONE == "none"
174
+ assert ResourceState.PARTIAL == "partial"
175
+ assert ResourceState.COMPLETE == "complete"
176
+
177
+ def test_from_string(self):
178
+ assert ResourceState("none") == ResourceState.NONE
179
+ assert ResourceState("partial") == ResourceState.PARTIAL
180
+ assert ResourceState("complete") == ResourceState.COMPLETE
181
+
182
+
183
+ # ==== ResourceStateFile Tests ====
184
+
185
+
186
+ class TestResourceStateFile:
187
+ def test_read_nonexistent(self, datapath):
188
+ sf = ResourceStateFile(datapath)
189
+ assert sf.read("TRAIN") == ResourceState.NONE
190
+
191
+ def test_write_and_read(self, datapath):
192
+ sf = ResourceStateFile(datapath)
193
+ sf.write("TRAIN", ResourceState.COMPLETE)
194
+
195
+ assert sf.read("TRAIN") == ResourceState.COMPLETE
196
+ assert sf.read("TEST") == ResourceState.NONE
197
+
198
+ def test_multiple_resources(self, datapath):
199
+ sf = ResourceStateFile(datapath)
200
+ sf.write("A", ResourceState.COMPLETE)
201
+ sf.write("B", ResourceState.PARTIAL)
202
+ sf.write("C", ResourceState.NONE)
203
+
204
+ assert sf.read("A") == ResourceState.COMPLETE
205
+ assert sf.read("B") == ResourceState.PARTIAL
206
+ assert sf.read("C") == ResourceState.NONE
207
+
208
+ def test_overwrite(self, datapath):
209
+ sf = ResourceStateFile(datapath)
210
+ sf.write("A", ResourceState.PARTIAL)
211
+ assert sf.read("A") == ResourceState.PARTIAL
212
+
213
+ sf.write("A", ResourceState.COMPLETE)
214
+ assert sf.read("A") == ResourceState.COMPLETE
215
+
216
+ def test_file_format(self, datapath):
217
+ sf = ResourceStateFile(datapath)
218
+ sf.write("TRAIN", ResourceState.COMPLETE)
219
+
220
+ state_path = datapath / ".state.json"
221
+ assert state_path.exists()
222
+
223
+ with state_path.open() as f:
224
+ data = json.load(f)
225
+
226
+ assert data["version"] == 1
227
+ assert data["resources"]["TRAIN"]["state"] == "complete"
228
+
229
+
230
+ # ==== Resource Base Class Tests ====
231
+
232
+
233
+ class TestResourceBase:
234
+ def test_bind(self, dataset):
235
+ r = DummyFileResource("test.txt")
236
+ r.bind("TEST", dataset)
237
+
238
+ assert r.name == "TEST"
239
+ assert r.dataset is dataset
240
+ assert "TEST" in dataset.resources
241
+ assert r in dataset.ordered_resources
242
+
243
+ def test_bind_with_varname(self, dataset):
244
+ r = DummyFileResource("test.txt", varname="my_var")
245
+ r.bind("ATTR_NAME", dataset)
246
+
247
+ # varname takes precedence
248
+ assert r.name == "my_var"
249
+
250
+ def test_bind_duplicate_raises(self, dataset):
251
+ r1 = DummyFileResource("test1.txt")
252
+ r2 = DummyFileResource("test2.txt")
253
+ r1.bind("TEST", dataset)
254
+
255
+ with pytest.raises(AssertionError, match="already declared"):
256
+ r2.bind("TEST", dataset)
257
+
258
+ def test_bind_already_bound_raises(self, dataset):
259
+ r = DummyFileResource("test.txt")
260
+ r.bind("TEST", dataset)
261
+
262
+ ds2 = SimpleDataset(None, dataset.datapath / "other")
263
+ with pytest.raises(AssertionError, match="already bound"):
264
+ r.bind("TEST2", ds2)
265
+
266
+ def test_state_default_none(self, dataset):
267
+ r = DummyFileResource("test.txt")
268
+ r.bind("TEST", dataset)
269
+ assert r.state == ResourceState.NONE
270
+
271
+ def test_state_set_and_get(self, dataset):
272
+ r = DummyFileResource("test.txt")
273
+ r.bind("TEST", dataset)
274
+
275
+ r.state = ResourceState.COMPLETE
276
+ assert r.state == ResourceState.COMPLETE
277
+
278
+ r.state = ResourceState.PARTIAL
279
+ assert r.state == ResourceState.PARTIAL
280
+
281
+ def test_dependencies_default_empty(self, dataset):
282
+ r = DummyFileResource("test.txt")
283
+ r.bind("TEST", dataset)
284
+ assert r.dependencies == []
285
+
286
+ def test_dependents_default_empty(self, dataset):
287
+ r = DummyFileResource("test.txt")
288
+ r.bind("TEST", dataset)
289
+ assert r.dependents == []
290
+
291
+ def test_can_recover_default_false(self, dataset):
292
+ r = DummyFileResource("test.txt")
293
+ r.bind("TEST", dataset)
294
+ assert r.can_recover is False
295
+
296
+ def test_can_recover_override(self, dataset):
297
+ r = RecoverableResource("test.txt")
298
+ r.bind("TEST", dataset)
299
+ assert r.can_recover is True
300
+
301
+ def test_has_files_default_true(self, dataset):
302
+ r = DummyFileResource("test.txt")
303
+ r.bind("TEST", dataset)
304
+ assert r.has_files() is True
305
+
306
+ def test_transient_flag(self, dataset):
307
+ r = DummyFileResource("test.txt", transient=True)
308
+ r.bind("TEST", dataset)
309
+ assert r.transient is True
310
+
311
+ def test_context_property(self, dataset):
312
+ r = DummyFileResource("test.txt")
313
+ r.bind("TEST", dataset)
314
+ assert r.context is dataset.context
315
+
316
+ def test_cleanup(self, dataset):
317
+ r = DummyFileResource("test.txt")
318
+ r.bind("TEST", dataset)
319
+
320
+ # Create files at both paths
321
+ r.path.parent.mkdir(parents=True, exist_ok=True)
322
+ r.path.write_text("final")
323
+ r.transient_path.parent.mkdir(parents=True, exist_ok=True)
324
+ r.transient_path.write_text("temp")
325
+ r.state = ResourceState.COMPLETE
326
+
327
+ r.cleanup()
328
+
329
+ assert not r.path.exists()
330
+ assert not r.transient_path.exists()
331
+ assert r.state == ResourceState.NONE
332
+
333
+
334
+ # ==== FileResource Tests ====
335
+
336
+
337
+ class TestFileResource:
338
+ def test_path(self, dataset):
339
+ r = DummyFileResource("data.csv")
340
+ r.bind("DATA", dataset)
341
+
342
+ expected = dataset.datapath / "data.csv"
343
+ assert r.path == expected
344
+
345
+ def test_transient_path(self, dataset):
346
+ r = DummyFileResource("data.csv")
347
+ r.bind("DATA", dataset)
348
+
349
+ expected = dataset.datapath / ".downloads" / "data.csv"
350
+ assert r.transient_path == expected
351
+
352
+ def test_prepare_returns_path(self, dataset):
353
+ r = DummyFileResource("data.csv")
354
+ r.bind("DATA", dataset)
355
+ assert r.prepare() == r.path
356
+
357
+ def test_download_writes_to_transient(self, dataset):
358
+ r = DummyFileResource("data.csv")
359
+ r.bind("DATA", dataset)
360
+ r.download()
361
+
362
+ assert r.transient_path.exists()
363
+ assert "downloaded" in r.transient_path.read_text()
364
+ assert r._download_called
365
+
366
+ def test_stream_default_none(self, dataset):
367
+ r = DummyFileResource("data.csv")
368
+ r.bind("DATA", dataset)
369
+ assert r.stream() is None
370
+
371
+ def test_varname_from_filename(self):
372
+ """Without explicit varname, name is derived from filename."""
373
+ r = DummyFileResource("data.csv.gz")
374
+ assert r.name == "data"
375
+
376
+
377
+ # ==== FolderResource Tests ====
378
+
379
+
380
+ class TestFolderResource:
381
+ def test_path(self, dataset):
382
+ r = DummyFolderResource(varname="archive")
383
+ r.bind("ARCHIVE", dataset)
384
+
385
+ expected = dataset.datapath / "archive"
386
+ assert r.path == expected
387
+
388
+ def test_transient_path(self, dataset):
389
+ r = DummyFolderResource(varname="archive")
390
+ r.bind("ARCHIVE", dataset)
391
+
392
+ expected = dataset.datapath / ".downloads" / "archive"
393
+ assert r.transient_path == expected
394
+
395
+ def test_prepare_returns_path(self, dataset):
396
+ r = DummyFolderResource(varname="archive")
397
+ r.bind("ARCHIVE", dataset)
398
+ assert r.prepare() == r.path
399
+
400
+ def test_download_creates_directory(self, dataset):
401
+ r = DummyFolderResource(varname="archive")
402
+ r.bind("ARCHIVE", dataset)
403
+ r.download()
404
+
405
+ assert r.transient_path.is_dir()
406
+ assert (r.transient_path / "file.txt").exists()
407
+
408
+
409
+ # ==== ValueResource Tests ====
410
+
411
+
412
+ class TestValueResource:
413
+ def test_has_files_false(self, dataset):
414
+ r = DummyValueResource({"key": "value"}, varname="data")
415
+ r.bind("DATA", dataset)
416
+ assert r.has_files() is False
417
+
418
+ def test_prepare_returns_value(self, dataset):
419
+ val = {"key": "value"}
420
+ r = DummyValueResource(val, varname="data")
421
+ r.bind("DATA", dataset)
422
+ assert r.prepare() == val
423
+
424
+
425
+ # ==== Topological Sort Tests ====
426
+
427
+
428
+ class TestTopologicalSort:
429
+ def test_empty(self):
430
+ assert topological_sort({}) == []
431
+
432
+ def test_single(self, dataset):
433
+ r = DummyFileResource("a.txt")
434
+ r.bind("A", dataset)
435
+ result = topological_sort(dataset.resources)
436
+ assert result == [r]
437
+
438
+ def test_linear_chain(self, dataset):
439
+ a = DummyFileResource("a.txt")
440
+ a.bind("A", dataset)
441
+
442
+ b = DependentResource("b.txt", source=a)
443
+ b.bind("B", dataset)
444
+
445
+ result = topological_sort(dataset.resources)
446
+ assert result.index(a) < result.index(b)
447
+
448
+ def test_diamond(self, dataset):
449
+ a = DummyFileResource("a.txt")
450
+ a.bind("A", dataset)
451
+
452
+ b = DependentResource("b.txt", source=a)
453
+ b.bind("B", dataset)
454
+
455
+ c = DependentResource("c.txt", source=a)
456
+ c.bind("C", dataset)
457
+
458
+ d = DependentResource("d.txt", source=b)
459
+ d._dependencies.append(c)
460
+ d.bind("D", dataset)
461
+
462
+ result = topological_sort(dataset.resources)
463
+ assert result.index(a) < result.index(b)
464
+ assert result.index(a) < result.index(c)
465
+ assert result.index(b) < result.index(d)
466
+ assert result.index(c) < result.index(d)
467
+
468
+ def test_cycle_detection(self, dataset):
469
+ a = DummyFileResource("a.txt")
470
+ a.bind("A", dataset)
471
+
472
+ b = DependentResource("b.txt", source=a)
473
+ b.bind("B", dataset)
474
+
475
+ # Create cycle: a depends on b
476
+ a._dependencies = [b]
477
+
478
+ with pytest.raises(ValueError, match="Cycle detected"):
479
+ topological_sort(dataset.resources)
480
+
481
+ def test_independent_resources(self, dataset):
482
+ a = DummyFileResource("a.txt")
483
+ a.bind("A", dataset)
484
+
485
+ b = DummyFileResource("b.txt")
486
+ b.bind("B", dataset)
487
+
488
+ result = topological_sort(dataset.resources)
489
+ assert len(result) == 2
490
+ assert set(result) == {a, b}
491
+
492
+
493
+ # ==== Dependents Computation Tests ====
494
+
495
+
496
+ class TestComputeDependents:
497
+ def test_no_dependencies(self, dataset):
498
+ a = DummyFileResource("a.txt")
499
+ a.bind("A", dataset)
500
+
501
+ _compute_dependents(dataset.resources)
502
+ assert a.dependents == []
503
+
504
+ def test_linear_dependents(self, dataset):
505
+ a = DummyFileResource("a.txt")
506
+ a.bind("A", dataset)
507
+
508
+ b = DependentResource("b.txt", source=a)
509
+ b.bind("B", dataset)
510
+
511
+ _compute_dependents(dataset.resources)
512
+ assert b in a.dependents
513
+ assert a not in b.dependents
514
+
515
+ def test_multiple_dependents(self, dataset):
516
+ a = DummyFileResource("a.txt")
517
+ a.bind("A", dataset)
518
+
519
+ b = DependentResource("b.txt", source=a)
520
+ b.bind("B", dataset)
521
+
522
+ c = DependentResource("c.txt", source=a)
523
+ c.bind("C", dataset)
524
+
525
+ _compute_dependents(dataset.resources)
526
+ assert set(a.dependents) == {b, c}
527
+
528
+
529
+ # ==== Two-Path Download Flow Tests ====
530
+
531
+
532
+ class TestTwoPathFlow:
533
+ def test_download_moves_to_final_path(self, dataset):
534
+ """Framework should move transient_path -> path on success."""
535
+ r = DummyFileResource("data.txt")
536
+ r.bind("DATA", dataset)
537
+
538
+ dataset.ordered_resources = [r]
539
+ _compute_dependents(dataset.resources)
540
+
541
+ dataset.download()
542
+
543
+ assert r.path.exists()
544
+ assert r.state == ResourceState.COMPLETE
545
+
546
+ def test_failure_no_recover_cleans_up(self, dataset):
547
+ """On failure without can_recover, transient data is deleted."""
548
+ r = FailingResource("data.txt")
549
+ r.bind("DATA", dataset)
550
+
551
+ dataset.ordered_resources = [r]
552
+ _compute_dependents(dataset.resources)
553
+
554
+ result = dataset.download()
555
+
556
+ assert result is False
557
+ assert not r.transient_path.exists()
558
+ assert r.state == ResourceState.NONE
559
+
560
+ def test_failure_with_recover_preserves(self, dataset):
561
+ """On failure with can_recover, transient data is preserved."""
562
+
563
+ class FailRecoverable(RecoverableResource):
564
+ def _download(self, destination):
565
+ destination.parent.mkdir(parents=True, exist_ok=True)
566
+ destination.write_text("partial")
567
+ raise RuntimeError("partial failure")
568
+
569
+ r = FailRecoverable("data.txt")
570
+ r.bind("DATA", dataset)
571
+
572
+ dataset.ordered_resources = [r]
573
+ _compute_dependents(dataset.resources)
574
+
575
+ result = dataset.download()
576
+
577
+ assert result is False
578
+ assert r.transient_path.exists()
579
+ assert r.state == ResourceState.PARTIAL
580
+
581
+ def test_skip_complete_resources(self, dataset):
582
+ """Resources already COMPLETE are skipped unless force=True."""
583
+ r = DummyFileResource("data.txt")
584
+ r.bind("DATA", dataset)
585
+
586
+ dataset.ordered_resources = [r]
587
+ _compute_dependents(dataset.resources)
588
+
589
+ # Mark as complete
590
+ r.state = ResourceState.COMPLETE
591
+ r.path.parent.mkdir(parents=True, exist_ok=True)
592
+ r.path.write_text("existing")
593
+
594
+ dataset.download()
595
+
596
+ # download should not have been called
597
+ assert r._download_called is False
598
+
599
+ def test_redownload_when_files_missing(self, dataset):
600
+ """COMPLETE resource with missing files is re-downloaded."""
601
+ r = DummyFileResource("data.txt")
602
+ r.bind("DATA", dataset)
603
+
604
+ dataset.ordered_resources = [r]
605
+ _compute_dependents(dataset.resources)
606
+
607
+ # Mark as complete but do NOT create the file
608
+ r.state = ResourceState.COMPLETE
609
+ assert not r.path.exists()
610
+
611
+ dataset.download()
612
+
613
+ # Should have re-downloaded
614
+ assert r._download_called is True
615
+ assert r.path.exists()
616
+ assert r.state == ResourceState.COMPLETE
617
+
618
+ def test_adopt_preexisting_files(self, dataset):
619
+ """Files already on disk (old downloads) are adopted as COMPLETE."""
620
+ r = DummyFileResource("data.txt")
621
+ r.bind("DATA", dataset)
622
+
623
+ dataset.ordered_resources = [r]
624
+ _compute_dependents(dataset.resources)
625
+
626
+ # Pre-create the file at the final path (simulating old download)
627
+ r.path.parent.mkdir(parents=True, exist_ok=True)
628
+ r.path.write_text("old data")
629
+
630
+ # State is NONE (no .state.json entry)
631
+ assert r.state == ResourceState.NONE
632
+
633
+ dataset.download()
634
+
635
+ # Should NOT have re-downloaded — just marked COMPLETE
636
+ assert r._download_called is False
637
+ assert r.state == ResourceState.COMPLETE
638
+ assert r.path.read_text() == "old data"
639
+
640
+ def test_downloads_dir_cleaned_after_success(self, dataset):
641
+ """The .downloads/ directory is removed after all succeed."""
642
+ r = DummyFileResource("data.txt")
643
+ r.bind("DATA", dataset)
644
+
645
+ dataset.ordered_resources = [r]
646
+ _compute_dependents(dataset.resources)
647
+
648
+ result = dataset.download()
649
+
650
+ assert result is True
651
+ downloads_dir = dataset.datapath / ".downloads"
652
+ assert not downloads_dir.exists()
653
+
654
+ def test_downloads_dir_kept_on_failure(self, dataset):
655
+ """The .downloads/ directory is kept if a download fails."""
656
+ r = FailingResource("data.txt")
657
+ r.bind("DATA", dataset)
658
+
659
+ dataset.ordered_resources = [r]
660
+ _compute_dependents(dataset.resources)
661
+
662
+ # Pre-create .downloads/ with transient data
663
+ r.transient_path.parent.mkdir(parents=True, exist_ok=True)
664
+ r.transient_path.write_text("partial")
665
+
666
+ result = dataset.download()
667
+
668
+ assert result is False
669
+ # .downloads/ should still exist (failure, no cleanup)
670
+ # (transient data itself is deleted because can_recover=False)
671
+
672
+ def test_lock_prevents_concurrent_download(self, dataset):
673
+ """A second download blocks while the first holds the lock."""
674
+ import fcntl
675
+ import threading
676
+
677
+ r = DummyFileResource("data.txt")
678
+ r.bind("DATA", dataset)
679
+ dataset.ordered_resources = [r]
680
+ _compute_dependents(dataset.resources)
681
+
682
+ # Acquire the lock externally to simulate a concurrent download
683
+ dataset.datapath.mkdir(parents=True, exist_ok=True)
684
+ lock_path = dataset.datapath / ".state.lock"
685
+ lock_file = lock_path.open("w")
686
+ fcntl.flock(lock_file, fcntl.LOCK_EX)
687
+
688
+ result_holder = {}
689
+
690
+ def try_download():
691
+ result_holder["result"] = dataset.download()
692
+
693
+ t = threading.Thread(target=try_download)
694
+ t.start()
695
+ # Give thread time to hit the lock
696
+ t.join(timeout=0.2)
697
+ # Thread should still be alive (blocked on lock)
698
+ assert t.is_alive()
699
+
700
+ # Release the lock
701
+ fcntl.flock(lock_file, fcntl.LOCK_UN)
702
+ lock_file.close()
703
+
704
+ t.join(timeout=5)
705
+ assert not t.is_alive()
706
+ assert result_holder["result"] is True
707
+
708
+
709
+ # ==== Eager Transient Cleanup Tests ====
710
+
711
+
712
+ class TestTransientCleanup:
713
+ def test_transient_cleaned_after_dependents_complete(self, dataset):
714
+ """Transient resources are cleaned up when all dependents
715
+ are COMPLETE."""
716
+ a = DummyFileResource("a.txt", transient=True)
717
+ a.bind("A", dataset)
718
+
719
+ b = DependentResource("b.txt", source=a)
720
+ b.bind("B", dataset)
721
+
722
+ _compute_dependents(dataset.resources)
723
+ dataset.ordered_resources = topological_sort(dataset.resources)
724
+
725
+ dataset.download()
726
+
727
+ # b should be complete
728
+ assert b.state == ResourceState.COMPLETE
729
+ assert b.path.exists()
730
+
731
+ # a should be cleaned up (transient)
732
+ assert a.state == ResourceState.NONE
733
+ assert not a.path.exists()
734
+
735
+ def test_non_transient_not_cleaned(self, dataset):
736
+ """Non-transient resources are NOT cleaned up."""
737
+ a = DummyFileResource("a.txt", transient=False)
738
+ a.bind("A", dataset)
739
+
740
+ b = DependentResource("b.txt", source=a)
741
+ b.bind("B", dataset)
742
+
743
+ _compute_dependents(dataset.resources)
744
+ dataset.ordered_resources = topological_sort(dataset.resources)
745
+
746
+ dataset.download()
747
+
748
+ assert a.state == ResourceState.COMPLETE
749
+ assert a.path.exists()
750
+
751
+ def test_transient_not_cleaned_if_dependent_incomplete(self, dataset):
752
+ """Transient resources are NOT cleaned if a dependent
753
+ hasn't completed yet."""
754
+ a = DummyFileResource("a.txt", transient=True)
755
+ a.bind("A", dataset)
756
+
757
+ b = DependentResource("b.txt", source=a)
758
+ b.bind("B", dataset)
759
+
760
+ c = DependentResource("c.txt", source=a)
761
+ c.bind("C", dataset)
762
+
763
+ _compute_dependents(dataset.resources)
764
+ dataset.ordered_resources = topological_sort(dataset.resources)
765
+
766
+ # Download only processes in order, so after B completes,
767
+ # C hasn't yet — a should not be cleaned up until C completes.
768
+ # The full download() handles this correctly.
769
+ dataset.download()
770
+
771
+ # After full download, all dependents are complete
772
+ # so transient should be cleaned
773
+ assert a.state == ResourceState.NONE
774
+
775
+
776
+ # ==== Legacy Decorator-Based Dataset Tests ====
777
+
778
+
779
+ class TestLegacyDecoratorDataset:
780
+ def test_filedownloader_decorator(self, context):
781
+ """Legacy decorator-based filedownloader still works."""
782
+ import warnings
783
+ from datamaestro.download.single import filedownloader
784
+
785
+ repository = MyRepository(context)
786
+ ds = SimpleDataset(repository, context.datapath / "legacy")
787
+
788
+ with warnings.catch_warnings():
789
+ warnings.simplefilter("ignore", DeprecationWarning)
790
+ downloader = filedownloader("test.html", "http://httpbin.org/html")
791
+ downloader(ds)
792
+
793
+ assert "test" in ds.resources
794
+ assert ds.resources["test"] is downloader
795
+
796
+ def test_reference_resource(self, context):
797
+ """reference resource still works."""
798
+ repository = MyRepository(context)
799
+ ds = SimpleDataset(repository, context.datapath / "ref_test")
800
+
801
+ mock_ref = MagicMock()
802
+ mock_ref.prepare.return_value = "prepared_value"
803
+
804
+ ref = reference(varname="ref", reference=mock_ref)
805
+ ref.bind("ref", ds)
806
+
807
+ assert ref.has_files() is False
808
+ result = ref.prepare()
809
+ assert result == "prepared_value"
810
+
811
+
812
+ # ==== New Class-Based Dataset Tests ====
813
+
814
+
815
+ class TestClassBasedDataset:
816
+ def test_bind_class_resources(self, dataset):
817
+ """_bind_class_resources detects Resource attributes."""
818
+ from datamaestro.data import Base
819
+
820
+ class MyData(Base):
821
+ A = DummyFileResource("a.txt")
822
+ B = DummyFileResource("b.txt")
823
+
824
+ _bind_class_resources(MyData, dataset)
825
+
826
+ assert "A" in dataset.resources
827
+ assert "B" in dataset.resources
828
+ assert len(dataset.ordered_resources) == 2
829
+
830
+ def test_bind_with_dependencies(self, dataset):
831
+ """Resources with dependencies are properly ordered."""
832
+ from datamaestro.data import Base
833
+
834
+ src = DummyFileResource("src.txt")
835
+
836
+ class MyData(Base):
837
+ SRC = src
838
+ PROCESSED = DependentResource("proc.txt", source=src)
839
+
840
+ _bind_class_resources(MyData, dataset)
841
+
842
+ # SRC should come before PROCESSED in ordered_resources
843
+ src_idx = dataset.ordered_resources.index(MyData.SRC)
844
+ proc_idx = dataset.ordered_resources.index(MyData.PROCESSED)
845
+ assert src_idx < proc_idx
846
+
847
+ # Check dependents were computed
848
+ assert MyData.PROCESSED in MyData.SRC.dependents
849
+
850
+ def test_non_resource_attributes_ignored(self, dataset):
851
+ """Non-Resource class attributes are not bound."""
852
+ from datamaestro.data import Base
853
+
854
+ class MyData(Base):
855
+ A = DummyFileResource("a.txt")
856
+ NOT_A_RESOURCE = "just a string"
857
+ ALSO_NOT = 42
858
+
859
+ _bind_class_resources(MyData, dataset)
860
+
861
+ assert "A" in dataset.resources
862
+ assert "NOT_A_RESOURCE" not in dataset.resources
863
+ assert "ALSO_NOT" not in dataset.resources
864
+
865
+
866
+ # ==== Backward Compatibility Tests ====
867
+
868
+
869
+ class TestBackwardCompat:
870
+ def test_hasfiles_deprecated(self, dataset):
871
+ """hasfiles() still works but emits deprecation."""
872
+ r = DummyFileResource("test.txt")
873
+ r.bind("TEST", dataset)
874
+
875
+ import warnings
876
+
877
+ with warnings.catch_warnings(record=True):
878
+ warnings.simplefilter("always")
879
+ result = r.hasfiles()
880
+
881
+ assert result is True
882
+
883
+ def test_definition_property_deprecated(self, dataset):
884
+ """definition property still works but emits deprecation."""
885
+ r = DummyFileResource("test.txt")
886
+ r.bind("TEST", dataset)
887
+
888
+ import warnings
889
+
890
+ # Clear the one-time warning cache
891
+ from datamaestro.download import _deprecation_warned
892
+
893
+ _deprecation_warned.discard("definition")
894
+
895
+ with warnings.catch_warnings(record=True):
896
+ warnings.simplefilter("always")
897
+ result = r.definition
898
+
899
+ assert result is dataset
900
+
901
+ def test_download_subclass_deprecated(self):
902
+ """Subclassing Download emits deprecation."""
903
+ from datamaestro.download import _deprecation_warned
904
+
905
+ _deprecation_warned.discard("Download-TestSub")
906
+
907
+ import warnings
908
+
909
+ with warnings.catch_warnings(record=True):
910
+ warnings.simplefilter("always")
911
+
912
+ class TestSub(Download):
913
+ def download(self, force=False):
914
+ pass
915
+
916
+ def prepare(self):
917
+ pass
918
+
919
+ def test_apply_classmethod(self):
920
+ """Resource.apply creates instances."""
921
+ r = DummyFileResource.apply("test.txt")
922
+ assert isinstance(r, DummyFileResource)
923
+ assert r.filename == "test.txt"
924
+
925
+
926
+ # ==== Concrete Resource Tests ====
927
+
928
+
929
+ class TestFileDownloader:
930
+ def test_construction(self):
931
+ """FileDownloader can be constructed."""
932
+ from datamaestro.download.single import FileDownloader
933
+
934
+ r = FileDownloader("data.csv", "http://example.com/data.csv")
935
+ assert r.filename == "data.csv"
936
+ assert r.url == "http://example.com/data.csv"
937
+ assert r.name == "data" # derived from filename
938
+
939
+ def test_factory_alias(self):
940
+ """filedownloader is an alias for FileDownloader.apply."""
941
+ from datamaestro.download.single import (
942
+ filedownloader,
943
+ FileDownloader,
944
+ )
945
+
946
+ r = filedownloader("data.csv", "http://example.com/data.csv")
947
+ assert isinstance(r, FileDownloader)
948
+
949
+ def test_transient_flag(self):
950
+ """FileDownloader accepts transient flag."""
951
+ from datamaestro.download.single import FileDownloader
952
+
953
+ r = FileDownloader(
954
+ "data.csv",
955
+ "http://example.com/data.csv",
956
+ transient=True,
957
+ )
958
+ assert r.transient is True
959
+
960
+ def test_backward_compat_alias(self):
961
+ """SingleDownload is an alias for FileDownloader."""
962
+ from datamaestro.download.single import (
963
+ SingleDownload,
964
+ FileDownloader,
965
+ )
966
+
967
+ assert SingleDownload is FileDownloader
968
+
969
+
970
+ class TestConcatDownloader:
971
+ def test_construction(self):
972
+ from datamaestro.download.single import ConcatDownloader
973
+
974
+ r = ConcatDownloader("data.txt", "http://example.com/data.tar.gz")
975
+ assert r.filename == "data.txt"
976
+ assert r.url == "http://example.com/data.tar.gz"
977
+
978
+ def test_factory_alias(self):
979
+ from datamaestro.download.single import (
980
+ concatdownload,
981
+ ConcatDownloader,
982
+ )
983
+
984
+ r = concatdownload("data.txt", "http://example.com/data.tar.gz")
985
+ assert isinstance(r, ConcatDownloader)
986
+
987
+
988
+ class TestArchiveDownloaders:
989
+ def test_zip_construction(self):
990
+ from datamaestro.download.archive import ZipDownloader
991
+
992
+ r = ZipDownloader("archive", "http://example.com/data.zip")
993
+ assert r.url == "http://example.com/data.zip"
994
+ assert r.name == "archive"
995
+
996
+ def test_tar_construction(self):
997
+ from datamaestro.download.archive import TarDownloader
998
+
999
+ r = TarDownloader("archive", "http://example.com/data.tar.gz")
1000
+ assert r.url == "http://example.com/data.tar.gz"
1001
+
1002
+ def test_zip_factory_alias(self):
1003
+ from datamaestro.download.archive import (
1004
+ zipdownloader,
1005
+ ZipDownloader,
1006
+ )
1007
+
1008
+ r = zipdownloader("archive", "http://example.com/data.zip")
1009
+ assert isinstance(r, ZipDownloader)
1010
+
1011
+ def test_tar_factory_alias(self):
1012
+ from datamaestro.download.archive import (
1013
+ tardownloader,
1014
+ TarDownloader,
1015
+ )
1016
+
1017
+ r = tardownloader("archive", "http://example.com/data.tar.gz")
1018
+ assert isinstance(r, TarDownloader)
1019
+
1020
+
1021
+ class TestCustomDownload:
1022
+ def test_construction(self):
1023
+ from datamaestro.download.custom import custom_download
1024
+
1025
+ fn = MagicMock()
1026
+ r = custom_download("data", fn)
1027
+ assert r.name == "data"
1028
+ assert r.downloader is fn
1029
+
1030
+
1031
+ class TestHFDownloader:
1032
+ def test_construction(self):
1033
+ from datamaestro.download.huggingface import HFDownloader
1034
+
1035
+ r = HFDownloader("hf", repo_id="user/dataset")
1036
+ assert r.repo_id == "user/dataset"
1037
+ assert r.name == "hf"
1038
+
1039
+ def test_factory_alias(self):
1040
+ from datamaestro.download.huggingface import (
1041
+ hf_download,
1042
+ HFDownloader,
1043
+ )
1044
+
1045
+ r = hf_download("hf", repo_id="user/dataset")
1046
+ assert isinstance(r, HFDownloader)
1047
+
1048
+ def test_prepare(self):
1049
+ from datamaestro.download.huggingface import HFDownloader
1050
+
1051
+ r = HFDownloader(
1052
+ "hf",
1053
+ repo_id="user/dataset",
1054
+ data_files="train.csv",
1055
+ split="train",
1056
+ )
1057
+ result = r.prepare()
1058
+ assert result == {
1059
+ "repo_id": "user/dataset",
1060
+ "data_files": "train.csv",
1061
+ "split": "train",
1062
+ }
1063
+
1064
+
1065
+ class TestTodoResource:
1066
+ def test_raises_not_implemented(self):
1067
+ from datamaestro.download.todo import Todo
1068
+
1069
+ r = Todo(varname="test")
1070
+ with pytest.raises(NotImplementedError):
1071
+ r.download()
1072
+
1073
+ with pytest.raises(NotImplementedError):
1074
+ r.prepare()
1075
+
1076
+
1077
+ class TestReferenceResource:
1078
+ def test_has_files_false(self, dataset):
1079
+ mock_ref = MagicMock()
1080
+ mock_ref.prepare.return_value = "value"
1081
+
1082
+ r = reference(varname="ref", reference=mock_ref)
1083
+ r.bind("ref", dataset)
1084
+
1085
+ assert r.has_files() is False
1086
+
1087
+ def test_prepare_delegates(self, dataset):
1088
+ mock_ref = MagicMock()
1089
+ mock_ref.prepare.return_value = "prepared"
1090
+
1091
+ r = reference(varname="ref", reference=mock_ref)
1092
+ r.bind("ref", dataset)
1093
+
1094
+ result = r.prepare()
1095
+ assert result == "prepared"
1096
+
1097
+ def test_download_delegates(self, dataset):
1098
+ mock_ref = MagicMock()
1099
+ mock_ref.__datamaestro__ = MagicMock()
1100
+
1101
+ r = reference(varname="ref", reference=mock_ref)
1102
+ r.bind("ref", dataset)
1103
+
1104
+ r.download(force=True)
1105
+ mock_ref.__datamaestro__.download.assert_called_once_with(True)
1106
+
1107
+ def test_requires_reference(self):
1108
+ with pytest.raises(AssertionError, match="cannot be null"):
1109
+ reference(varname="ref", reference=None)
1110
+
1111
+
1112
+ # ==== Links Resource Tests ====
1113
+
1114
+
1115
+ class TestLinksResource:
1116
+ def test_construction(self):
1117
+ from datamaestro.download.links import links
1118
+
1119
+ mock_ds = MagicMock()
1120
+ r = links("data", ref1=mock_ds)
1121
+ assert r.name == "data"
1122
+
1123
+ def test_has_files_false(self, dataset):
1124
+ from datamaestro.download.links import links
1125
+
1126
+ mock_ds = MagicMock()
1127
+ r = links("data", ref1=mock_ds)
1128
+ r.bind("data", dataset)
1129
+
1130
+ assert r.has_files() is False
1131
+
1132
+ def test_path_is_datapath(self, dataset):
1133
+ from datamaestro.download.links import links
1134
+
1135
+ mock_ds = MagicMock()
1136
+ r = links("data", ref1=mock_ds)
1137
+ r.bind("data", dataset)
1138
+
1139
+ assert r.path == dataset.datapath
1140
+
1141
+ def test_prepare_returns_path(self, dataset):
1142
+ from datamaestro.download.links import links
1143
+
1144
+ mock_ds = MagicMock()
1145
+ r = links("data", ref1=mock_ds)
1146
+ r.bind("data", dataset)
1147
+
1148
+ assert r.prepare() == dataset.datapath
1149
+
1150
+
1151
+ class TestLinkFolder:
1152
+ def test_construction(self):
1153
+ from datamaestro.download.links import linkfolder
1154
+
1155
+ r = linkfolder("data", proposals=["/tmp/test"])
1156
+ assert r.name == "data"
1157
+
1158
+ def test_check_is_dir(self, dataset, tmp_path):
1159
+ from datamaestro.download.links import linkfolder
1160
+
1161
+ r = linkfolder("data", proposals=[])
1162
+ r.bind("data", dataset)
1163
+
1164
+ # A directory should pass
1165
+ assert r.check(tmp_path) is True
1166
+ # A non-existent path should fail
1167
+ assert r.check(tmp_path / "nonexistent") is False
1168
+
1169
+ def test_path(self, dataset):
1170
+ from datamaestro.download.links import linkfolder
1171
+
1172
+ r = linkfolder("data", proposals=[])
1173
+ r.bind("data", dataset)
1174
+
1175
+ assert r.path == dataset.datapath / "data"
1176
+
1177
+ def test_prepare_returns_path(self, dataset):
1178
+ from datamaestro.download.links import linkfolder
1179
+
1180
+ r = linkfolder("data", proposals=[])
1181
+ r.bind("data", dataset)
1182
+
1183
+ assert r.prepare() == r.path
1184
+
1185
+
1186
+ class TestLinkFile:
1187
+ def test_construction(self):
1188
+ from datamaestro.download.links import linkfile
1189
+
1190
+ r = linkfile("data", proposals=["/tmp/test.txt"])
1191
+ assert r.name == "data"
1192
+
1193
+ def test_check_is_file(self, dataset, tmp_path):
1194
+ from datamaestro.download.links import linkfile
1195
+
1196
+ r = linkfile("data", proposals=[])
1197
+ r.bind("data", dataset)
1198
+
1199
+ # Create a real file to check
1200
+ test_file = tmp_path / "test.txt"
1201
+ test_file.write_text("hello")
1202
+
1203
+ assert r.check(test_file) is True
1204
+ # A directory should fail
1205
+ assert r.check(tmp_path) is False
1206
+ # A non-existent path should fail
1207
+ assert r.check(tmp_path / "nonexistent") is False
1208
+
1209
+ def test_path(self, dataset):
1210
+ from datamaestro.download.links import linkfile
1211
+
1212
+ r = linkfile("data", proposals=[])
1213
+ r.bind("data", dataset)
1214
+
1215
+ assert r.path == dataset.datapath / "data"
1216
+
1217
+
1218
+ # ==== Wayback Resource Tests ====
1219
+
1220
+
1221
+ class TestWaybackDocuments:
1222
+ def test_construction(self):
1223
+ from datamaestro.download.wayback import wayback_documents
1224
+
1225
+ def urls_fn():
1226
+ return iter(["http://example.com"])
1227
+
1228
+ r = wayback_documents("20200101", urls_fn, name="wb")
1229
+ assert r.name == "wb"
1230
+ assert r.timestamp == "20200101"
1231
+
1232
+ def test_prepare_returns_path(self, dataset):
1233
+ from datamaestro.download.wayback import wayback_documents
1234
+
1235
+ def urls_fn():
1236
+ return iter([])
1237
+
1238
+ r = wayback_documents("20200101", urls_fn, name="wb")
1239
+ r.bind("wb", dataset)
1240
+
1241
+ expected = dataset.datapath / "wb"
1242
+ assert r.prepare() == expected
1243
+
1244
+
1245
+ # ==== Custom Download Functional Tests ====
1246
+
1247
+
1248
+ class TestCustomDownloadFunctional:
1249
+ def test_download_delegates(self, dataset):
1250
+ from datamaestro.download.custom import custom_download
1251
+
1252
+ fn = MagicMock()
1253
+ r = custom_download("data", fn)
1254
+ r.bind("data", dataset)
1255
+
1256
+ r.download(force=True)
1257
+
1258
+ fn.assert_called_once_with(dataset.context, dataset.datapath, force=True)
1259
+
1260
+ def test_prepare_returns_datapath(self, dataset):
1261
+ from datamaestro.download.custom import custom_download
1262
+
1263
+ fn = MagicMock()
1264
+ r = custom_download("data", fn)
1265
+ r.bind("data", dataset)
1266
+
1267
+ assert r.prepare() == dataset.datapath
1268
+
1269
+
1270
+ # ==== Archive Downloader Base Tests ====
1271
+
1272
+
1273
+ class TestArchiveDownloaderBase:
1274
+ def test_zip_path_with_postinit(self, dataset):
1275
+ from datamaestro.download.archive import ZipDownloader
1276
+
1277
+ r = ZipDownloader("archive", "http://example.com/data.zip")
1278
+ r.bind("archive", dataset)
1279
+
1280
+ # path should trigger postinit
1281
+ p = r.path
1282
+ assert isinstance(p, Path)
1283
+
1284
+ def test_tar_path_with_postinit(self, dataset):
1285
+ from datamaestro.download.archive import TarDownloader
1286
+
1287
+ r = TarDownloader("archive", "http://example.com/data.tar.gz")
1288
+ r.bind("archive", dataset)
1289
+
1290
+ p = r.path
1291
+ assert isinstance(p, Path)
1292
+
1293
+ def test_extractall_default(self):
1294
+ from datamaestro.download.archive import ZipDownloader
1295
+
1296
+ r = ZipDownloader("archive", "http://example.com/data.zip")
1297
+ assert r.extractall is True
1298
+
1299
+ def test_extractall_with_subpath(self):
1300
+ from datamaestro.download.archive import ZipDownloader
1301
+
1302
+ r = ZipDownloader(
1303
+ "archive",
1304
+ "http://example.com/data.zip",
1305
+ subpath="subdir",
1306
+ )
1307
+ assert r.extractall is False
1308
+
1309
+ def test_extractall_with_files(self):
1310
+ from datamaestro.download.archive import ZipDownloader
1311
+
1312
+ r = ZipDownloader(
1313
+ "archive",
1314
+ "http://example.com/data.zip",
1315
+ files={"file1.txt"},
1316
+ )
1317
+ assert r.extractall is False
1318
+
1319
+ def test_subpath_trailing_slash(self):
1320
+ from datamaestro.download.archive import ZipDownloader
1321
+
1322
+ r = ZipDownloader(
1323
+ "archive",
1324
+ "http://example.com/data.zip",
1325
+ subpath="subdir",
1326
+ )
1327
+ assert r.subpath == "subdir/"
1328
+
1329
+ def test_transient_flag(self):
1330
+ from datamaestro.download.archive import ZipDownloader
1331
+
1332
+ r = ZipDownloader(
1333
+ "archive",
1334
+ "http://example.com/data.zip",
1335
+ transient=True,
1336
+ )
1337
+ assert r.transient is True
1338
+
1339
+
1340
+ # ==== gsync (legacy) Tests ====
1341
+
1342
+
1343
+ class TestGsync:
1344
+ def test_import(self):
1345
+ """gsync can be imported (legacy Download subclass)."""
1346
+ from datamaestro.download.sync import gsync
1347
+
1348
+ assert issubclass(gsync, Download)
1349
+
1350
+
1351
+ # ==== manual.py (deprecated re-export) Tests ====
1352
+
1353
+
1354
+ class TestManual:
1355
+ def test_import_linkfolder(self):
1356
+ """manual.linkfolder is a deprecated re-export."""
1357
+ import warnings
1358
+
1359
+ with warnings.catch_warnings():
1360
+ warnings.simplefilter("ignore", DeprecationWarning)
1361
+ from datamaestro.download.manual import linkfolder
1362
+
1363
+ assert linkfolder is not None
1364
+
1365
+
1366
+ # ==== multiple.py (legacy) Tests ====
1367
+
1368
+
1369
+ class TestMultiple:
1370
+ def test_import_list(self):
1371
+ """List can be imported (legacy Download subclass)."""
1372
+ import warnings
1373
+
1374
+ with warnings.catch_warnings():
1375
+ warnings.simplefilter("ignore", DeprecationWarning)
1376
+ from datamaestro.download.multiple import List
1377
+
1378
+ assert issubclass(List, Download)
1379
+
1380
+ def test_import_datasets(self):
1381
+ """Datasets can be imported (legacy Download subclass)."""
1382
+ import warnings
1383
+
1384
+ with warnings.catch_warnings():
1385
+ warnings.simplefilter("ignore", DeprecationWarning)
1386
+ from datamaestro.download.multiple import Datasets
1387
+
1388
+ assert issubclass(Datasets, Download)