datamaestro 1.6.2__py3-none-any.whl → 1.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1657 @@
1
+ """Tests for the new Resource interface.
2
+
3
+ Covers:
4
+ - ResourceState enum and metadata persistence
5
+ - Resource base class (bind, dependencies, state, cleanup)
6
+ - FileResource, FolderResource, ValueResource
7
+ - Topological sort and cycle detection
8
+ - Two-path download flow (transient_path -> path)
9
+ - Eager transient cleanup
10
+ - can_recover property behavior
11
+ - Both new class-based and legacy decorator-based dataset definitions
12
+ - Each concrete resource type
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ from pathlib import Path
19
+ from unittest.mock import MagicMock
20
+
21
+ import pytest
22
+
23
+ from datamaestro.definitions import (
24
+ AbstractDataset,
25
+ DataDefinition,
26
+ DatasetWrapper,
27
+ topological_sort,
28
+ _compute_dependents,
29
+ _bind_class_resources,
30
+ )
31
+ from datamaestro.download import (
32
+ Resource,
33
+ ResourceState,
34
+ ResourceStateFile,
35
+ FileResource,
36
+ FolderResource,
37
+ ValueResource,
38
+ Download,
39
+ reference,
40
+ )
41
+ from .conftest import MyRepository
42
+
43
+
44
+ # ---- Helpers ----
45
+
46
+
47
+ class SimpleDataset(AbstractDataset):
48
+ """Minimal dataset for testing."""
49
+
50
+ def __init__(self, repository, datapath: Path):
51
+ super().__init__(repository)
52
+ self._datapath = datapath
53
+
54
+ @property
55
+ def datapath(self):
56
+ return self._datapath
57
+
58
+ def _prepare(self):
59
+ # Return a mock Base-like object for the prepare flow
60
+ obj = MagicMock()
61
+ obj.__xpm__ = MagicMock()
62
+ obj.__xpm__.values = {}
63
+ return obj
64
+
65
+ @property
66
+ def description(self):
67
+ return "test dataset"
68
+
69
+
70
+ class DummyFileResource(FileResource):
71
+ """Concrete FileResource for testing."""
72
+
73
+ def __init__(self, filename, url="http://example.com/test", **kw):
74
+ super().__init__(filename, **kw)
75
+ self.url = url
76
+ self._download_called = False
77
+
78
+ def _download(self, destination: Path):
79
+ destination.parent.mkdir(parents=True, exist_ok=True)
80
+ destination.write_text(f"downloaded from {self.url}")
81
+ self._download_called = True
82
+
83
+
84
+ class DummyFolderResource(FolderResource):
85
+ """Concrete FolderResource for testing."""
86
+
87
+ def __init__(self, **kw):
88
+ super().__init__(**kw)
89
+ self._download_called = False
90
+
91
+ def _download(self, destination: Path):
92
+ destination.mkdir(parents=True, exist_ok=True)
93
+ (destination / "file.txt").write_text("content")
94
+ self._download_called = True
95
+
96
+
97
+ class DummyValueResource(ValueResource):
98
+ """Concrete ValueResource for testing."""
99
+
100
+ def __init__(self, value, **kw):
101
+ super().__init__(**kw)
102
+ self._value = value
103
+
104
+ def download(self, force=False):
105
+ pass
106
+
107
+ def prepare(self):
108
+ return self._value
109
+
110
+
111
+ class RecoverableResource(FileResource):
112
+ """Resource that supports recovery from PARTIAL state."""
113
+
114
+ @property
115
+ def can_recover(self) -> bool:
116
+ return True
117
+
118
+ def __init__(self, filename, **kw):
119
+ super().__init__(filename, **kw)
120
+
121
+ def _download(self, destination: Path):
122
+ destination.parent.mkdir(parents=True, exist_ok=True)
123
+ destination.write_text("recovered")
124
+
125
+
126
+ class FailingResource(FileResource):
127
+ """Resource that fails during download."""
128
+
129
+ def __init__(self, filename, **kw):
130
+ super().__init__(filename, **kw)
131
+
132
+ def _download(self, destination: Path):
133
+ destination.parent.mkdir(parents=True, exist_ok=True)
134
+ destination.write_text("partial data")
135
+ raise RuntimeError("Download failed")
136
+
137
+
138
+ class DependentResource(FileResource):
139
+ """Resource that depends on another resource."""
140
+
141
+ def __init__(self, filename, source: Resource, **kw):
142
+ super().__init__(filename, **kw)
143
+ self._dependencies = [source]
144
+
145
+ def _download(self, destination: Path):
146
+ # Read from dependency's path
147
+ source = self.dependencies[0]
148
+ data = source.path.read_text()
149
+ destination.parent.mkdir(parents=True, exist_ok=True)
150
+ destination.write_text(f"processed: {data}")
151
+
152
+
153
+ # ---- Fixtures ----
154
+
155
+
156
+ @pytest.fixture
157
+ def datapath(tmp_path):
158
+ """Temporary dataset data path."""
159
+ return tmp_path / "dataset"
160
+
161
+
162
+ @pytest.fixture
163
+ def dataset(context, datapath):
164
+ """A minimal dataset bound to a repository."""
165
+ repository = MyRepository(context)
166
+ ds = SimpleDataset(repository, datapath)
167
+ return ds
168
+
169
+
170
+ # ==== ResourceState Tests ====
171
+
172
+
173
+ class TestResourceState:
174
+ def test_values(self):
175
+ assert ResourceState.NONE == "none"
176
+ assert ResourceState.PARTIAL == "partial"
177
+ assert ResourceState.COMPLETE == "complete"
178
+
179
+ def test_from_string(self):
180
+ assert ResourceState("none") == ResourceState.NONE
181
+ assert ResourceState("partial") == ResourceState.PARTIAL
182
+ assert ResourceState("complete") == ResourceState.COMPLETE
183
+
184
+
185
+ # ==== ResourceStateFile Tests ====
186
+
187
+
188
+ class TestResourceStateFile:
189
+ def test_read_nonexistent(self, datapath):
190
+ sf = ResourceStateFile(datapath)
191
+ assert sf.read("TRAIN") == ResourceState.NONE
192
+
193
+ def test_write_and_read(self, datapath):
194
+ sf = ResourceStateFile(datapath)
195
+ sf.write("TRAIN", ResourceState.COMPLETE)
196
+
197
+ assert sf.read("TRAIN") == ResourceState.COMPLETE
198
+ assert sf.read("TEST") == ResourceState.NONE
199
+
200
+ def test_multiple_resources(self, datapath):
201
+ sf = ResourceStateFile(datapath)
202
+ sf.write("A", ResourceState.COMPLETE)
203
+ sf.write("B", ResourceState.PARTIAL)
204
+ sf.write("C", ResourceState.NONE)
205
+
206
+ assert sf.read("A") == ResourceState.COMPLETE
207
+ assert sf.read("B") == ResourceState.PARTIAL
208
+ assert sf.read("C") == ResourceState.NONE
209
+
210
+ def test_overwrite(self, datapath):
211
+ sf = ResourceStateFile(datapath)
212
+ sf.write("A", ResourceState.PARTIAL)
213
+ assert sf.read("A") == ResourceState.PARTIAL
214
+
215
+ sf.write("A", ResourceState.COMPLETE)
216
+ assert sf.read("A") == ResourceState.COMPLETE
217
+
218
+ def test_file_format(self, datapath):
219
+ sf = ResourceStateFile(datapath)
220
+ sf.write("TRAIN", ResourceState.COMPLETE)
221
+
222
+ state_path = datapath / ".state.json"
223
+ assert state_path.exists()
224
+
225
+ with state_path.open() as f:
226
+ data = json.load(f)
227
+
228
+ assert data["version"] == 1
229
+ assert data["resources"]["TRAIN"]["state"] == "complete"
230
+
231
+
232
+ # ==== Resource Base Class Tests ====
233
+
234
+
235
+ class TestResourceBase:
236
+ def test_bind(self, dataset):
237
+ r = DummyFileResource("test.txt")
238
+ r.bind("TEST", dataset)
239
+
240
+ assert r.name == "TEST"
241
+ assert r.dataset is dataset
242
+ assert "TEST" in dataset.resources
243
+ assert r in dataset.ordered_resources
244
+
245
+ def test_bind_with_varname(self, dataset):
246
+ r = DummyFileResource("test.txt", varname="my_var")
247
+ r.bind("ATTR_NAME", dataset)
248
+
249
+ # varname takes precedence
250
+ assert r.name == "my_var"
251
+
252
+ def test_bind_duplicate_raises(self, dataset):
253
+ r1 = DummyFileResource("test1.txt")
254
+ r2 = DummyFileResource("test2.txt")
255
+ r1.bind("TEST", dataset)
256
+
257
+ with pytest.raises(AssertionError, match="already declared"):
258
+ r2.bind("TEST", dataset)
259
+
260
+ def test_bind_already_bound_raises(self, dataset):
261
+ r = DummyFileResource("test.txt")
262
+ r.bind("TEST", dataset)
263
+
264
+ ds2 = SimpleDataset(None, dataset.datapath / "other")
265
+ with pytest.raises(AssertionError, match="already bound"):
266
+ r.bind("TEST2", ds2)
267
+
268
+ def test_state_default_none(self, dataset):
269
+ r = DummyFileResource("test.txt")
270
+ r.bind("TEST", dataset)
271
+ assert r.state == ResourceState.NONE
272
+
273
+ def test_state_set_and_get(self, dataset):
274
+ r = DummyFileResource("test.txt")
275
+ r.bind("TEST", dataset)
276
+
277
+ r.state = ResourceState.COMPLETE
278
+ assert r.state == ResourceState.COMPLETE
279
+
280
+ r.state = ResourceState.PARTIAL
281
+ assert r.state == ResourceState.PARTIAL
282
+
283
+ def test_dependencies_default_empty(self, dataset):
284
+ r = DummyFileResource("test.txt")
285
+ r.bind("TEST", dataset)
286
+ assert r.dependencies == []
287
+
288
+ def test_dependents_default_empty(self, dataset):
289
+ r = DummyFileResource("test.txt")
290
+ r.bind("TEST", dataset)
291
+ assert r.dependents == []
292
+
293
+ def test_can_recover_default_false(self, dataset):
294
+ r = DummyFileResource("test.txt")
295
+ r.bind("TEST", dataset)
296
+ assert r.can_recover is False
297
+
298
+ def test_can_recover_override(self, dataset):
299
+ r = RecoverableResource("test.txt")
300
+ r.bind("TEST", dataset)
301
+ assert r.can_recover is True
302
+
303
+ def test_has_files_default_true(self, dataset):
304
+ r = DummyFileResource("test.txt")
305
+ r.bind("TEST", dataset)
306
+ assert r.has_files() is True
307
+
308
+ def test_transient_flag(self, dataset):
309
+ r = DummyFileResource("test.txt", transient=True)
310
+ r.bind("TEST", dataset)
311
+ assert r.transient is True
312
+
313
+ def test_context_property(self, dataset):
314
+ r = DummyFileResource("test.txt")
315
+ r.bind("TEST", dataset)
316
+ assert r.context is dataset.context
317
+
318
+ def test_cleanup(self, dataset):
319
+ r = DummyFileResource("test.txt")
320
+ r.bind("TEST", dataset)
321
+
322
+ # Create files at both paths
323
+ r.path.parent.mkdir(parents=True, exist_ok=True)
324
+ r.path.write_text("final")
325
+ r.transient_path.parent.mkdir(parents=True, exist_ok=True)
326
+ r.transient_path.write_text("temp")
327
+ r.state = ResourceState.COMPLETE
328
+
329
+ r.cleanup()
330
+
331
+ assert not r.path.exists()
332
+ assert not r.transient_path.exists()
333
+ assert r.state == ResourceState.NONE
334
+
335
+
336
+ # ==== FileResource Tests ====
337
+
338
+
339
+ class TestFileResource:
340
+ def test_path(self, dataset):
341
+ r = DummyFileResource("data.csv")
342
+ r.bind("DATA", dataset)
343
+
344
+ expected = dataset.datapath / "data.csv"
345
+ assert r.path == expected
346
+
347
+ def test_transient_path(self, dataset):
348
+ r = DummyFileResource("data.csv")
349
+ r.bind("DATA", dataset)
350
+
351
+ expected = dataset.datapath / ".downloads" / "data.csv"
352
+ assert r.transient_path == expected
353
+
354
+ def test_prepare_returns_path(self, dataset):
355
+ r = DummyFileResource("data.csv")
356
+ r.bind("DATA", dataset)
357
+ assert r.prepare() == r.path
358
+
359
+ def test_download_writes_to_transient(self, dataset):
360
+ r = DummyFileResource("data.csv")
361
+ r.bind("DATA", dataset)
362
+ r.download()
363
+
364
+ assert r.transient_path.exists()
365
+ assert "downloaded" in r.transient_path.read_text()
366
+ assert r._download_called
367
+
368
+ def test_stream_default_none(self, dataset):
369
+ r = DummyFileResource("data.csv")
370
+ r.bind("DATA", dataset)
371
+ assert r.stream() is None
372
+
373
+ def test_varname_from_filename(self):
374
+ """Without explicit varname, name is derived from filename."""
375
+ r = DummyFileResource("data.csv.gz")
376
+ assert r.name == "data"
377
+
378
+
379
+ # ==== FolderResource Tests ====
380
+
381
+
382
+ class TestFolderResource:
383
+ def test_path(self, dataset):
384
+ r = DummyFolderResource(varname="archive")
385
+ r.bind("ARCHIVE", dataset)
386
+
387
+ expected = dataset.datapath / "archive"
388
+ assert r.path == expected
389
+
390
+ def test_transient_path(self, dataset):
391
+ r = DummyFolderResource(varname="archive")
392
+ r.bind("ARCHIVE", dataset)
393
+
394
+ expected = dataset.datapath / ".downloads" / "archive"
395
+ assert r.transient_path == expected
396
+
397
+ def test_prepare_returns_path(self, dataset):
398
+ r = DummyFolderResource(varname="archive")
399
+ r.bind("ARCHIVE", dataset)
400
+ assert r.prepare() == r.path
401
+
402
+ def test_download_creates_directory(self, dataset):
403
+ r = DummyFolderResource(varname="archive")
404
+ r.bind("ARCHIVE", dataset)
405
+ r.download()
406
+
407
+ assert r.transient_path.is_dir()
408
+ assert (r.transient_path / "file.txt").exists()
409
+
410
+
411
+ # ==== ValueResource Tests ====
412
+
413
+
414
+ class TestValueResource:
415
+ def test_has_files_false(self, dataset):
416
+ r = DummyValueResource({"key": "value"}, varname="data")
417
+ r.bind("DATA", dataset)
418
+ assert r.has_files() is False
419
+
420
+ def test_prepare_returns_value(self, dataset):
421
+ val = {"key": "value"}
422
+ r = DummyValueResource(val, varname="data")
423
+ r.bind("DATA", dataset)
424
+ assert r.prepare() == val
425
+
426
+
427
+ # ==== Topological Sort Tests ====
428
+
429
+
430
+ class TestTopologicalSort:
431
+ def test_empty(self):
432
+ assert topological_sort({}) == []
433
+
434
+ def test_single(self, dataset):
435
+ r = DummyFileResource("a.txt")
436
+ r.bind("A", dataset)
437
+ result = topological_sort(dataset.resources)
438
+ assert result == [r]
439
+
440
+ def test_linear_chain(self, dataset):
441
+ a = DummyFileResource("a.txt")
442
+ a.bind("A", dataset)
443
+
444
+ b = DependentResource("b.txt", source=a)
445
+ b.bind("B", dataset)
446
+
447
+ result = topological_sort(dataset.resources)
448
+ assert result.index(a) < result.index(b)
449
+
450
+ def test_diamond(self, dataset):
451
+ a = DummyFileResource("a.txt")
452
+ a.bind("A", dataset)
453
+
454
+ b = DependentResource("b.txt", source=a)
455
+ b.bind("B", dataset)
456
+
457
+ c = DependentResource("c.txt", source=a)
458
+ c.bind("C", dataset)
459
+
460
+ d = DependentResource("d.txt", source=b)
461
+ d._dependencies.append(c)
462
+ d.bind("D", dataset)
463
+
464
+ result = topological_sort(dataset.resources)
465
+ assert result.index(a) < result.index(b)
466
+ assert result.index(a) < result.index(c)
467
+ assert result.index(b) < result.index(d)
468
+ assert result.index(c) < result.index(d)
469
+
470
+ def test_cycle_detection(self, dataset):
471
+ a = DummyFileResource("a.txt")
472
+ a.bind("A", dataset)
473
+
474
+ b = DependentResource("b.txt", source=a)
475
+ b.bind("B", dataset)
476
+
477
+ # Create cycle: a depends on b
478
+ a._dependencies = [b]
479
+
480
+ with pytest.raises(ValueError, match="Cycle detected"):
481
+ topological_sort(dataset.resources)
482
+
483
+ def test_independent_resources(self, dataset):
484
+ a = DummyFileResource("a.txt")
485
+ a.bind("A", dataset)
486
+
487
+ b = DummyFileResource("b.txt")
488
+ b.bind("B", dataset)
489
+
490
+ result = topological_sort(dataset.resources)
491
+ assert len(result) == 2
492
+ assert set(result) == {a, b}
493
+
494
+
495
+ # ==== Dependents Computation Tests ====
496
+
497
+
498
+ class TestComputeDependents:
499
+ def test_no_dependencies(self, dataset):
500
+ a = DummyFileResource("a.txt")
501
+ a.bind("A", dataset)
502
+
503
+ _compute_dependents(dataset.resources)
504
+ assert a.dependents == []
505
+
506
+ def test_linear_dependents(self, dataset):
507
+ a = DummyFileResource("a.txt")
508
+ a.bind("A", dataset)
509
+
510
+ b = DependentResource("b.txt", source=a)
511
+ b.bind("B", dataset)
512
+
513
+ _compute_dependents(dataset.resources)
514
+ assert b in a.dependents
515
+ assert a not in b.dependents
516
+
517
+ def test_multiple_dependents(self, dataset):
518
+ a = DummyFileResource("a.txt")
519
+ a.bind("A", dataset)
520
+
521
+ b = DependentResource("b.txt", source=a)
522
+ b.bind("B", dataset)
523
+
524
+ c = DependentResource("c.txt", source=a)
525
+ c.bind("C", dataset)
526
+
527
+ _compute_dependents(dataset.resources)
528
+ assert set(a.dependents) == {b, c}
529
+
530
+
531
+ # ==== Two-Path Download Flow Tests ====
532
+
533
+
534
+ class TestTwoPathFlow:
535
+ def test_download_moves_to_final_path(self, dataset):
536
+ """Framework should move transient_path -> path on success."""
537
+ r = DummyFileResource("data.txt")
538
+ r.bind("DATA", dataset)
539
+
540
+ dataset.ordered_resources = [r]
541
+ _compute_dependents(dataset.resources)
542
+
543
+ dataset.download()
544
+
545
+ assert r.path.exists()
546
+ assert r.state == ResourceState.COMPLETE
547
+
548
+ def test_failure_no_recover_cleans_up(self, dataset):
549
+ """On failure without can_recover, transient data is deleted."""
550
+ r = FailingResource("data.txt")
551
+ r.bind("DATA", dataset)
552
+
553
+ dataset.ordered_resources = [r]
554
+ _compute_dependents(dataset.resources)
555
+
556
+ result = dataset.download()
557
+
558
+ assert result is False
559
+ assert not r.transient_path.exists()
560
+ assert r.state == ResourceState.NONE
561
+
562
+ def test_failure_with_recover_preserves(self, dataset):
563
+ """On failure with can_recover, transient data is preserved."""
564
+
565
+ class FailRecoverable(RecoverableResource):
566
+ def _download(self, destination):
567
+ destination.parent.mkdir(parents=True, exist_ok=True)
568
+ destination.write_text("partial")
569
+ raise RuntimeError("partial failure")
570
+
571
+ r = FailRecoverable("data.txt")
572
+ r.bind("DATA", dataset)
573
+
574
+ dataset.ordered_resources = [r]
575
+ _compute_dependents(dataset.resources)
576
+
577
+ result = dataset.download()
578
+
579
+ assert result is False
580
+ assert r.transient_path.exists()
581
+ assert r.state == ResourceState.PARTIAL
582
+
583
+ def test_skip_complete_resources(self, dataset):
584
+ """Resources already COMPLETE are skipped unless force=True."""
585
+ r = DummyFileResource("data.txt")
586
+ r.bind("DATA", dataset)
587
+
588
+ dataset.ordered_resources = [r]
589
+ _compute_dependents(dataset.resources)
590
+
591
+ # Mark as complete
592
+ r.state = ResourceState.COMPLETE
593
+ r.path.parent.mkdir(parents=True, exist_ok=True)
594
+ r.path.write_text("existing")
595
+
596
+ dataset.download()
597
+
598
+ # download should not have been called
599
+ assert r._download_called is False
600
+
601
+ def test_redownload_when_files_missing(self, dataset):
602
+ """COMPLETE resource with missing files is re-downloaded."""
603
+ r = DummyFileResource("data.txt")
604
+ r.bind("DATA", dataset)
605
+
606
+ dataset.ordered_resources = [r]
607
+ _compute_dependents(dataset.resources)
608
+
609
+ # Mark as complete but do NOT create the file
610
+ r.state = ResourceState.COMPLETE
611
+ assert not r.path.exists()
612
+
613
+ dataset.download()
614
+
615
+ # Should have re-downloaded
616
+ assert r._download_called is True
617
+ assert r.path.exists()
618
+ assert r.state == ResourceState.COMPLETE
619
+
620
+ def test_adopt_preexisting_files(self, dataset):
621
+ """Files already on disk (old downloads) are adopted as COMPLETE."""
622
+ r = DummyFileResource("data.txt")
623
+ r.bind("DATA", dataset)
624
+
625
+ dataset.ordered_resources = [r]
626
+ _compute_dependents(dataset.resources)
627
+
628
+ # Pre-create the file at the final path (simulating old download)
629
+ r.path.parent.mkdir(parents=True, exist_ok=True)
630
+ r.path.write_text("old data")
631
+
632
+ # State is NONE (no .state.json entry)
633
+ assert r.state == ResourceState.NONE
634
+
635
+ dataset.download()
636
+
637
+ # Should NOT have re-downloaded — just marked COMPLETE
638
+ assert r._download_called is False
639
+ assert r.state == ResourceState.COMPLETE
640
+ assert r.path.read_text() == "old data"
641
+
642
+ def test_downloads_dir_cleaned_after_success(self, dataset):
643
+ """The .downloads/ directory is removed after all succeed."""
644
+ r = DummyFileResource("data.txt")
645
+ r.bind("DATA", dataset)
646
+
647
+ dataset.ordered_resources = [r]
648
+ _compute_dependents(dataset.resources)
649
+
650
+ result = dataset.download()
651
+
652
+ assert result is True
653
+ downloads_dir = dataset.datapath / ".downloads"
654
+ assert not downloads_dir.exists()
655
+
656
+ def test_downloads_dir_kept_on_failure(self, dataset):
657
+ """The .downloads/ directory is kept if a download fails."""
658
+ r = FailingResource("data.txt")
659
+ r.bind("DATA", dataset)
660
+
661
+ dataset.ordered_resources = [r]
662
+ _compute_dependents(dataset.resources)
663
+
664
+ # Pre-create .downloads/ with transient data
665
+ r.transient_path.parent.mkdir(parents=True, exist_ok=True)
666
+ r.transient_path.write_text("partial")
667
+
668
+ result = dataset.download()
669
+
670
+ assert result is False
671
+ # .downloads/ should still exist (failure, no cleanup)
672
+ # (transient data itself is deleted because can_recover=False)
673
+
674
+ def test_lock_prevents_concurrent_download(self, dataset):
675
+ """A second download blocks while the first holds the lock."""
676
+ import fcntl
677
+ import threading
678
+
679
+ r = DummyFileResource("data.txt")
680
+ r.bind("DATA", dataset)
681
+ dataset.ordered_resources = [r]
682
+ _compute_dependents(dataset.resources)
683
+
684
+ # Acquire the lock externally to simulate a concurrent download
685
+ dataset.datapath.mkdir(parents=True, exist_ok=True)
686
+ lock_path = dataset.datapath / ".state.lock"
687
+ lock_file = lock_path.open("w")
688
+ fcntl.flock(lock_file, fcntl.LOCK_EX)
689
+
690
+ result_holder = {}
691
+
692
+ def try_download():
693
+ result_holder["result"] = dataset.download()
694
+
695
+ t = threading.Thread(target=try_download)
696
+ t.start()
697
+ # Give thread time to hit the lock
698
+ t.join(timeout=0.2)
699
+ # Thread should still be alive (blocked on lock)
700
+ assert t.is_alive()
701
+
702
+ # Release the lock
703
+ fcntl.flock(lock_file, fcntl.LOCK_UN)
704
+ lock_file.close()
705
+
706
+ t.join(timeout=5)
707
+ assert not t.is_alive()
708
+ assert result_holder["result"] is True
709
+
710
+
711
+ # ==== Eager Transient Cleanup Tests ====
712
+
713
+
714
+ class TestTransientCleanup:
715
+ def test_transient_cleaned_after_dependents_complete(self, dataset):
716
+ """Transient resources are cleaned up when all dependents
717
+ are COMPLETE."""
718
+ a = DummyFileResource("a.txt", transient=True)
719
+ a.bind("A", dataset)
720
+
721
+ b = DependentResource("b.txt", source=a)
722
+ b.bind("B", dataset)
723
+
724
+ _compute_dependents(dataset.resources)
725
+ dataset.ordered_resources = topological_sort(dataset.resources)
726
+
727
+ dataset.download()
728
+
729
+ # b should be complete
730
+ assert b.state == ResourceState.COMPLETE
731
+ assert b.path.exists()
732
+
733
+ # a should be cleaned up (transient)
734
+ assert a.state == ResourceState.NONE
735
+ assert not a.path.exists()
736
+
737
+ def test_non_transient_not_cleaned(self, dataset):
738
+ """Non-transient resources are NOT cleaned up."""
739
+ a = DummyFileResource("a.txt", transient=False)
740
+ a.bind("A", dataset)
741
+
742
+ b = DependentResource("b.txt", source=a)
743
+ b.bind("B", dataset)
744
+
745
+ _compute_dependents(dataset.resources)
746
+ dataset.ordered_resources = topological_sort(dataset.resources)
747
+
748
+ dataset.download()
749
+
750
+ assert a.state == ResourceState.COMPLETE
751
+ assert a.path.exists()
752
+
753
+ def test_transient_not_cleaned_if_dependent_incomplete(self, dataset):
754
+ """Transient resources are NOT cleaned if a dependent
755
+ hasn't completed yet."""
756
+ a = DummyFileResource("a.txt", transient=True)
757
+ a.bind("A", dataset)
758
+
759
+ b = DependentResource("b.txt", source=a)
760
+ b.bind("B", dataset)
761
+
762
+ c = DependentResource("c.txt", source=a)
763
+ c.bind("C", dataset)
764
+
765
+ _compute_dependents(dataset.resources)
766
+ dataset.ordered_resources = topological_sort(dataset.resources)
767
+
768
+ # Download only processes in order, so after B completes,
769
+ # C hasn't yet — a should not be cleaned up until C completes.
770
+ # The full download() handles this correctly.
771
+ dataset.download()
772
+
773
+ # After full download, all dependents are complete
774
+ # so transient should be cleaned
775
+ assert a.state == ResourceState.NONE
776
+
777
+
778
+ # ==== Legacy Decorator-Based Dataset Tests ====
779
+
780
+
781
+ class TestLegacyDecoratorDataset:
782
+ def test_filedownloader_decorator(self, context):
783
+ """Legacy decorator-based filedownloader still works."""
784
+ import warnings
785
+ from datamaestro.download.single import filedownloader
786
+
787
+ repository = MyRepository(context)
788
+ ds = SimpleDataset(repository, context.datapath / "legacy")
789
+
790
+ with warnings.catch_warnings():
791
+ warnings.simplefilter("ignore", DeprecationWarning)
792
+ downloader = filedownloader("test.html", "http://httpbin.org/html")
793
+ downloader(ds)
794
+
795
+ assert "test" in ds.resources
796
+ assert ds.resources["test"] is downloader
797
+
798
+ def test_reference_resource(self, context):
799
+ """reference resource still works."""
800
+ repository = MyRepository(context)
801
+ ds = SimpleDataset(repository, context.datapath / "ref_test")
802
+
803
+ mock_ref = MagicMock()
804
+ mock_ref.prepare.return_value = "prepared_value"
805
+
806
+ ref = reference(varname="ref", reference=mock_ref)
807
+ ref.bind("ref", ds)
808
+
809
+ assert ref.has_files() is False
810
+ result = ref.prepare()
811
+ assert result == "prepared_value"
812
+
813
+
814
+ # ==== New Class-Based Dataset Tests ====
815
+
816
+
817
+ class TestClassBasedDataset:
818
+ def test_bind_class_resources(self, dataset):
819
+ """_bind_class_resources detects Resource attributes."""
820
+ from datamaestro.data import Base
821
+
822
+ class MyData(Base):
823
+ A = DummyFileResource("a.txt")
824
+ B = DummyFileResource("b.txt")
825
+
826
+ _bind_class_resources(MyData, dataset)
827
+
828
+ assert "A" in dataset.resources
829
+ assert "B" in dataset.resources
830
+ assert len(dataset.ordered_resources) == 2
831
+
832
+ def test_bind_with_dependencies(self, dataset):
833
+ """Resources with dependencies are properly ordered."""
834
+ from datamaestro.data import Base
835
+
836
+ src = DummyFileResource("src.txt")
837
+
838
+ class MyData(Base):
839
+ SRC = src
840
+ PROCESSED = DependentResource("proc.txt", source=src)
841
+
842
+ _bind_class_resources(MyData, dataset)
843
+
844
+ # SRC should come before PROCESSED in ordered_resources
845
+ src_idx = dataset.ordered_resources.index(MyData.SRC)
846
+ proc_idx = dataset.ordered_resources.index(MyData.PROCESSED)
847
+ assert src_idx < proc_idx
848
+
849
+ # Check dependents were computed
850
+ assert MyData.PROCESSED in MyData.SRC.dependents
851
+
852
+ def test_non_resource_attributes_ignored(self, dataset):
853
+ """Non-Resource class attributes are not bound."""
854
+ from datamaestro.data import Base
855
+
856
+ class MyData(Base):
857
+ A = DummyFileResource("a.txt")
858
+ NOT_A_RESOURCE = "just a string"
859
+ ALSO_NOT = 42
860
+
861
+ _bind_class_resources(MyData, dataset)
862
+
863
+ assert "A" in dataset.resources
864
+ assert "NOT_A_RESOURCE" not in dataset.resources
865
+ assert "ALSO_NOT" not in dataset.resources
866
+
867
+
868
+ # ==== Backward Compatibility Tests ====
869
+
870
+
871
+ class TestBackwardCompat:
872
+ def test_hasfiles_deprecated(self, dataset):
873
+ """hasfiles() still works but emits deprecation."""
874
+ r = DummyFileResource("test.txt")
875
+ r.bind("TEST", dataset)
876
+
877
+ import warnings
878
+
879
+ with warnings.catch_warnings(record=True):
880
+ warnings.simplefilter("always")
881
+ result = r.hasfiles()
882
+
883
+ assert result is True
884
+
885
+ def test_definition_property_deprecated(self, dataset):
886
+ """definition property still works but emits deprecation."""
887
+ r = DummyFileResource("test.txt")
888
+ r.bind("TEST", dataset)
889
+
890
+ import warnings
891
+
892
+ # Clear the one-time warning cache
893
+ from datamaestro.download import _deprecation_warned
894
+
895
+ _deprecation_warned.discard("definition")
896
+
897
+ with warnings.catch_warnings(record=True):
898
+ warnings.simplefilter("always")
899
+ result = r.definition
900
+
901
+ assert result is dataset
902
+
903
+ def test_download_subclass_deprecated(self):
904
+ """Subclassing Download emits deprecation."""
905
+ from datamaestro.download import _deprecation_warned
906
+
907
+ _deprecation_warned.discard("Download-TestSub")
908
+
909
+ import warnings
910
+
911
+ with warnings.catch_warnings(record=True):
912
+ warnings.simplefilter("always")
913
+
914
+ class TestSub(Download):
915
+ def download(self, force=False):
916
+ pass
917
+
918
+ def prepare(self):
919
+ pass
920
+
921
+ def test_apply_classmethod(self):
922
+ """Resource.apply creates instances."""
923
+ r = DummyFileResource.apply("test.txt")
924
+ assert isinstance(r, DummyFileResource)
925
+ assert r.filename == "test.txt"
926
+
927
+
928
+ # ==== Concrete Resource Tests ====
929
+
930
+
931
+ class TestFileDownloader:
932
+ def test_construction(self):
933
+ """FileDownloader can be constructed."""
934
+ from datamaestro.download.single import FileDownloader
935
+
936
+ r = FileDownloader("data.csv", "http://example.com/data.csv")
937
+ assert r.filename == "data.csv"
938
+ assert r.url == "http://example.com/data.csv"
939
+ assert r.name == "data" # derived from filename
940
+
941
+ def test_factory_alias(self):
942
+ """filedownloader is an alias for FileDownloader.apply."""
943
+ from datamaestro.download.single import (
944
+ filedownloader,
945
+ FileDownloader,
946
+ )
947
+
948
+ r = filedownloader("data.csv", "http://example.com/data.csv")
949
+ assert isinstance(r, FileDownloader)
950
+
951
+ def test_transient_flag(self):
952
+ """FileDownloader accepts transient flag."""
953
+ from datamaestro.download.single import FileDownloader
954
+
955
+ r = FileDownloader(
956
+ "data.csv",
957
+ "http://example.com/data.csv",
958
+ transient=True,
959
+ )
960
+ assert r.transient is True
961
+
962
+ def test_backward_compat_alias(self):
963
+ """SingleDownload is an alias for FileDownloader."""
964
+ from datamaestro.download.single import (
965
+ SingleDownload,
966
+ FileDownloader,
967
+ )
968
+
969
+ assert SingleDownload is FileDownloader
970
+
971
+
972
+ class TestConcatDownloader:
973
+ def test_construction(self):
974
+ from datamaestro.download.single import ConcatDownloader
975
+
976
+ r = ConcatDownloader("data.txt", "http://example.com/data.tar.gz")
977
+ assert r.filename == "data.txt"
978
+ assert r.url == "http://example.com/data.tar.gz"
979
+
980
+ def test_factory_alias(self):
981
+ from datamaestro.download.single import (
982
+ concatdownload,
983
+ ConcatDownloader,
984
+ )
985
+
986
+ r = concatdownload("data.txt", "http://example.com/data.tar.gz")
987
+ assert isinstance(r, ConcatDownloader)
988
+
989
+
990
+ class TestArchiveDownloaders:
991
+ def test_zip_construction(self):
992
+ from datamaestro.download.archive import ZipDownloader
993
+
994
+ r = ZipDownloader("archive", "http://example.com/data.zip")
995
+ assert r.url == "http://example.com/data.zip"
996
+ assert r.name == "archive"
997
+
998
+ def test_tar_construction(self):
999
+ from datamaestro.download.archive import TarDownloader
1000
+
1001
+ r = TarDownloader("archive", "http://example.com/data.tar.gz")
1002
+ assert r.url == "http://example.com/data.tar.gz"
1003
+
1004
+ def test_zip_factory_alias(self):
1005
+ from datamaestro.download.archive import (
1006
+ zipdownloader,
1007
+ ZipDownloader,
1008
+ )
1009
+
1010
+ r = zipdownloader("archive", "http://example.com/data.zip")
1011
+ assert isinstance(r, ZipDownloader)
1012
+
1013
+ def test_tar_factory_alias(self):
1014
+ from datamaestro.download.archive import (
1015
+ tardownloader,
1016
+ TarDownloader,
1017
+ )
1018
+
1019
+ r = tardownloader("archive", "http://example.com/data.tar.gz")
1020
+ assert isinstance(r, TarDownloader)
1021
+
1022
+
1023
+ class TestCustomDownload:
1024
+ def test_construction(self):
1025
+ from datamaestro.download.custom import custom_download
1026
+
1027
+ fn = MagicMock()
1028
+ r = custom_download("data", fn)
1029
+ assert r.name == "data"
1030
+ assert r.downloader is fn
1031
+
1032
+
1033
+ class TestHFDownloader:
1034
+ def test_construction(self):
1035
+ from datamaestro.download.huggingface import HFDownloader
1036
+
1037
+ r = HFDownloader("hf", repo_id="user/dataset")
1038
+ assert r.repo_id == "user/dataset"
1039
+ assert r.name == "hf"
1040
+
1041
+ def test_factory_alias(self):
1042
+ from datamaestro.download.huggingface import (
1043
+ hf_download,
1044
+ HFDownloader,
1045
+ )
1046
+
1047
+ r = hf_download("hf", repo_id="user/dataset")
1048
+ assert isinstance(r, HFDownloader)
1049
+
1050
+ def test_prepare(self):
1051
+ from datamaestro.download.huggingface import HFDownloader
1052
+
1053
+ r = HFDownloader(
1054
+ "hf",
1055
+ repo_id="user/dataset",
1056
+ data_files="train.csv",
1057
+ split="train",
1058
+ )
1059
+ result = r.prepare()
1060
+ assert result == {
1061
+ "repo_id": "user/dataset",
1062
+ "data_files": "train.csv",
1063
+ "split": "train",
1064
+ }
1065
+
1066
+
1067
+ class TestTodoResource:
1068
+ def test_raises_not_implemented(self):
1069
+ from datamaestro.download.todo import Todo
1070
+
1071
+ r = Todo(varname="test")
1072
+ with pytest.raises(NotImplementedError):
1073
+ r.download()
1074
+
1075
+ with pytest.raises(NotImplementedError):
1076
+ r.prepare()
1077
+
1078
+
1079
+ class TestReferenceResource:
1080
+ def test_has_files_false(self, dataset):
1081
+ mock_ref = MagicMock()
1082
+ mock_ref.prepare.return_value = "value"
1083
+
1084
+ r = reference(varname="ref", reference=mock_ref)
1085
+ r.bind("ref", dataset)
1086
+
1087
+ assert r.has_files() is False
1088
+
1089
+ def test_prepare_delegates(self, dataset):
1090
+ mock_ref = MagicMock()
1091
+ mock_ref.prepare.return_value = "prepared"
1092
+
1093
+ r = reference(varname="ref", reference=mock_ref)
1094
+ r.bind("ref", dataset)
1095
+
1096
+ result = r.prepare()
1097
+ assert result == "prepared"
1098
+
1099
+ def test_download_delegates(self, dataset):
1100
+ mock_ref = MagicMock()
1101
+ mock_ref.__datamaestro__ = MagicMock()
1102
+
1103
+ r = reference(varname="ref", reference=mock_ref)
1104
+ r.bind("ref", dataset)
1105
+
1106
+ r.download(force=True)
1107
+ mock_ref.__datamaestro__.download.assert_called_once_with(True)
1108
+
1109
+ def test_requires_reference(self):
1110
+ with pytest.raises(AssertionError, match="cannot be null"):
1111
+ reference(varname="ref", reference=None)
1112
+
1113
+
1114
+ # ==== Reference with Class-Based Datasets ====
1115
+
1116
+
1117
+ class TestReferenceClassBased:
1118
+ """Tests for `reference` used with class-based datasets.
1119
+
1120
+ When a class-based dataset is decorated with @dataset, the class
1121
+ gets a __dataset__ attribute pointing to the DatasetWrapper.
1122
+ The reference resource must resolve through that attribute.
1123
+ """
1124
+
1125
+ def _make_base_dataset(self, context):
1126
+ """Create a minimal class-based dataset to use as a reference target."""
1127
+ from datamaestro.data import Base
1128
+ from datamaestro.definitions import dataset as dataset_dec
1129
+
1130
+ class BaseData(Base):
1131
+ """Base test dataset."""
1132
+
1133
+ DATA = DummyFileResource("base.txt")
1134
+
1135
+ @classmethod
1136
+ def __create_dataset__(cls, dataset: AbstractDataset):
1137
+ return cls.C(id="test.base")
1138
+
1139
+ BaseData.__module__ = "datamaestro.config.test"
1140
+
1141
+ # Apply the @dataset decorator (sets __dataset__ on the class)
1142
+ dataset_dec(base=BaseData, url="http://test.com")(BaseData)
1143
+ return BaseData
1144
+
1145
+ def test_resolve_via_dataset_attr(self, context):
1146
+ """_resolve_reference follows __dataset__ for class-based targets."""
1147
+ BaseData = self._make_base_dataset(context)
1148
+
1149
+ ref = reference(varname="base", reference=BaseData)
1150
+ resolved = ref._resolve_reference()
1151
+
1152
+ assert resolved is BaseData.__dataset__
1153
+
1154
+ def test_prepare_delegates_to_class_dataset(self, context):
1155
+ """prepare() calls _prepare() on the referenced DatasetWrapper."""
1156
+ BaseData = self._make_base_dataset(context)
1157
+
1158
+ ref = reference(varname="base", reference=BaseData)
1159
+
1160
+ # Mock the DatasetWrapper._prepare to avoid full experimaestro
1161
+ # Config construction (which rejects classes defined in functions)
1162
+ sentinel = object()
1163
+ BaseData.__dataset__._prepare = MagicMock(return_value=sentinel)
1164
+
1165
+ result = ref.prepare()
1166
+ BaseData.__dataset__._prepare.assert_called_once()
1167
+ assert result is sentinel
1168
+
1169
+ def test_download_delegates_to_class_dataset(self, context):
1170
+ """download() calls download() on the referenced DatasetWrapper."""
1171
+ BaseData = self._make_base_dataset(context)
1172
+
1173
+ ref = reference(varname="base", reference=BaseData)
1174
+
1175
+ # Mock the DatasetWrapper.download to verify delegation
1176
+ BaseData.__dataset__.download = MagicMock()
1177
+
1178
+ ref.download(force=True)
1179
+ BaseData.__dataset__.download.assert_called_once_with(True)
1180
+
1181
+ def test_download_no_force(self, context):
1182
+ """download(force=False) passes force=False to the target."""
1183
+ BaseData = self._make_base_dataset(context)
1184
+
1185
+ ref = reference(varname="base", reference=BaseData)
1186
+ BaseData.__dataset__.download = MagicMock()
1187
+
1188
+ ref.download(force=False)
1189
+ BaseData.__dataset__.download.assert_called_once_with(False)
1190
+
1191
+ def test_has_files_false(self, context):
1192
+ """reference has_files() is always False."""
1193
+ BaseData = self._make_base_dataset(context)
1194
+
1195
+ ref = reference(varname="base", reference=BaseData)
1196
+ assert ref.has_files() is False
1197
+
1198
+ def test_bound_in_class_based_dataset(self, context):
1199
+ """reference works as a class attribute bound via
1200
+ _bind_class_resources."""
1201
+ BaseData = self._make_base_dataset(context)
1202
+
1203
+ repository = MyRepository(context)
1204
+ ds = SimpleDataset(repository, context.datapath / "derived_test")
1205
+
1206
+ ref = reference(varname="base", reference=BaseData)
1207
+ ref.bind("BASE", ds)
1208
+
1209
+ assert "base" in ds.resources
1210
+ assert ds.resources["base"] is ref
1211
+ assert ref.has_files() is False
1212
+
1213
+ def test_full_class_attribute_integration(self, context):
1214
+ """reference as a class attribute in a full class-based dataset."""
1215
+ from datamaestro.data import Base
1216
+
1217
+ BaseData = self._make_base_dataset(context)
1218
+
1219
+ class DerivedData(Base):
1220
+ """Derived dataset referencing the base."""
1221
+
1222
+ BASE = reference(varname="base", reference=BaseData)
1223
+
1224
+ @classmethod
1225
+ def __create_dataset__(cls, dataset: AbstractDataset):
1226
+ cls.BASE.prepare()
1227
+ return cls.C(id="test.derived")
1228
+
1229
+ repository = MyRepository(context)
1230
+ ds = SimpleDataset(repository, context.datapath / "derived_full")
1231
+
1232
+ _bind_class_resources(DerivedData, ds)
1233
+
1234
+ assert "base" in ds.resources
1235
+ assert isinstance(ds.resources["base"], reference)
1236
+
1237
+ # The reference should resolve to the base dataset
1238
+ resolved = ds.resources["base"]._resolve_reference()
1239
+ assert resolved is BaseData.__dataset__
1240
+
1241
+
1242
+ # ==== Links Resource Tests ====
1243
+
1244
+
1245
+ class TestLinksResource:
1246
+ def test_construction(self):
1247
+ from datamaestro.download.links import links
1248
+
1249
+ mock_ds = MagicMock()
1250
+ r = links("data", ref1=mock_ds)
1251
+ assert r.name == "data"
1252
+
1253
+ def test_has_files_false(self, dataset):
1254
+ from datamaestro.download.links import links
1255
+
1256
+ mock_ds = MagicMock()
1257
+ r = links("data", ref1=mock_ds)
1258
+ r.bind("data", dataset)
1259
+
1260
+ assert r.has_files() is False
1261
+
1262
+ def test_path_is_datapath(self, dataset):
1263
+ from datamaestro.download.links import links
1264
+
1265
+ mock_ds = MagicMock()
1266
+ r = links("data", ref1=mock_ds)
1267
+ r.bind("data", dataset)
1268
+
1269
+ assert r.path == dataset.datapath
1270
+
1271
+ def test_prepare_returns_path(self, dataset):
1272
+ from datamaestro.download.links import links
1273
+
1274
+ mock_ds = MagicMock()
1275
+ r = links("data", ref1=mock_ds)
1276
+ r.bind("data", dataset)
1277
+
1278
+ assert r.prepare() == dataset.datapath
1279
+
1280
+
1281
+ class TestLinkFolder:
1282
+ def test_construction(self):
1283
+ from datamaestro.download.links import linkfolder
1284
+
1285
+ r = linkfolder("data", proposals=["/tmp/test"])
1286
+ assert r.name == "data"
1287
+
1288
+ def test_check_is_dir(self, dataset, tmp_path):
1289
+ from datamaestro.download.links import linkfolder
1290
+
1291
+ r = linkfolder("data", proposals=[])
1292
+ r.bind("data", dataset)
1293
+
1294
+ # A directory should pass
1295
+ assert r.check(tmp_path) is True
1296
+ # A non-existent path should fail
1297
+ assert r.check(tmp_path / "nonexistent") is False
1298
+
1299
+ def test_path(self, dataset):
1300
+ from datamaestro.download.links import linkfolder
1301
+
1302
+ r = linkfolder("data", proposals=[])
1303
+ r.bind("data", dataset)
1304
+
1305
+ assert r.path == dataset.datapath / "data"
1306
+
1307
+ def test_prepare_returns_path(self, dataset):
1308
+ from datamaestro.download.links import linkfolder
1309
+
1310
+ r = linkfolder("data", proposals=[])
1311
+ r.bind("data", dataset)
1312
+
1313
+ assert r.prepare() == r.path
1314
+
1315
+
1316
+ class TestLinkFile:
1317
+ def test_construction(self):
1318
+ from datamaestro.download.links import linkfile
1319
+
1320
+ r = linkfile("data", proposals=["/tmp/test.txt"])
1321
+ assert r.name == "data"
1322
+
1323
+ def test_check_is_file(self, dataset, tmp_path):
1324
+ from datamaestro.download.links import linkfile
1325
+
1326
+ r = linkfile("data", proposals=[])
1327
+ r.bind("data", dataset)
1328
+
1329
+ # Create a real file to check
1330
+ test_file = tmp_path / "test.txt"
1331
+ test_file.write_text("hello")
1332
+
1333
+ assert r.check(test_file) is True
1334
+ # A directory should fail
1335
+ assert r.check(tmp_path) is False
1336
+ # A non-existent path should fail
1337
+ assert r.check(tmp_path / "nonexistent") is False
1338
+
1339
+ def test_path(self, dataset):
1340
+ from datamaestro.download.links import linkfile
1341
+
1342
+ r = linkfile("data", proposals=[])
1343
+ r.bind("data", dataset)
1344
+
1345
+ assert r.path == dataset.datapath / "data"
1346
+
1347
+
1348
+ # ==== Wayback Resource Tests ====
1349
+
1350
+
1351
+ class TestWaybackDocuments:
1352
+ def test_construction(self):
1353
+ from datamaestro.download.wayback import wayback_documents
1354
+
1355
+ def urls_fn():
1356
+ return iter(["http://example.com"])
1357
+
1358
+ r = wayback_documents("20200101", urls_fn, name="wb")
1359
+ assert r.name == "wb"
1360
+ assert r.timestamp == "20200101"
1361
+
1362
+ def test_prepare_returns_path(self, dataset):
1363
+ from datamaestro.download.wayback import wayback_documents
1364
+
1365
+ def urls_fn():
1366
+ return iter([])
1367
+
1368
+ r = wayback_documents("20200101", urls_fn, name="wb")
1369
+ r.bind("wb", dataset)
1370
+
1371
+ expected = dataset.datapath / "wb"
1372
+ assert r.prepare() == expected
1373
+
1374
+
1375
+ # ==== Custom Download Functional Tests ====
1376
+
1377
+
1378
+ class TestCustomDownloadFunctional:
1379
+ def test_download_delegates(self, dataset):
1380
+ from datamaestro.download.custom import custom_download
1381
+
1382
+ fn = MagicMock()
1383
+ r = custom_download("data", fn)
1384
+ r.bind("data", dataset)
1385
+
1386
+ r.download(force=True)
1387
+
1388
+ fn.assert_called_once_with(dataset.context, dataset.datapath, force=True)
1389
+
1390
+ def test_prepare_returns_datapath(self, dataset):
1391
+ from datamaestro.download.custom import custom_download
1392
+
1393
+ fn = MagicMock()
1394
+ r = custom_download("data", fn)
1395
+ r.bind("data", dataset)
1396
+
1397
+ assert r.prepare() == dataset.datapath
1398
+
1399
+
1400
+ # ==== Archive Downloader Base Tests ====
1401
+
1402
+
1403
+ class TestArchiveDownloaderBase:
1404
+ def test_zip_path_with_postinit(self, dataset):
1405
+ from datamaestro.download.archive import ZipDownloader
1406
+
1407
+ r = ZipDownloader("archive", "http://example.com/data.zip")
1408
+ r.bind("archive", dataset)
1409
+
1410
+ # path should trigger postinit
1411
+ p = r.path
1412
+ assert isinstance(p, Path)
1413
+
1414
+ def test_tar_path_with_postinit(self, dataset):
1415
+ from datamaestro.download.archive import TarDownloader
1416
+
1417
+ r = TarDownloader("archive", "http://example.com/data.tar.gz")
1418
+ r.bind("archive", dataset)
1419
+
1420
+ p = r.path
1421
+ assert isinstance(p, Path)
1422
+
1423
+ def test_extractall_default(self):
1424
+ from datamaestro.download.archive import ZipDownloader
1425
+
1426
+ r = ZipDownloader("archive", "http://example.com/data.zip")
1427
+ assert r.extractall is True
1428
+
1429
+ def test_extractall_with_subpath(self):
1430
+ from datamaestro.download.archive import ZipDownloader
1431
+
1432
+ r = ZipDownloader(
1433
+ "archive",
1434
+ "http://example.com/data.zip",
1435
+ subpath="subdir",
1436
+ )
1437
+ assert r.extractall is False
1438
+
1439
+ def test_extractall_with_files(self):
1440
+ from datamaestro.download.archive import ZipDownloader
1441
+
1442
+ r = ZipDownloader(
1443
+ "archive",
1444
+ "http://example.com/data.zip",
1445
+ files={"file1.txt"},
1446
+ )
1447
+ assert r.extractall is False
1448
+
1449
+ def test_subpath_trailing_slash(self):
1450
+ from datamaestro.download.archive import ZipDownloader
1451
+
1452
+ r = ZipDownloader(
1453
+ "archive",
1454
+ "http://example.com/data.zip",
1455
+ subpath="subdir",
1456
+ )
1457
+ assert r.subpath == "subdir/"
1458
+
1459
+ def test_transient_flag(self):
1460
+ from datamaestro.download.archive import ZipDownloader
1461
+
1462
+ r = ZipDownloader(
1463
+ "archive",
1464
+ "http://example.com/data.zip",
1465
+ transient=True,
1466
+ )
1467
+ assert r.transient is True
1468
+
1469
+
1470
+ # ==== gsync (legacy) Tests ====
1471
+
1472
+
1473
+ class TestGsync:
1474
+ def test_import(self):
1475
+ """gsync can be imported (legacy Download subclass)."""
1476
+ from datamaestro.download.sync import gsync
1477
+
1478
+ assert issubclass(gsync, Download)
1479
+
1480
+
1481
+ # ==== manual.py (deprecated re-export) Tests ====
1482
+
1483
+
1484
+ class TestManual:
1485
+ def test_import_linkfolder(self):
1486
+ """manual.linkfolder is a deprecated re-export."""
1487
+ import warnings
1488
+
1489
+ with warnings.catch_warnings():
1490
+ warnings.simplefilter("ignore", DeprecationWarning)
1491
+ from datamaestro.download.manual import linkfolder
1492
+
1493
+ assert linkfolder is not None
1494
+
1495
+
1496
+ # ==== multiple.py (legacy) Tests ====
1497
+
1498
+
1499
+ class TestMultiple:
1500
+ def test_import_list(self):
1501
+ """List can be imported (legacy Download subclass)."""
1502
+ import warnings
1503
+
1504
+ with warnings.catch_warnings():
1505
+ warnings.simplefilter("ignore", DeprecationWarning)
1506
+ from datamaestro.download.multiple import List
1507
+
1508
+ assert issubclass(List, Download)
1509
+
1510
+ def test_import_datasets(self):
1511
+ """Datasets can be imported (legacy Download subclass)."""
1512
+ import warnings
1513
+
1514
+ with warnings.catch_warnings():
1515
+ warnings.simplefilter("ignore", DeprecationWarning)
1516
+ from datamaestro.download.multiple import Datasets
1517
+
1518
+ assert issubclass(Datasets, Download)
1519
+
1520
+
1521
+ # ==== Dataset ID Inference Tests ====
1522
+
1523
+
1524
+ class TestDatasetIDInference:
1525
+ """Integration tests for dataset ID inference stability.
1526
+
1527
+ Verifies that DataDefinition.repository_relpath correctly derives
1528
+ path components from type modules and names, including the
1529
+ CamelCase → snake_case conversion for the final component
1530
+ (class/function name).
1531
+ """
1532
+
1533
+ @staticmethod
1534
+ def _make_type(module, name):
1535
+ """Create a mock type with given __module__ and __name__."""
1536
+ t = type(name, (), {})
1537
+ t.__module__ = module
1538
+ return t
1539
+
1540
+ def test_all_caps_class(self, context):
1541
+ """All-caps class name (e.g. MNIST) becomes lowercase."""
1542
+ t = self._make_type("datamaestro.config.lecun", "MNIST")
1543
+ _, parts = DataDefinition.repository_relpath(t)
1544
+ assert parts == ["config", "lecun", "mnist"]
1545
+
1546
+ def test_camel_case_class(self, context):
1547
+ """CamelCase class name becomes snake_case."""
1548
+ t = self._make_type("datamaestro.config.lecun", "ProcessedMNIST")
1549
+ _, parts = DataDefinition.repository_relpath(t)
1550
+ assert parts == ["config", "lecun", "processed_mnist"]
1551
+
1552
+ def test_multi_word_camel_case(self, context):
1553
+ """Multi-word CamelCase is split with underscores."""
1554
+ t = self._make_type("datamaestro.config.data", "ImageClassification")
1555
+ _, parts = DataDefinition.repository_relpath(t)
1556
+ assert parts == ["config", "data", "image_classification"]
1557
+
1558
+ def test_lowercase_function_name(self, context):
1559
+ """Lowercase function names stay as-is."""
1560
+ t = self._make_type("datamaestro.config.lecun", "mnist")
1561
+ _, parts = DataDefinition.repository_relpath(t)
1562
+ assert parts == ["config", "lecun", "mnist"]
1563
+
1564
+ def test_name_with_digits(self, context):
1565
+ """Names with trailing digits are handled correctly."""
1566
+ t = self._make_type("datamaestro.config.trec", "Robust2005")
1567
+ _, parts = DataDefinition.repository_relpath(t)
1568
+ assert parts == ["config", "trec", "robust2005"]
1569
+
1570
+ def test_acronym_then_word(self, context):
1571
+ """Acronym followed by word splits correctly."""
1572
+ t = self._make_type("datamaestro.config.web", "HTTPSConnection")
1573
+ _, parts = DataDefinition.repository_relpath(t)
1574
+ assert parts == ["config", "web", "https_connection"]
1575
+
1576
+ def test_digit_to_upper_boundary(self, context):
1577
+ """Digit-to-uppercase boundary inserts underscore."""
1578
+ t = self._make_type("datamaestro.config.data", "V2Data")
1579
+ _, parts = DataDefinition.repository_relpath(t)
1580
+ assert parts == ["config", "data", "v2_data"]
1581
+
1582
+ def test_snake_case_passthrough(self, context):
1583
+ """Already snake_case names are unchanged."""
1584
+ t = self._make_type("datamaestro.config.lecun", "my_data")
1585
+ _, parts = DataDefinition.repository_relpath(t)
1586
+ assert parts == ["config", "lecun", "my_data"]
1587
+
1588
+ def test_module_components_lowercased(self, context):
1589
+ """Module path components are lowercased, not snake_cased."""
1590
+ t = self._make_type("datamaestro.config.LeCun.SubDir", "MNIST")
1591
+ _, parts = DataDefinition.repository_relpath(t)
1592
+ assert parts == ["config", "lecun", "subdir", "mnist"]
1593
+
1594
+ def test_only_last_component_snake_cased(self, context):
1595
+ """Only the last component gets CamelCase→snake_case;
1596
+ module components are simply lowercased."""
1597
+ t = self._make_type("datamaestro.config.MyModule.SubPkg", "ProcessedData")
1598
+ _, parts = DataDefinition.repository_relpath(t)
1599
+ # MyModule/SubPkg → lowercased; ProcessedData → snake_cased
1600
+ assert parts == [
1601
+ "config",
1602
+ "mymodule",
1603
+ "subpkg",
1604
+ "processed_data",
1605
+ ]
1606
+
1607
+ def test_full_id_class_based(self, context):
1608
+ """Full dataset ID for a class-based dataset."""
1609
+ from datamaestro.data import Base
1610
+ from datamaestro.definitions import dataset as dataset_dec
1611
+
1612
+ class ProcessedMNIST(Base):
1613
+ """Test dataset."""
1614
+
1615
+ pass
1616
+
1617
+ ProcessedMNIST.__module__ = "datamaestro.config.lecun"
1618
+
1619
+ ann = dataset_dec(base=ProcessedMNIST, url="http://test.com")
1620
+ dw = DatasetWrapper(ann, ProcessedMNIST)
1621
+ assert dw.id == "lecun.processed_mnist"
1622
+
1623
+ def test_full_id_function_based(self, context):
1624
+ """Full dataset ID for a function-based (lowercase) dataset."""
1625
+ from datamaestro.data import Base
1626
+
1627
+ class MyData(Base):
1628
+ pass
1629
+
1630
+ from datamaestro.definitions import dataset as dataset_dec
1631
+
1632
+ def mnist() -> MyData:
1633
+ pass
1634
+
1635
+ mnist.__module__ = "datamaestro.config.lecun"
1636
+
1637
+ ann = dataset_dec(url="http://test.com")
1638
+ # Infer base from return annotation
1639
+ ann.base = MyData
1640
+ dw = DatasetWrapper(ann, mnist)
1641
+ assert dw.id == "lecun.mnist"
1642
+
1643
+ def test_full_id_nested_module(self, context):
1644
+ """Full dataset ID with nested module path."""
1645
+ from datamaestro.data import Base
1646
+ from datamaestro.definitions import dataset as dataset_dec
1647
+
1648
+ class Squad(Base):
1649
+ """Test dataset."""
1650
+
1651
+ pass
1652
+
1653
+ Squad.__module__ = "datamaestro.config.stanford.qa"
1654
+
1655
+ ann = dataset_dec(base=Squad, url="http://test.com")
1656
+ dw = DatasetWrapper(ann, Squad)
1657
+ assert dw.id == "stanford.qa.squad"