datamaestro 1.7.3__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamaestro/__init__.py CHANGED
@@ -7,6 +7,6 @@ from .context import (
7
7
  prepare_dataset,
8
8
  )
9
9
 
10
- from .definitions import dataset, metadata
10
+ from .definitions import dataset, metadata, Dataset
11
11
  from .data import Base
12
12
  from .version import __version__
datamaestro/context.py CHANGED
@@ -282,7 +282,7 @@ class Datasets(Iterable["AbstractDataset"]):
282
282
  self._description = "\n".join(description)
283
283
 
284
284
  def __iter__(self) -> Iterable["AbstractDataset"]:
285
- from .definitions import DatasetWrapper
285
+ from .definitions import DatasetWrapper, Dataset
286
286
  from datamaestro.data import Base
287
287
 
288
288
  # Iterates over defined symbols
@@ -294,7 +294,7 @@ class Datasets(Iterable["AbstractDataset"]):
294
294
  yield value
295
295
  elif (
296
296
  inspect.isclass(value)
297
- and issubclass(value, Base)
297
+ and (issubclass(value, Base) or issubclass(value, Dataset))
298
298
  and hasattr(value, "__dataset__")
299
299
  ):
300
300
  if self.module.__name__ == value.__module__:
@@ -445,6 +445,9 @@ def prepare_dataset(
445
445
 
446
446
  if isinstance(dataset_id, DatasetWrapper):
447
447
  ds = dataset_id
448
+ elif hasattr(dataset_id, "__dataset__"):
449
+ # Class-based dataset decorated with @dataset
450
+ ds = dataset_id.__dataset__
448
451
  elif isinstance(dataset_id, Config):
449
452
  ds = dataset_id.__datamaestro_dataset__
450
453
  else:
@@ -546,10 +546,16 @@ class DatasetWrapper(AbstractDataset):
546
546
  # There is nothing, use the full path
547
547
  path = ".".join(components[1:])
548
548
  else:
549
- # Replace
549
+ # Replace the class name with the provided suffix
550
550
  path = ".".join(components[1:-1])
551
551
  if annotation.id != "":
552
- path = f"{path}.{annotation.id}"
552
+ # Strip leading dot if present (e.g., ".8.topics" -> "8.topics")
553
+ suffix = (
554
+ annotation.id[1:]
555
+ if annotation.id.startswith(".")
556
+ else annotation.id
557
+ )
558
+ path = f"{path}.{suffix}"
553
559
 
554
560
  self.id = path
555
561
  else:
@@ -611,8 +617,13 @@ class DatasetWrapper(AbstractDataset):
611
617
  if self.config is not None:
612
618
  return self.config
613
619
 
620
+ # Dataset subclass with config() method
621
+ if inspect.isclass(self.t) and issubclass(self.t, Dataset):
622
+ instance = self.t()
623
+ self.config = instance.config()
624
+
614
625
  # Direct creation of the dataset
615
- if self.base is self.t:
626
+ elif self.base is self.t:
616
627
  self.config = self.base.__create_dataset__(self)
617
628
 
618
629
  elif hasattr(self.t, "__create_dataset__"):
@@ -719,6 +730,9 @@ class DataAnnotation:
719
730
  else:
720
731
  if "__datamaestro__" in object.__dict__:
721
732
  self.annotate(object.__datamaestro__)
733
+ elif "__dataset__" in object.__dict__:
734
+ # Dataset subclass decorated with @dataset
735
+ self.annotate(object.__dataset__)
722
736
  else:
723
737
  # With configuration objects, add a __datamaestro__ member to the class
724
738
  assert issubclass(object, Config), (
@@ -815,9 +829,10 @@ class dataset:
815
829
 
816
830
  :param base: The base type (or None if inferred from type annotation).
817
831
  :param timestamp: If the dataset evolves, specify its timestamp.
818
- :param id: Gives the full ID of the dataset if it contains a '.',
819
- the last component if not containing a '.', or the last components
820
- if starting with '.'
832
+ :param id: Dataset ID override. Behavior depends on format:
833
+ - Full ID (e.g., "com.example.data"): used as-is if it has 3+ components
834
+ - Suffix with dot prefix (e.g., ".8.topics"): appended to module path
835
+ - Single component (e.g., "mnist"): replaces the class name in the path
821
836
  :param url: The URL associated with the dataset.
822
837
  :param size: The size of the dataset (should be a parsable format).
823
838
  :param doi: The DOI of the corresponding paper.
@@ -857,6 +872,17 @@ class dataset:
857
872
  if self.base is None:
858
873
  if inspect.isclass(t) and issubclass(t, Base):
859
874
  self.base = t
875
+ elif inspect.isclass(t) and issubclass(t, Dataset):
876
+ # Infer base from config() return annotation
877
+ try:
878
+ config_method = t.config
879
+ return_type = config_method.__annotations__["return"]
880
+ if isinstance(return_type, _GenericAlias):
881
+ return_type = return_type.__origin__
882
+ self.base = return_type
883
+ except (KeyError, AttributeError):
884
+ logging.warning("No return annotation on config() in %s", t)
885
+ raise
860
886
  else:
861
887
  try:
862
888
  # Get type from return annotation
@@ -875,12 +901,48 @@ class dataset:
875
901
  t.__dataset__ = dw
876
902
 
877
903
  # For class-based datasets, scan for Resource class attributes
878
- if inspect.isclass(t) and issubclass(t, Base):
904
+ if inspect.isclass(t) and (issubclass(t, Base) or issubclass(t, Dataset)):
879
905
  _bind_class_resources(t, dw)
880
906
  return t
881
907
  return dw
882
908
 
883
909
 
910
+ class Dataset(ABC):
911
+ """Base class for simplified dataset definitions.
912
+
913
+ Inherit from this class and use the ``@dataset`` decorator.
914
+ Resources are defined as class attributes and accessed via ``self``.
915
+
916
+ Example::
917
+
918
+ @dataset(url="http://yann.lecun.com/exdb/mnist/")
919
+ class MNIST(Dataset):
920
+ \"\"\"The MNIST database of handwritten digits.\"\"\"
921
+
922
+ TRAIN_IMAGES = FileDownloader("train.idx", "http://...")
923
+ TEST_IMAGES = FileDownloader("test.idx", "http://...")
924
+
925
+ def config(self) -> ImageClassification:
926
+ return ImageClassification.C(
927
+ train=IDX(path=self.TRAIN_IMAGES.path),
928
+ test=IDX(path=self.TEST_IMAGES.path),
929
+ )
930
+ """
931
+
932
+ @abstractmethod
933
+ def config(self) -> "Base":
934
+ """Create and return the dataset configuration.
935
+
936
+ Override this method to construct and return the data object.
937
+ Resources are accessible via ``self.RESOURCE_NAME.path`` or
938
+ ``self.RESOURCE_NAME.prepare()``.
939
+
940
+ Returns:
941
+ A Config instance (typically created via ``SomeType.C(...)``).
942
+ """
943
+ ...
944
+
945
+
884
946
  class metadataset(AbstractDataset):
885
947
  """Annotation for object/functions which are abstract dataset definitions
886
948
 
datamaestro/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # This file is auto-generated by Hatchling. As such, do not:
2
2
  # - modify
3
3
  # - track in version control e.g. be sure to add to .gitignore
4
- __version__ = VERSION = '1.7.3'
4
+ __version__ = VERSION = '1.8.0'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro
3
- Version: 1.7.3
3
+ Version: 1.8.0
4
4
  Summary: Add your description here
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License-File: LICENSE
@@ -141,11 +141,11 @@ and handles downloads with two-path safety and state tracking.
141
141
  from datamaestro_image.data import ImageClassification, LabelledImages
142
142
  from datamaestro.data.tensor import IDX
143
143
  from datamaestro.download.single import FileDownloader
144
- from datamaestro.definitions import AbstractDataset, dataset
144
+ from datamaestro.definitions import Dataset, dataset
145
145
 
146
146
 
147
147
  @dataset(url="http://yann.lecun.com/exdb/mnist/")
148
- class MNIST(ImageClassification):
148
+ class MNIST(Dataset):
149
149
  """The MNIST database of handwritten digits."""
150
150
 
151
151
  TRAIN_IMAGES = FileDownloader(
@@ -165,16 +165,15 @@ class MNIST(ImageClassification):
165
165
  "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz",
166
166
  )
167
167
 
168
- @classmethod
169
- def __create_dataset__(cls, dataset: AbstractDataset):
170
- return cls.C(
168
+ def config(self) -> ImageClassification:
169
+ return ImageClassification.C(
171
170
  train=LabelledImages(
172
- images=IDX(path=cls.TRAIN_IMAGES.path),
173
- labels=IDX(path=cls.TRAIN_LABELS.path),
171
+ images=IDX(path=self.TRAIN_IMAGES.path),
172
+ labels=IDX(path=self.TRAIN_LABELS.path),
174
173
  ),
175
174
  test=LabelledImages(
176
- images=IDX(path=cls.TEST_IMAGES.path),
177
- labels=IDX(path=cls.TEST_LABELS.path),
175
+ images=IDX(path=self.TEST_IMAGES.path),
176
+ labels=IDX(path=self.TEST_LABELS.path),
178
177
  ),
179
178
  )
180
179
  ```
@@ -1,7 +1,7 @@
1
- datamaestro/__init__.py,sha256=oh9M4VODuvTc9EFHKirtDxpCJkLUANzpzBOIwzHc_mw,246
1
+ datamaestro/__init__.py,sha256=ZquS8JzBV8aUYjiU758IIdoepk3o2uAqQqI1KRgwQ7Q,255
2
2
  datamaestro/__main__.py,sha256=22v54rQoO2umL1frFO2FOQuuRljr-Jw-ER-OATTpVxw,9218
3
- datamaestro/context.py,sha256=AL2BTi6dLA8rDGBE0PFyfV9ua29JHvBgx6_w6hDj9Dg,13977
4
- datamaestro/definitions.py,sha256=xo-MhpQHcUPNFJtkdWOEp1jC-7pbv0TREJKVS0iDVh8,27979
3
+ datamaestro/context.py,sha256=KFStZf4z1eJT4A47uvDalTTHkBgv5l4KXHDY-amqKf0,14153
4
+ datamaestro/definitions.py,sha256=DvinFtwPN_V7oaD3SILOnYTT_VKyrvDRupJPXT9qOGQ,30611
5
5
  datamaestro/record.py,sha256=e5fjRV3ni7ZxXwYH45bVDB_jpD-n9quvh4ie4uI-MM4,7140
6
6
  datamaestro/registry.py,sha256=M7QJkcWJP_cxAoqIioLQ01ou2Zg9RqGQvW0XGVspYFE,1421
7
7
  datamaestro/search.py,sha256=bRT-91-2VJJ2JSfNaS1mzaVfqq_HMVBVs-RBj0w-ypM,2906
@@ -9,7 +9,7 @@ datamaestro/settings.py,sha256=NuUbe_C31GDlzdio2ryz7tPzuo4hsmmdCM5Cyuhqbzs,1294
9
9
  datamaestro/sphinx.py,sha256=WWXB63gd0ZgEwFr_YwO2Hmuly5OoiFlu9mDvJSHFYuY,6966
10
10
  datamaestro/utils.py,sha256=JUrvtVYnjNKRo0_ZypmXSQ9R4uOyImDjW1GZ14MYzKM,6547
11
11
  datamaestro/v2.md,sha256=pLCxQUdfVkd4CM9Ie0ZxCnxUntqoA7k_0m7x1etcr7Y,9801
12
- datamaestro/version.py,sha256=qbNPC0YY-TxmiwHVzSS5pF2Eykd0T9k-YiM7b0ut3i4,171
12
+ datamaestro/version.py,sha256=7vKzAvyE5qa683r17tWnXJ25jKqnqX-fQ76AkxrXL_E,171
13
13
  datamaestro/annotations/__init__.py,sha256=jLprrxSBa5QIqc--vqycEcxU4CR9WjVNRaqR5lH0EuE,39
14
14
  datamaestro/annotations/agreement.py,sha256=xEH0ddZxdJ_oG_150PoOa-WjY_OaeQja3FzMzY5IB6k,955
15
15
  datamaestro/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -42,8 +42,8 @@ datamaestro/test/test_annotations.py,sha256=XUjDWb3FJimSD91wcItJ0lLwTBmvN4wVu_Eg
42
42
  datamaestro/test/test_download_handlers.py,sha256=-Gofr89zqIyeI8C4rZqfYR3JfiZVImdcSz9s6q361zQ,641
43
43
  datamaestro/test/test_record.py,sha256=hNZ3uo2i5FZ0VsOHRwvLO1Z6Zce92PdipAF65UptPB8,1156
44
44
  datamaestro/test/test_resource.py,sha256=QbwmZkGv_8O_jI0CKcatJSUs3IKbMfBrk0T_aTC1KcE,51124
45
- datamaestro-1.7.3.dist-info/METADATA,sha256=9EfCsWxYzjfZgcNi2_1dNoD8s-MijuQVdeK_7L6TEZY,7433
46
- datamaestro-1.7.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
47
- datamaestro-1.7.3.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
48
- datamaestro-1.7.3.dist-info/licenses/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
49
- datamaestro-1.7.3.dist-info/RECORD,,
45
+ datamaestro-1.8.0.dist-info/METADATA,sha256=mHgkI5X1um_WLTvCyjsFH3HTv_O-5SdkGuPysfCMsrg,7402
46
+ datamaestro-1.8.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
47
+ datamaestro-1.8.0.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
48
+ datamaestro-1.8.0.dist-info/licenses/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
49
+ datamaestro-1.8.0.dist-info/RECORD,,