datamaestro 1.0.4__tar.gz → 1.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {datamaestro-1.0.4 → datamaestro-1.0.6}/.readthedocs.yml +4 -1
  2. {datamaestro-1.0.4 → datamaestro-1.0.6}/PKG-INFO +1 -1
  3. datamaestro-1.0.6/docs/requirements.txt +5 -0
  4. {datamaestro-1.0.4 → datamaestro-1.0.6}/docs/source/api/index.md +2 -0
  5. datamaestro-1.0.6/docs/source/api/records.rst +59 -0
  6. datamaestro-1.0.6/src/datamaestro/record.py +312 -0
  7. datamaestro-1.0.6/src/datamaestro/test/test_record.py +151 -0
  8. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/version.py +2 -2
  9. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro.egg-info/PKG-INFO +1 -1
  10. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro.egg-info/SOURCES.txt +1 -0
  11. datamaestro-1.0.4/docs/requirements.txt +0 -2
  12. datamaestro-1.0.4/src/datamaestro/record.py +0 -196
  13. datamaestro-1.0.4/src/datamaestro/test/test_record.py +0 -81
  14. {datamaestro-1.0.4 → datamaestro-1.0.6}/.coverage +0 -0
  15. {datamaestro-1.0.4 → datamaestro-1.0.6}/.github/workflows/pytest.yml +0 -0
  16. {datamaestro-1.0.4 → datamaestro-1.0.6}/.github/workflows/python-publish.yml +0 -0
  17. {datamaestro-1.0.4 → datamaestro-1.0.6}/.gitignore +0 -0
  18. {datamaestro-1.0.4 → datamaestro-1.0.6}/.pre-commit-config.yaml +0 -0
  19. {datamaestro-1.0.4 → datamaestro-1.0.6}/CHANGELOG.md +0 -0
  20. {datamaestro-1.0.4 → datamaestro-1.0.6}/LICENSE +0 -0
  21. {datamaestro-1.0.4 → datamaestro-1.0.6}/MANIFEST.in +0 -0
  22. {datamaestro-1.0.4 → datamaestro-1.0.6}/README.md +0 -0
  23. {datamaestro-1.0.4 → datamaestro-1.0.6}/TODO.md +0 -0
  24. {datamaestro-1.0.4 → datamaestro-1.0.6}/docs/Makefile +0 -0
  25. {datamaestro-1.0.4 → datamaestro-1.0.6}/docs/make.bat +0 -0
  26. {datamaestro-1.0.4 → datamaestro-1.0.6}/docs/source/api/data.md +0 -0
  27. {datamaestro-1.0.4 → datamaestro-1.0.6}/docs/source/api/download.rst +0 -0
  28. {datamaestro-1.0.4 → datamaestro-1.0.6}/docs/source/conf.py +0 -0
  29. {datamaestro-1.0.4 → datamaestro-1.0.6}/docs/source/datasets.rst +0 -0
  30. {datamaestro-1.0.4 → datamaestro-1.0.6}/docs/source/developping.md +0 -0
  31. {datamaestro-1.0.4 → datamaestro-1.0.6}/docs/source/index.md +0 -0
  32. {datamaestro-1.0.4 → datamaestro-1.0.6}/docs/source/style.css +0 -0
  33. {datamaestro-1.0.4 → datamaestro-1.0.6}/mkdocs.yml +0 -0
  34. {datamaestro-1.0.4 → datamaestro-1.0.6}/pyproject.toml +0 -0
  35. {datamaestro-1.0.4 → datamaestro-1.0.6}/pytest.ini +0 -0
  36. {datamaestro-1.0.4 → datamaestro-1.0.6}/requirements-dev.txt +0 -0
  37. {datamaestro-1.0.4 → datamaestro-1.0.6}/requirements.txt +0 -0
  38. {datamaestro-1.0.4 → datamaestro-1.0.6}/schema.yaml +0 -0
  39. {datamaestro-1.0.4 → datamaestro-1.0.6}/setup.cfg +0 -0
  40. {datamaestro-1.0.4 → datamaestro-1.0.6}/setup.py +0 -0
  41. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/__init__.py +0 -0
  42. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/__main__.py +0 -0
  43. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/annotations/__init__.py +0 -0
  44. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/annotations/agreement.py +0 -0
  45. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/commands/__init__.py +0 -0
  46. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/commands/mainstyle.css +0 -0
  47. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/commands/site.py +0 -0
  48. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/context.py +0 -0
  49. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/data/__init__.py +0 -0
  50. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/data/csv.py +0 -0
  51. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/data/huggingface.py +0 -0
  52. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/data/ml.py +0 -0
  53. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/data/tensor.py +0 -0
  54. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/definitions.py +0 -0
  55. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/download/__init__.py +0 -0
  56. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/download/archive.py +0 -0
  57. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/download/huggingface.py +0 -0
  58. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/download/links.py +0 -0
  59. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/download/manual.py +0 -0
  60. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/download/multiple.py +0 -0
  61. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/download/single.py +0 -0
  62. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/download/sync.py +0 -0
  63. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/download/todo.py +0 -0
  64. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/registry.py +0 -0
  65. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/search.py +0 -0
  66. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/settings.py +0 -0
  67. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/sphinx.py +0 -0
  68. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/stream/__init__.py +0 -0
  69. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/stream/compress.py +0 -0
  70. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/stream/lines.py +0 -0
  71. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/templates/dataset.py +0 -0
  72. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/test/__init__.py +0 -0
  73. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/test/checks.py +0 -0
  74. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/test/conftest.py +0 -0
  75. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/test/test_annotations.py +0 -0
  76. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/test/test_download_handlers.py +0 -0
  77. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro/utils.py +0 -0
  78. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro.egg-info/dependency_links.txt +0 -0
  79. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro.egg-info/entry_points.txt +0 -0
  80. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro.egg-info/not-zip-safe +0 -0
  81. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro.egg-info/requires.txt +0 -0
  82. {datamaestro-1.0.4 → datamaestro-1.0.6}/src/datamaestro.egg-info/top_level.txt +0 -0
  83. {datamaestro-1.0.4 → datamaestro-1.0.6}/tox.ini +0 -0
@@ -8,10 +8,13 @@ version: 2
8
8
  sphinx:
9
9
  configuration: docs/source/conf.py
10
10
 
11
+ build:
12
+ os: "ubuntu-20.04"
13
+ tools:
14
+ python: "3.9"
11
15
 
12
16
  # Install the package
13
17
  python:
14
- version: 3.8
15
18
  install:
16
19
  - method: pip
17
20
  path: .
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro
3
- Version: 1.0.4
3
+ Version: 1.0.6
4
4
  Summary: "Dataset management command line and API"
5
5
  Home-page: https://github.com/experimaestro/datamaestro
6
6
  Author: Benjamin Piwowarski
@@ -0,0 +1,5 @@
1
+ sphinx>=6
2
+ myst_parser > 0.18
3
+ sphinx >= 4.2
4
+ sphinx-rtd-theme==1.2.2
5
+ sphinx-toolbox
@@ -7,9 +7,11 @@ caption: "Contents:"
7
7
  ---
8
8
  download
9
9
  data
10
+ records
10
11
  ```
11
12
 
12
13
  The API is composed of:
13
14
 
14
15
  - [Specify what to download](download/)
15
16
  - [Describing data](data/)
17
+ - [Records](records/)
@@ -0,0 +1,59 @@
1
+ Records
2
+ =======
3
+
4
+ Records can hold arbitrary information. They are quite useful when precessing data, since
5
+ information can be easily added to a record.
6
+
7
+ .. code-block:: python
8
+
9
+ @define
10
+ class AItem(Item):
11
+ a: int
12
+
13
+
14
+ @define
15
+ class A1Item(AItem):
16
+ a1: int
17
+
18
+
19
+ @define
20
+ class BItem(Item):
21
+ b: int
22
+
23
+
24
+ @define
25
+ class CItem(Item):
26
+ c: int
27
+
28
+
29
+ @recordtypes(A1Item)
30
+ class ARecord(Record):
31
+ ...
32
+
33
+
34
+ @recordtypes(BItem)
35
+ class ABRecord(ARecord):
36
+ ...
37
+
38
+
39
+ record = ABRecode(AItem(1), BItem(2))
40
+ print(record[AItem].a) # 1
41
+
42
+
43
+ record = record.update(BItem(3))
44
+ print(record[BItem]) # 3
45
+
46
+
47
+ .. autoclass:: datamaestro.record.Item
48
+ :members:
49
+
50
+ .. autoclass:: datamaestro.record.Record
51
+ :members: update, has, get, from_types, from_record
52
+
53
+ .. autofunction:: datamaestro.record.recordtypes
54
+
55
+ .. autoclass:: datamaestro.record.RecordTypesCache
56
+ :members: __init__, update
57
+
58
+ .. autoclass:: datamaestro.record.SingleRecordTypeCache
59
+ :members: __init__, update
@@ -0,0 +1,312 @@
1
+ import logging
2
+ from typing import ClassVar, Type, TypeVar, Dict, List, Union, Optional, FrozenSet
3
+
4
+
5
+ class Item:
6
+ """Base class for all item types"""
7
+
8
+ @classmethod
9
+ def __get_base__(cls: Type) -> Type:
10
+ """Get the most generic superclass for this type of item"""
11
+ if base := getattr(cls, "__base__cache__", None):
12
+ return base
13
+
14
+ base = cls
15
+ for supercls in cls.__mro__:
16
+ if issubclass(supercls, Item) and supercls is not Item:
17
+ base = supercls
18
+ setattr(cls, "__base__cache__", base)
19
+ return base
20
+
21
+
22
+ T = TypeVar("T", bound=Item)
23
+ Items = Dict[Type[T], T]
24
+
25
+
26
+ class Record:
27
+ """Associate types with entries
28
+
29
+ A record is a composition of items; each item base class is unique.
30
+ """
31
+
32
+ items: Items
33
+
34
+ def __init__(self, *items: Union[Items, T], override=False):
35
+ self.items = {}
36
+
37
+ if len(items) == 1 and isinstance(items[0], dict):
38
+ # Just copy the dictionary
39
+ self.items = items[0]
40
+ else:
41
+ for entry in items:
42
+ # Returns a new record if the item exists
43
+ base = entry.__get_base__()
44
+ if not override and base in self.items:
45
+ raise RuntimeError(
46
+ f"The item type {base} ({entry.__class__})"
47
+ " is already in the record"
48
+ )
49
+ self.items[base] = entry
50
+
51
+ self.validate()
52
+
53
+ @classmethod
54
+ def from_record(cls, record: "Record", *items: T, override=True):
55
+ """Build from another record"""
56
+ return cls({**record.items, **{item.__get_base__(): item for item in items}})
57
+
58
+ def __str__(self):
59
+ return (
60
+ "{"
61
+ + ", ".join(f"{key}: {value}" for key, value in self.items.items())
62
+ + "}"
63
+ )
64
+
65
+ def __reduce__(self):
66
+ cls = self.__class__
67
+ if cls.__trueclass__ is None:
68
+ return (cls.__new__, (cls.__trueclass__ or cls,), {"items": self.items})
69
+
70
+ return (
71
+ cls.__new__,
72
+ (cls.__trueclass__ or cls,),
73
+ {"items": self.items, "itemtypes": self.itemtypes},
74
+ )
75
+
76
+ def __setstate__(self, state):
77
+ self.items = state["items"]
78
+ self.itemtypes = None
79
+
80
+ def validate(self, cls: Type["Record"] = None):
81
+ """Validate the record"""
82
+ cls = cls if cls is not None else self.__class__
83
+
84
+ if cls.itemtypes:
85
+ for itemtype in cls.itemtypes:
86
+ try:
87
+ self.__getitem__(itemtype)
88
+ except KeyError:
89
+ raise KeyError(f"Item of type {itemtype} is missing")
90
+
91
+ if len(self.items) != len(cls.itemtypes):
92
+ unregistered = [
93
+ item
94
+ for item in self.items.values()
95
+ if all(
96
+ not issubclass(item.__get_base__(), itemtype)
97
+ for itemtype in cls.itemtypes
98
+ )
99
+ ]
100
+ raise KeyError(
101
+ f"The record {cls} contains unregistered items: {unregistered}"
102
+ )
103
+
104
+ def get(self, key: Type[T]) -> Optional[T]:
105
+ """Get a given item or None if it does not exist"""
106
+ try:
107
+ return self[key]
108
+ except KeyError:
109
+ return None
110
+
111
+ def has(self, key: Type[T]) -> bool:
112
+ """Returns True if the record has the given item type"""
113
+ return key.__get_base__() in self.items
114
+
115
+ def __getitem__(self, key: Type[T]) -> T:
116
+ """Get an item given its type"""
117
+ base = key.__get_base__()
118
+ entry = self.items[base]
119
+
120
+ # Check if this matches the expected class
121
+ if not isinstance(entry, key):
122
+ raise KeyError(f"No entry with type {key}")
123
+ return entry
124
+
125
+ def is_pickled(self):
126
+ return self.itemtypes is None
127
+
128
+ def update(self, *items: T) -> "Record":
129
+ """Update some items"""
130
+ # Create our new dictionary
131
+ item_dict = {**self.items}
132
+ for item in items:
133
+ item_dict[item.__get_base__()] = item
134
+
135
+ return self.__class__(item_dict)
136
+
137
+ # --- Class methods and variables
138
+
139
+ itemtypes: ClassVar[Optional[FrozenSet[Type[T]]]] = []
140
+ """For specific records, this is the list of types. The value is null when
141
+ no validation is used (e.g. pickled records created on the fly)"""
142
+
143
+ __trueclass__: ClassVar[Optional[Type["Record"]]] = None
144
+ """The last class in the type hierarchy corresponding to an actual type,
145
+ i.e. not created on the fly (only defined when the record is pickled)"""
146
+
147
+ @classmethod
148
+ def has_type(cls, itemtype: Type[T]):
149
+ return any(issubclass(cls_itemtype, itemtype) for cls_itemtype in cls.itemtypes)
150
+
151
+ @classmethod
152
+ def _subclass(cls, *itemtypes: Type[T]):
153
+ cls_itemtypes = [x for x in getattr(cls, "itemtypes", [])]
154
+ mapping = {
155
+ itemtype.__get_base__(): ix for ix, itemtype in enumerate(cls_itemtypes)
156
+ }
157
+
158
+ for itemtype in itemtypes:
159
+ if (ix := mapping.get(itemtype.__get_base__(), -1)) >= 0:
160
+ cls_itemtypes[ix] = itemtype
161
+ else:
162
+ cls_itemtypes.append(itemtype)
163
+
164
+ return frozenset(cls_itemtypes)
165
+
166
+ @classmethod
167
+ def from_types(cls, name: str, *itemtypes: Type[T], module: str = None):
168
+ """Construct a new sub-record type
169
+
170
+ :param name: The name of the subrecord
171
+ :param module: The module name, defaults to None
172
+ :return: A new Record type
173
+ """
174
+ extra_dict = {}
175
+ if module:
176
+ extra_dict["__module__"] = module
177
+
178
+ return type(
179
+ name,
180
+ (cls,),
181
+ {
182
+ **extra_dict,
183
+ "itemtypes": frozenset(cls._subclass(*itemtypes)),
184
+ "__trueclass__": cls.__trueclass__ or cls,
185
+ },
186
+ )
187
+
188
+ __RECORD_TYPES_CACHE__: Dict[frozenset, Type["Record"]] = {}
189
+
190
+ @staticmethod
191
+ def fromitemtypes(itemtypes: FrozenSet[T]):
192
+ if recordtype := Record.__RECORD_TYPES_CACHE__.get(itemtypes, None):
193
+ return recordtype
194
+
195
+ recordtype = Record.from_types(
196
+ "_".join(itemtype.__name__ for itemtype in itemtypes), *itemtypes
197
+ )
198
+ Record.__RECORD_TYPES_CACHE__[itemtypes] = recordtype
199
+ return recordtype
200
+
201
+
202
+ def recordtypes(*types: List[Type[T]]):
203
+ """Adds types for a new record class"""
204
+
205
+ def decorate(cls: Type[Record]):
206
+ (base_cls,) = [base for base in cls.__bases__ if issubclass(base, Record)]
207
+
208
+ setattr(cls, "itemtypes", base_cls._subclass(*types))
209
+ return cls
210
+
211
+ return decorate
212
+
213
+
214
+ class RecordTypesCacheBase:
215
+ def __init__(self, name: str, *itemtypes: Type[T], module: str = None):
216
+ self._module = module
217
+ self._name = name
218
+ self._itemtypes = itemtypes
219
+
220
+ def _compute(self, record_type: Type[Record]):
221
+ updated_type = record_type.from_types(
222
+ f"{self._name}_{record_type.__name__}",
223
+ *self._itemtypes,
224
+ module=self._module,
225
+ )
226
+ return updated_type
227
+
228
+
229
+ class RecordTypesCache(RecordTypesCacheBase):
230
+ """Class to use when new record types need to be created on the fly by
231
+ adding new items"""
232
+
233
+ def __init__(self, name: str, *itemtypes: Type[T], module: str = None):
234
+ """Creates a new cache
235
+
236
+ :param name: Base name for new record types
237
+ :param module: The module name for new types, defaults to None
238
+ """
239
+ super().__init__(name, *itemtypes, module=module)
240
+ self._cache: Dict[Type[Record], Type[Record]] = {}
241
+ self._warning = False
242
+
243
+ def __call__(self, record_type: Type[Record]):
244
+ if (updated_type := self._cache.get(record_type, None)) is None:
245
+ self._cache[record_type] = updated_type = self._compute(record_type)
246
+ return updated_type
247
+
248
+ def update(self, record: Record, *items: Item, cls=None):
249
+ """Update the record with the given items
250
+
251
+ :param record: The record to which we add items
252
+ :param cls: The class of the record, useful if the record has been
253
+ pickled, defaults to None
254
+ :return: A new record with the extra items
255
+ """
256
+ if cls is None:
257
+ cls = record.__class__
258
+ if record.is_pickled() and not self._warning:
259
+ logging.warning(
260
+ "Updating unpickled records is not recommended"
261
+ " (speed issues): use the pickle record class as the cls input"
262
+ )
263
+ itemtypes = frozenset(type(item) for item in record.items.values())
264
+ cls = Record.fromitemtypes(itemtypes)
265
+ else:
266
+ assert (
267
+ record.is_pickled()
268
+ ), "cls can be used only when the record as been pickled"
269
+
270
+ return self(cls)(*record.items.values(), *items, override=True)
271
+
272
+
273
+ class SingleRecordTypeCache(RecordTypesCacheBase):
274
+ """Class to use when new record types need to be created on the fly by
275
+ adding new items
276
+
277
+ This class supposes that the input record type is always the same (no check
278
+ is done to ensure this)"""
279
+
280
+ def __init__(self, name: str, *itemtypes: Type[T], module: str = None):
281
+ """Creates a new cache
282
+
283
+ :param name: Base name for new record types
284
+ :param module: The module name for new types, defaults to None
285
+ """
286
+ super().__init__(name, *itemtypes, module=module)
287
+ self._cache: Optional[Type[Record]] = None
288
+
289
+ def __call__(self, record_type: Type[Record]):
290
+ if self._cache is None:
291
+ self._cache = self._compute(record_type)
292
+ return self._cache
293
+
294
+ def update(self, record: Record, *items: Item, cls=None):
295
+ """Update the record with the given items
296
+
297
+ :param record: The record to which we add items
298
+ :param cls: The class of the record, useful if the record has been
299
+ pickled, defaults to None
300
+ :return: A new record with the extra items
301
+ """
302
+ if self._cache is None:
303
+ if cls is None:
304
+ cls = record.__class__
305
+ itemtypes = frozenset(type(item) for item in record.items.values())
306
+ cls = Record.fromitemtypes(itemtypes)
307
+ else:
308
+ assert (
309
+ record.is_pickled()
310
+ ), "cls can be used only when the record as been pickled"
311
+
312
+ return self(cls)(*record.items.values(), *items, override=True)
@@ -0,0 +1,151 @@
1
+ import pickle
2
+ from datamaestro.record import (
3
+ Record,
4
+ Item,
5
+ RecordTypesCache,
6
+ recordtypes,
7
+ SingleRecordTypeCache,
8
+ )
9
+ from attrs import define
10
+ import pytest
11
+
12
+
13
+ @define
14
+ class AItem(Item):
15
+ a: int
16
+
17
+
18
+ @define
19
+ class A1Item(AItem):
20
+ a1: int
21
+
22
+
23
+ @define
24
+ class BItem(Item):
25
+ b: int
26
+
27
+
28
+ @define
29
+ class B1Item(BItem):
30
+ b1: int
31
+
32
+
33
+ @define
34
+ class CItem(Item):
35
+ c: int
36
+
37
+
38
+ @recordtypes(A1Item)
39
+ class BaseRecord(Record):
40
+ ...
41
+
42
+
43
+ @recordtypes(BItem)
44
+ class MyRecord(BaseRecord):
45
+ ...
46
+
47
+
48
+ @recordtypes(CItem)
49
+ class MyRecord2(MyRecord):
50
+ pass
51
+
52
+
53
+ def test_record_simple():
54
+ a = A1Item(1, 2)
55
+ b = BItem(4)
56
+ r = MyRecord(a, b)
57
+ assert r[AItem] is a
58
+ assert r[A1Item] is a
59
+ assert r[BItem] is b
60
+
61
+
62
+ def test_record_missing_init():
63
+ with pytest.raises(KeyError):
64
+ # A1Item is missing
65
+ MyRecord(AItem(1), BItem(2))
66
+
67
+ with pytest.raises(KeyError):
68
+ MyRecord(A1Item(1, 2))
69
+
70
+
71
+ def test_record_update():
72
+ a = A1Item(1, 2)
73
+ b = BItem(4)
74
+ r = MyRecord(a, b)
75
+
76
+ r2 = r.update(BItem(3))
77
+ assert r is not r2
78
+ assert r2[BItem] is not b
79
+
80
+ r3 = MyRecord2.from_record(r, CItem(2), BItem(5))
81
+ assert r[BItem].b == 4
82
+ assert r3[BItem].b == 5
83
+
84
+
85
+ def test_record_decorator():
86
+ MyRecord2(A1Item(1, 2), BItem(2), CItem(3))
87
+
88
+
89
+ def test_record_type_update():
90
+ itemtypes = MyRecord2.from_types("Test", B1Item).itemtypes
91
+ assert itemtypes == frozenset((A1Item, B1Item, CItem))
92
+
93
+
94
+ def test_record_onthefly():
95
+ cache = RecordTypesCache("OnTheFly", CItem)
96
+
97
+ MyRecord2 = cache(MyRecord)
98
+ MyRecord2(A1Item(1, 2), BItem(2), CItem(3))
99
+
100
+ assert cache(MyRecord) is MyRecord2
101
+
102
+ r = MyRecord(A1Item(1, 2), BItem(2))
103
+ assert cache(r.__class__) is MyRecord2
104
+
105
+ r = cache.update(r, CItem(3))
106
+
107
+ # Same record type
108
+ cache2 = RecordTypesCache("OnTheFly", CItem)
109
+
110
+ cache2.update(r, CItem(4))
111
+
112
+
113
+ def test_record_pickled():
114
+ # First,
115
+ MyRecord2 = BaseRecord.from_types("MyRecordBis", BItem)
116
+ r = MyRecord2(A1Item(1, 2), BItem(2))
117
+ r = pickle.loads(pickle.dumps(r))
118
+
119
+ assert isinstance(r, BaseRecord) and not isinstance(r, MyRecord2)
120
+ cache = RecordTypesCache("OnTheFly", CItem)
121
+
122
+ assert r.is_pickled()
123
+
124
+ r2 = cache.update(r, CItem(4))
125
+ assert not r2.is_pickled()
126
+
127
+ # Test with cls update
128
+ with pytest.raises(KeyError):
129
+ cache.update(r, CItem(4), cls=BaseRecord)
130
+
131
+ # This is OK
132
+ cache.update(r, CItem(4), cls=MyRecord)
133
+
134
+ # --- Test when we update a pickled record with an of a sub-class
135
+ cache = RecordTypesCache("OnTheFly", B1Item)
136
+ r2 = cache.update(r, B1Item(1, 2))
137
+
138
+
139
+ def test_record_pickled_single():
140
+ MyRecord2 = BaseRecord.from_types("MyRecordBis", BItem)
141
+ r = MyRecord2(A1Item(1, 2), BItem(2))
142
+ r = pickle.loads(pickle.dumps(r))
143
+
144
+ cache = SingleRecordTypeCache("OnTheFly", CItem)
145
+
146
+ updated = cache.update(r, CItem(4))
147
+
148
+ assert updated.itemtypes == frozenset((A1Item, BItem, CItem))
149
+
150
+ # Even with the wrong record, no change now
151
+ assert cache(BaseRecord).itemtypes == frozenset((A1Item, BItem, CItem))
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '1.0.4'
16
- __version_tuple__ = version_tuple = (1, 0, 4)
15
+ __version__ = version = '1.0.6'
16
+ __version_tuple__ = version_tuple = (1, 0, 6)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro
3
- Version: 1.0.4
3
+ Version: 1.0.6
4
4
  Summary: "Dataset management command line and API"
5
5
  Home-page: https://github.com/experimaestro/datamaestro
6
6
  Author: Benjamin Piwowarski
@@ -29,6 +29,7 @@ docs/source/style.css
29
29
  docs/source/api/data.md
30
30
  docs/source/api/download.rst
31
31
  docs/source/api/index.md
32
+ docs/source/api/records.rst
32
33
  src/datamaestro/__init__.py
33
34
  src/datamaestro/__main__.py
34
35
  src/datamaestro/context.py
@@ -1,2 +0,0 @@
1
- myst_parser > 0.18
2
- sphinx >= 4.2
@@ -1,196 +0,0 @@
1
- from typing import ClassVar, Type, TypeVar, Dict, List, Union, Optional
2
-
3
-
4
- class Item:
5
- """Base class for all item types"""
6
-
7
- @classmethod
8
- def __get_base__(cls: Type) -> Type:
9
- if base := getattr(cls, "__base__cache__", None):
10
- return base
11
-
12
- base = cls
13
- for supercls in cls.__mro__:
14
- if issubclass(supercls, Item) and supercls is not Item:
15
- base = supercls
16
- setattr(cls, "__base__cache__", base)
17
- return base
18
-
19
-
20
- T = TypeVar("T", bound=Item)
21
- Items = Dict[Type[T], T]
22
-
23
-
24
- class Record:
25
- """Associate types with entries"""
26
-
27
- items: Items
28
-
29
- def __init__(self, *items: Union[Items, T], no_check=False):
30
- self.items = {}
31
-
32
- if len(items) == 1 and isinstance(items[0], dict):
33
- self.items = items[0]
34
- else:
35
- for item in items:
36
- self.add(item, update_only=True)
37
-
38
- # Check if the record is constructured
39
- if not no_check:
40
- self.validate()
41
-
42
- def __new__(cls, *items: Union[Items, T], no_check=False):
43
- # Without this, impossible to pickle objects
44
- if cls.__trueclass__ is not None:
45
- record = object.__new__(cls.__trueclass__)
46
- record.__init__(*items, no_check=True)
47
- if not no_check:
48
- record.validate(cls=cls)
49
- return record
50
-
51
- return object.__new__(cls)
52
-
53
- def __str__(self):
54
- return (
55
- "{"
56
- + ", ".join(f"{key}: {value}" for key, value in self.items.items())
57
- + "}"
58
- )
59
-
60
- def validate(self, cls: Type["Record"] = None):
61
- """Validate the record"""
62
- cls = cls if cls is not None else self.__class__
63
-
64
- if cls.itemtypes:
65
- for itemtype in cls.itemtypes:
66
- try:
67
- self.__getitem__(itemtype)
68
- except KeyError:
69
- raise KeyError(f"Item of type {itemtype} is missing")
70
-
71
- if len(self.items) != len(cls.itemtypes):
72
- unregistered = [
73
- item
74
- for item in self.items.values()
75
- if all(
76
- not issubclass(item.__get_base__(), itemtype)
77
- for itemtype in cls.itemtypes
78
- )
79
- ]
80
- raise RuntimeError(
81
- f"The record {cls} contains unregistered items: {unregistered}"
82
- )
83
-
84
- def get(self, key: Type[T]) -> Optional[T]:
85
- try:
86
- return self[key]
87
- except KeyError:
88
- return None
89
-
90
- def has(self, key: Type[T]) -> bool:
91
- return key.__get_base__() in self.items
92
-
93
- def __getitem__(self, key: Type[T]) -> T:
94
- """Get an item given its type"""
95
- base = key.__get_base__()
96
- entry = self.items[base]
97
-
98
- # Check if this matches the expected class
99
- if not isinstance(entry, key):
100
- raise KeyError(f"No entry with type {key}")
101
- return entry
102
-
103
- def add(self, *entries: T, update_only=False, no_check=False) -> "Record":
104
- """Update the record with this new entry, returns a new record if
105
- it exists"""
106
-
107
- for entry in entries:
108
- # Returns a new record if the item exists
109
- base = entry.__get_base__()
110
- if base in self.items:
111
- if update_only:
112
- raise RuntimeError(
113
- f"The item type {base} ({entry.__class__})"
114
- " is already in the record"
115
- )
116
- return self.__class__({**self.items, base: entry}, no_check=no_check)
117
-
118
- # No, just update
119
- self.items[base] = entry
120
- return self
121
-
122
- # --- Class methods and variables
123
-
124
- itemtypes: ClassVar[List[Type[T]]] = []
125
- """For specific records, this is the list of types"""
126
-
127
- __trueclass__: ClassVar[Optional[Type["Record"]]] = None
128
- """True when the class is defined in a module"""
129
-
130
- @classmethod
131
- def has_type(cls, itemtype: Type[T]):
132
- return any(issubclass(cls_itemtype, itemtype) for cls_itemtype in cls.itemtypes)
133
-
134
- @classmethod
135
- def _subclass(cls, *itemtypes: Type[T]):
136
- cls_itemtypes = [x for x in getattr(cls, "itemtypes", [])]
137
- mapping = {
138
- ix: itemtype.__get_base__() for ix, itemtype in enumerate(cls_itemtypes)
139
- }
140
-
141
- for itemtype in itemtypes:
142
- if ix := mapping.get(itemtype.__get_base__(), None):
143
- cls_itemtypes[ix] = itemtype
144
- else:
145
- cls_itemtypes.append(itemtype)
146
- return cls_itemtypes
147
-
148
- @classmethod
149
- def from_types(cls, name: str, *itemtypes: Type[T], module: str = None):
150
- extra_dict = {}
151
- if module:
152
- extra_dict["__module__"] = module
153
- return type(
154
- name,
155
- (cls,),
156
- {
157
- **extra_dict,
158
- "__trueclass__": cls.__trueclass__ or cls,
159
- "itemtypes": cls._subclass(*itemtypes),
160
- },
161
- )
162
-
163
-
164
- def recordtypes(*types: List[Type[T]]):
165
- """Adds types for a new record class"""
166
-
167
- def decorate(cls: Type[Record]):
168
- (base_cls,) = [base for base in cls.__bases__ if issubclass(base, Record)]
169
-
170
- setattr(cls, "itemtypes", base_cls._subclass(*types))
171
- return cls
172
-
173
- return decorate
174
-
175
-
176
- class RecordTypesCache:
177
- """Class to use when new record types need to be created on the fly by
178
- adding new items"""
179
-
180
- def __init__(self, name: str, *itemtypes: Type[T], module: str = None):
181
- self._module = module
182
- self._name = name
183
- self._itemtypes = itemtypes
184
- self._cache: Dict[Type[Record], Type[Record]] = {}
185
-
186
- def __getitem__(self, record_type: Type[Record]):
187
- if updated_type := self._cache.get(record_type, None):
188
- return updated_type
189
-
190
- updated_type = record_type.from_types(
191
- f"{self._name}_{record_type.__name__}",
192
- *self._itemtypes,
193
- module=self._module,
194
- )
195
- self._cache[record_type] = updated_type
196
- return updated_type
@@ -1,81 +0,0 @@
1
- from datamaestro.record import Record, Item, RecordTypesCache, recordtypes
2
- from attrs import define
3
- import pytest
4
-
5
-
6
- @define
7
- class AItem(Item):
8
- a: int
9
-
10
-
11
- @define
12
- class A1Item(AItem):
13
- a1: int
14
-
15
-
16
- @define
17
- class BItem(Item):
18
- b: int
19
-
20
-
21
- @define
22
- class CItem(Item):
23
- c: int
24
-
25
-
26
- class MyRecord(Record):
27
- itemtypes = [A1Item, BItem]
28
-
29
-
30
- @recordtypes(CItem)
31
- class MyRecord2(MyRecord):
32
- pass
33
-
34
-
35
- def test_record_simple():
36
- a = A1Item(1, 2)
37
- b = BItem(4)
38
- r = MyRecord(a, b)
39
- assert r[AItem] is a
40
- assert r[A1Item] is a
41
- assert r[BItem] is b
42
-
43
-
44
- def test_record_missing_init():
45
- with pytest.raises(KeyError):
46
- MyRecord(AItem(1), BItem(2))
47
-
48
- with pytest.raises(KeyError):
49
- MyRecord(A1Item(1, 2))
50
-
51
-
52
- def test_record_update():
53
- a = A1Item(1, 2)
54
- b = BItem(4)
55
- r = MyRecord(a, b)
56
-
57
- r2 = r.add(BItem(3))
58
- assert r is not r2
59
- assert r2[BItem] is not b
60
-
61
-
62
- def test_record_decorator():
63
- MyRecord2(A1Item(1, 2), BItem(2), CItem(3))
64
-
65
-
66
- def test_record_newtype():
67
- MyRecord2 = MyRecord.from_types("MyRecord2", CItem)
68
- r = MyRecord2(A1Item(1, 2), BItem(2), CItem(3))
69
-
70
- # For a dynamic class, we should have the same MyRecord type
71
- assert r.__class__ is MyRecord
72
-
73
-
74
- def test_record_onthefly():
75
- cache = RecordTypesCache("OnTheFly", CItem)
76
-
77
- MyRecord2 = cache[MyRecord]
78
- r = MyRecord2(A1Item(1, 2), BItem(2), CItem(3))
79
- assert r.__class__ is MyRecord
80
-
81
- assert cache[MyRecord] is MyRecord2
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes