datamaestro 1.0.6__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {datamaestro-1.0.6 → datamaestro-1.2.0}/PKG-INFO +1 -1
  2. datamaestro-1.2.0/docs/source/api/records.rst +112 -0
  3. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/annotations/agreement.py +9 -3
  4. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/context.py +18 -9
  5. datamaestro-1.2.0/src/datamaestro/data/ml.py +27 -0
  6. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/definitions.py +58 -18
  7. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/__init__.py +31 -2
  8. datamaestro-1.2.0/src/datamaestro/record.py +177 -0
  9. datamaestro-1.2.0/src/datamaestro/test/test_record.py +72 -0
  10. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/version.py +2 -2
  11. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/PKG-INFO +1 -1
  12. datamaestro-1.0.6/docs/source/api/records.rst +0 -59
  13. datamaestro-1.0.6/src/datamaestro/data/ml.py +0 -19
  14. datamaestro-1.0.6/src/datamaestro/record.py +0 -312
  15. datamaestro-1.0.6/src/datamaestro/test/test_record.py +0 -151
  16. {datamaestro-1.0.6 → datamaestro-1.2.0}/.coverage +0 -0
  17. {datamaestro-1.0.6 → datamaestro-1.2.0}/.github/workflows/pytest.yml +0 -0
  18. {datamaestro-1.0.6 → datamaestro-1.2.0}/.github/workflows/python-publish.yml +0 -0
  19. {datamaestro-1.0.6 → datamaestro-1.2.0}/.gitignore +0 -0
  20. {datamaestro-1.0.6 → datamaestro-1.2.0}/.pre-commit-config.yaml +0 -0
  21. {datamaestro-1.0.6 → datamaestro-1.2.0}/.readthedocs.yml +0 -0
  22. {datamaestro-1.0.6 → datamaestro-1.2.0}/CHANGELOG.md +0 -0
  23. {datamaestro-1.0.6 → datamaestro-1.2.0}/LICENSE +0 -0
  24. {datamaestro-1.0.6 → datamaestro-1.2.0}/MANIFEST.in +0 -0
  25. {datamaestro-1.0.6 → datamaestro-1.2.0}/README.md +0 -0
  26. {datamaestro-1.0.6 → datamaestro-1.2.0}/TODO.md +0 -0
  27. {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/Makefile +0 -0
  28. {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/make.bat +0 -0
  29. {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/requirements.txt +0 -0
  30. {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/api/data.md +0 -0
  31. {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/api/download.rst +0 -0
  32. {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/api/index.md +0 -0
  33. {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/conf.py +0 -0
  34. {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/datasets.rst +0 -0
  35. {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/developping.md +0 -0
  36. {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/index.md +0 -0
  37. {datamaestro-1.0.6 → datamaestro-1.2.0}/docs/source/style.css +0 -0
  38. {datamaestro-1.0.6 → datamaestro-1.2.0}/mkdocs.yml +0 -0
  39. {datamaestro-1.0.6 → datamaestro-1.2.0}/pyproject.toml +0 -0
  40. {datamaestro-1.0.6 → datamaestro-1.2.0}/pytest.ini +0 -0
  41. {datamaestro-1.0.6 → datamaestro-1.2.0}/requirements-dev.txt +0 -0
  42. {datamaestro-1.0.6 → datamaestro-1.2.0}/requirements.txt +0 -0
  43. {datamaestro-1.0.6 → datamaestro-1.2.0}/schema.yaml +0 -0
  44. {datamaestro-1.0.6 → datamaestro-1.2.0}/setup.cfg +0 -0
  45. {datamaestro-1.0.6 → datamaestro-1.2.0}/setup.py +0 -0
  46. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/__init__.py +0 -0
  47. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/__main__.py +0 -0
  48. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/annotations/__init__.py +0 -0
  49. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/commands/__init__.py +0 -0
  50. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/commands/mainstyle.css +0 -0
  51. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/commands/site.py +0 -0
  52. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/data/__init__.py +0 -0
  53. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/data/csv.py +0 -0
  54. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/data/huggingface.py +0 -0
  55. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/data/tensor.py +0 -0
  56. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/archive.py +0 -0
  57. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/huggingface.py +0 -0
  58. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/links.py +0 -0
  59. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/manual.py +0 -0
  60. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/multiple.py +0 -0
  61. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/single.py +0 -0
  62. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/sync.py +0 -0
  63. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/todo.py +0 -0
  64. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/registry.py +0 -0
  65. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/search.py +0 -0
  66. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/settings.py +0 -0
  67. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/sphinx.py +0 -0
  68. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/stream/__init__.py +0 -0
  69. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/stream/compress.py +0 -0
  70. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/stream/lines.py +0 -0
  71. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/templates/dataset.py +0 -0
  72. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/test/__init__.py +0 -0
  73. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/test/checks.py +0 -0
  74. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/test/conftest.py +0 -0
  75. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/test/test_annotations.py +0 -0
  76. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/test/test_download_handlers.py +0 -0
  77. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/utils.py +0 -0
  78. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/SOURCES.txt +0 -0
  79. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/dependency_links.txt +0 -0
  80. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/entry_points.txt +0 -0
  81. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/not-zip-safe +0 -0
  82. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/requires.txt +0 -0
  83. {datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro.egg-info/top_level.txt +0 -0
  84. {datamaestro-1.0.6 → datamaestro-1.2.0}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro
3
- Version: 1.0.6
3
+ Version: 1.2.0
4
4
  Summary: "Dataset management command line and API"
5
5
  Home-page: https://github.com/experimaestro/datamaestro
6
6
  Author: Benjamin Piwowarski
@@ -0,0 +1,112 @@
1
+ Records
2
+ =======
3
+
4
+ Records are flexible ways to compose information coming from various sources. For instance,
5
+ your processing chain can produce records only containing an ID. Later, you add can retrieve
6
+ the item content and add it to the record. Further in the processing, you would want to add
7
+ some transformation of the item content.
8
+
9
+ Records allow to perform this type of transformations by holding a set of **items**. Record types
10
+ form a lattice of types so that checking that some item types are present in an item is easy.
11
+
12
+ .. code-block:: python
13
+ @define
14
+ class AItem(Item):
15
+ a: int
16
+
17
+
18
+ @define
19
+ class A1Item(AItem):
20
+ a1: int
21
+
22
+
23
+ @define
24
+ class BItem(Item):
25
+ b: int
26
+
27
+
28
+ @define
29
+ class CItem(Item):
30
+ c: int
31
+
32
+
33
+
34
+ record = Record(AItem(1), BItem(2))
35
+ print(record[AItem].a) # 1
36
+ print(record[BItem].b) # 1
37
+
38
+ # records types are only defined by their item types
39
+ other_record = Record(A1Item(1), BItem(2))
40
+
41
+ # records can be updated
42
+ new_record = record.update(BItem(3), CItem(4))
43
+ print(new_record[BItem].b) # 3
44
+ print(new_record[CItem].c) # 4
45
+
46
+ # records only hold one instance of a given item
47
+ # base type
48
+ new_record_a1 = record.update(A1Item(3, 4))
49
+ print(new_record[AItem].a) # 3
50
+ print(new_record[A1Item].a) # 3
51
+ print(new_record[A1Item].a1) # 4
52
+
53
+
54
+ Working with record types
55
+ *************************
56
+
57
+ Record types form a lattice of types that can be used to check
58
+ record properties before hand.
59
+
60
+ .. code-block:: python
61
+
62
+ ABRecord = record_type(AItem, BItem)
63
+ AB1Record = record_type(AItem, B1Item)
64
+
65
+ # Hierarchy-based check
66
+ assert ABRecord.contains(AB1Record)
67
+
68
+ # Checks for specific types
69
+ assert ABRecord.has(AItem, BItem)
70
+
71
+ Validating
72
+ **********
73
+
74
+ To ensure that a record fills the requested property,
75
+ one can use record types
76
+
77
+ .. code-block:: python
78
+
79
+ ABRecord = record_type(AItem, BItem)
80
+
81
+ # OK
82
+ ABRecord(AItem(1), BItem(2))
83
+
84
+ # Fails: A1Item is not AItem
85
+ ABRecord(A1Item(1), BItem(2))
86
+
87
+ # Fails: AItem is not present
88
+ ABRecord(BItem(2))
89
+
90
+ When updating, it is also possible to validate
91
+
92
+ .. code-block:: python
93
+
94
+ A1BRecord = record_type(A1Item, BItem)
95
+ record = Record(AItem(1), BItem(2))
96
+
97
+ # Update the ABRecord into a A1/B one
98
+ record.update(A1Item(1, 2), target=A1BRecord)
99
+
100
+
101
+ API
102
+ ***
103
+
104
+ .. autoclass:: datamaestro.record.Item
105
+
106
+ .. autoclass:: datamaestro.record.RecordType
107
+ :members: __call__, validate, sub
108
+
109
+ .. autoclass:: datamaestro.record.Record
110
+ :members: update, has, get
111
+
112
+ .. autofunction:: datamaestro.record.record_type
@@ -1,9 +1,15 @@
1
- import logging
2
- from datamaestro.definitions import DatasetAnnotation, AbstractDataset, hook
1
+ from typing import Optional
2
+ from datamaestro.definitions import AbstractDataset, hook
3
3
 
4
4
 
5
5
  @hook("pre-use")
6
- def useragreement(definition: AbstractDataset, message, id=None):
6
+ def useragreement(definition: AbstractDataset, message: str, id: Optional[str] = None):
7
+ """Asks for a user-agreement
8
+
9
+ :param definition: The dataset for which the agreement is asked
10
+ :param message: The agreement text
11
+ :param id: The ID of the agreement (default to the dataset ID)
12
+ """
7
13
  # Skip agreement when testing
8
14
  if definition.context.running_test:
9
15
  return
@@ -110,19 +110,20 @@ class Context:
110
110
  if repositoryid is None:
111
111
  return None
112
112
 
113
- l = [
113
+ entry_points = [
114
114
  x
115
115
  for x in pkg_resources.iter_entry_points(
116
116
  "datamaestro.repositories", repositoryid
117
117
  )
118
118
  ]
119
- if not l:
119
+ if not entry_points:
120
120
  raise Exception("No datasets repository named %s", repositoryid)
121
- if len(l) > 1:
121
+ if len(entry_points) > 1:
122
122
  raise Exception(
123
- "Too many datasets repository named %s (%d)" % (repositoryid, len(l))
123
+ "Too many datasets repository named %s (%d)"
124
+ % (repositoryid, len(entry_points))
124
125
  )
125
- return l[0].load()(self)
126
+ return entry_points[0].load()(self)
126
127
 
127
128
  @property
128
129
  def running_test(self):
@@ -175,7 +176,6 @@ class Context:
175
176
  if dlpath.is_file():
176
177
  logging.debug("Using cached file %s for %s", dlpath, url)
177
178
  else:
178
-
179
179
  logging.info("Downloading %s", url)
180
180
  tmppath = dlpath.with_suffix(".tmp")
181
181
 
@@ -188,7 +188,7 @@ class Context:
188
188
 
189
189
  def ask(self, question: str, options: Dict[str, str]):
190
190
  """Ask a question to the user"""
191
- print(question)
191
+ print(question) # noqa: T201
192
192
  answer = None
193
193
  while answer not in options:
194
194
  answer = input().strip().lower()
@@ -268,6 +268,7 @@ class Datasets(Iterable["AbstractDataset"]):
268
268
 
269
269
  def __iter__(self) -> Iterable["AbstractDataset"]:
270
270
  from .definitions import DatasetWrapper
271
+ from datamaestro.data import Base
271
272
 
272
273
  # Iterates over defined symbols
273
274
  for key, value in self.module.__dict__.items():
@@ -276,10 +277,18 @@ class Datasets(Iterable["AbstractDataset"]):
276
277
  # Ensure it comes from the module
277
278
  if self.module.__name__ == value.t.__module__:
278
279
  yield value
280
+ elif (
281
+ inspect.isclass(value)
282
+ and issubclass(value, Base)
283
+ and hasattr(value, "__dataset__")
284
+ ):
285
+ if self.module.__name__ == value.__module__:
286
+ yield value.__dataset__
279
287
 
280
288
 
281
289
  class Repository:
282
- """A repository regroup a set of datasets and their corresponding specific handlers (downloading, filtering, etc.)"""
290
+ """A repository regroup a set of datasets and their corresponding specific
291
+ handlers (downloading, filtering, etc.)"""
283
292
 
284
293
  def __init__(self, context: Context):
285
294
  """Initialize a new repository
@@ -315,7 +324,7 @@ class Repository:
315
324
  try:
316
325
  return get_distribution(cls.__module__).version
317
326
  except DistributionNotFound:
318
- __version__ = None
327
+ return None
319
328
 
320
329
  def __repr__(self):
321
330
  return "Repository(%s)" % self.basedir
@@ -0,0 +1,27 @@
1
+ """Machine learning generic data formats"""
2
+ from typing import Generic, TypeVar, Optional
3
+ from pathlib import Path
4
+ from experimaestro import Param, Meta, argument
5
+ from . import Base
6
+
7
+ Train = TypeVar("Train", bound=Base)
8
+ Validation = TypeVar("Validation", bound=Base)
9
+ Test = TypeVar("Test", bound=Base)
10
+
11
+
12
+ class Supervised(Base, Generic[Train, Validation, Test]):
13
+ train: Param[Base]
14
+ """The training dataset"""
15
+
16
+ validation: Param[Optional[Base]] = None
17
+ """The validation dataset (optional)"""
18
+
19
+ test: Param[Optional[Base]] = None
20
+ """The training optional"""
21
+
22
+
23
+ @argument("classes")
24
+ class FolderBased(Base):
25
+ """Classification dataset where folders give the basis"""
26
+
27
+ path: Meta[Path]
@@ -127,6 +127,13 @@ class AbstractDataset(AbstractData):
127
127
  """
128
128
 
129
129
  name: Optional[str] = None
130
+ """The name of the dataset"""
131
+
132
+ url: Optional[str] = None
133
+ """The URL of the dataset"""
134
+
135
+ doi: Optional[str] = None
136
+ """The DOI of this dataset"""
130
137
 
131
138
  def __init__(self, repository: Optional["Repository"]):
132
139
  super().__init__()
@@ -136,6 +143,7 @@ class AbstractDataset(AbstractData):
136
143
 
137
144
  # Associated resources
138
145
  self.resources: Dict[str, "Download"] = {}
146
+ self.ordered_resources = []
139
147
 
140
148
  # Hooks
141
149
  # pre-use: before returning the dataset object
@@ -194,13 +202,15 @@ class AbstractDataset(AbstractData):
194
202
  def download(self, force=False):
195
203
  """Download all the necessary resources"""
196
204
  success = True
197
- for key, resource in self.resources.items():
205
+ logging.info("Materializing %d resources", len(self.ordered_resources))
206
+ for resource in self.ordered_resources:
198
207
  try:
199
208
  resource.download(force)
200
209
  except Exception:
201
- logging.error("Could not download resource %s", key)
210
+ logging.error("Could not download resource %s", resource)
202
211
  traceback.print_exc()
203
212
  success = False
213
+ break
204
214
  return success
205
215
 
206
216
  @staticmethod
@@ -249,6 +259,7 @@ class DatasetWrapper(AbstractDataset):
249
259
  def __init__(self, annotation, t: type):
250
260
  self.t = t
251
261
  self.base = annotation.base
262
+ self.config = None
252
263
  assert self.base is not None, f"Could not set the Config type for {t}"
253
264
 
254
265
  repository, components = DataDefinition.repository_relpath(t)
@@ -256,6 +267,7 @@ class DatasetWrapper(AbstractDataset):
256
267
 
257
268
  # Set some variables
258
269
  self.url = annotation.url
270
+ self.doi = annotation.doi
259
271
 
260
272
  # Builds the ID:
261
273
  # Removes module_name.config prefix
@@ -322,7 +334,18 @@ class DatasetWrapper(AbstractDataset):
322
334
  """Returns a pointer to a potential attribute"""
323
335
  return FutureAttr(self, [key])
324
336
 
337
+ def download(self, force=False):
338
+ if self.base is self.t:
339
+ self._prepare()
340
+ return super().download(force=force)
341
+
325
342
  def _prepare(self, download=False) -> "Base":
343
+ if self.config is not None:
344
+ return self.config
345
+
346
+ if self.base is self.t:
347
+ self.config = self.base.__create_dataset__(self)
348
+
326
349
  if download:
327
350
  for hook in self.hooks["pre-download"]:
328
351
  hook(self)
@@ -332,23 +355,23 @@ class DatasetWrapper(AbstractDataset):
332
355
  for hook in self.hooks["pre-use"]:
333
356
  hook(self)
334
357
 
335
- resources = {key: value.prepare() for key, value in self.resources.items()}
336
- dict = self.t(**resources)
337
- if dict is None:
338
- name = self.t.__name__
339
- filename = inspect.getfile(self.t)
340
- raise Exception(
341
- f"The dataset method {name} defined in "
342
- f"{filename} returned a null object"
343
- )
344
-
345
358
  # Construct the object
346
- data = self.base(**dict)
359
+ if self.config is None:
360
+ resources = {key: value.prepare() for key, value in self.resources.items()}
361
+ dict = self.t(**resources)
362
+ if dict is None:
363
+ name = self.t.__name__
364
+ filename = inspect.getfile(self.t)
365
+ raise Exception(
366
+ f"The dataset method {name} defined in "
367
+ f"{filename} returned a null object"
368
+ )
369
+ self.config = self.base(**dict)
347
370
 
348
371
  # Set the ids
349
- self.setDataIDs(data, self.id)
372
+ self.setDataIDs(self.config, self.id)
350
373
 
351
- return data
374
+ return self.config
352
375
 
353
376
  @property
354
377
  def _path(self) -> Path:
@@ -455,7 +478,9 @@ datatasks = DataTagging(lambda d: d.tasks)
455
478
 
456
479
 
457
480
  class dataset:
458
- def __init__(self, base=None, *, timestamp=None, id=None, url=None, size=None):
481
+ def __init__(
482
+ self, base=None, *, timestamp=None, id=None, url=None, size=None, doi=None
483
+ ):
459
484
  """Creates a new (meta)dataset
460
485
 
461
486
  Meta-datasets are not associated with any base type
@@ -473,6 +498,8 @@ class dataset:
473
498
  url {[type]} -- [description] (default: {None})
474
499
 
475
500
  size {str} -- The size (should be a parsable format)
501
+
502
+ doi {str} -- The DOI of the corresponding paper
476
503
  """
477
504
  if hasattr(base, "__datamaestro__") and isinstance(
478
505
  base.__datamaestro__, metadataset
@@ -486,18 +513,31 @@ class dataset:
486
513
  self.meta = False
487
514
  self.timestamp = timestamp
488
515
  self.size = size
516
+ self.doi = doi
489
517
 
490
518
  def __call__(self, t):
491
519
  try:
492
520
  if self.base is None:
493
- # Get type from return annotation
494
- self.base = t.__annotations__["return"]
521
+ from datamaestro.data import Base
522
+
523
+ if inspect.isclass(t) and issubclass(t, Base):
524
+ self.base = t
525
+ else:
526
+ # Get type from return annotation
527
+ try:
528
+ self.base = t.__annotations__["return"]
529
+ except KeyError:
530
+ logging.warning("No return annotation in %s", t)
531
+ raise
495
532
  object.__getattribute__(t, "__datamaestro__")
496
533
  raise AssertionError("@data should only be called once")
497
534
  except AttributeError:
498
535
  pass
499
536
 
500
537
  dw = DatasetWrapper(self, t)
538
+ t.__dataset__ = dw
539
+ if inspect.isclass(t) and issubclass(t, Base):
540
+ return t
501
541
  return dw
502
542
 
503
543
 
@@ -1,6 +1,8 @@
1
+ from typing import Union
1
2
  from abc import ABC, abstractmethod
2
3
  from datamaestro.definitions import AbstractDataset, DatasetAnnotation
3
4
  from datamaestro.utils import deprecated
5
+ from attrs import define
4
6
 
5
7
 
6
8
  def initialized(method):
@@ -15,7 +17,12 @@ def initialized(method):
15
17
  return wrapper
16
18
 
17
19
 
18
- class Download(DatasetAnnotation, ABC):
20
+ @define(kw_only=True)
21
+ class SetupOptions:
22
+ pass
23
+
24
+
25
+ class Resource(DatasetAnnotation, ABC):
19
26
  """
20
27
  Base class for all download handlers
21
28
  """
@@ -24,13 +31,16 @@ class Download(DatasetAnnotation, ABC):
24
31
  self.varname = varname
25
32
  # Ensures that the object is initialized
26
33
  self._post = False
34
+ self.definition = None
27
35
 
28
36
  def annotate(self, dataset: AbstractDataset):
37
+ assert self.definition is None
29
38
  # Register has a resource download
30
39
  if self.varname in dataset.resources:
31
40
  raise AssertionError("Name %s already declared as a resource", self.varname)
32
41
 
33
42
  dataset.resources[self.varname] = self
43
+ dataset.ordered_resources.append(self)
34
44
  self.definition = dataset
35
45
 
36
46
  @property
@@ -53,10 +63,29 @@ class Download(DatasetAnnotation, ABC):
53
63
  """Prepares the dataset"""
54
64
  ...
55
65
 
66
+ def setup(
67
+ self,
68
+ dataset: Union[AbstractDataset],
69
+ options: SetupOptions = None,
70
+ ):
71
+ """Direct way to setup the resource (no annotation)"""
72
+ self(dataset)
73
+ return self.prepare()
74
+
75
+
76
+ # Keeps downwards compatibility
77
+ Download = Resource
78
+
56
79
 
57
80
  class reference(Download):
58
- def __init__(self, varname, reference):
81
+ def __init__(self, varname=None, reference=None):
82
+ """References another dataset
83
+
84
+ :param varname: The name of the variable
85
+ :param reference: Another dataset
86
+ """
59
87
  super().__init__(varname)
88
+ assert reference is not None, "Reference cannot be null"
60
89
  self.reference = reference
61
90
 
62
91
  def prepare(self):
@@ -0,0 +1,177 @@
1
+ from typing import Type, TypeVar, Dict, Union, Optional
2
+
3
+
4
+ class Item:
5
+ """Base class for all item types"""
6
+
7
+ @classmethod
8
+ def __get_base__(cls: Type) -> Type:
9
+ """Get the most generic superclass for this type of item"""
10
+ if base := getattr(cls, "__base__cache__", None):
11
+ return base
12
+
13
+ base = cls
14
+ for supercls in cls.__mro__:
15
+ if issubclass(supercls, Item) and supercls is not Item:
16
+ base = supercls
17
+ setattr(cls, "__base__cache__", base)
18
+ return base
19
+
20
+
21
+ T = TypeVar("T", bound=Item)
22
+ Items = Dict[Type[T], T]
23
+
24
+
25
+ class RecordType:
26
+ def __init__(self, *item_types: Type[T]):
27
+ self.item_types = frozenset(item_types)
28
+ self.mapping = {item_type.__get_base__(): item_type for item_type in item_types}
29
+
30
+ def __repr__(self):
31
+ return f"""Record({",".join(item_type.__name__ for item_type in
32
+ self.item_types)})"""
33
+
34
+ def contains(self, other: "RecordType"):
35
+ """Checks that each item type in other has an item type of a compatible
36
+ type in self"""
37
+ if len(self.item_types) != len(other.item_types):
38
+ return False
39
+
40
+ for item_type in other.item_types:
41
+ if matching_type := self.mapping.get(item_type.__get_base__(), None):
42
+ if not issubclass(matching_type, item_type):
43
+ return False
44
+ else:
45
+ return False
46
+
47
+ return True
48
+
49
+ def sub(self, *item_types: Type[T]):
50
+ """Returns a new record type based on self and new item types"""
51
+ cls_itemtypes = [x for x in self.item_types]
52
+ mapping = {
53
+ itemtype.__get_base__(): ix for ix, itemtype in enumerate(cls_itemtypes)
54
+ }
55
+
56
+ for itemtype in item_types:
57
+ if (ix := mapping.get(itemtype.__get_base__(), -1)) >= 0:
58
+ cls_itemtypes[ix] = itemtype
59
+ else:
60
+ cls_itemtypes.append(itemtype)
61
+
62
+ return record_type(*cls_itemtypes)
63
+
64
+ def __call__(self, *items: T):
65
+ record = Record(*items)
66
+ self.validate(record)
67
+ return record
68
+
69
+ def has(self, itemtype: Type[T]):
70
+ return issubclass(self.mapping[itemtype.__get_base__()], itemtype)
71
+
72
+ def validate(self, record: "Record"):
73
+ """Creates and validate a new record of this type"""
74
+ if self.item_types:
75
+ for item_type in self.item_types:
76
+ try:
77
+ record.__getitem__(item_type)
78
+ except KeyError:
79
+ raise KeyError(f"Item of type {item_type} is missing")
80
+
81
+ if len(record.items) != len(self.item_types):
82
+ unregistered = [
83
+ item
84
+ for item in record.items.values()
85
+ if all(
86
+ not issubclass(item.__get_base__(), item_type)
87
+ for item_type in self.item_types
88
+ )
89
+ ]
90
+ raise KeyError(
91
+ f"The record of type {self} contains unregistered items: {unregistered}"
92
+ )
93
+
94
+ # Creates a new record
95
+ return record
96
+
97
+
98
+ def record_type(*item_types: Type[T]):
99
+ """Returns a new record type"""
100
+ return RecordType(*item_types)
101
+
102
+
103
+ class Record:
104
+ """Associate types with entries
105
+
106
+ A record is a composition of items; each item base class is unique.
107
+ """
108
+
109
+ #: Items for this record
110
+ items: Items
111
+
112
+ def __init__(self, *items: Union[Items, T], override=False):
113
+ self.items = {}
114
+
115
+ if len(items) == 1 and isinstance(items[0], dict):
116
+ # Just copy the dictionary
117
+ self.items = items[0]
118
+ else:
119
+ for entry in items:
120
+ # Returns a new record if the item exists
121
+ base = entry.__get_base__()
122
+ if not override and base in self.items:
123
+ raise RuntimeError(
124
+ f"The item type {base} ({entry.__class__})"
125
+ " is already in the record"
126
+ )
127
+ self.items[base] = entry
128
+
129
+ def __str__(self):
130
+ return (
131
+ "{"
132
+ + ", ".join(
133
+ f"{key.__module__}.{key.__qualname__}: {value}"
134
+ for key, value in self.items.items()
135
+ )
136
+ + "}"
137
+ )
138
+
139
+ def __repr__(self):
140
+ return (
141
+ "{"
142
+ + ", ".join(
143
+ f"{key.__module__}.{key.__qualname__}: {repr(value)}"
144
+ for key, value in self.items.items()
145
+ )
146
+ + "}"
147
+ )
148
+
149
+ def get(self, key: Type[T]) -> Optional[T]:
150
+ """Get a given item or None if it does not exist"""
151
+ try:
152
+ return self[key]
153
+ except KeyError:
154
+ return None
155
+
156
+ def has(self, key: Type[T]) -> bool:
157
+ """Returns True if the record has the given item type"""
158
+ return key.__get_base__() in self.items
159
+
160
+ def __getitem__(self, key: Type[T]) -> T:
161
+ """Get an item given its type"""
162
+ base = key.__get_base__()
163
+ entry = self.items[base]
164
+
165
+ # Check if this matches the expected class
166
+ if not isinstance(entry, key):
167
+ raise KeyError(f"No entry with type {key}")
168
+ return entry
169
+
170
+ def update(self, *items: T, target: RecordType = None) -> "Record":
171
+ """Update some items"""
172
+ # Create our new dictionary
173
+ item_dict = {**self.items}
174
+ for item in items:
175
+ item_dict[item.__get_base__()] = item
176
+
177
+ return Record(item_dict)