datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. datamaestro/__init__.py +11 -7
  2. datamaestro/__main__.py +29 -8
  3. datamaestro/annotations/__init__.py +1 -1
  4. datamaestro/annotations/agreement.py +9 -3
  5. datamaestro/commands/site.py +27 -15
  6. datamaestro/context.py +143 -87
  7. datamaestro/data/__init__.py +23 -11
  8. datamaestro/data/csv.py +12 -12
  9. datamaestro/data/huggingface.py +25 -0
  10. datamaestro/data/ml.py +19 -10
  11. datamaestro/data/tensor.py +32 -24
  12. datamaestro/definitions.py +492 -131
  13. datamaestro/download/__init__.py +610 -24
  14. datamaestro/download/archive.py +129 -77
  15. datamaestro/download/custom.py +53 -0
  16. datamaestro/download/huggingface.py +77 -0
  17. datamaestro/download/links.py +106 -50
  18. datamaestro/download/multiple.py +27 -5
  19. datamaestro/download/single.py +114 -51
  20. datamaestro/download/sync.py +0 -1
  21. datamaestro/download/todo.py +9 -4
  22. datamaestro/download/wayback.py +164 -0
  23. datamaestro/record.py +232 -0
  24. datamaestro/registry.py +1 -0
  25. datamaestro/search.py +1 -1
  26. datamaestro/settings.py +3 -1
  27. datamaestro/sphinx.py +224 -0
  28. datamaestro/stream/__init__.py +0 -2
  29. datamaestro/stream/lines.py +10 -7
  30. datamaestro/templates/dataset.py +5 -4
  31. datamaestro/test/__init__.py +3 -1
  32. datamaestro/test/checks.py +1 -5
  33. datamaestro/test/conftest.py +1 -6
  34. datamaestro/test/test_annotations.py +2 -2
  35. datamaestro/test/test_download_handlers.py +3 -4
  36. datamaestro/test/test_record.py +72 -0
  37. datamaestro/test/test_resource.py +1388 -0
  38. datamaestro/utils.py +15 -9
  39. datamaestro/v2.md +301 -0
  40. datamaestro/version.py +4 -0
  41. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
  42. datamaestro-1.7.0.dist-info/RECORD +49 -0
  43. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
  44. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
  45. datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
  46. datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
  47. datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
  48. datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
  49. datamaestro/__pycache__/context.cpython-38.pyc +0 -0
  50. datamaestro/__pycache__/context.cpython-39.pyc +0 -0
  51. datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
  52. datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
  53. datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
  54. datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
  55. datamaestro/__pycache__/search.cpython-38.pyc +0 -0
  56. datamaestro/__pycache__/search.cpython-39.pyc +0 -0
  57. datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
  58. datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
  59. datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
  60. datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
  61. datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
  62. datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
  63. datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
  64. datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
  65. datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
  66. datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
  67. datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
  68. datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
  69. datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
  70. datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
  71. datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
  72. datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
  73. datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
  74. datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
  75. datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
  76. datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
  77. datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
  78. datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
  79. datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
  80. datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
  81. datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
  82. datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
  83. datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
  84. datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
  85. datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
  86. datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
  87. datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
  88. datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
  89. datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
  90. datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
  91. datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
  92. datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
  93. datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
  94. datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
  95. datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
  96. datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
  97. datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
  98. datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
  99. datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
  100. datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
  101. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
  102. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
  103. datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
  104. datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
  105. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
  106. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
  107. datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
  108. datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
  109. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
  110. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
  111. datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
  112. datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
  113. datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
  114. datamaestro-0.8.1.dist-info/RECORD +0 -109
  115. datamaestro-0.8.1.dist-info/top_level.txt +0 -1
  116. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
datamaestro/record.py ADDED
@@ -0,0 +1,232 @@
1
+ """Record module for type-safe heterogeneous containers.
2
+
3
+ .. deprecated:: 2.0
4
+ This module will be removed in v2. Use :class:`typing.TypedDict` instead
5
+ for type-safe heterogeneous data structures. TypedDict provides better IDE
6
+ support, type checking, and is part of the standard library.
7
+
8
+ When using TypedDict, define key constants in classes (e.g., ``MyItem.ID``)
9
+ to avoid typos and enable IDE autocomplete. Prefix keys with package name
10
+ using underscore ``_`` as delimiter to avoid conflicts between different
11
+ data sources.
12
+
13
+ Example migration::
14
+
15
+ # Old way (deprecated)
16
+ @define
17
+ class MyItem(Item):
18
+ value: int
19
+
20
+ record = Record(MyItem(42))
21
+ print(record[MyItem].value)
22
+
23
+ # New way (recommended)
24
+ from typing import TypedDict
25
+
26
+ # Define key constants in classes
27
+ class MyItem:
28
+ ID = "mypackage_value"
29
+
30
+ class MyRecord(TypedDict):
31
+ mypackage_value: int
32
+
33
+ data: MyRecord = {MyItem.ID: 42}
34
+ print(data[MyItem.ID])
35
+ """
36
+
37
+ import warnings
38
+ from typing import Type, TypeVar, Dict, Union, Optional
39
+
40
+ # Emit deprecation warning when module is imported
41
+ warnings.warn(
42
+ "The datamaestro.record module is deprecated and will be removed in v2. "
43
+ "Use typing.TypedDict instead (use class constants like MyItem.ID for keys, "
44
+ "prefixed with package name).",
45
+ DeprecationWarning,
46
+ stacklevel=2,
47
+ )
48
+
49
+
50
+ class Item:
51
+ """Base class for all item types"""
52
+
53
+ @classmethod
54
+ def __get_base__(cls: Type) -> Type:
55
+ """Get the most generic superclass for this type of item"""
56
+ if base := getattr(cls, "__base__cache__", None):
57
+ return base
58
+
59
+ base = cls
60
+ for supercls in cls.__mro__:
61
+ if issubclass(supercls, Item) and supercls is not Item:
62
+ base = supercls
63
+ setattr(cls, "__base__cache__", base)
64
+ return base
65
+
66
+
67
+ T = TypeVar("T", bound=Item)
68
+ Items = Dict[Type[T], T]
69
+
70
+
71
+ class RecordType:
72
+ def __init__(self, *item_types: Type[T]):
73
+ self.item_types = frozenset(item_types)
74
+ self.mapping = {item_type.__get_base__(): item_type for item_type in item_types}
75
+
76
+ def __repr__(self):
77
+ names = ",".join(item_type.__name__ for item_type in self.item_types)
78
+ return f"Record({names})"
79
+
80
+ def contains(self, other: "RecordType"):
81
+ """Checks that each item type in other has an item type of a compatible
82
+ type in self"""
83
+ if len(self.item_types) != len(other.item_types):
84
+ return False
85
+
86
+ for item_type in other.item_types:
87
+ if matching_type := self.mapping.get(item_type.__get_base__(), None):
88
+ if not issubclass(matching_type, item_type):
89
+ return False
90
+ else:
91
+ return False
92
+
93
+ return True
94
+
95
+ def sub(self, *item_types: Type[T]):
96
+ """Returns a new record type based on self and new item types"""
97
+ cls_itemtypes = [x for x in self.item_types]
98
+ mapping = {
99
+ itemtype.__get_base__(): ix for ix, itemtype in enumerate(cls_itemtypes)
100
+ }
101
+
102
+ for itemtype in item_types:
103
+ if (ix := mapping.get(itemtype.__get_base__(), -1)) >= 0:
104
+ cls_itemtypes[ix] = itemtype
105
+ else:
106
+ cls_itemtypes.append(itemtype)
107
+
108
+ return record_type(*cls_itemtypes)
109
+
110
+ def __call__(self, *items: T):
111
+ record = Record(*items)
112
+ self.validate(record)
113
+ return record
114
+
115
+ def has(self, itemtype: Type[T]):
116
+ return issubclass(self.mapping[itemtype.__get_base__()], itemtype)
117
+
118
+ def validate(self, record: "Record"):
119
+ """Creates and validate a new record of this type"""
120
+ if self.item_types:
121
+ for item_type in self.item_types:
122
+ try:
123
+ record.__getitem__(item_type)
124
+ except KeyError:
125
+ raise KeyError(f"Item of type {item_type} is missing")
126
+
127
+ if len(record.items) != len(self.item_types):
128
+ unregistered = [
129
+ item
130
+ for item in record.items.values()
131
+ if all(
132
+ not issubclass(item.__get_base__(), item_type)
133
+ for item_type in self.item_types
134
+ )
135
+ ]
136
+ raise KeyError(
137
+ f"The record of type {self} contains unregistered items: {unregistered}"
138
+ )
139
+
140
+ # Creates a new record
141
+ return record
142
+
143
+
144
+ def record_type(*item_types: Type[T]):
145
+ """Returns a new record type"""
146
+ return RecordType(*item_types)
147
+
148
+
149
+ class Record:
150
+ """Associate types with entries
151
+
152
+ A record is a composition of items; each item base class is unique.
153
+ """
154
+
155
+ #: Items for this record
156
+ items: Items
157
+
158
+ def __init__(self, *items: Union[Items, T], override=False):
159
+ self.items = {}
160
+
161
+ if len(items) == 1 and isinstance(items[0], dict):
162
+ # Just copy the dictionary
163
+ self.items = items[0]
164
+ else:
165
+ for entry in items:
166
+ # Returns a new record if the item exists
167
+ base = entry.__get_base__()
168
+ if not override and base in self.items:
169
+ raise RuntimeError(
170
+ f"The item type {base} ({entry.__class__})"
171
+ " is already in the record"
172
+ )
173
+ self.items[base] = entry
174
+
175
+ def __str__(self):
176
+ return (
177
+ "{"
178
+ + ", ".join(
179
+ f"{key.__module__}.{key.__qualname__}: {value}"
180
+ for key, value in self.items.items()
181
+ )
182
+ + "}"
183
+ )
184
+
185
+ def __repr__(self):
186
+ return (
187
+ "{"
188
+ + ", ".join(
189
+ f"{key.__module__}.{key.__qualname__}: {repr(value)}"
190
+ for key, value in self.items.items()
191
+ )
192
+ + "}"
193
+ )
194
+
195
+ def get(self, key: Type[T]) -> Optional[T]:
196
+ """Get a given item or None if it does not exist"""
197
+ try:
198
+ return self[key]
199
+ except KeyError:
200
+ return None
201
+
202
+ def has(self, key: Type[T]) -> bool:
203
+ """Returns True if the record has the given item type"""
204
+ return key.__get_base__() in self.items
205
+
206
+ def __getitem__(self, key: Type[T]) -> T:
207
+ """Get an item given its type"""
208
+ base = key.__get_base__()
209
+ try:
210
+ entry = self.items[base]
211
+ except KeyError:
212
+ raise KeyError(
213
+ f"""No entry with type {key}: """
214
+ f"""{",".join(str(s) for s in self.items.keys())}"""
215
+ )
216
+
217
+ # Check if this matches the expected class
218
+ if not isinstance(entry, key):
219
+ raise KeyError(
220
+ f"""No entry with type {key}: """
221
+ f"""{",".join(str(s) for s in self.items.keys())}"""
222
+ )
223
+ return entry
224
+
225
+ def update(self, *items: T, target: RecordType = None) -> "Record":
226
+ """Update some items"""
227
+ # Create our new dictionary
228
+ item_dict = {**self.items}
229
+ for item in items:
230
+ item_dict[item.__get_base__()] = item
231
+
232
+ return Record(item_dict)
datamaestro/registry.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from pathlib import Path
2
2
  import shutil
3
3
  from tempfile import NamedTemporaryFile
4
+ import yaml
4
5
 
5
6
 
6
7
  class RegistryEntry:
datamaestro/search.py CHANGED
@@ -40,7 +40,7 @@ class AndCondition(Condition):
40
40
  return True
41
41
 
42
42
  def __repr__(self):
43
- return " AND ".join(self.conditions)
43
+ return " AND ".join([repr(x) for x in self.conditions])
44
44
 
45
45
 
46
46
  class OrCondition(Condition):
datamaestro/settings.py CHANGED
@@ -1,6 +1,8 @@
1
1
  """Global and user settings utility classes"""
2
+
2
3
  import marshmallow as mm
3
- from experimaestro.utils.settings import JsonSettings, PathField
4
+ from typing import Dict, Any
5
+ from experimaestro.utils.settings import JsonSettings
4
6
  from pathlib import Path
5
7
 
6
8
  # --- Global settings
datamaestro/sphinx.py ADDED
@@ -0,0 +1,224 @@
1
+ # Sphinx extension for datamaestro datasets
2
+
3
+ from typing import Any, Dict, Tuple
4
+ from sphinx.ext.autodoc.mock import mock
5
+
6
+ from docutils import nodes
7
+
8
+ from sphinx.application import Sphinx
9
+ from sphinx.domains import Domain, ObjType
10
+ from sphinx.roles import XRefRole
11
+ from sphinx.util.docutils import SphinxDirective
12
+ from sphinx.locale import _
13
+ from sphinx import addnodes
14
+ from sphinx.util.nodes import make_refnode
15
+ import datamaestro
16
+ from datamaestro.data import AbstractDataset
17
+ import logging
18
+ from myst_parser.config.main import MdParserConfig
19
+ from myst_parser.mdit_to_docutils.base import DocutilsRenderer
20
+ from myst_parser.parsers.mdit import create_md_parser
21
+
22
+
23
+ class DatasetNode(nodes.paragraph):
24
+ pass
25
+
26
+
27
+ def to_docutils(source: str):
28
+ parser = create_md_parser(MdParserConfig(), DocutilsRenderer)
29
+ return parser.render(source)
30
+
31
+
32
+ class DatasetsDirective(SphinxDirective):
33
+ def dataset_desc(self, ds: AbstractDataset):
34
+ dm = self.env.get_domain("dm")
35
+
36
+ assert isinstance(dm, DatamaestroDomain)
37
+ dm.add_dataset(ds.id)
38
+
39
+ # indexnode = addnodes.index(entries=[])
40
+ desc = addnodes.desc()
41
+ desc["domain"] = DatamaestroDomain.name
42
+ desc["objtype"] = desc["desctype"] = "dataset"
43
+ desc["classes"].append(DatamaestroDomain.name)
44
+
45
+ signodes = addnodes.desc_signature(ds.id, "", is_multiline=True)
46
+ desc.append(signodes)
47
+
48
+ signode = addnodes.desc_signature_line()
49
+ signode += nodes.Text("Dataset ")
50
+ signode += addnodes.desc_name(text=ds.id)
51
+ signode["ids"].append("dataset" + "-" + ds.id)
52
+ signodes.append(signode)
53
+
54
+ content = addnodes.desc_content()
55
+ desc.append(content)
56
+
57
+ if ds.configtype:
58
+ ctype = ds.configtype
59
+ name = f"{ctype.__module__}.{ctype.__qualname__}"
60
+
61
+ te = nodes.paragraph()
62
+ te.append(nodes.Text("Experimaestro type: "))
63
+
64
+ p = nodes.paragraph()
65
+ returns = addnodes.desc_returns()
66
+ xref = addnodes.pending_xref(
67
+ "",
68
+ nodes.Text(name),
69
+ refdomain="py",
70
+ reftype="class",
71
+ reftarget=name,
72
+ )
73
+ returns.append(xref)
74
+ p.append(returns)
75
+
76
+ content.append(p)
77
+
78
+ # node.append(nodes.Text(ds.id))
79
+ if ds.name:
80
+ content.append(
81
+ nodes.paragraph("", "", nodes.strong("", nodes.Text(ds.name)))
82
+ )
83
+
84
+ if ds.tags or ds.tasks:
85
+ if ds.tags:
86
+ content.append(
87
+ nodes.paragraph(
88
+ "",
89
+ "",
90
+ nodes.strong("", nodes.Text("Tags: ")),
91
+ nodes.Text(", ".join(ds.tags)),
92
+ )
93
+ )
94
+ if ds.tasks:
95
+ content.append(
96
+ nodes.paragraph(
97
+ "",
98
+ "",
99
+ nodes.strong("", "Tasks: "),
100
+ nodes.Text(", ".join(ds.tasks)),
101
+ )
102
+ )
103
+
104
+ if ds.url:
105
+ href = nodes.reference(refuri=ds.url)
106
+ href.append(nodes.Text(ds.url))
107
+ p = nodes.paragraph()
108
+ p.append(nodes.Text("External link: "))
109
+ p.append(href)
110
+ content.append(p)
111
+
112
+ if ds.description:
113
+ content.extend(to_docutils(ds.description))
114
+
115
+ return desc
116
+
117
+
118
+ class RepositoryDirective(DatasetsDirective):
119
+ """Generates the document for a whole repository"""
120
+
121
+ has_content = True
122
+ required_arguments = 1
123
+ optional_arguments = 0
124
+
125
+ def run(self):
126
+ (repository_id,) = self.arguments
127
+ with mock(self.config.autodoc_mock_imports):
128
+ repository = datamaestro.Context.instance().repository(repository_id) # type: Optional[datamaestro.Repository]
129
+ assert repository is not None
130
+
131
+ docnodes = []
132
+ for module in repository.modules():
133
+ section = nodes.section(
134
+ ids=[f"dm-datasets-{repository_id}-{module.id}"]
135
+ )
136
+ docnodes.append(section)
137
+
138
+ section += nodes.title("", nodes.Text(module.title))
139
+ section += nodes.paragraph()
140
+ if module.description:
141
+ section += to_docutils(module.description).children
142
+
143
+ for ds in iter(module):
144
+ section += self.dataset_desc(ds)
145
+
146
+ return docnodes
147
+
148
+
149
+ class DatasetDirective(DatasetsDirective):
150
+ has_content = True
151
+ required_arguments = 1
152
+ optional_arguments = 1
153
+
154
+ def run(self):
155
+ # --- Retrieve the datasets
156
+ if len(self.arguments) == 2:
157
+ module_name, repository_name = self.arguments
158
+ else:
159
+ (module_name,) = self.arguments
160
+ repository_name = self.env.config["datamaestro_repository"]
161
+
162
+ datasets = None
163
+ with mock(self.config.autodoc_mock_imports):
164
+ for repository in datamaestro.Context.instance().repositories():
165
+ if repository_name is None or repository.id == repository_name:
166
+ datasets = repository.datasets(module_name)
167
+ if datasets is not None:
168
+ break
169
+
170
+ assert datasets is not None
171
+
172
+ # --- Start documenting
173
+
174
+ docnodes = []
175
+ # node.document = self.state.document
176
+ if datasets.description:
177
+ docnodes.extend(to_docutils(datasets.description))
178
+
179
+ for ds in datasets:
180
+ docnodes.append(self.dataset_desc(ds))
181
+ return docnodes
182
+
183
+
184
+ class DatamaestroDomain(Domain):
185
+ name = "dm"
186
+ object_types = {
187
+ "dataset": ObjType(_("dataset"), "ds"),
188
+ }
189
+ directives = {
190
+ "repository": RepositoryDirective,
191
+ "datasets": DatasetDirective,
192
+ }
193
+ roles = {"ref": XRefRole()}
194
+ indices = {
195
+ # TODO: Add indices for tags and tasks
196
+ }
197
+ initial_data: Dict[str, Dict[str, Tuple[str, str]]] = {
198
+ "datasets": {}, # fullname -> dataset
199
+ "tags": {}, # tag -> list of datasets,
200
+ "tasks": {}, # task name -> list of datasets
201
+ }
202
+
203
+ def add_dataset(self, dsid):
204
+ self.data["datasets"][dsid] = (self.env.docname, f"dataset-{dsid}")
205
+
206
+ def resolve_xref(self, env, fromdocname, builder, typ, target, node, contnode):
207
+ logging.debug("[dm/sphinx] Searching for", target)
208
+
209
+ ref = self.data["datasets"].get(target, None)
210
+ if ref:
211
+ docname, targ = ref
212
+ return make_refnode(builder, fromdocname, docname, targ, contnode, targ)
213
+ return None
214
+
215
+
216
+ def setup(app: Sphinx) -> Dict[str, Any]:
217
+ """Setup experimaestro for Sphinx documentation"""
218
+
219
+ app.add_domain(DatamaestroDomain)
220
+ app.add_node(DatasetNode)
221
+
222
+ app.add_config_value("datamaestro_repository", None, True)
223
+
224
+ return {"version": datamaestro.version, "parallel_read_safe": True}
@@ -1,5 +1,3 @@
1
- import io
2
- import logging
3
1
  from pathlib import Path
4
2
 
5
3
 
@@ -28,7 +28,8 @@ class LineTransformStream(io.RawIOBase):
28
28
  self.current = self.transform(line).encode("utf-8")
29
29
 
30
30
  def readinto(self, b):
31
- """Read bytes into a pre-allocated, writable bytes-like object b and return the number of bytes read"""
31
+ """Read bytes into a pre-allocated, writable bytes-like object b and
32
+ return the number of bytes read"""
32
33
  if self.current is None:
33
34
  return 0
34
35
 
@@ -41,12 +42,14 @@ class LineTransformStream(io.RawIOBase):
41
42
  return offset
42
43
 
43
44
  # How many bytes to read from current line
44
- l = min(lb, len(self.current) - self.offset)
45
-
46
- b[offset : (offset + l)] = self.current[self.offset : (self.offset + l)]
47
- lb -= l
48
- offset += l
49
- self.offset += l
45
+ chunk_len = min(lb, len(self.current) - self.offset)
46
+
47
+ b[offset : (offset + chunk_len)] = self.current[
48
+ self.offset : (self.offset + chunk_len)
49
+ ]
50
+ lb -= chunk_len
51
+ offset += chunk_len
52
+ self.offset += chunk_len
50
53
 
51
54
  return offset
52
55
 
@@ -1,4 +1,4 @@
1
- # See documentation on http://experimaestro.github.io/datamaestro/
1
+ # See documentation on https://datamaestro.readthedocs.io
2
2
 
3
3
  from datamaestro.definitions import datatasks, datatags, dataset
4
4
  from datamaestro.data import Base
@@ -7,11 +7,12 @@ from datamaestro.data import Base
7
7
  @datatags("tag1", "tag2")
8
8
  @datatasks("task1", "task2")
9
9
  @dataset(
10
- Base, url="__URL__",
10
+ Base,
11
+ url="__URL__",
11
12
  )
12
13
  def __IDENTIFIER__():
13
14
  """Line description
14
15
 
15
- Long description
16
- """
16
+ Long description
17
+ """
17
18
  return {}
@@ -1 +1,3 @@
1
- from .checks import *
1
+ from .checks import DatasetTests
2
+
3
+ __all__ = ["DatasetTests"]
@@ -1,11 +1,7 @@
1
1
  import logging
2
- import traceback
3
2
  import importlib
4
3
  import inspect
5
-
6
- from datamaestro.context import Context, Repository
7
-
8
- import unittest
4
+ from datamaestro.context import Context
9
5
 
10
6
 
11
7
  class DatasetTests:
@@ -1,13 +1,8 @@
1
1
  from pathlib import Path
2
- import contextlib
3
- import unittest
4
- import tempfile
5
2
  from datamaestro import Repository, Context
6
3
  import shutil
7
4
  import logging
8
5
  import pytest
9
- import os
10
- import shutil
11
6
 
12
7
 
13
8
  class MyRepository(Repository):
@@ -27,7 +22,7 @@ def context(tmp_path_factory):
27
22
  context = Context(Path(dir))
28
23
  logging.info("Created datamaestro test directory %s", dir)
29
24
 
30
- repository = MyRepository(context)
25
+ _repository = MyRepository(context) # noqa: F841 - registered on creation
31
26
 
32
27
  yield context
33
28
 
@@ -1,11 +1,11 @@
1
1
  from datamaestro.annotations.agreement import useragreement
2
2
  from datamaestro.definitions import AbstractDataset
3
- from .conftest import repository
4
3
 
5
4
 
6
5
  def test_useragreements(context):
7
6
  # Fake dataset
8
7
  class t(AbstractDataset):
9
- pass
8
+ def _prepare(self):
9
+ pass
10
10
 
11
11
  useragreement("test")(t(None))
@@ -1,9 +1,5 @@
1
- import unittest
2
- import logging
3
1
  from pathlib import Path
4
- import shutil
5
2
  import datamaestro.download.single as single
6
- from datamaestro import Repository, Context
7
3
  from datamaestro.definitions import AbstractDataset
8
4
  from .conftest import MyRepository
9
5
 
@@ -16,6 +12,9 @@ class Dataset(AbstractDataset):
16
12
  super().__init__(repository)
17
13
  self.datapath = Path(repository.context._path)
18
14
 
15
+ def _prepare(self):
16
+ pass
17
+
19
18
 
20
19
  def test_filedownloader(context):
21
20
  repository = MyRepository(context)
@@ -0,0 +1,72 @@
1
+ import pickle
2
+ from datamaestro.record import Item, record_type
3
+ from attrs import define
4
+ import pytest
5
+
6
+
7
+ @define
8
+ class AItem(Item):
9
+ a: int
10
+
11
+
12
+ @define
13
+ class A1Item(AItem):
14
+ a1: int
15
+
16
+
17
+ @define
18
+ class BItem(Item):
19
+ b: int
20
+
21
+
22
+ @define
23
+ class B1Item(BItem):
24
+ b1: int
25
+
26
+
27
+ @define
28
+ class CItem(Item):
29
+ c: int
30
+
31
+
32
+ ARecord = record_type(AItem)
33
+ BaseRecord = ARecord.sub(A1Item)
34
+ MyRecord = BaseRecord.sub(BItem)
35
+
36
+
37
+ def test_record_simple():
38
+ a = A1Item(1, 2)
39
+ b = BItem(4)
40
+ r = MyRecord(a, b)
41
+ assert r[AItem] is a
42
+ assert r[A1Item] is a
43
+ assert r[BItem] is b
44
+
45
+
46
+ def test_record_missing_init():
47
+ with pytest.raises(KeyError):
48
+ # A1Item is missing
49
+ MyRecord(AItem(1), BItem(2))
50
+
51
+ with pytest.raises(KeyError):
52
+ MyRecord(A1Item(1, 2))
53
+
54
+
55
+ def test_record_update():
56
+ a = A1Item(1, 2)
57
+ b = BItem(4)
58
+ r = MyRecord(a, b)
59
+
60
+ r2 = r.update(BItem(3))
61
+ assert r is not r2
62
+ assert r2[BItem] is not b
63
+
64
+
65
+ def test_record_pickled():
66
+ # First,
67
+ MyRecord2 = BaseRecord.sub(BItem)
68
+ r = MyRecord2(A1Item(1, 2), BItem(2))
69
+ r = pickle.loads(pickle.dumps(r))
70
+
71
+ assert r[A1Item].a == 1
72
+ assert r[BItem].b == 2