syft-dataset 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ Metadata-Version: 2.4
2
+ Name: syft-dataset
3
+ Version: 0.1.3
4
+ Summary: Syft Datasets
5
+ Author-email: OpenMined <info@openmined.org>
6
+ License: Apache-2.0
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: pyyaml>=6.0.3
9
+ Requires-Dist: syft-notebook-ui
@@ -0,0 +1,10 @@
1
+ syft_datasets/__init__.py,sha256=VisH6OXM2j-SCOZUCsPD0kBcZvrAQOvHz6QZAvcK4xI,85
2
+ syft_datasets/config.py,sha256=iObuEfNr5_uwxTXvKqQ1lt5lk-aYKe8gF10mbsmegcw,499
3
+ syft_datasets/dataset.py,sha256=7UNnX7EpSXcYg9cIYx37NnkLFD8L5YfMmdFK4AChrEI,6979
4
+ syft_datasets/dataset_manager.py,sha256=zCQktHvP3uYI0HT2KDvUnq8rO1G3wPhvN_6t0YPbP5Q,13658
5
+ syft_datasets/file_utils.py,sha256=IM9Kw9Vth3ghNqayVJ5_FvyPmXQEaqLdxLI4NDTEt2M,1468
6
+ syft_datasets/types.py,sha256=n7vfKW1BYn1_6223nlYPfeA4rDQoFlTMzM-2WUwCAtg,238
7
+ syft_datasets/url.py,sha256=662XabfCEygaURQtMyB9QW7izHccTjxnyPXyDUw6nSQ,4379
8
+ syft_dataset-0.1.3.dist-info/METADATA,sha256=l4hg1xotXEUk00z7w7P5lu4Lmt_nAJugul6CRTVDAEA,229
9
+ syft_dataset-0.1.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
+ syft_dataset-0.1.3.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,4 @@
1
+ from .config import SyftBoxConfig
2
+
3
+ __version__ = "0.1.3"
4
+ __all__ = ["SyftBoxConfig"]
@@ -0,0 +1,17 @@
1
+ from pydantic import BaseModel, Field
2
+ from pathlib import Path
3
+
4
+
5
+ class SyftBoxConfig(BaseModel):
6
+ syftbox_folder: Path = Field(
7
+ ..., description="Path to the SyftBox folder on the local filesystem."
8
+ )
9
+ email: str = Field(..., description="Email associated with the SyftBox.")
10
+
11
+ @property
12
+ def private_dir(self) -> Path:
13
+ return self.syftbox_folder / "private"
14
+
15
+ @property
16
+ def public_dir(self) -> Path:
17
+ return self.syftbox_folder / self.email / "public"
@@ -0,0 +1,228 @@
1
+ from datetime import datetime, timezone
2
+ from functools import cached_property
3
+ from pathlib import Path
4
+ from typing import ClassVar, Self
5
+ from uuid import UUID, uuid4
6
+
7
+ import yaml
8
+ from pydantic import BaseModel, Field
9
+ from syft_notebook_ui.formatter_mixin import (
10
+ ANSIPydanticFormatter,
11
+ PydanticFormatter,
12
+ PydanticFormatterMixin,
13
+ )
14
+
15
+ from .types import PathLike, to_path
16
+ from .url import SyftBoxURL
17
+ from .config import SyftBoxConfig
18
+
19
+
20
+ def _utcnow():
21
+ return datetime.now(tz=timezone.utc)
22
+
23
+
24
+ class DatasetBase(BaseModel):
25
+ __display_formatter__: ClassVar[PydanticFormatter] = ANSIPydanticFormatter()
26
+ _syftbox_config: SyftBoxConfig | None = None
27
+
28
+ def save(self, filepath: PathLike) -> None:
29
+ filepath = to_path(filepath)
30
+ if not filepath.suffix == ".yaml":
31
+ raise ValueError("Model must be saved as a .yaml file.")
32
+
33
+ if not filepath.parent.exists():
34
+ filepath.parent.mkdir(parents=True, exist_ok=True)
35
+
36
+ data = self.model_dump(mode="json")
37
+ yaml_dump = yaml.safe_dump(data, indent=2, sort_keys=False)
38
+ filepath.write_text(yaml_dump)
39
+
40
+ @classmethod
41
+ def load(
42
+ cls, filepath: PathLike, syftbox_config: SyftBoxConfig | None = None
43
+ ) -> Self:
44
+ filepath = to_path(filepath)
45
+ if not filepath.exists():
46
+ raise FileNotFoundError(f"Config file not found: {filepath}")
47
+
48
+ data = yaml.safe_load(filepath.read_text())
49
+ res = cls.model_validate(data)
50
+ res._syftbox_config = syftbox_config
51
+ return res
52
+
53
+ def __str__(self) -> str:
54
+ return self.__display_formatter__.format_str(self)
55
+
56
+ def __repr__(self) -> str:
57
+ return self.__display_formatter__.format_repr(self)
58
+
59
+ def _repr_html_(self) -> str:
60
+ return self.__display_formatter__.format_html(self)
61
+
62
+ def _repr_markdown_(self) -> str:
63
+ return self.__display_formatter__.format_markdown(self)
64
+
65
+
66
+ class PrivateDatasetConfig(DatasetBase, PydanticFormatterMixin):
67
+ """Used to store private dataset metadata, outside of the sync folder."""
68
+
69
+ uid: UUID # id for this dataset
70
+ data_dir: Path
71
+
72
+
73
+ class Dataset(DatasetBase, PydanticFormatterMixin):
74
+ __table_extra_fields__ = [
75
+ "name",
76
+ "owner",
77
+ ]
78
+
79
+ uid: UUID = Field(default_factory=uuid4)
80
+ created_at: datetime = Field(default_factory=_utcnow)
81
+ updated_at: datetime = Field(default_factory=_utcnow)
82
+ name: str
83
+ summary: str | None = None
84
+ tags: list[str] = []
85
+ location: str | None = None
86
+
87
+ mock_url: SyftBoxURL
88
+ private_url: SyftBoxURL
89
+ readme_url: SyftBoxURL | None = None
90
+
91
+ # Absolute paths to uploaded files (excluding metadata files)
92
+ mock_files_paths: list[Path] = Field(default_factory=list)
93
+ private_files_paths: list[Path] = Field(default_factory=list)
94
+
95
+ @property
96
+ def owner(self) -> str:
97
+ return self.mock_url.host
98
+
99
+ @property
100
+ def syftbox_config(self) -> SyftBoxConfig:
101
+ if self._syftbox_config is None:
102
+ raise ValueError("SyftBox config is not set.")
103
+ return self._syftbox_config
104
+
105
+ def _url_to_path(self, url: SyftBoxURL) -> Path:
106
+ return url.to_local_path(
107
+ syftbox_folder=self.syftbox_config.syftbox_folder,
108
+ )
109
+
110
+ @property
111
+ def readme_path(self) -> Path | None:
112
+ if self.readme_url is None:
113
+ return None
114
+ return self._url_to_path(self.readme_url)
115
+
116
+ def get_readme(self) -> str | None:
117
+ """Get the content of the README file."""
118
+ if self.readme_path and self.readme_path.exists():
119
+ return self.readme_path.read_text()
120
+ return None
121
+
122
+ @property
123
+ def mock_dir(self) -> Path:
124
+ return self._url_to_path(self.mock_url)
125
+
126
+ @property
127
+ def private_config_path(self) -> Path:
128
+ if self.syftbox_config.email != self.owner:
129
+ raise ValueError(
130
+ "Cannot access private config for a dataset owned by another user."
131
+ )
132
+ return self._private_metadata_dir / "private_metadata.yaml"
133
+
134
+ @cached_property
135
+ def private_config(self) -> PrivateDatasetConfig:
136
+ config_path = self.private_config_path
137
+ if not config_path.exists():
138
+ raise FileNotFoundError(
139
+ f"Private dataset config not found at {config_path}"
140
+ )
141
+
142
+ return PrivateDatasetConfig.load(
143
+ filepath=config_path, syftbox_config=self._syftbox_config
144
+ )
145
+
146
+ @property
147
+ def private_dir(self) -> Path:
148
+ private_config = self.private_config
149
+ return private_config.data_dir
150
+
151
+ @property
152
+ def _private_metadata_dir(self) -> Path:
153
+ if self.syftbox_config.email != self.owner:
154
+ raise ValueError(
155
+ "Cannot access private data for a dataset owned by another user."
156
+ )
157
+
158
+ # TODO add 'private' to sb workspace
159
+ private_datasets_dir = (
160
+ self.syftbox_config.syftbox_folder / "private" / "syft_datasets"
161
+ )
162
+
163
+ return private_datasets_dir / self.name
164
+
165
+ @property
166
+ def mock_files(self) -> list[Path]:
167
+ """
168
+ Get absolute paths to all mock files uploaded during dataset.create.
169
+ Excludes dataset.yaml and readme.md files.
170
+ """
171
+ return self.mock_files_paths
172
+
173
+ @property
174
+ def private_files(self) -> list[Path]:
175
+ """
176
+ Get absolute paths to all private files uploaded during dataset.create.
177
+ Excludes private_metadata.yaml file.
178
+ """
179
+ return self.private_files_paths
180
+
181
+ @property
182
+ def files(self) -> list[Path]:
183
+ """
184
+ Get absolute paths to all files (both mock and private) uploaded during dataset.create.
185
+ """
186
+ return self.mock_files + self.private_files
187
+
188
+ def _generate_description_html(self) -> str:
189
+ from syft_notebook_ui.pydantic_html_repr import create_html_repr
190
+
191
+ fields_to_include = ["name", "created_at", "summary", "tags", "location"]
192
+
193
+ paths_to_include = []
194
+ try:
195
+ paths_to_include.append("mock_dir")
196
+ except Exception:
197
+ fields_to_include.append("mock_url")
198
+
199
+ try:
200
+ private_dir = self.private_dir
201
+ if private_dir.is_dir():
202
+ paths_to_include.append("private_dir")
203
+ except Exception:
204
+ pass
205
+
206
+ try:
207
+ readme_path = self.readme_path
208
+ if readme_path and readme_path.exists():
209
+ paths_to_include.append("readme_path")
210
+ except Exception:
211
+ fields_to_include.append("readme_url")
212
+
213
+ description = create_html_repr(
214
+ obj=self,
215
+ fields=fields_to_include,
216
+ display_paths=paths_to_include,
217
+ )
218
+
219
+ return description
220
+
221
+ def describe(self) -> None:
222
+ from IPython.display import HTML, display
223
+
224
+ description = self._generate_description_html()
225
+ display(HTML(description))
226
+
227
+ def _repr_html_(self) -> str:
228
+ return self._generate_description_html()
@@ -0,0 +1,377 @@
1
+ import re
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import Self
5
+
6
+ from .types import PathLike, to_path
7
+ from syft_notebook_ui.types import TableList
8
+ from typing_extensions import Literal
9
+
10
+ from syft_datasets.dataset import Dataset, PrivateDatasetConfig
11
+ from syft_datasets.file_utils import copy_dir_contents, copy_paths, is_empty_dir
12
+
13
+ from .url import SyftBoxURL
14
+ from .config import SyftBoxConfig
15
+
16
+ FOLDER_NAME = "syft_datasets"
17
+ METADATA_FILENAME = "dataset.yaml"
18
+
19
+
20
+ class SyftDatasetManager:
21
+ def __init__(self, syftbox_folder_path: PathLike, email: str):
22
+ self.syftbox_config = SyftBoxConfig(
23
+ syftbox_folder=to_path(syftbox_folder_path), email=email
24
+ )
25
+
26
+ @classmethod
27
+ def from_config(cls, config: SyftBoxConfig) -> Self:
28
+ return cls(syftbox_folder_path=config.syftbox_folder, email=config.email)
29
+
30
+ def public_dir_for_datasite(self, datasite: str) -> Path:
31
+ dir = self.syftbox_config.syftbox_folder / datasite / "public" / FOLDER_NAME
32
+ dir.mkdir(parents=True, exist_ok=True)
33
+ return dir
34
+
35
+ def get_mock_dataset_dir(self, dataset_name: str, datasite: str) -> Path:
36
+ return self.public_dir_for_datasite(datasite) / dataset_name
37
+
38
+ def _validate_dataset_name(self, dataset_name: str) -> None:
39
+ # Returns True if the dataset is a valid path name on unix or windows.
40
+ if not re.match(r"^[\w-]+$", dataset_name):
41
+ raise ValueError(
42
+ f"Invalid dataset name '{dataset_name}'. Only alphanumeric characters, underscores, and hyphens are allowed."
43
+ )
44
+
45
+ def _prepare_mock_data(self, dataset: Dataset, src_path: Path) -> list[Path]:
46
+ # Validate src data
47
+ if not src_path.exists():
48
+ raise FileNotFoundError(f"Could not find mock data at {src_path}")
49
+
50
+ if (src_path / METADATA_FILENAME).exists():
51
+ raise ValueError(
52
+ f"Mock data at {src_path} contains reserved file {METADATA_FILENAME}. Please rename it and try again."
53
+ )
54
+
55
+ # Validate dir we're making on Syftbox
56
+ if dataset.mock_dir.exists() and not is_empty_dir(dataset.mock_dir):
57
+ raise FileExistsError(
58
+ f"Mock dir {dataset.mock_dir} already exists and is not empty."
59
+ )
60
+ dataset.mock_dir.mkdir(parents=True, exist_ok=True)
61
+
62
+ copied_files = []
63
+ if src_path.is_dir():
64
+ copied_files = copy_dir_contents(
65
+ src=src_path,
66
+ dst=dataset.mock_dir,
67
+ exists_ok=True,
68
+ )
69
+ elif src_path.is_file():
70
+ copied_files = copy_paths(
71
+ files=[src_path],
72
+ dst=dataset.mock_dir,
73
+ exists_ok=True,
74
+ )
75
+ else:
76
+ raise ValueError(
77
+ f"Mock data path {src_path} must be an existing file or directory."
78
+ )
79
+
80
+ return copied_files
81
+
82
+ def _prepare_private_data(
83
+ self,
84
+ dataset: Dataset,
85
+ src_path: Path,
86
+ ) -> list[Path]:
87
+ dataset.private_dir.mkdir(parents=True, exist_ok=True)
88
+
89
+ copied_files = []
90
+ if src_path.is_dir():
91
+ # TODO: Implementing without copying private data to `SyftBox/private``
92
+ copied_files = copy_dir_contents(
93
+ src=src_path,
94
+ dst=dataset.private_dir,
95
+ exists_ok=True,
96
+ )
97
+ elif src_path.is_file():
98
+ copied_files = copy_paths(
99
+ files=[src_path],
100
+ dst=dataset.private_dir,
101
+ exists_ok=True,
102
+ )
103
+ else:
104
+ raise ValueError(
105
+ f"Private data path {src_path} must be an existing file or directory."
106
+ )
107
+
108
+ return copied_files
109
+
110
+ def _prepare_private_config(
111
+ self,
112
+ dataset: Dataset,
113
+ private_data_dir: Path,
114
+ location: str | None = None,
115
+ ) -> None:
116
+ """
117
+ The private dataset config is used to store private metadata separately from the public dataset metadata.
118
+ """
119
+ if dataset._private_metadata_dir.exists() and not is_empty_dir(
120
+ dataset._private_metadata_dir
121
+ ):
122
+ raise FileExistsError(
123
+ f"Private dir {dataset.private_dir} already exists and is not empty."
124
+ )
125
+
126
+ private_config = PrivateDatasetConfig(
127
+ uid=dataset.uid,
128
+ data_dir=private_data_dir,
129
+ )
130
+
131
+ private_config_path = dataset.private_config_path
132
+ private_config_path.parent.mkdir(parents=True, exist_ok=True)
133
+ private_config.save(filepath=private_config_path)
134
+
135
+ def _prepare_readme(self, dataset: Dataset, src_file: Path | None) -> list[Path]:
136
+ copied_files = []
137
+ if src_file is not None:
138
+ if not src_file.is_file():
139
+ raise FileNotFoundError(f"Could not find README at {src_file}")
140
+ if not src_file.suffix.lower() == ".md":
141
+ raise ValueError("readme file must be a markdown (.md) file.")
142
+ copied_files = copy_paths(
143
+ files=[src_file],
144
+ dst=dataset.mock_dir,
145
+ exists_ok=True,
146
+ )
147
+ return copied_files
148
+
149
+ def create(
150
+ self,
151
+ name: str,
152
+ mock_path: PathLike,
153
+ private_path: PathLike,
154
+ summary: str | None = None,
155
+ readme_path: Path | None = None,
156
+ location: str | None = None,
157
+ tags: list[str] | None = None,
158
+ # copy_private_data: bool = True, # TODO
159
+ ) -> Dataset:
160
+ """_summary_
161
+
162
+ Args:
163
+ name (str): Unique of the dataset to create.
164
+ mock_path (PathLike): Path to the existing mock data. This can be a file or a directory.
165
+ private_path (PathLike): Path to the existing private data. This can be a file or a directory.
166
+ summary (str | None, optional): Short summary of the dataset. Defaults to None.
167
+ readme_path (Path | None, optional): Markdown README in the public dataset. Defaults to None.
168
+ location (str | None, optional): Location identifier for the dataset, e.g. 'high-side-1234'.
169
+ Only required for datasets that are hosted on a remote location and require manual syncing.
170
+ Defaults to None.
171
+ tags (list[str] | None, optional): Optional tags for the dataset. Defaults to None.
172
+
173
+ Returns:
174
+ Dataset: The created Dataset object.
175
+ """
176
+ mock_path = to_path(mock_path)
177
+ private_path = to_path(private_path)
178
+ readme_path = to_path(readme_path) if readme_path else None
179
+ tags = tags or []
180
+
181
+ mock_dir = self.get_mock_dataset_dir(
182
+ dataset_name=name,
183
+ datasite=self.syftbox_config.email,
184
+ )
185
+ mock_url = SyftBoxURL.from_path(
186
+ path=mock_dir,
187
+ syftbox_folder=self.syftbox_config.syftbox_folder,
188
+ )
189
+ readme_url = None
190
+ if readme_path:
191
+ readme_url = SyftBoxURL.from_path(
192
+ path=mock_dir / readme_path.name,
193
+ syftbox_folder=self.syftbox_config.syftbox_folder,
194
+ )
195
+
196
+ # Generate private_url for the dataset
197
+ # Private URLs use a simple path format
198
+ private_url = SyftBoxURL(
199
+ f"syft://private/syft_datasets/{name}",
200
+ )
201
+
202
+ dataset = Dataset(
203
+ name=name,
204
+ mock_url=mock_url,
205
+ private_url=private_url,
206
+ readme_url=readme_url,
207
+ summary=summary,
208
+ location=location,
209
+ tags=tags,
210
+ )
211
+ dataset._syftbox_config = self.syftbox_config
212
+
213
+ # Prepare mock data and collect file paths
214
+ mock_data_files = self._prepare_mock_data(
215
+ dataset=dataset,
216
+ src_path=mock_path,
217
+ )
218
+
219
+ # Prepare readme and collect file paths
220
+ readme_files = self._prepare_readme(
221
+ dataset=dataset,
222
+ src_file=readme_path,
223
+ )
224
+
225
+ # TODO enable adding private data without copying to SyftBox
226
+ # e.g. private_data_dir = dataset._private_metadata_dir if copy_private_data else private_path
227
+ self._prepare_private_config(
228
+ dataset=dataset,
229
+ private_data_dir=dataset._private_metadata_dir,
230
+ )
231
+
232
+ # Prepare private data and collect file paths
233
+ private_data_files = self._prepare_private_data(
234
+ dataset=dataset,
235
+ src_path=private_path,
236
+ )
237
+
238
+ # Store file paths, excluding metadata files
239
+ dataset_yaml_path = (mock_dir / METADATA_FILENAME).absolute()
240
+ private_metadata_path = dataset.private_config_path.absolute()
241
+
242
+ # Mock files exclude dataset.yaml and readme.md
243
+ dataset.mock_files_paths = [
244
+ f
245
+ for f in mock_data_files
246
+ if f != dataset_yaml_path and f not in readme_files
247
+ ]
248
+
249
+ # Private files exclude private_metadata.yaml
250
+ dataset.private_files_paths = [
251
+ f for f in private_data_files if f != private_metadata_path
252
+ ]
253
+
254
+ # Save dataset metadata
255
+ dataset.save(filepath=dataset_yaml_path)
256
+ return dataset
257
+
258
+ def _load_dataset_from_dir(self, dataset_dir: Path) -> Dataset:
259
+ metadata_path = dataset_dir / METADATA_FILENAME
260
+ if not metadata_path.exists():
261
+ raise FileNotFoundError(f"Dataset metadata not found at {metadata_path}")
262
+
263
+ return Dataset.load(
264
+ filepath=metadata_path,
265
+ syftbox_config=self.syftbox_config,
266
+ )
267
+
268
+ def get(self, name: str, datasite: str | None = None) -> Dataset:
269
+ datasite = datasite or self.syftbox_config.email
270
+ mock_dir = self.get_mock_dataset_dir(
271
+ dataset_name=name,
272
+ datasite=datasite,
273
+ )
274
+
275
+ if not mock_dir.exists():
276
+ raise FileNotFoundError(f"Dataset {name} not found in {mock_dir}")
277
+ return self._load_dataset_from_dir(mock_dir)
278
+
279
+ def __getitem__(self, key: str) -> Dataset:
280
+ return self.get(name=key)
281
+
282
+ def get_all(
283
+ self,
284
+ datasite: str | None = None,
285
+ limit: int | None = None,
286
+ offset: int | None = None,
287
+ order_by: str | None = None,
288
+ sort_order: Literal["asc", "desc"] = "asc",
289
+ ) -> list[Dataset]:
290
+ all_datasets = []
291
+
292
+ if datasite:
293
+ datasites_to_check = [datasite]
294
+ else:
295
+ syftbox_folder = self.syftbox_config.syftbox_folder
296
+ # All directories with "@" in the name are peer/owner email directories
297
+ datasites_to_check = [
298
+ d.name for d in syftbox_folder.iterdir() if d.is_dir() and "@" in d.name
299
+ ]
300
+
301
+ for datasite in datasites_to_check:
302
+ public_datasets_dir = self.public_dir_for_datasite(datasite)
303
+ if not public_datasets_dir.exists():
304
+ continue
305
+ for dataset_dir in public_datasets_dir.iterdir():
306
+ if dataset_dir.is_dir():
307
+ try:
308
+ dataset = self._load_dataset_from_dir(dataset_dir)
309
+ all_datasets.append(dataset)
310
+ except Exception:
311
+ continue
312
+
313
+ if order_by is not None:
314
+ all_datasets.sort(
315
+ key=lambda d: getattr(d, order_by),
316
+ reverse=(sort_order.lower() == "desc"),
317
+ )
318
+
319
+ if offset is not None:
320
+ all_datasets = all_datasets[offset:]
321
+ if limit is not None:
322
+ all_datasets = all_datasets[:limit]
323
+
324
+ return TableList(all_datasets)
325
+
326
+ def delete(
327
+ self,
328
+ name: str,
329
+ datasite: str | None = None,
330
+ require_confirmation: bool = True,
331
+ ) -> None:
332
+ datasite = datasite or self.syftbox_config.email
333
+
334
+ if datasite != self.syftbox_config.email:
335
+ # NOTE this check is easily bypassed, but bypassing does not have any effect.
336
+ # When bypassed, the dataset will be restored because the user only has
337
+ # read access to someone else's datasite.
338
+ raise ValueError(
339
+ "Cannot delete datasets from a datasite that is not your own."
340
+ )
341
+
342
+ try:
343
+ dataset = self.get(
344
+ name=name,
345
+ datasite=datasite,
346
+ )
347
+ except FileNotFoundError:
348
+ raise FileNotFoundError(f"Dataset {name} not found in datasite {datasite}")
349
+
350
+ if require_confirmation:
351
+ msg = (
352
+ "Deleting this dataset will remove the following folders:\n"
353
+ f"Mock data: {dataset.mock_dir}\n"
354
+ f"Private metadata: {dataset._private_metadata_dir}\n"
355
+ )
356
+ if (
357
+ dataset._private_metadata_dir.exists()
358
+ and dataset.private_dir.resolve().absolute()
359
+ == dataset._private_metadata_dir.resolve().absolute()
360
+ ):
361
+ msg += (
362
+ "WARNING: this will also delete the private data from your system\n"
363
+ )
364
+ else:
365
+ msg += "Private data will not be deleted from your system, it is not managed by SyftBox.\n"
366
+
367
+ msg += "Are you sure you want to delete these folders? (yes/no): "
368
+ confirmation = input(msg).strip().lower()
369
+ if confirmation != "yes":
370
+ print("Dataset deletion cancelled.")
371
+ return
372
+
373
+ # Delete the dataset directories
374
+ if dataset.mock_dir.exists():
375
+ shutil.rmtree(dataset.mock_dir)
376
+ if dataset._private_metadata_dir.exists():
377
+ shutil.rmtree(dataset._private_metadata_dir)
@@ -0,0 +1,44 @@
1
+ import shutil
2
+ from pathlib import Path
3
+ from typing import Iterator
4
+
5
+
6
+ def copy_dir_contents(src: Path, dst: Path, exists_ok: bool = False) -> list[Path]:
7
+ if not src.is_dir():
8
+ raise ValueError(f"Source path {src} is not a directory.")
9
+ return copy_paths(src.iterdir(), dst, exists_ok)
10
+
11
+
12
+ def copy_paths(files: Iterator[Path], dst: Path, exists_ok: bool = False) -> list[Path]:
13
+ """
14
+ Copy a list of files to a destination directory.
15
+ If `dst` does not exist, it will be created.
16
+
17
+ Returns:
18
+ list[Path]: List of absolute paths to all copied files (destination paths).
19
+ """
20
+ if not dst.exists():
21
+ dst.mkdir(parents=True, exist_ok=True)
22
+
23
+ copied_files = []
24
+ for file in files:
25
+ dst_path = dst / file.name
26
+ if dst_path.exists() and not exists_ok:
27
+ raise FileExistsError(f"Destination path {dst_path} already exists.")
28
+ if file.is_file():
29
+ shutil.copy2(file, dst_path)
30
+ copied_files.append(dst_path.absolute())
31
+ elif file.is_dir():
32
+ shutil.copytree(file, dst_path, dirs_exist_ok=exists_ok)
33
+ # Get all files in the copied directory tree
34
+ for copied_file in dst_path.rglob("*"):
35
+ if copied_file.is_file():
36
+ copied_files.append(copied_file.absolute())
37
+
38
+ return copied_files
39
+
40
+
41
+ def is_empty_dir(path: Path) -> bool:
42
+ if not path.is_dir():
43
+ return False
44
+ return not any(path.iterdir())
syft_datasets/types.py ADDED
@@ -0,0 +1,11 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Union
4
+
5
+ from typing_extensions import TypeAlias
6
+
7
+ PathLike: TypeAlias = Union[str, Path, os.PathLike]
8
+
9
+
10
+ def to_path(path: PathLike) -> Path:
11
+ return Path(path).expanduser().resolve()
syft_datasets/url.py ADDED
@@ -0,0 +1,135 @@
1
+ import re
2
+ from pathlib import Path
3
+ from typing import Any, Dict, Union
4
+ from urllib.parse import urlencode, urlparse
5
+
6
+ from pydantic import GetCoreSchemaHandler, GetJsonSchemaHandler, ValidationInfo
7
+ from pydantic.json_schema import JsonSchemaValue
8
+ from pydantic_core import core_schema
9
+ from typing_extensions import Self
10
+
11
+ from .types import PathLike, to_path
12
+
13
+
14
+ class SyftBoxURL(str):
15
+ def __new__(cls, url: str):
16
+ instance = super().__new__(cls, url)
17
+ if not cls.is_valid(url):
18
+ raise ValueError(f"Invalid SyftBoxURL: {url}")
19
+ instance.parsed = urlparse(url)
20
+ return instance
21
+
22
+ @classmethod
23
+ def is_valid(cls, url: str) -> bool:
24
+ """Validates the given URL matches the syft:// protocol.
25
+
26
+ Supports two formats:
27
+ 1. Email-based: syft://user@domain.com/path (for public data)
28
+ 2. Simple path: syft://path (for private/local data)
29
+ """
30
+ # Pattern for email-based URLs (e.g., syft://user@domain.com/path)
31
+ email_pattern = (
32
+ r"^syft://([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)(/.*)?$"
33
+ )
34
+ # Pattern for simple path URLs (e.g., syft://private/path)
35
+ simple_pattern = r"^syft://([a-zA-Z0-9_.-]+)(/.*)?$"
36
+
37
+ return bool(re.match(email_pattern, url)) or bool(re.match(simple_pattern, url))
38
+
39
+ @property
40
+ def query(self) -> Dict[str, str]:
41
+ """Returns the query parameters as a dictionary."""
42
+ if not self.parsed.query:
43
+ return {}
44
+
45
+ return dict(
46
+ param.split("=", 1)
47
+ for param in self.parsed.query.split("&")
48
+ if "=" in param
49
+ )
50
+
51
+ @property
52
+ def protocol(self) -> str:
53
+ """Returns the protocol (syft://)."""
54
+ return self.parsed.scheme + "://"
55
+
56
+ @property
57
+ def host(self) -> str:
58
+ """Returns the host, which is the email part."""
59
+ return self.parsed.netloc
60
+
61
+ @property
62
+ def path(self) -> str:
63
+ """Returns the path component after the email."""
64
+ return self.parsed.path
65
+
66
+ def to_local_path(self, syftbox_folder: PathLike) -> Path:
67
+ """
68
+ Converts the SyftBoxURL to a local file system path.
69
+
70
+ Args:
71
+ syftbox_folder (Path): Base SyftBox directory.
72
+
73
+ Returns:
74
+ Path: Local file system path.
75
+ """
76
+ # Remove the protocol and prepend the syftbox_folder
77
+ local_path = to_path(syftbox_folder) / self.host / self.path.lstrip("/")
78
+ return local_path.resolve()
79
+
80
+ def as_http_params(self) -> Dict[str, str]:
81
+ return {
82
+ "method": "get",
83
+ "datasite": self.host,
84
+ "path": self.path,
85
+ }
86
+
87
+ def to_http_get(self, rpc_url: str) -> str:
88
+ rpc_url = rpc_url.split("//")[-1]
89
+ params = self.as_http_params()
90
+ url_params = urlencode(params)
91
+ http_url = f"http://{rpc_url}?{url_params}"
92
+ return http_url
93
+
94
+ @classmethod
95
+ def from_path(cls, path: PathLike, syftbox_folder: PathLike) -> Self:
96
+ rel_path = to_path(path).relative_to(to_path(syftbox_folder))
97
+ # convert to posix path to make it work on Windows OS
98
+ rel_path = rel_path.as_posix()
99
+ return cls(f"syft://{rel_path}")
100
+
101
+ @classmethod
102
+ def validate(
103
+ cls, value: Union["SyftBoxURL", str], info: ValidationInfo
104
+ ) -> "SyftBoxURL":
105
+ if type(value) not in (str, cls):
106
+ raise ValueError(
107
+ f"Invalid type for url: {type(value)}. Expected str or SyftBoxURL."
108
+ )
109
+ value = str(value)
110
+ if not cls.is_valid(value):
111
+ raise ValueError(f"Invalid SyftBoxURL: {value}")
112
+ return cls(value)
113
+
114
+ @classmethod
115
+ def __get_pydantic_core_schema__(
116
+ cls,
117
+ source_type: Any,
118
+ handler: GetCoreSchemaHandler,
119
+ ) -> core_schema.CoreSchema:
120
+ """Pydantic V2 core schema for custom type validation."""
121
+ return core_schema.with_info_after_validator_function(
122
+ cls.validate,
123
+ handler(str),
124
+ )
125
+
126
+ @classmethod
127
+ def __get_pydantic_json_schema__(
128
+ cls, schema_or_field: Any, schema_handler: GetJsonSchemaHandler
129
+ ) -> JsonSchemaValue:
130
+ """Define the JSON schema representation for Pydantic models."""
131
+ return {
132
+ "type": "string",
133
+ "format": "uri",
134
+ "description": "A SyftBox URL",
135
+ }