syft-dataset 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,175 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ credentials/
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # PyInstaller
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ .python-version
87
+
88
+ # pipenv
89
+ Pipfile.lock
90
+
91
+ # poetry
92
+ poetry.lock
93
+
94
+ # pdm
95
+ .pdm.toml
96
+
97
+ # PEP 582
98
+ __pypackages__/
99
+
100
+ # Celery stuff
101
+ celerybeat-schedule
102
+ celerybeat.pid
103
+
104
+ # SageMath parsed files
105
+ *.sage.py
106
+
107
+ # Environments
108
+ .env
109
+ .venv
110
+ env/
111
+ venv/
112
+ ENV/
113
+ env.bak/
114
+ venv.bak/
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
133
+
134
+ # pytype static type analyzer
135
+ .pytype/
136
+
137
+ # Cython debug symbols
138
+ cython_debug/
139
+
140
+ # PyCharm
141
+ .idea/
142
+
143
+ # VS Code
144
+ .vscode/
145
+
146
+ # macOS
147
+ .DS_Store
148
+
149
+ # Google Drive credentials
150
+ credentials.json
151
+ client_secret*.json
152
+ token.pickle
153
+ token.json
154
+
155
+ # SyftBox related
156
+ ~/SyftBox/
157
+ SyftBox/
158
+
159
+ # Temporary files
160
+ *.tmp
161
+ *.bak
162
+ *.swp
163
+ *~
164
+
165
+ # Claude AI settings
166
+ .claude/
167
+ # CI test outputs
168
+ test_outputs/
169
+ test_suite_output.log
170
+
171
+
172
+ # Notebooks
173
+ notebooks/e2e/sales_mock.csv
174
+ notebooks/e2e/sales_private.csv
175
+ notebooks/e2e/readme.md
@@ -0,0 +1,9 @@
1
+ Metadata-Version: 2.4
2
+ Name: syft-dataset
3
+ Version: 0.1.0
4
+ Summary: Syft Datasets
5
+ Author-email: OpenMined <info@openmined.org>
6
+ License: Apache-2.0
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: pyyaml>=6.0.3
9
+ Requires-Dist: syft-notebook-ui
@@ -0,0 +1,25 @@
1
+ [project]
2
+ name = "syft-dataset"
3
+ version = "0.1.0"
4
+ description = "Syft Datasets"
5
+ authors = [{ name = "OpenMined", email = "info@openmined.org" }]
6
+ license = { text = "Apache-2.0" }
7
+ requires-python = ">=3.12"
8
+
9
+ dependencies = [
10
+ "pyyaml>=6.0.3",
11
+ "syft-notebook-ui",
12
+ ]
13
+
14
+
15
+ [build-system]
16
+ requires = ["hatchling"]
17
+ build-backend = "hatchling.build"
18
+
19
+ [tool.hatch.build.targets.wheel]
20
+ packages = ["src/syft_datasets"]
21
+
22
+ [dependency-groups]
23
+ dev = [
24
+ "ipykernel>=7.1.0",
25
+ ]
@@ -0,0 +1,3 @@
1
+ from .config import SyftBoxConfig
2
+
3
+ __all__ = ["SyftBoxConfig"]
@@ -0,0 +1,17 @@
1
+ from pydantic import BaseModel, Field
2
+ from pathlib import Path
3
+
4
+
5
+ class SyftBoxConfig(BaseModel):
6
+ syftbox_folder: Path = Field(
7
+ ..., description="Path to the SyftBox folder on the local filesystem."
8
+ )
9
+ email: str = Field(..., description="Email associated with the SyftBox.")
10
+
11
+ @property
12
+ def private_dir(self) -> Path:
13
+ return self.syftbox_folder / "private"
14
+
15
+ @property
16
+ def public_dir(self) -> Path:
17
+ return self.syftbox_folder / self.email / "public"
@@ -0,0 +1,192 @@
1
+ from datetime import datetime, timezone
2
+ from functools import cached_property
3
+ from pathlib import Path
4
+ from typing import ClassVar, Self
5
+ from uuid import UUID, uuid4
6
+
7
+ import yaml
8
+ from pydantic import BaseModel, Field
9
+ from syft_notebook_ui.formatter_mixin import (
10
+ ANSIPydanticFormatter,
11
+ PydanticFormatter,
12
+ PydanticFormatterMixin,
13
+ )
14
+
15
+ from .types import PathLike, to_path
16
+ from .url import SyftBoxURL
17
+ from .config import SyftBoxConfig
18
+
19
+
20
+ def _utcnow():
21
+ return datetime.now(tz=timezone.utc)
22
+
23
+
24
+ class DatasetBase(BaseModel):
25
+ __display_formatter__: ClassVar[PydanticFormatter] = ANSIPydanticFormatter()
26
+ _syftbox_config: SyftBoxConfig | None = None
27
+
28
+ def save(self, filepath: PathLike) -> None:
29
+ filepath = to_path(filepath)
30
+ if not filepath.suffix == ".yaml":
31
+ raise ValueError("Model must be saved as a .yaml file.")
32
+
33
+ if not filepath.parent.exists():
34
+ filepath.parent.mkdir(parents=True, exist_ok=True)
35
+
36
+ data = self.model_dump(mode="json")
37
+ yaml_dump = yaml.safe_dump(data, indent=2, sort_keys=False)
38
+ filepath.write_text(yaml_dump)
39
+
40
+ @classmethod
41
+ def load(
42
+ cls, filepath: PathLike, syftbox_config: SyftBoxConfig | None = None
43
+ ) -> Self:
44
+ filepath = to_path(filepath)
45
+ if not filepath.exists():
46
+ raise FileNotFoundError(f"Config file not found: {filepath}")
47
+
48
+ data = yaml.safe_load(filepath.read_text())
49
+ res = cls.model_validate(data)
50
+ res._syftbox_config = syftbox_config
51
+ return res
52
+
53
+ def __str__(self) -> str:
54
+ return self.__display_formatter__.format_str(self)
55
+
56
+ def __repr__(self) -> str:
57
+ return self.__display_formatter__.format_repr(self)
58
+
59
+ def _repr_html_(self) -> str:
60
+ return self.__display_formatter__.format_html(self)
61
+
62
+ def _repr_markdown_(self) -> str:
63
+ return self.__display_formatter__.format_markdown(self)
64
+
65
+
66
+ class PrivateDatasetConfig(DatasetBase, PydanticFormatterMixin):
67
+ """Used to store private dataset metadata, outside of the sync folder."""
68
+
69
+ uid: UUID # id for this dataset
70
+ data_dir: Path
71
+
72
+
73
+ class Dataset(DatasetBase, PydanticFormatterMixin):
74
+ __table_extra_fields__ = [
75
+ "name",
76
+ "owner",
77
+ ]
78
+
79
+ uid: UUID = Field(default_factory=uuid4)
80
+ created_at: datetime = Field(default_factory=_utcnow)
81
+ updated_at: datetime = Field(default_factory=_utcnow)
82
+ name: str
83
+ summary: str | None = None
84
+ tags: list[str] = []
85
+ location: str | None = None
86
+
87
+ mock_url: SyftBoxURL
88
+ readme_url: SyftBoxURL | None = None
89
+
90
+ @property
91
+ def owner(self) -> str:
92
+ return self.mock_url.host
93
+
94
+ @property
95
+ def syftbox_config(self) -> SyftBoxConfig:
96
+ if self._syftbox_config is None:
97
+ raise ValueError("SyftBox config is not set.")
98
+ return self._syftbox_config
99
+
100
+ def _url_to_path(self, url: SyftBoxURL) -> Path:
101
+ return url.to_local_path(
102
+ syftbox_folder=self.syftbox_config.syftbox_folder,
103
+ )
104
+
105
+ @property
106
+ def readme_path(self) -> Path | None:
107
+ if self.readme_url is None:
108
+ return None
109
+ return self._url_to_path(self.readme_url)
110
+
111
+ def get_readme(self) -> str | None:
112
+ """Get the content of the README file."""
113
+ if self.readme_path and self.readme_path.exists():
114
+ return self.readme_path.read_text()
115
+ return None
116
+
117
+ @property
118
+ def mock_dir(self) -> Path:
119
+ return self._url_to_path(self.mock_url)
120
+
121
+ @property
122
+ def private_config_path(self) -> Path:
123
+ if self.syftbox_config.email != self.owner:
124
+ raise ValueError(
125
+ "Cannot access private config for a dataset owned by another user."
126
+ )
127
+ return self._private_metadata_dir / "private_metadata.yaml"
128
+
129
+ @cached_property
130
+ def private_config(self) -> PrivateDatasetConfig:
131
+ config_path = self.private_config_path
132
+ if not config_path.exists():
133
+ raise FileNotFoundError(
134
+ f"Private dataset config not found at {config_path}"
135
+ )
136
+
137
+ return PrivateDatasetConfig.load(
138
+ filepath=config_path, syftbox_config=self._syftbox_config
139
+ )
140
+
141
+ @property
142
+ def private_dir(self) -> Path:
143
+ private_config = self.private_config
144
+ return private_config.data_dir
145
+
146
+ @property
147
+ def _private_metadata_dir(self) -> Path:
148
+ if self.syftbox_config.email != self.owner:
149
+ raise ValueError(
150
+ "Cannot access private data for a dataset owned by another user."
151
+ )
152
+
153
+ # TODO add 'private' to sb workspace
154
+ private_datasets_dir = (
155
+ self.syftbox_config.syftbox_folder / "private" / "syft_datasets"
156
+ )
157
+
158
+ return private_datasets_dir / self.name
159
+
160
+ def describe(self) -> None:
161
+ from IPython.display import HTML, display
162
+ from syft_notebook_ui.pydantic_html_repr import create_html_repr
163
+
164
+ fields_to_include = ["name", "created_at", "summary", "tags", "location"]
165
+
166
+ paths_to_include = []
167
+ try:
168
+ paths_to_include.append("mock_dir")
169
+ except Exception:
170
+ fields_to_include.append("mock_url")
171
+
172
+ try:
173
+ private_dir = self.private_dir
174
+ if private_dir.is_dir():
175
+ paths_to_include.append("private_dir")
176
+ except Exception:
177
+ pass
178
+
179
+ try:
180
+ readme_path = self.readme_path
181
+ if readme_path and readme_path.exists():
182
+ paths_to_include.append("readme_path")
183
+ except Exception:
184
+ fields_to_include.append("readme_url")
185
+
186
+ description = create_html_repr(
187
+ obj=self,
188
+ fields=fields_to_include,
189
+ display_paths=paths_to_include,
190
+ )
191
+
192
+ display(HTML(description))
@@ -0,0 +1,341 @@
1
+ import re
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import Self
5
+
6
+ from .types import PathLike, to_path
7
+ from syft_notebook_ui.types import TableList
8
+ from typing_extensions import Literal
9
+
10
+ from syft_datasets.dataset import Dataset, PrivateDatasetConfig
11
+ from syft_datasets.file_utils import copy_dir_contents, copy_paths, is_empty_dir
12
+
13
+ from .url import SyftBoxURL
14
+ from .config import SyftBoxConfig
15
+
16
+ FOLDER_NAME = "syft_datasets"
17
+ METADATA_FILENAME = "dataset.yaml"
18
+
19
+
20
+ class SyftDatasetManager:
21
+ def __init__(self, syftbox_folder_path: PathLike, email: str):
22
+ self.syftbox_config = SyftBoxConfig(
23
+ syftbox_folder=to_path(syftbox_folder_path), email=email
24
+ )
25
+
26
+ @classmethod
27
+ def from_config(cls, config: SyftBoxConfig) -> Self:
28
+ return cls(syftbox_folder_path=config.syftbox_folder, email=config.email)
29
+
30
+ def public_dir_for_datasite(self, datasite: str) -> Path:
31
+ dir = self.syftbox_config.syftbox_folder / datasite / "public" / FOLDER_NAME
32
+ dir.mkdir(parents=True, exist_ok=True)
33
+ return dir
34
+
35
+ def get_mock_dataset_dir(self, dataset_name: str, datasite: str) -> Path:
36
+ return self.public_dir_for_datasite(datasite) / dataset_name
37
+
38
+ def _validate_dataset_name(self, dataset_name: str) -> None:
39
+ # Returns True if the dataset is a valid path name on unix or windows.
40
+ if not re.match(r"^[\w-]+$", dataset_name):
41
+ raise ValueError(
42
+ f"Invalid dataset name '{dataset_name}'. Only alphanumeric characters, underscores, and hyphens are allowed."
43
+ )
44
+
45
+ def _prepare_mock_data(self, dataset: Dataset, src_path: Path) -> None:
46
+ # Validate src data
47
+ if not src_path.exists():
48
+ raise FileNotFoundError(f"Could not find mock data at {src_path}")
49
+
50
+ if (src_path / METADATA_FILENAME).exists():
51
+ raise ValueError(
52
+ f"Mock data at {src_path} contains reserved file {METADATA_FILENAME}. Please rename it and try again."
53
+ )
54
+
55
+ # Validate dir we're making on Syftbox
56
+ if dataset.mock_dir.exists() and not is_empty_dir(dataset.mock_dir):
57
+ raise FileExistsError(
58
+ f"Mock dir {dataset.mock_dir} already exists and is not empty."
59
+ )
60
+ dataset.mock_dir.mkdir(parents=True, exist_ok=True)
61
+
62
+ if src_path.is_dir():
63
+ copy_dir_contents(
64
+ src=src_path,
65
+ dst=dataset.mock_dir,
66
+ exists_ok=True,
67
+ )
68
+ elif src_path.is_file():
69
+ copy_paths(
70
+ files=[src_path],
71
+ dst=dataset.mock_dir,
72
+ exists_ok=True,
73
+ )
74
+ else:
75
+ raise ValueError(
76
+ f"Mock data path {src_path} must be an existing file or directory."
77
+ )
78
+
79
+ def _prepare_private_data(
80
+ self,
81
+ dataset: Dataset,
82
+ src_path: Path,
83
+ ) -> None:
84
+ dataset.private_dir.mkdir(parents=True, exist_ok=True)
85
+
86
+ if src_path.is_dir():
87
+ # TODO: Implementing without copying private data to `SyftBox/private``
88
+ copy_dir_contents(
89
+ src=src_path,
90
+ dst=dataset.private_dir,
91
+ exists_ok=True,
92
+ )
93
+ elif src_path.is_file():
94
+ copy_paths(
95
+ files=[src_path],
96
+ dst=dataset.private_dir,
97
+ exists_ok=True,
98
+ )
99
+ else:
100
+ raise ValueError(
101
+ f"Private data path {src_path} must be an existing file or directory."
102
+ )
103
+
104
+ def _prepare_private_config(
105
+ self,
106
+ dataset: Dataset,
107
+ private_data_dir: Path,
108
+ location: str | None = None,
109
+ ) -> None:
110
+ """
111
+ The private dataset config is used to store private metadata separately from the public dataset metadata.
112
+ """
113
+ if dataset._private_metadata_dir.exists() and not is_empty_dir(
114
+ dataset._private_metadata_dir
115
+ ):
116
+ raise FileExistsError(
117
+ f"Private dir {dataset.private_dir} already exists and is not empty."
118
+ )
119
+
120
+ private_config = PrivateDatasetConfig(
121
+ uid=dataset.uid,
122
+ data_dir=private_data_dir,
123
+ location=location,
124
+ )
125
+
126
+ private_config_path = dataset.private_config_path
127
+ private_config_path.parent.mkdir(parents=True, exist_ok=True)
128
+ private_config.save(filepath=private_config_path)
129
+
130
+ def _prepare_readme(self, dataset: Dataset, src_file: Path | None) -> None:
131
+ if src_file is not None:
132
+ if not src_file.is_file():
133
+ raise FileNotFoundError(f"Could not find README at {src_file}")
134
+ if not src_file.suffix.lower() == ".md":
135
+ raise ValueError("readme file must be a markdown (.md) file.")
136
+ copy_paths(
137
+ files=[src_file],
138
+ dst=dataset.mock_dir,
139
+ exists_ok=True,
140
+ )
141
+
142
+ def create(
143
+ self,
144
+ name: str,
145
+ mock_path: PathLike,
146
+ private_path: PathLike,
147
+ summary: str | None = None,
148
+ readme_path: Path | None = None,
149
+ location: str | None = None,
150
+ tags: list[str] | None = None,
151
+ # copy_private_data: bool = True, # TODO
152
+ ) -> Dataset:
153
+ """_summary_
154
+
155
+ Args:
156
+ name (str): Unique of the dataset to create.
157
+ mock_path (PathLike): Path to the existing mock data. This can be a file or a directory.
158
+ private_path (PathLike): Path to the existing private data. This can be a file or a directory.
159
+ summary (str | None, optional): Short summary of the dataset. Defaults to None.
160
+ readme_path (Path | None, optional): Markdown README in the public dataset. Defaults to None.
161
+ location (str | None, optional): Location identifier for the dataset, e.g. 'high-side-1234'.
162
+ Only required for datasets that are hosted on a remote location and require manual syncing.
163
+ Defaults to None.
164
+ tags (list[str] | None, optional): Optional tags for the dataset. Defaults to None.
165
+
166
+ Returns:
167
+ Dataset: The created Dataset object.
168
+ """
169
+ mock_path = to_path(mock_path)
170
+ private_path = to_path(private_path)
171
+ readme_path = to_path(readme_path) if readme_path else None
172
+ tags = tags or []
173
+
174
+ mock_dir = self.get_mock_dataset_dir(
175
+ dataset_name=name,
176
+ datasite=self.syftbox_config.email,
177
+ )
178
+ mock_url = SyftBoxURL.from_path(
179
+ path=mock_dir,
180
+ syftbox_folder=self.syftbox_config.syftbox_folder,
181
+ )
182
+ readme_url = None
183
+ if readme_path:
184
+ readme_url = SyftBoxURL.from_path(
185
+ path=mock_dir / readme_path.name,
186
+ syftbox_folder=self.syftbox_config.syftbox_folder,
187
+ )
188
+
189
+ dataset = Dataset(
190
+ name=name,
191
+ mock_url=mock_url,
192
+ readme_url=readme_url,
193
+ summary=summary,
194
+ location=location,
195
+ tags=tags,
196
+ )
197
+ dataset._syftbox_config = self.syftbox_config
198
+
199
+ self._prepare_mock_data(
200
+ dataset=dataset,
201
+ src_path=mock_path,
202
+ )
203
+ self._prepare_readme(
204
+ dataset=dataset,
205
+ src_file=readme_path,
206
+ )
207
+
208
+ # TODO enable adding private data without copying to SyftBox
209
+ # e.g. private_data_dir = dataset._private_metadata_dir if copy_private_data else private_path
210
+ self._prepare_private_config(
211
+ dataset=dataset,
212
+ private_data_dir=dataset._private_metadata_dir,
213
+ )
214
+ self._prepare_private_data(
215
+ dataset=dataset,
216
+ src_path=private_path,
217
+ )
218
+
219
+ dataset_yaml_path = mock_dir / METADATA_FILENAME
220
+ dataset.save(filepath=dataset_yaml_path)
221
+ return dataset
222
+
223
+ def _load_dataset_from_dir(self, dataset_dir: Path) -> Dataset:
224
+ metadata_path = dataset_dir / METADATA_FILENAME
225
+ if not metadata_path.exists():
226
+ raise FileNotFoundError(f"Dataset metadata not found at {metadata_path}")
227
+
228
+ return Dataset.load(
229
+ filepath=metadata_path,
230
+ syftbox_config=self.syftbox_config,
231
+ )
232
+
233
+ def get(self, name: str, datasite: str | None = None) -> Dataset:
234
+ datasite = datasite or self.syftbox_config.email
235
+ mock_dir = self.get_mock_dataset_dir(
236
+ dataset_name=name,
237
+ datasite=datasite,
238
+ )
239
+
240
+ if not mock_dir.exists():
241
+ raise FileNotFoundError(f"Dataset {name} not found in {mock_dir}")
242
+ return self._load_dataset_from_dir(mock_dir)
243
+
244
+ def __getitem__(self, key: str) -> Dataset:
245
+ return self.get(name=key)
246
+
247
+ def get_all(
248
+ self,
249
+ datasite: str | None = None,
250
+ limit: int | None = None,
251
+ offset: int | None = None,
252
+ order_by: str | None = None,
253
+ sort_order: Literal["asc", "desc"] = "asc",
254
+ ) -> list[Dataset]:
255
+ all_datasets = []
256
+
257
+ if datasite:
258
+ datasites_to_check = [datasite]
259
+ else:
260
+ syftbox_folder = self.syftbox_config.syftbox_folder
261
+ # All directories with "@" in the name are peer/owner email directories
262
+ datasites_to_check = [
263
+ d.name for d in syftbox_folder.iterdir() if d.is_dir() and "@" in d.name
264
+ ]
265
+
266
+ for datasite in datasites_to_check:
267
+ public_datasets_dir = self.public_dir_for_datasite(datasite)
268
+ if not public_datasets_dir.exists():
269
+ continue
270
+ for dataset_dir in public_datasets_dir.iterdir():
271
+ if dataset_dir.is_dir():
272
+ try:
273
+ dataset = self._load_dataset_from_dir(dataset_dir)
274
+ all_datasets.append(dataset)
275
+ except Exception:
276
+ continue
277
+
278
+ if order_by is not None:
279
+ all_datasets.sort(
280
+ key=lambda d: getattr(d, order_by),
281
+ reverse=(sort_order.lower() == "desc"),
282
+ )
283
+
284
+ if offset is not None:
285
+ all_datasets = all_datasets[offset:]
286
+ if limit is not None:
287
+ all_datasets = all_datasets[:limit]
288
+
289
+ return TableList(all_datasets)
290
+
291
+ def delete(
292
+ self,
293
+ name: str,
294
+ datasite: str | None = None,
295
+ require_confirmation: bool = True,
296
+ ) -> None:
297
+ datasite = datasite or self.syftbox_config.email
298
+
299
+ if datasite != self.syftbox_config.email:
300
+ # NOTE this check is easily bypassed, but bypassing does not have any effect.
301
+ # When bypassed, the dataset will be restored because the user only has
302
+ # read access to someone else's datasite.
303
+ raise ValueError(
304
+ "Cannot delete datasets from a datasite that is not your own."
305
+ )
306
+
307
+ try:
308
+ dataset = self.get(
309
+ name=name,
310
+ datasite=datasite,
311
+ )
312
+ except FileNotFoundError:
313
+ raise FileNotFoundError(f"Dataset {name} not found in datasite {datasite}")
314
+
315
+ if require_confirmation:
316
+ msg = (
317
+ "Deleting this dataset will remove the following folders:\n"
318
+ f"Mock data: {dataset.mock_dir}\n"
319
+ f"Private metadata: {dataset._private_metadata_dir}\n"
320
+ )
321
+ if (
322
+ dataset.private_dir.resolve().absolute()
323
+ == dataset._private_metadata_dir.resolve().absolute()
324
+ ):
325
+ msg += (
326
+ "WARNING: this will also delete the private data from your system\n"
327
+ )
328
+ else:
329
+ msg += "Private data will not be deleted from your system, it is not managed by SyftBox.\n"
330
+
331
+ msg += "Are you sure you want to delete these folders? (yes/no): "
332
+ confirmation = input(msg).strip().lower()
333
+ if confirmation != "yes":
334
+ print("Dataset deletion cancelled.")
335
+ return
336
+
337
+ # Delete the dataset directories
338
+ if dataset.mock_dir.exists():
339
+ shutil.rmtree(dataset.mock_dir)
340
+ if dataset._private_metadata_dir.exists():
341
+ shutil.rmtree(dataset._private_metadata_dir)
@@ -0,0 +1,33 @@
1
+ import shutil
2
+ from pathlib import Path
3
+ from typing import Iterator
4
+
5
+
6
+ def copy_dir_contents(src: Path, dst: Path, exists_ok: bool = False) -> None:
7
+ if not src.is_dir():
8
+ raise ValueError(f"Source path {src} is not a directory.")
9
+ return copy_paths(src.iterdir(), dst, exists_ok)
10
+
11
+
12
+ def copy_paths(files: Iterator[Path], dst: Path, exists_ok: bool = False) -> None:
13
+ """
14
+ Copy a list of files to a destination directory.
15
+ If `dst` does not exist, it will be created.
16
+ """
17
+ if not dst.exists():
18
+ dst.mkdir(parents=True, exist_ok=True)
19
+
20
+ for file in files:
21
+ dst_path = dst / file.name
22
+ if dst_path.exists() and not exists_ok:
23
+ raise FileExistsError(f"Destination path {dst_path} already exists.")
24
+ if file.is_file():
25
+ shutil.copy2(file, dst_path)
26
+ elif file.is_dir():
27
+ shutil.copytree(file, dst_path, dirs_exist_ok=exists_ok)
28
+
29
+
30
+ def is_empty_dir(path: Path) -> bool:
31
+ if not path.is_dir():
32
+ return False
33
+ return not any(path.iterdir())
@@ -0,0 +1,11 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Union
4
+
5
+ from typing_extensions import TypeAlias
6
+
7
+ PathLike: TypeAlias = Union[str, Path, os.PathLike]
8
+
9
+
10
+ def to_path(path: PathLike) -> Path:
11
+ return Path(path).expanduser().resolve()
@@ -0,0 +1,124 @@
1
+ import re
2
+ from pathlib import Path
3
+ from typing import Any, Dict, Union
4
+ from urllib.parse import urlencode, urlparse
5
+
6
+ from pydantic import GetCoreSchemaHandler, GetJsonSchemaHandler, ValidationInfo
7
+ from pydantic.json_schema import JsonSchemaValue
8
+ from pydantic_core import core_schema
9
+ from typing_extensions import Self
10
+
11
+ from .types import PathLike, to_path
12
+
13
+
14
+ class SyftBoxURL(str):
15
+ def __new__(cls, url: str):
16
+ instance = super().__new__(cls, url)
17
+ if not cls.is_valid(url):
18
+ raise ValueError(f"Invalid SyftBoxURL: {url}")
19
+ instance.parsed = urlparse(url)
20
+ return instance
21
+
22
+ @classmethod
23
+ def is_valid(cls, url: str) -> bool:
24
+ """Validates the given URL matches the syft:// protocol and email-based schema."""
25
+ pattern = r"^syft://([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)(/.*)?$"
26
+ return bool(re.match(pattern, url))
27
+
28
+ @property
29
+ def query(self) -> Dict[str, str]:
30
+ """Returns the query parameters as a dictionary."""
31
+ if not self.parsed.query:
32
+ return {}
33
+
34
+ return dict(
35
+ param.split("=", 1)
36
+ for param in self.parsed.query.split("&")
37
+ if "=" in param
38
+ )
39
+
40
+ @property
41
+ def protocol(self) -> str:
42
+ """Returns the protocol (syft://)."""
43
+ return self.parsed.scheme + "://"
44
+
45
+ @property
46
+ def host(self) -> str:
47
+ """Returns the host, which is the email part."""
48
+ return self.parsed.netloc
49
+
50
+ @property
51
+ def path(self) -> str:
52
+ """Returns the path component after the email."""
53
+ return self.parsed.path
54
+
55
+ def to_local_path(self, syftbox_folder: PathLike) -> Path:
56
+ """
57
+ Converts the SyftBoxURL to a local file system path.
58
+
59
+ Args:
60
+ syftbox_folder (Path): Base SyftBox directory.
61
+
62
+ Returns:
63
+ Path: Local file system path.
64
+ """
65
+ # Remove the protocol and prepend the syftbox_folder
66
+ local_path = to_path(syftbox_folder) / self.host / self.path.lstrip("/")
67
+ return local_path.resolve()
68
+
69
+ def as_http_params(self) -> Dict[str, str]:
70
+ return {
71
+ "method": "get",
72
+ "datasite": self.host,
73
+ "path": self.path,
74
+ }
75
+
76
+ def to_http_get(self, rpc_url: str) -> str:
77
+ rpc_url = rpc_url.split("//")[-1]
78
+ params = self.as_http_params()
79
+ url_params = urlencode(params)
80
+ http_url = f"http://{rpc_url}?{url_params}"
81
+ return http_url
82
+
83
+ @classmethod
84
+ def from_path(cls, path: PathLike, syftbox_folder: PathLike) -> Self:
85
+ rel_path = to_path(path).relative_to(to_path(syftbox_folder))
86
+ # convert to posix path to make it work on Windows OS
87
+ rel_path = rel_path.as_posix()
88
+ return cls(f"syft://{rel_path}")
89
+
90
+ @classmethod
91
+ def validate(
92
+ cls, value: Union["SyftBoxURL", str], info: ValidationInfo
93
+ ) -> "SyftBoxURL":
94
+ if type(value) not in (str, cls):
95
+ raise ValueError(
96
+ f"Invalid type for url: {type(value)}. Expected str or SyftBoxURL."
97
+ )
98
+ value = str(value)
99
+ if not cls.is_valid(value):
100
+ raise ValueError(f"Invalid SyftBoxURL: {value}")
101
+ return cls(value)
102
+
103
+ @classmethod
104
+ def __get_pydantic_core_schema__(
105
+ cls,
106
+ source_type: Any,
107
+ handler: GetCoreSchemaHandler,
108
+ ) -> core_schema.CoreSchema:
109
+ """Pydantic V2 core schema for custom type validation."""
110
+ return core_schema.with_info_after_validator_function(
111
+ cls.validate,
112
+ handler(str),
113
+ )
114
+
115
+ @classmethod
116
+ def __get_pydantic_json_schema__(
117
+ cls, schema_or_field: Any, schema_handler: GetJsonSchemaHandler
118
+ ) -> JsonSchemaValue:
119
+ """Define the JSON schema representation for Pydantic models."""
120
+ return {
121
+ "type": "string",
122
+ "format": "uri",
123
+ "description": "A SyftBox URL",
124
+ }