syft-dataset 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syft_dataset-0.1.3.dist-info/METADATA +9 -0
- syft_dataset-0.1.3.dist-info/RECORD +10 -0
- syft_dataset-0.1.3.dist-info/WHEEL +4 -0
- syft_datasets/__init__.py +4 -0
- syft_datasets/config.py +17 -0
- syft_datasets/dataset.py +228 -0
- syft_datasets/dataset_manager.py +377 -0
- syft_datasets/file_utils.py +44 -0
- syft_datasets/types.py +11 -0
- syft_datasets/url.py +135 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
syft_datasets/__init__.py,sha256=VisH6OXM2j-SCOZUCsPD0kBcZvrAQOvHz6QZAvcK4xI,85
|
|
2
|
+
syft_datasets/config.py,sha256=iObuEfNr5_uwxTXvKqQ1lt5lk-aYKe8gF10mbsmegcw,499
|
|
3
|
+
syft_datasets/dataset.py,sha256=7UNnX7EpSXcYg9cIYx37NnkLFD8L5YfMmdFK4AChrEI,6979
|
|
4
|
+
syft_datasets/dataset_manager.py,sha256=zCQktHvP3uYI0HT2KDvUnq8rO1G3wPhvN_6t0YPbP5Q,13658
|
|
5
|
+
syft_datasets/file_utils.py,sha256=IM9Kw9Vth3ghNqayVJ5_FvyPmXQEaqLdxLI4NDTEt2M,1468
|
|
6
|
+
syft_datasets/types.py,sha256=n7vfKW1BYn1_6223nlYPfeA4rDQoFlTMzM-2WUwCAtg,238
|
|
7
|
+
syft_datasets/url.py,sha256=662XabfCEygaURQtMyB9QW7izHccTjxnyPXyDUw6nSQ,4379
|
|
8
|
+
syft_dataset-0.1.3.dist-info/METADATA,sha256=l4hg1xotXEUk00z7w7P5lu4Lmt_nAJugul6CRTVDAEA,229
|
|
9
|
+
syft_dataset-0.1.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
10
|
+
syft_dataset-0.1.3.dist-info/RECORD,,
|
syft_datasets/config.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class SyftBoxConfig(BaseModel):
|
|
6
|
+
syftbox_folder: Path = Field(
|
|
7
|
+
..., description="Path to the SyftBox folder on the local filesystem."
|
|
8
|
+
)
|
|
9
|
+
email: str = Field(..., description="Email associated with the SyftBox.")
|
|
10
|
+
|
|
11
|
+
@property
|
|
12
|
+
def private_dir(self) -> Path:
|
|
13
|
+
return self.syftbox_folder / "private"
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def public_dir(self) -> Path:
|
|
17
|
+
return self.syftbox_folder / self.email / "public"
|
syft_datasets/dataset.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
|
+
from functools import cached_property
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import ClassVar, Self
|
|
5
|
+
from uuid import UUID, uuid4
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
from syft_notebook_ui.formatter_mixin import (
|
|
10
|
+
ANSIPydanticFormatter,
|
|
11
|
+
PydanticFormatter,
|
|
12
|
+
PydanticFormatterMixin,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from .types import PathLike, to_path
|
|
16
|
+
from .url import SyftBoxURL
|
|
17
|
+
from .config import SyftBoxConfig
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _utcnow():
|
|
21
|
+
return datetime.now(tz=timezone.utc)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatasetBase(BaseModel):
|
|
25
|
+
__display_formatter__: ClassVar[PydanticFormatter] = ANSIPydanticFormatter()
|
|
26
|
+
_syftbox_config: SyftBoxConfig | None = None
|
|
27
|
+
|
|
28
|
+
def save(self, filepath: PathLike) -> None:
|
|
29
|
+
filepath = to_path(filepath)
|
|
30
|
+
if not filepath.suffix == ".yaml":
|
|
31
|
+
raise ValueError("Model must be saved as a .yaml file.")
|
|
32
|
+
|
|
33
|
+
if not filepath.parent.exists():
|
|
34
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
35
|
+
|
|
36
|
+
data = self.model_dump(mode="json")
|
|
37
|
+
yaml_dump = yaml.safe_dump(data, indent=2, sort_keys=False)
|
|
38
|
+
filepath.write_text(yaml_dump)
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def load(
|
|
42
|
+
cls, filepath: PathLike, syftbox_config: SyftBoxConfig | None = None
|
|
43
|
+
) -> Self:
|
|
44
|
+
filepath = to_path(filepath)
|
|
45
|
+
if not filepath.exists():
|
|
46
|
+
raise FileNotFoundError(f"Config file not found: {filepath}")
|
|
47
|
+
|
|
48
|
+
data = yaml.safe_load(filepath.read_text())
|
|
49
|
+
res = cls.model_validate(data)
|
|
50
|
+
res._syftbox_config = syftbox_config
|
|
51
|
+
return res
|
|
52
|
+
|
|
53
|
+
def __str__(self) -> str:
|
|
54
|
+
return self.__display_formatter__.format_str(self)
|
|
55
|
+
|
|
56
|
+
def __repr__(self) -> str:
|
|
57
|
+
return self.__display_formatter__.format_repr(self)
|
|
58
|
+
|
|
59
|
+
def _repr_html_(self) -> str:
|
|
60
|
+
return self.__display_formatter__.format_html(self)
|
|
61
|
+
|
|
62
|
+
def _repr_markdown_(self) -> str:
|
|
63
|
+
return self.__display_formatter__.format_markdown(self)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class PrivateDatasetConfig(DatasetBase, PydanticFormatterMixin):
|
|
67
|
+
"""Used to store private dataset metadata, outside of the sync folder."""
|
|
68
|
+
|
|
69
|
+
uid: UUID # id for this dataset
|
|
70
|
+
data_dir: Path
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class Dataset(DatasetBase, PydanticFormatterMixin):
|
|
74
|
+
__table_extra_fields__ = [
|
|
75
|
+
"name",
|
|
76
|
+
"owner",
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
uid: UUID = Field(default_factory=uuid4)
|
|
80
|
+
created_at: datetime = Field(default_factory=_utcnow)
|
|
81
|
+
updated_at: datetime = Field(default_factory=_utcnow)
|
|
82
|
+
name: str
|
|
83
|
+
summary: str | None = None
|
|
84
|
+
tags: list[str] = []
|
|
85
|
+
location: str | None = None
|
|
86
|
+
|
|
87
|
+
mock_url: SyftBoxURL
|
|
88
|
+
private_url: SyftBoxURL
|
|
89
|
+
readme_url: SyftBoxURL | None = None
|
|
90
|
+
|
|
91
|
+
# Absolute paths to uploaded files (excluding metadata files)
|
|
92
|
+
mock_files_paths: list[Path] = Field(default_factory=list)
|
|
93
|
+
private_files_paths: list[Path] = Field(default_factory=list)
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def owner(self) -> str:
|
|
97
|
+
return self.mock_url.host
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def syftbox_config(self) -> SyftBoxConfig:
|
|
101
|
+
if self._syftbox_config is None:
|
|
102
|
+
raise ValueError("SyftBox config is not set.")
|
|
103
|
+
return self._syftbox_config
|
|
104
|
+
|
|
105
|
+
def _url_to_path(self, url: SyftBoxURL) -> Path:
|
|
106
|
+
return url.to_local_path(
|
|
107
|
+
syftbox_folder=self.syftbox_config.syftbox_folder,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def readme_path(self) -> Path | None:
|
|
112
|
+
if self.readme_url is None:
|
|
113
|
+
return None
|
|
114
|
+
return self._url_to_path(self.readme_url)
|
|
115
|
+
|
|
116
|
+
def get_readme(self) -> str | None:
|
|
117
|
+
"""Get the content of the README file."""
|
|
118
|
+
if self.readme_path and self.readme_path.exists():
|
|
119
|
+
return self.readme_path.read_text()
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def mock_dir(self) -> Path:
|
|
124
|
+
return self._url_to_path(self.mock_url)
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def private_config_path(self) -> Path:
|
|
128
|
+
if self.syftbox_config.email != self.owner:
|
|
129
|
+
raise ValueError(
|
|
130
|
+
"Cannot access private config for a dataset owned by another user."
|
|
131
|
+
)
|
|
132
|
+
return self._private_metadata_dir / "private_metadata.yaml"
|
|
133
|
+
|
|
134
|
+
@cached_property
|
|
135
|
+
def private_config(self) -> PrivateDatasetConfig:
|
|
136
|
+
config_path = self.private_config_path
|
|
137
|
+
if not config_path.exists():
|
|
138
|
+
raise FileNotFoundError(
|
|
139
|
+
f"Private dataset config not found at {config_path}"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return PrivateDatasetConfig.load(
|
|
143
|
+
filepath=config_path, syftbox_config=self._syftbox_config
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def private_dir(self) -> Path:
|
|
148
|
+
private_config = self.private_config
|
|
149
|
+
return private_config.data_dir
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def _private_metadata_dir(self) -> Path:
|
|
153
|
+
if self.syftbox_config.email != self.owner:
|
|
154
|
+
raise ValueError(
|
|
155
|
+
"Cannot access private data for a dataset owned by another user."
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# TODO add 'private' to sb workspace
|
|
159
|
+
private_datasets_dir = (
|
|
160
|
+
self.syftbox_config.syftbox_folder / "private" / "syft_datasets"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
return private_datasets_dir / self.name
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def mock_files(self) -> list[Path]:
|
|
167
|
+
"""
|
|
168
|
+
Get absolute paths to all mock files uploaded during dataset.create.
|
|
169
|
+
Excludes dataset.yaml and readme.md files.
|
|
170
|
+
"""
|
|
171
|
+
return self.mock_files_paths
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def private_files(self) -> list[Path]:
|
|
175
|
+
"""
|
|
176
|
+
Get absolute paths to all private files uploaded during dataset.create.
|
|
177
|
+
Excludes private_metadata.yaml file.
|
|
178
|
+
"""
|
|
179
|
+
return self.private_files_paths
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def files(self) -> list[Path]:
|
|
183
|
+
"""
|
|
184
|
+
Get absolute paths to all files (both mock and private) uploaded during dataset.create.
|
|
185
|
+
"""
|
|
186
|
+
return self.mock_files + self.private_files
|
|
187
|
+
|
|
188
|
+
def _generate_description_html(self) -> str:
|
|
189
|
+
from syft_notebook_ui.pydantic_html_repr import create_html_repr
|
|
190
|
+
|
|
191
|
+
fields_to_include = ["name", "created_at", "summary", "tags", "location"]
|
|
192
|
+
|
|
193
|
+
paths_to_include = []
|
|
194
|
+
try:
|
|
195
|
+
paths_to_include.append("mock_dir")
|
|
196
|
+
except Exception:
|
|
197
|
+
fields_to_include.append("mock_url")
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
private_dir = self.private_dir
|
|
201
|
+
if private_dir.is_dir():
|
|
202
|
+
paths_to_include.append("private_dir")
|
|
203
|
+
except Exception:
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
readme_path = self.readme_path
|
|
208
|
+
if readme_path and readme_path.exists():
|
|
209
|
+
paths_to_include.append("readme_path")
|
|
210
|
+
except Exception:
|
|
211
|
+
fields_to_include.append("readme_url")
|
|
212
|
+
|
|
213
|
+
description = create_html_repr(
|
|
214
|
+
obj=self,
|
|
215
|
+
fields=fields_to_include,
|
|
216
|
+
display_paths=paths_to_include,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
return description
|
|
220
|
+
|
|
221
|
+
def describe(self) -> None:
|
|
222
|
+
from IPython.display import HTML, display
|
|
223
|
+
|
|
224
|
+
description = self._generate_description_html()
|
|
225
|
+
display(HTML(description))
|
|
226
|
+
|
|
227
|
+
def _repr_html_(self) -> str:
|
|
228
|
+
return self._generate_description_html()
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import shutil
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Self
|
|
5
|
+
|
|
6
|
+
from .types import PathLike, to_path
|
|
7
|
+
from syft_notebook_ui.types import TableList
|
|
8
|
+
from typing_extensions import Literal
|
|
9
|
+
|
|
10
|
+
from syft_datasets.dataset import Dataset, PrivateDatasetConfig
|
|
11
|
+
from syft_datasets.file_utils import copy_dir_contents, copy_paths, is_empty_dir
|
|
12
|
+
|
|
13
|
+
from .url import SyftBoxURL
|
|
14
|
+
from .config import SyftBoxConfig
|
|
15
|
+
|
|
16
|
+
FOLDER_NAME = "syft_datasets"
|
|
17
|
+
METADATA_FILENAME = "dataset.yaml"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SyftDatasetManager:
|
|
21
|
+
def __init__(self, syftbox_folder_path: PathLike, email: str):
|
|
22
|
+
self.syftbox_config = SyftBoxConfig(
|
|
23
|
+
syftbox_folder=to_path(syftbox_folder_path), email=email
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def from_config(cls, config: SyftBoxConfig) -> Self:
|
|
28
|
+
return cls(syftbox_folder_path=config.syftbox_folder, email=config.email)
|
|
29
|
+
|
|
30
|
+
def public_dir_for_datasite(self, datasite: str) -> Path:
|
|
31
|
+
dir = self.syftbox_config.syftbox_folder / datasite / "public" / FOLDER_NAME
|
|
32
|
+
dir.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
return dir
|
|
34
|
+
|
|
35
|
+
def get_mock_dataset_dir(self, dataset_name: str, datasite: str) -> Path:
|
|
36
|
+
return self.public_dir_for_datasite(datasite) / dataset_name
|
|
37
|
+
|
|
38
|
+
def _validate_dataset_name(self, dataset_name: str) -> None:
|
|
39
|
+
# Returns True if the dataset is a valid path name on unix or windows.
|
|
40
|
+
if not re.match(r"^[\w-]+$", dataset_name):
|
|
41
|
+
raise ValueError(
|
|
42
|
+
f"Invalid dataset name '{dataset_name}'. Only alphanumeric characters, underscores, and hyphens are allowed."
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def _prepare_mock_data(self, dataset: Dataset, src_path: Path) -> list[Path]:
|
|
46
|
+
# Validate src data
|
|
47
|
+
if not src_path.exists():
|
|
48
|
+
raise FileNotFoundError(f"Could not find mock data at {src_path}")
|
|
49
|
+
|
|
50
|
+
if (src_path / METADATA_FILENAME).exists():
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f"Mock data at {src_path} contains reserved file {METADATA_FILENAME}. Please rename it and try again."
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Validate dir we're making on Syftbox
|
|
56
|
+
if dataset.mock_dir.exists() and not is_empty_dir(dataset.mock_dir):
|
|
57
|
+
raise FileExistsError(
|
|
58
|
+
f"Mock dir {dataset.mock_dir} already exists and is not empty."
|
|
59
|
+
)
|
|
60
|
+
dataset.mock_dir.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
|
|
62
|
+
copied_files = []
|
|
63
|
+
if src_path.is_dir():
|
|
64
|
+
copied_files = copy_dir_contents(
|
|
65
|
+
src=src_path,
|
|
66
|
+
dst=dataset.mock_dir,
|
|
67
|
+
exists_ok=True,
|
|
68
|
+
)
|
|
69
|
+
elif src_path.is_file():
|
|
70
|
+
copied_files = copy_paths(
|
|
71
|
+
files=[src_path],
|
|
72
|
+
dst=dataset.mock_dir,
|
|
73
|
+
exists_ok=True,
|
|
74
|
+
)
|
|
75
|
+
else:
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"Mock data path {src_path} must be an existing file or directory."
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
return copied_files
|
|
81
|
+
|
|
82
|
+
def _prepare_private_data(
|
|
83
|
+
self,
|
|
84
|
+
dataset: Dataset,
|
|
85
|
+
src_path: Path,
|
|
86
|
+
) -> list[Path]:
|
|
87
|
+
dataset.private_dir.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
|
|
89
|
+
copied_files = []
|
|
90
|
+
if src_path.is_dir():
|
|
91
|
+
# TODO: Implementing without copying private data to `SyftBox/private``
|
|
92
|
+
copied_files = copy_dir_contents(
|
|
93
|
+
src=src_path,
|
|
94
|
+
dst=dataset.private_dir,
|
|
95
|
+
exists_ok=True,
|
|
96
|
+
)
|
|
97
|
+
elif src_path.is_file():
|
|
98
|
+
copied_files = copy_paths(
|
|
99
|
+
files=[src_path],
|
|
100
|
+
dst=dataset.private_dir,
|
|
101
|
+
exists_ok=True,
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
raise ValueError(
|
|
105
|
+
f"Private data path {src_path} must be an existing file or directory."
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
return copied_files
|
|
109
|
+
|
|
110
|
+
def _prepare_private_config(
|
|
111
|
+
self,
|
|
112
|
+
dataset: Dataset,
|
|
113
|
+
private_data_dir: Path,
|
|
114
|
+
location: str | None = None,
|
|
115
|
+
) -> None:
|
|
116
|
+
"""
|
|
117
|
+
The private dataset config is used to store private metadata separately from the public dataset metadata.
|
|
118
|
+
"""
|
|
119
|
+
if dataset._private_metadata_dir.exists() and not is_empty_dir(
|
|
120
|
+
dataset._private_metadata_dir
|
|
121
|
+
):
|
|
122
|
+
raise FileExistsError(
|
|
123
|
+
f"Private dir {dataset.private_dir} already exists and is not empty."
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
private_config = PrivateDatasetConfig(
|
|
127
|
+
uid=dataset.uid,
|
|
128
|
+
data_dir=private_data_dir,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
private_config_path = dataset.private_config_path
|
|
132
|
+
private_config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
133
|
+
private_config.save(filepath=private_config_path)
|
|
134
|
+
|
|
135
|
+
def _prepare_readme(self, dataset: Dataset, src_file: Path | None) -> list[Path]:
|
|
136
|
+
copied_files = []
|
|
137
|
+
if src_file is not None:
|
|
138
|
+
if not src_file.is_file():
|
|
139
|
+
raise FileNotFoundError(f"Could not find README at {src_file}")
|
|
140
|
+
if not src_file.suffix.lower() == ".md":
|
|
141
|
+
raise ValueError("readme file must be a markdown (.md) file.")
|
|
142
|
+
copied_files = copy_paths(
|
|
143
|
+
files=[src_file],
|
|
144
|
+
dst=dataset.mock_dir,
|
|
145
|
+
exists_ok=True,
|
|
146
|
+
)
|
|
147
|
+
return copied_files
|
|
148
|
+
|
|
149
|
+
def create(
|
|
150
|
+
self,
|
|
151
|
+
name: str,
|
|
152
|
+
mock_path: PathLike,
|
|
153
|
+
private_path: PathLike,
|
|
154
|
+
summary: str | None = None,
|
|
155
|
+
readme_path: Path | None = None,
|
|
156
|
+
location: str | None = None,
|
|
157
|
+
tags: list[str] | None = None,
|
|
158
|
+
# copy_private_data: bool = True, # TODO
|
|
159
|
+
) -> Dataset:
|
|
160
|
+
"""_summary_
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
name (str): Unique of the dataset to create.
|
|
164
|
+
mock_path (PathLike): Path to the existing mock data. This can be a file or a directory.
|
|
165
|
+
private_path (PathLike): Path to the existing private data. This can be a file or a directory.
|
|
166
|
+
summary (str | None, optional): Short summary of the dataset. Defaults to None.
|
|
167
|
+
readme_path (Path | None, optional): Markdown README in the public dataset. Defaults to None.
|
|
168
|
+
location (str | None, optional): Location identifier for the dataset, e.g. 'high-side-1234'.
|
|
169
|
+
Only required for datasets that are hosted on a remote location and require manual syncing.
|
|
170
|
+
Defaults to None.
|
|
171
|
+
tags (list[str] | None, optional): Optional tags for the dataset. Defaults to None.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Dataset: The created Dataset object.
|
|
175
|
+
"""
|
|
176
|
+
mock_path = to_path(mock_path)
|
|
177
|
+
private_path = to_path(private_path)
|
|
178
|
+
readme_path = to_path(readme_path) if readme_path else None
|
|
179
|
+
tags = tags or []
|
|
180
|
+
|
|
181
|
+
mock_dir = self.get_mock_dataset_dir(
|
|
182
|
+
dataset_name=name,
|
|
183
|
+
datasite=self.syftbox_config.email,
|
|
184
|
+
)
|
|
185
|
+
mock_url = SyftBoxURL.from_path(
|
|
186
|
+
path=mock_dir,
|
|
187
|
+
syftbox_folder=self.syftbox_config.syftbox_folder,
|
|
188
|
+
)
|
|
189
|
+
readme_url = None
|
|
190
|
+
if readme_path:
|
|
191
|
+
readme_url = SyftBoxURL.from_path(
|
|
192
|
+
path=mock_dir / readme_path.name,
|
|
193
|
+
syftbox_folder=self.syftbox_config.syftbox_folder,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Generate private_url for the dataset
|
|
197
|
+
# Private URLs use a simple path format
|
|
198
|
+
private_url = SyftBoxURL(
|
|
199
|
+
f"syft://private/syft_datasets/{name}",
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
dataset = Dataset(
|
|
203
|
+
name=name,
|
|
204
|
+
mock_url=mock_url,
|
|
205
|
+
private_url=private_url,
|
|
206
|
+
readme_url=readme_url,
|
|
207
|
+
summary=summary,
|
|
208
|
+
location=location,
|
|
209
|
+
tags=tags,
|
|
210
|
+
)
|
|
211
|
+
dataset._syftbox_config = self.syftbox_config
|
|
212
|
+
|
|
213
|
+
# Prepare mock data and collect file paths
|
|
214
|
+
mock_data_files = self._prepare_mock_data(
|
|
215
|
+
dataset=dataset,
|
|
216
|
+
src_path=mock_path,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Prepare readme and collect file paths
|
|
220
|
+
readme_files = self._prepare_readme(
|
|
221
|
+
dataset=dataset,
|
|
222
|
+
src_file=readme_path,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# TODO enable adding private data without copying to SyftBox
|
|
226
|
+
# e.g. private_data_dir = dataset._private_metadata_dir if copy_private_data else private_path
|
|
227
|
+
self._prepare_private_config(
|
|
228
|
+
dataset=dataset,
|
|
229
|
+
private_data_dir=dataset._private_metadata_dir,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Prepare private data and collect file paths
|
|
233
|
+
private_data_files = self._prepare_private_data(
|
|
234
|
+
dataset=dataset,
|
|
235
|
+
src_path=private_path,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# Store file paths, excluding metadata files
|
|
239
|
+
dataset_yaml_path = (mock_dir / METADATA_FILENAME).absolute()
|
|
240
|
+
private_metadata_path = dataset.private_config_path.absolute()
|
|
241
|
+
|
|
242
|
+
# Mock files exclude dataset.yaml and readme.md
|
|
243
|
+
dataset.mock_files_paths = [
|
|
244
|
+
f
|
|
245
|
+
for f in mock_data_files
|
|
246
|
+
if f != dataset_yaml_path and f not in readme_files
|
|
247
|
+
]
|
|
248
|
+
|
|
249
|
+
# Private files exclude private_metadata.yaml
|
|
250
|
+
dataset.private_files_paths = [
|
|
251
|
+
f for f in private_data_files if f != private_metadata_path
|
|
252
|
+
]
|
|
253
|
+
|
|
254
|
+
# Save dataset metadata
|
|
255
|
+
dataset.save(filepath=dataset_yaml_path)
|
|
256
|
+
return dataset
|
|
257
|
+
|
|
258
|
+
def _load_dataset_from_dir(self, dataset_dir: Path) -> Dataset:
|
|
259
|
+
metadata_path = dataset_dir / METADATA_FILENAME
|
|
260
|
+
if not metadata_path.exists():
|
|
261
|
+
raise FileNotFoundError(f"Dataset metadata not found at {metadata_path}")
|
|
262
|
+
|
|
263
|
+
return Dataset.load(
|
|
264
|
+
filepath=metadata_path,
|
|
265
|
+
syftbox_config=self.syftbox_config,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
def get(self, name: str, datasite: str | None = None) -> Dataset:
|
|
269
|
+
datasite = datasite or self.syftbox_config.email
|
|
270
|
+
mock_dir = self.get_mock_dataset_dir(
|
|
271
|
+
dataset_name=name,
|
|
272
|
+
datasite=datasite,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
if not mock_dir.exists():
|
|
276
|
+
raise FileNotFoundError(f"Dataset {name} not found in {mock_dir}")
|
|
277
|
+
return self._load_dataset_from_dir(mock_dir)
|
|
278
|
+
|
|
279
|
+
def __getitem__(self, key: str) -> Dataset:
|
|
280
|
+
return self.get(name=key)
|
|
281
|
+
|
|
282
|
+
def get_all(
|
|
283
|
+
self,
|
|
284
|
+
datasite: str | None = None,
|
|
285
|
+
limit: int | None = None,
|
|
286
|
+
offset: int | None = None,
|
|
287
|
+
order_by: str | None = None,
|
|
288
|
+
sort_order: Literal["asc", "desc"] = "asc",
|
|
289
|
+
) -> list[Dataset]:
|
|
290
|
+
all_datasets = []
|
|
291
|
+
|
|
292
|
+
if datasite:
|
|
293
|
+
datasites_to_check = [datasite]
|
|
294
|
+
else:
|
|
295
|
+
syftbox_folder = self.syftbox_config.syftbox_folder
|
|
296
|
+
# All directories with "@" in the name are peer/owner email directories
|
|
297
|
+
datasites_to_check = [
|
|
298
|
+
d.name for d in syftbox_folder.iterdir() if d.is_dir() and "@" in d.name
|
|
299
|
+
]
|
|
300
|
+
|
|
301
|
+
for datasite in datasites_to_check:
|
|
302
|
+
public_datasets_dir = self.public_dir_for_datasite(datasite)
|
|
303
|
+
if not public_datasets_dir.exists():
|
|
304
|
+
continue
|
|
305
|
+
for dataset_dir in public_datasets_dir.iterdir():
|
|
306
|
+
if dataset_dir.is_dir():
|
|
307
|
+
try:
|
|
308
|
+
dataset = self._load_dataset_from_dir(dataset_dir)
|
|
309
|
+
all_datasets.append(dataset)
|
|
310
|
+
except Exception:
|
|
311
|
+
continue
|
|
312
|
+
|
|
313
|
+
if order_by is not None:
|
|
314
|
+
all_datasets.sort(
|
|
315
|
+
key=lambda d: getattr(d, order_by),
|
|
316
|
+
reverse=(sort_order.lower() == "desc"),
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
if offset is not None:
|
|
320
|
+
all_datasets = all_datasets[offset:]
|
|
321
|
+
if limit is not None:
|
|
322
|
+
all_datasets = all_datasets[:limit]
|
|
323
|
+
|
|
324
|
+
return TableList(all_datasets)
|
|
325
|
+
|
|
326
|
+
def delete(
|
|
327
|
+
self,
|
|
328
|
+
name: str,
|
|
329
|
+
datasite: str | None = None,
|
|
330
|
+
require_confirmation: bool = True,
|
|
331
|
+
) -> None:
|
|
332
|
+
datasite = datasite or self.syftbox_config.email
|
|
333
|
+
|
|
334
|
+
if datasite != self.syftbox_config.email:
|
|
335
|
+
# NOTE this check is easily bypassed, but bypassing does not have any effect.
|
|
336
|
+
# When bypassed, the dataset will be restored because the user only has
|
|
337
|
+
# read access to someone else's datasite.
|
|
338
|
+
raise ValueError(
|
|
339
|
+
"Cannot delete datasets from a datasite that is not your own."
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
try:
|
|
343
|
+
dataset = self.get(
|
|
344
|
+
name=name,
|
|
345
|
+
datasite=datasite,
|
|
346
|
+
)
|
|
347
|
+
except FileNotFoundError:
|
|
348
|
+
raise FileNotFoundError(f"Dataset {name} not found in datasite {datasite}")
|
|
349
|
+
|
|
350
|
+
if require_confirmation:
|
|
351
|
+
msg = (
|
|
352
|
+
"Deleting this dataset will remove the following folders:\n"
|
|
353
|
+
f"Mock data: {dataset.mock_dir}\n"
|
|
354
|
+
f"Private metadata: {dataset._private_metadata_dir}\n"
|
|
355
|
+
)
|
|
356
|
+
if (
|
|
357
|
+
dataset._private_metadata_dir.exists()
|
|
358
|
+
and dataset.private_dir.resolve().absolute()
|
|
359
|
+
== dataset._private_metadata_dir.resolve().absolute()
|
|
360
|
+
):
|
|
361
|
+
msg += (
|
|
362
|
+
"WARNING: this will also delete the private data from your system\n"
|
|
363
|
+
)
|
|
364
|
+
else:
|
|
365
|
+
msg += "Private data will not be deleted from your system, it is not managed by SyftBox.\n"
|
|
366
|
+
|
|
367
|
+
msg += "Are you sure you want to delete these folders? (yes/no): "
|
|
368
|
+
confirmation = input(msg).strip().lower()
|
|
369
|
+
if confirmation != "yes":
|
|
370
|
+
print("Dataset deletion cancelled.")
|
|
371
|
+
return
|
|
372
|
+
|
|
373
|
+
# Delete the dataset directories
|
|
374
|
+
if dataset.mock_dir.exists():
|
|
375
|
+
shutil.rmtree(dataset.mock_dir)
|
|
376
|
+
if dataset._private_metadata_dir.exists():
|
|
377
|
+
shutil.rmtree(dataset._private_metadata_dir)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Iterator
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def copy_dir_contents(src: Path, dst: Path, exists_ok: bool = False) -> list[Path]:
|
|
7
|
+
if not src.is_dir():
|
|
8
|
+
raise ValueError(f"Source path {src} is not a directory.")
|
|
9
|
+
return copy_paths(src.iterdir(), dst, exists_ok)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def copy_paths(files: Iterator[Path], dst: Path, exists_ok: bool = False) -> list[Path]:
|
|
13
|
+
"""
|
|
14
|
+
Copy a list of files to a destination directory.
|
|
15
|
+
If `dst` does not exist, it will be created.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
list[Path]: List of absolute paths to all copied files (destination paths).
|
|
19
|
+
"""
|
|
20
|
+
if not dst.exists():
|
|
21
|
+
dst.mkdir(parents=True, exist_ok=True)
|
|
22
|
+
|
|
23
|
+
copied_files = []
|
|
24
|
+
for file in files:
|
|
25
|
+
dst_path = dst / file.name
|
|
26
|
+
if dst_path.exists() and not exists_ok:
|
|
27
|
+
raise FileExistsError(f"Destination path {dst_path} already exists.")
|
|
28
|
+
if file.is_file():
|
|
29
|
+
shutil.copy2(file, dst_path)
|
|
30
|
+
copied_files.append(dst_path.absolute())
|
|
31
|
+
elif file.is_dir():
|
|
32
|
+
shutil.copytree(file, dst_path, dirs_exist_ok=exists_ok)
|
|
33
|
+
# Get all files in the copied directory tree
|
|
34
|
+
for copied_file in dst_path.rglob("*"):
|
|
35
|
+
if copied_file.is_file():
|
|
36
|
+
copied_files.append(copied_file.absolute())
|
|
37
|
+
|
|
38
|
+
return copied_files
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def is_empty_dir(path: Path) -> bool:
|
|
42
|
+
if not path.is_dir():
|
|
43
|
+
return False
|
|
44
|
+
return not any(path.iterdir())
|
syft_datasets/types.py
ADDED
syft_datasets/url.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, Union
|
|
4
|
+
from urllib.parse import urlencode, urlparse
|
|
5
|
+
|
|
6
|
+
from pydantic import GetCoreSchemaHandler, GetJsonSchemaHandler, ValidationInfo
|
|
7
|
+
from pydantic.json_schema import JsonSchemaValue
|
|
8
|
+
from pydantic_core import core_schema
|
|
9
|
+
from typing_extensions import Self
|
|
10
|
+
|
|
11
|
+
from .types import PathLike, to_path
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SyftBoxURL(str):
|
|
15
|
+
def __new__(cls, url: str):
|
|
16
|
+
instance = super().__new__(cls, url)
|
|
17
|
+
if not cls.is_valid(url):
|
|
18
|
+
raise ValueError(f"Invalid SyftBoxURL: {url}")
|
|
19
|
+
instance.parsed = urlparse(url)
|
|
20
|
+
return instance
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def is_valid(cls, url: str) -> bool:
|
|
24
|
+
"""Validates the given URL matches the syft:// protocol.
|
|
25
|
+
|
|
26
|
+
Supports two formats:
|
|
27
|
+
1. Email-based: syft://user@domain.com/path (for public data)
|
|
28
|
+
2. Simple path: syft://path (for private/local data)
|
|
29
|
+
"""
|
|
30
|
+
# Pattern for email-based URLs (e.g., syft://user@domain.com/path)
|
|
31
|
+
email_pattern = (
|
|
32
|
+
r"^syft://([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)(/.*)?$"
|
|
33
|
+
)
|
|
34
|
+
# Pattern for simple path URLs (e.g., syft://private/path)
|
|
35
|
+
simple_pattern = r"^syft://([a-zA-Z0-9_.-]+)(/.*)?$"
|
|
36
|
+
|
|
37
|
+
return bool(re.match(email_pattern, url)) or bool(re.match(simple_pattern, url))
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def query(self) -> Dict[str, str]:
|
|
41
|
+
"""Returns the query parameters as a dictionary."""
|
|
42
|
+
if not self.parsed.query:
|
|
43
|
+
return {}
|
|
44
|
+
|
|
45
|
+
return dict(
|
|
46
|
+
param.split("=", 1)
|
|
47
|
+
for param in self.parsed.query.split("&")
|
|
48
|
+
if "=" in param
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def protocol(self) -> str:
|
|
53
|
+
"""Returns the protocol (syft://)."""
|
|
54
|
+
return self.parsed.scheme + "://"
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def host(self) -> str:
|
|
58
|
+
"""Returns the host, which is the email part."""
|
|
59
|
+
return self.parsed.netloc
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def path(self) -> str:
|
|
63
|
+
"""Returns the path component after the email."""
|
|
64
|
+
return self.parsed.path
|
|
65
|
+
|
|
66
|
+
def to_local_path(self, syftbox_folder: PathLike) -> Path:
|
|
67
|
+
"""
|
|
68
|
+
Converts the SyftBoxURL to a local file system path.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
syftbox_folder (Path): Base SyftBox directory.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Path: Local file system path.
|
|
75
|
+
"""
|
|
76
|
+
# Remove the protocol and prepend the syftbox_folder
|
|
77
|
+
local_path = to_path(syftbox_folder) / self.host / self.path.lstrip("/")
|
|
78
|
+
return local_path.resolve()
|
|
79
|
+
|
|
80
|
+
def as_http_params(self) -> Dict[str, str]:
|
|
81
|
+
return {
|
|
82
|
+
"method": "get",
|
|
83
|
+
"datasite": self.host,
|
|
84
|
+
"path": self.path,
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
def to_http_get(self, rpc_url: str) -> str:
|
|
88
|
+
rpc_url = rpc_url.split("//")[-1]
|
|
89
|
+
params = self.as_http_params()
|
|
90
|
+
url_params = urlencode(params)
|
|
91
|
+
http_url = f"http://{rpc_url}?{url_params}"
|
|
92
|
+
return http_url
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def from_path(cls, path: PathLike, syftbox_folder: PathLike) -> Self:
|
|
96
|
+
rel_path = to_path(path).relative_to(to_path(syftbox_folder))
|
|
97
|
+
# convert to posix path to make it work on Windows OS
|
|
98
|
+
rel_path = rel_path.as_posix()
|
|
99
|
+
return cls(f"syft://{rel_path}")
|
|
100
|
+
|
|
101
|
+
@classmethod
|
|
102
|
+
def validate(
|
|
103
|
+
cls, value: Union["SyftBoxURL", str], info: ValidationInfo
|
|
104
|
+
) -> "SyftBoxURL":
|
|
105
|
+
if type(value) not in (str, cls):
|
|
106
|
+
raise ValueError(
|
|
107
|
+
f"Invalid type for url: {type(value)}. Expected str or SyftBoxURL."
|
|
108
|
+
)
|
|
109
|
+
value = str(value)
|
|
110
|
+
if not cls.is_valid(value):
|
|
111
|
+
raise ValueError(f"Invalid SyftBoxURL: {value}")
|
|
112
|
+
return cls(value)
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def __get_pydantic_core_schema__(
|
|
116
|
+
cls,
|
|
117
|
+
source_type: Any,
|
|
118
|
+
handler: GetCoreSchemaHandler,
|
|
119
|
+
) -> core_schema.CoreSchema:
|
|
120
|
+
"""Pydantic V2 core schema for custom type validation."""
|
|
121
|
+
return core_schema.with_info_after_validator_function(
|
|
122
|
+
cls.validate,
|
|
123
|
+
handler(str),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def __get_pydantic_json_schema__(
|
|
128
|
+
cls, schema_or_field: Any, schema_handler: GetJsonSchemaHandler
|
|
129
|
+
) -> JsonSchemaValue:
|
|
130
|
+
"""Define the JSON schema representation for Pydantic models."""
|
|
131
|
+
return {
|
|
132
|
+
"type": "string",
|
|
133
|
+
"format": "uri",
|
|
134
|
+
"description": "A SyftBox URL",
|
|
135
|
+
}
|