everycure-datasets 0.2.33__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyc
5
+
6
+ # Virtual environments
7
+ .venv/
8
+ venv/
9
+
10
+ # uv
11
+ .uv/
12
+
13
+ # Build artifacts
14
+ dist/
15
+ build/
16
+ *.egg-info/
17
+
18
+ # IDE (keep .vscode/settings.json for schema validation)
19
+ .vscode/*
20
+ !.vscode/settings.json
21
+ .idea/
22
+
23
+ # Environment variables
24
+ .env
25
+ .env.local
26
+
27
+ # OS
28
+ .DS_Store
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: everycure-datasets
3
+ Version: 0.2.33
4
+ Requires-Python: >=3.11
5
+ Requires-Dist: gitpython>=3.1.45
6
+ Requires-Dist: pydantic>=2.0.0
7
+ Requires-Dist: pyyaml>=6.0.0
8
+ Provides-Extra: kedro
9
+ Requires-Dist: kedro-datasets[pandas,spark]>=6.0.0; extra == 'kedro'
10
+ Requires-Dist: kedro>=0.19.1; extra == 'kedro'
11
+ Requires-Dist: semantic-version>=2.10.0; extra == 'kedro'
@@ -0,0 +1,52 @@
1
+ [project]
2
+ name = "everycure-datasets"
3
+ version = "0.2.33"
4
+ requires-python = ">=3.11"
5
+ scripts = {datasets = "everycure.datasets.cli:app"}
6
+ dependencies = [
7
+ "gitpython>=3.1.45",
8
+ "pydantic>=2.0.0",
9
+ "pyyaml>=6.0.0",
10
+ ]
11
+ [project.optional-dependencies]
12
+ kedro = [
13
+ "kedro>=0.19.1",
14
+ "kedro-datasets[pandas,spark]>=6.0.0",
15
+ "semantic-version>=2.10.0",
16
+ ]
17
+
18
+ [tool.hatch.build]
19
+ include = ["src/**"]
20
+
21
+ [tool.hatch.build.targets.sdist]
22
+ exclude = [
23
+ "datasets/**",
24
+ ]
25
+
26
+ [tool.hatch.build.targets.wheel]
27
+ packages = ["src/everycure"]
28
+
29
+ # [tool.uv]
30
+ # keyring-provider = "subprocess"
31
+ #
32
+ # [[tool.uv.index]]
33
+ # name = "pypi"
34
+ # url = "https://pypi.org/simple"
35
+ #
36
+ # [[tool.uv.index]]
37
+ # name = "everycure"
38
+ # url = "https://oauth2accesstoken@us-central1-python.pkg.dev/core-422020/everycure/simple/"
39
+ # publish-url = "https://us-central1-python.pkg.dev/core-422020/everycure/"
40
+ # default = false
41
+
42
+ [build-system]
43
+ requires = ["hatchling"]
44
+ build-backend = "hatchling.build"
45
+
46
+ [dependency-groups]
47
+ dev = [
48
+ "keyrings-google-artifactregistry-auth>=1.1.2",
49
+ "pytest>=9.0.1",
50
+ "pre-commit>=4.3.0",
51
+ "typer>=0.12.0",
52
+ ]
@@ -0,0 +1,3 @@
1
+ """Every Cure namespace package."""
2
+
3
+ __path__ = __import__("pkgutil").extend_path(__path__, __name__)
@@ -0,0 +1,6 @@
1
+ """Dataset Registry - Centralized dataset metadata management."""
2
+
3
+ from everycure.datasets.models.v1 import DatasetMetadataV1
4
+ from everycure.datasets.models.v2 import DatasetMetadataV2
5
+
6
+ __all__ = ["DatasetMetadataV1", "DatasetMetadataV2"]
@@ -0,0 +1,64 @@
1
+ """CLI for datasets registry operations."""
2
+
3
+ from pathlib import Path
4
+
5
+ import typer
6
+
7
+ from everycure.datasets.generate_schema import generate_all_schemas
8
+ from everycure.datasets.validate import validate_datasets
9
+
10
+ app = typer.Typer(
11
+ name="datasets",
12
+ help="Datasets registry management CLI",
13
+ add_completion=False,
14
+ no_args_is_help=True,
15
+ )
16
+
17
+ schema_app = typer.Typer(
18
+ name="schema",
19
+ help="Schema management commands",
20
+ add_completion=False,
21
+ )
22
+
23
+ app.add_typer(schema_app)
24
+
25
+
26
+ @app.callback(invoke_without_command=True)
27
+ def main(ctx: typer.Context) -> None:
28
+ """Datasets registry management CLI."""
29
+ if ctx.invoked_subcommand is None:
30
+ typer.echo(ctx.get_help())
31
+
32
+
33
+ @app.command()
34
+ def validate(
35
+ datasets_dir: Path | None = typer.Option(
36
+ None,
37
+ "--datasets-dir",
38
+ "-d",
39
+ help="Path to the datasets directory (default: auto-detect)",
40
+ ),
41
+ ) -> None:
42
+ """
43
+ Validate dataset YAML files and directory structure.
44
+
45
+ Checks:
46
+ - Dataset names are snake_case
47
+ - Version directories follow semantic versioning (MAJOR.MINOR.PATCH)
48
+ """
49
+ return validate_datasets(datasets_dir)
50
+
51
+
52
+ @schema_app.command()
53
+ def generate() -> None:
54
+ """
55
+ Generate JSON schema(s) from the Pydantic models.
56
+ """
57
+ repo_root = Path.cwd()
58
+ schema_dir = repo_root / ".schema"
59
+ generate_all_schemas(schema_dir)
60
+ typer.echo(f"✓ Generated all JSON schemas in {schema_dir}")
61
+
62
+
63
+ if __name__ == "__main__":
64
+ app()
@@ -0,0 +1,62 @@
1
+ """Generate JSON schema from Pydantic models."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from everycure.datasets.models.v1 import DatasetMetadataV1
7
+ from everycure.datasets.models.v2 import DatasetMetadataV2
8
+
9
+
10
+ def generate_schema(model_class, output_path: Path, schema_id: str) -> None:
11
+ """
12
+ Generate JSON schema from a Pydantic model.
13
+
14
+ Args:
15
+ model_class: The Pydantic model class to generate schema from
16
+ output_path: Path where the JSON schema should be written
17
+ schema_id: The $id for the schema (FQDN)
18
+ """
19
+ # Get the JSON schema from the Pydantic model
20
+ schema = model_class.model_json_schema(
21
+ mode="serialization",
22
+ by_alias=True,
23
+ )
24
+
25
+ # Update the $id to use the provided FQDN
26
+ schema["$id"] = schema_id
27
+
28
+ # Remove fields with defaults from required list
29
+ # but keep them in properties so they're documented
30
+ fields_with_defaults = ["schema_version", "status", "created_at"]
31
+ if "required" in schema:
32
+ for field in fields_with_defaults:
33
+ if field in schema["required"]:
34
+ schema["required"].remove(field)
35
+
36
+ # Write to file with proper formatting
37
+ with open(output_path, "w") as f:
38
+ json.dump(schema, f, indent=2)
39
+ f.write("\n") # Add trailing newline
40
+
41
+ print(f"Generated JSON schema at {output_path}")
42
+
43
+
44
+ def generate_all_schemas(schema_dir: Path) -> None:
45
+ """Generate all schema versions."""
46
+ schema_dir.mkdir(parents=True, exist_ok=True)
47
+
48
+ # Generate v1 schema
49
+ v1_path = schema_dir / "dataset.v1.schema.json"
50
+ generate_schema(
51
+ DatasetMetadataV1,
52
+ v1_path,
53
+ "https://everycure.org/schemas/dataset.v1.schema.json",
54
+ )
55
+
56
+ # Generate v2 schema
57
+ v2_path = schema_dir / "dataset.v2.schema.json"
58
+ generate_schema(
59
+ DatasetMetadataV2,
60
+ v2_path,
61
+ "https://everycure.org/schemas/dataset.v2.schema.json",
62
+ )
@@ -0,0 +1,5 @@
1
+ from .catalog_dataset import DataCatalogDataset
2
+
3
+ __all__ = [
4
+ "DataCatalogDataset",
5
+ ]
@@ -0,0 +1,385 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ import typer
6
+ import yaml
7
+ from kedro.io.core import AbstractDataset, DatasetError
8
+ from kedro_datasets import pandas, spark
9
+ from semantic_version import NpmSpec, Version
10
+
11
+ from everycure.datasets.kedro.storage import GitStorageService, is_uri
12
+ from everycure.datasets.models.v1 import (
13
+ ColumnSchema,
14
+ DatasetMetadataV1,
15
+ DatasetSchema,
16
+ )
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ app = typer.Typer()
21
+
22
+
23
+ class DataCatalogDataset(AbstractDataset):
24
+ """Custom dataset to load and read resources
25
+
26
+ Examples:
27
+
28
+ ```yaml
29
+ catalog_diseases:
30
+ type: everycure.datasets.kedro.catalog_dataset.DataCatalogDataset
31
+ dataset: disease_list
32
+ engine: spark
33
+
34
+ load_args:
35
+ version: ~0.2.0
36
+
37
+ # Arguments for the underlying Kedro engine (e.g., spark.SparkDataset)
38
+ save_args:
39
+ mode: overwrite
40
+
41
+ # Arguments for the DatasetMetadataV1 model
42
+ catalog_args:
43
+ description: "Dataset description"
44
+ message: "Optional message"
45
+ owner:
46
+ name: "John Doe"
47
+ email: "john@example.com"
48
+ location:
49
+ uri: "gs://path/to/{version}/data.parquet"
50
+ format: "parquet"
51
+ # ...etc
52
+
53
+ ```
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ *,
59
+ dataset: str | dict[str, Any],
60
+ engine: str,
61
+ save_args: dict[str, Any] = None,
62
+ load_args: dict[str, Any] = None,
63
+ **kwargs,
64
+ ):
65
+ self._dataset = dataset
66
+ self._engine = engine
67
+ self._storage_service = GitStorageService.get_instance()
68
+ self._save_args = save_args or {}
69
+ self._catalog_args = self._save_args.pop("catalog_args", None)
70
+ self._load_args = load_args or {}
71
+
72
+ @property
73
+ def filepath(self) -> str:
74
+ semver_pattern = self._load_args.get("version")
75
+ version, _ = self.best_match(self.versions, semver_pattern)
76
+ content = self._storage_service.get(
77
+ Path(f"datasets/{self._dataset}/{version}/dataset.yaml")
78
+ )
79
+ if content is None:
80
+ raise DatasetError(
81
+ f"Dataset metadata file not found for '{self._dataset}' version '{version}'"
82
+ )
83
+ dataset = DatasetMetadataV1.model_validate(yaml.safe_load(content))
84
+
85
+ return str(dataset.location.uri)
86
+
87
+ def load(self) -> Any:
88
+ """Dataset loading
89
+
90
+ Dataset loads the best matching version of the requested
91
+ dataset using the pattern.
92
+ """
93
+ # Make a copy to avoid modifying the original dict
94
+ engine_load_args = self._load_args.copy()
95
+ semver_pattern = engine_load_args.pop("version", None)
96
+ assert_latest = engine_load_args.pop("assert_latest", False)
97
+
98
+ version, is_latest = self.best_match(self.versions, semver_pattern)
99
+
100
+ if version is None:
101
+ raise DatasetError(
102
+ f"No version matched for dataset '{self._dataset}', available versions: {','.join(self.versions)}"
103
+ )
104
+
105
+ if assert_latest and not is_latest:
106
+ raise DatasetError(
107
+ f"Newer version for dataset '{self._dataset}' available!"
108
+ )
109
+
110
+ logger.info(f"Using version {version} for dataset '{self._dataset}'")
111
+ try:
112
+ content = self._storage_service.get(
113
+ Path(f"datasets/{self._dataset}/{version}/dataset.yaml")
114
+ )
115
+ if content is None:
116
+ raise DatasetError(
117
+ f"Dataset metadata file not found for '{self._dataset}' version '{version}'"
118
+ )
119
+ dataset = DatasetMetadataV1.model_validate(yaml.safe_load(content))
120
+
121
+ return self.get_dataset(
122
+ dataset.location.format.value,
123
+ str(dataset.location.uri),
124
+ engine_load_args,
125
+ {}, # save_args are not used in load
126
+ ).load()
127
+ except Exception as e:
128
+ raise DatasetError(
129
+ f"Failed to load version for dataset '{self._dataset}': {e}"
130
+ ) from e
131
+
132
+ @staticmethod
133
+ def _uri_to_path(uri: str) -> str:
134
+ """Convert file:// URLs to file paths for local file access.
135
+
136
+ kedro_datasets expects file paths, not file:// URLs.
137
+ Other URI schemes (http, https, gs, s3, etc.) are passed through unchanged.
138
+ """
139
+ if uri.startswith("file://"):
140
+ from urllib.parse import unquote, urlparse
141
+
142
+ parsed = urlparse(uri)
143
+ return unquote(parsed.path)
144
+ return uri
145
+
146
+ def get_dataset(
147
+ self,
148
+ format_: str,
149
+ file_path: str,
150
+ load_args: dict[str, Any],
151
+ save_args: dict[str, Any],
152
+ ) -> AbstractDataset:
153
+ # Convert file:// URLs to paths for local file access
154
+ file_path = self._uri_to_path(file_path)
155
+
156
+ if self._engine == "spark":
157
+ if format_ == "tsv":
158
+ return spark.SparkDataset(
159
+ filepath=file_path,
160
+ file_format="csv",
161
+ load_args={
162
+ **load_args,
163
+ "sep": "\t",
164
+ "header": True,
165
+ "index": False,
166
+ },
167
+ save_args=save_args,
168
+ )
169
+
170
+ return spark.SparkDataset(
171
+ filepath=file_path,
172
+ file_format=format_,
173
+ load_args={**load_args, "header": True, "index": False},
174
+ save_args=save_args,
175
+ )
176
+
177
+ if self._engine == "pandas":
178
+ if format_ == "csv":
179
+ return pandas.CSVDataset(
180
+ filepath=file_path,
181
+ load_args=load_args,
182
+ save_args=save_args,
183
+ )
184
+
185
+ if format_ == "parquet":
186
+ return pandas.ParquetDataset(
187
+ filepath=file_path, load_args=load_args, save_args=save_args
188
+ )
189
+
190
+ raise ValueError(f"Unsupported engine: {self._engine} and format {format}")
191
+
192
+ def get_schema(self, data) -> DatasetSchema:
193
+ """Get dataset schema as DatasetSchema model."""
194
+ columns = None
195
+ row_count = None
196
+
197
+ if self._engine == "pandas":
198
+ type_map = {
199
+ "int64": "int",
200
+ "Int64": "int",
201
+ "float64": "float",
202
+ "object": "string",
203
+ "bool": "bool",
204
+ "datetime64[ns]": "datetime",
205
+ }
206
+ columns = [
207
+ ColumnSchema(name=col, type=type_map.get(str(dtype), "unknown"))
208
+ for col, dtype in data.dtypes.items()
209
+ ]
210
+ row_count = len(data)
211
+
212
+ elif self._engine == "spark":
213
+ spark_map = {
214
+ "IntegerType()": "int",
215
+ "LongType()": "int",
216
+ "DoubleType()": "float",
217
+ "FloatType()": "float",
218
+ "StringType()": "string",
219
+ "BooleanType()": "bool",
220
+ "TimestampType()": "datetime",
221
+ "DateType()": "date",
222
+ }
223
+ columns = [
224
+ ColumnSchema(
225
+ name=field.name,
226
+ type=spark_map.get(str(field.dataType), "unknown"),
227
+ )
228
+ for field in data.schema.fields
229
+ ]
230
+ row_count = data.count()
231
+
232
+ else:
233
+ raise ValueError(f"Unsupported engine: {self._engine}")
234
+
235
+ return DatasetSchema(columns=columns, row_count=row_count)
236
+
237
+ def save(self, data: Any) -> None:
238
+ """Dataset saving
239
+
240
+ Dataset is saved using the next relevant semversion based
241
+ on the catalog arguments.
242
+ """
243
+ if not self._catalog_args:
244
+ raise DatasetError("Required 'catalog_args' missing in save_args.")
245
+
246
+ # 1. Calculate dynamic properties from _catalog_args
247
+ save_version = self._catalog_args.get("version")
248
+ message = self._catalog_args.get("message")
249
+ if not save_version:
250
+ save_version = self.prompt_version_bump()
251
+ if not save_version: # User cancelled prompt
252
+ logger.warning("Save cancelled by user.")
253
+ return
254
+
255
+ if not message:
256
+ message = (
257
+ typer.prompt("Optional message", default="", show_default=False) or None
258
+ )
259
+
260
+ # 2. Prepare the dictionary of dynamic/runtime arguments
261
+ dynamic_args = {
262
+ "name": self._dataset,
263
+ "version": save_version,
264
+ # is aliased in original, hence writing to schema, not dataset_schema
265
+ "schema": self.get_schema(data).model_dump(exclude_none=True),
266
+ }
267
+ if message:
268
+ dynamic_args["message"] = message
269
+
270
+ metadata_dict = {**self._catalog_args, **dynamic_args}
271
+
272
+ # 4. Post-merge processing: Handle the version placeholder in the URI
273
+ location = metadata_dict.get("location", {})
274
+ if "uri" in location:
275
+ # Format the template string first, then convert to URI if needed
276
+ formatted_filesystem_path_str = location["uri"].format(
277
+ version=metadata_dict["version"]
278
+ )
279
+ if not is_uri(formatted_filesystem_path_str):
280
+ path_obj = Path(formatted_filesystem_path_str)
281
+ if not path_obj.is_absolute():
282
+ path_obj = path_obj.resolve()
283
+ location["uri"] = path_obj.as_uri()
284
+ else:
285
+ location["uri"] = formatted_filesystem_path_str
286
+ else:
287
+ raise DatasetError("Required 'location.uri' missing in catalog_args.")
288
+
289
+ # 5. Validate the final dictionary and create the Pydantic object
290
+ try:
291
+ dataset_metadata = DatasetMetadataV1.model_validate(metadata_dict)
292
+ except Exception as e: # Catches Pydantic's ValidationError
293
+ raise DatasetError(f"Invalid dataset metadata configuration: {e}") from e
294
+
295
+ # 6. Save the dataset file using the correct engine and self._save_args
296
+ self.get_dataset(
297
+ dataset_metadata.location.format.value,
298
+ str(dataset_metadata.location.uri),
299
+ {}, # load_args not used in save path
300
+ self._save_args, # Pass engine-specific save_args directly
301
+ ).save(data)
302
+
303
+ # 7. Save the metadata YAML file
304
+ self._storage_service.save(
305
+ f"datasets/{dataset_metadata.name}/{dataset_metadata.version}/dataset.yaml",
306
+ yaml.dump(dataset_metadata.model_dump(mode="json", by_alias=True)),
307
+ commit_msg=f"🤖 Create version {dataset_metadata.version} for '{dataset_metadata.name}'",
308
+ )
309
+
310
+ @staticmethod
311
+ def best_match(versions: list[str], pattern: str) -> tuple[str | None, bool]:
312
+ """Function to find the best semver match.
313
+
314
+ Args:
315
+ versions: List of available versions
316
+ pattern: semver pattern to match
317
+ Returns:
318
+ Best match, and boolean indicating whether this is the last version.
319
+ """
320
+ if not pattern:
321
+ spec = NpmSpec("*")
322
+ else:
323
+ spec = NpmSpec(pattern)
324
+ parsed_versions = [Version(v) for v in versions]
325
+
326
+ # Find versions that satisfy the pattern
327
+ matching = [v for v in parsed_versions if v in spec]
328
+ if not matching:
329
+ return None, False
330
+
331
+ best_version = max(matching)
332
+ latest_version = max(parsed_versions)
333
+ is_latest = best_version == latest_version
334
+
335
+ return str(best_version), is_latest
336
+
337
+ def prompt_version_bump(self) -> tuple[str, str | None]:
338
+ """Prompt user for bumping information."""
339
+ parsed = [Version(v) for v in self.versions]
340
+ current_version = max([*parsed, Version("0.0.0")])
341
+ typer.echo(f"Saving dataset: '{self._dataset}'")
342
+ typer.echo(f"Current version: '{current_version}'")
343
+
344
+ allowed = ["major", "minor", "patch"]
345
+ bump_type = typer.prompt("Which part to bump? (major/minor/patch)").lower()
346
+ while bump_type not in allowed:
347
+ bump_type = typer.prompt(
348
+ "Invalid choice. Please choose major, minor, or patch"
349
+ ).lower()
350
+
351
+ new_version = {
352
+ "major": Version(major=current_version.major + 1, minor=0, patch=0),
353
+ "minor": Version(
354
+ major=current_version.major, minor=current_version.minor + 1, patch=0
355
+ ),
356
+ "patch": Version(
357
+ major=current_version.major,
358
+ minor=current_version.minor,
359
+ patch=current_version.patch + 1,
360
+ ),
361
+ }[bump_type]
362
+
363
+ if not typer.confirm(
364
+ f"Do you want to save dataset '{self._dataset}' with version '{new_version}'?"
365
+ ):
366
+ typer.echo("Save cancelled.")
367
+ return None, None
368
+
369
+ return str(new_version)
370
+
371
+ @property
372
+ def versions(self) -> list[str]:
373
+ """Function to get versions for dataset."""
374
+ paths = self._storage_service.ls(f"datasets/{self._dataset}/*")
375
+ return [
376
+ str(path.relative_to(Path(f"datasets/{self._dataset}"))) for path in paths
377
+ ]
378
+
379
+ def _describe(self) -> dict[str, Any]:
380
+ """Describe the dataset by returning its metadata."""
381
+ return {
382
+ "dataset": self._dataset,
383
+ "engine": self._engine,
384
+ "versions": self.versions,
385
+ }
@@ -0,0 +1,69 @@
1
+ import logging
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ from git import InvalidGitRepositoryError, Repo
6
+ from kedro.framework.hooks import hook_impl
7
+
8
+ from everycure.datasets.kedro.storage import GitStorageService
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class GitStorageHook:
14
+ """Kedro hook to clone or update a Git repository before running a pipeline."""
15
+
16
+ def __init__(
17
+ self,
18
+ repo_url: str,
19
+ target_dir: str = "data/external/data-catalog",
20
+ branch: str = "main",
21
+ force: bool = False,
22
+ pull: bool = True,
23
+ ):
24
+ self.repo_url = repo_url
25
+ self.target_dir = Path(target_dir)
26
+ self.branch = branch
27
+ self.force = force
28
+ self.pull = pull
29
+
30
+ @hook_impl
31
+ def after_context_created(self, context):
32
+ """Clone or update repo before the pipeline runs."""
33
+
34
+ if self.force and self.target_dir.exists():
35
+ shutil.rmtree(self.target_dir)
36
+
37
+ if not self.target_dir.exists():
38
+ repo = Repo.clone_from(self.repo_url, self.target_dir, branch=self.branch)
39
+
40
+ else:
41
+ try:
42
+ repo = Repo(self.target_dir)
43
+ logger.info(f"📁 Existing repo found: {repo.working_dir}")
44
+
45
+ if self.pull:
46
+ logger.info(f"⬇️ Pulling latest changes from {self.branch}")
47
+ origin = repo.remotes.origin
48
+ origin.fetch()
49
+ origin.pull(self.branch)
50
+ logger.info("✅ Repository updated.")
51
+ else:
52
+ logger.info("🔸 Skipping pull, using existing contents.")
53
+
54
+ # Ensure branch consistency
55
+ repo.git.checkout(self.branch)
56
+ except InvalidGitRepositoryError:
57
+ logger.info(
58
+ f"⚠️ {self.target_dir} exists but is not a Git repo. Re-cloning."
59
+ )
60
+ shutil.rmtree(self.target_dir)
61
+ Repo.clone_from(self.repo_url, self.target_dir, branch=self.branch)
62
+
63
+ self.git_service = GitStorageService(
64
+ root_path=str(self.target_dir),
65
+ user="Kedro",
66
+ email="kedro@everycure.org",
67
+ )
68
+
69
+ logger.info(f"🔁 GitStorageService ready at {self.target_dir}")
@@ -0,0 +1,233 @@
1
+ import abc
2
+ import glob
3
+ import logging
4
+ import os
5
+ from pathlib import Path
6
+ from urllib.parse import urlparse
7
+
8
+ from git import InvalidGitRepositoryError, NoSuchPathError, Repo
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def is_uri(path: str) -> bool:
14
+ """Check if a string is a valid URI (has a scheme)."""
15
+ parsed = urlparse(path)
16
+ return bool(parsed.scheme)
17
+
18
+
19
+ class StorageService(abc.ABC):
20
+ """
21
+ Abstract base class defining a storage service.
22
+ """
23
+
24
+ def __init__(self, root_path: str):
25
+ self._root_path = root_path
26
+
27
+ @abc.abstractmethod
28
+ def exists(self, file_path: Path) -> bool:
29
+ """
30
+ Function to verify whether given path exists.
31
+
32
+ Args:
33
+ file_path: Path to the file
34
+ Returns:
35
+ boolean representing existence
36
+ """
37
+
38
+ @abc.abstractmethod
39
+ def ls(self, glob_path: str) -> list[str]:
40
+ """
41
+ Function to list files in the given path.
42
+
43
+ Args:
44
+ glob_path: Path to the directory
45
+ Returns:
46
+ list of paths to files in the directory
47
+ """
48
+
49
+ @abc.abstractmethod
50
+ def get(self, file_path: Path) -> str | None:
51
+ """
52
+ Function to retrieve the contents of the given path.
53
+
54
+ Args:
55
+ file_path: Path to the file
56
+ Returns:
57
+ string representing file contents
58
+ """
59
+
60
+ @abc.abstractmethod
61
+ def save(
62
+ self,
63
+ file_path: Path | list[Path],
64
+ contents: str,
65
+ overwrite: bool = False,
66
+ **kwargs,
67
+ ) -> Path:
68
+ """
69
+ Function to save data in the given location.
70
+
71
+ Args:
72
+ file_path: path or paths to files
73
+ contents: file contents
74
+ overwrite: boolean indicating file can be overwritten
75
+ Returns:
76
+ path to the materialized file
77
+ """
78
+
79
+
80
+ class LocalStorageService(StorageService):
81
+ """
82
+ Specific StorageService that materializes files locally.
83
+ """
84
+
85
+ def exists(self, file_path: Path) -> bool:
86
+ """
87
+ Function to verify whether given path exists.
88
+
89
+ Args:
90
+ file_path: Path to the file
91
+ Returns:
92
+ boolean representing existence
93
+ """
94
+ full_path = Path(self._root_path) / file_path
95
+ return full_path.exists()
96
+
97
+ def ls(self, glob_path: str) -> list[Path]:
98
+ """
99
+ Function to list files in the given path.
100
+
101
+ Args:
102
+ glob_path: Path to the directory
103
+ Returns:
104
+ list of paths to files in the directory
105
+ """
106
+ globs = glob.glob(
107
+ f"{self._root_path}/{glob_path}",
108
+ recursive=True,
109
+ )
110
+
111
+ return [Path(glob).relative_to(self._root_path) for glob in globs]
112
+
113
+ def get(self, file_path: Path) -> str | None:
114
+ """
115
+ Function to retrieve the contents of the given path.
116
+
117
+ Args:
118
+ file_path: Path to the file
119
+ Returns:
120
+ string representing file contents
121
+ """
122
+ full_path = Path(self._root_path) / file_path
123
+
124
+ if full_path.exists():
125
+ return full_path.open(encoding="utf-8").read()
126
+
127
+ return None
128
+
129
+ def save(
130
+ self, file_path: Path, contents: str, overwrite: bool = False, **kwargs
131
+ ) -> Path:
132
+ """
133
+ Function to save data in the given location.
134
+
135
+ Args:
136
+ file_path: file destination path
137
+ contents: file contents
138
+ overwrite: boolean indicating file can be overwritten
139
+ Returns:
140
+ path to the materialized file
141
+ """
142
+
143
+ full_path = Path(self._root_path) / file_path
144
+
145
+ if overwrite is False and full_path.exists():
146
+ raise FileExistsError()
147
+
148
+ os.makedirs(os.path.dirname(full_path), exist_ok=True)
149
+ with full_path.open("w+", encoding="utf-8") as file:
150
+ file.write(contents)
151
+
152
+ return full_path
153
+
154
+
155
+ class GitStorageService(LocalStorageService):
156
+ _instance = None
157
+
158
+ def __new__(cls, root_path: str, user: str, email: str, remote: str = "origin"):
159
+ if cls._instance is None:
160
+ cls._instance = super().__new__(cls)
161
+ return cls._instance
162
+
163
+ def __init__(self, root_path: str, user: str, email: str, remote: str = "origin"):
164
+ # prevent reinitialization
165
+ if getattr(self, "_initialized", False):
166
+ return
167
+
168
+ try:
169
+ self._root_path = root_path
170
+ self._remote = remote
171
+ self._repo = Repo(str(root_path))
172
+
173
+ with self._repo.config_writer() as git_config:
174
+ git_config.set_value("user", "email", email)
175
+ git_config.set_value("user", "name", user)
176
+
177
+ logging.info(f"✅ Initialized GitStorageService at {root_path}")
178
+ except (InvalidGitRepositoryError, NoSuchPathError) as err:
179
+ logging.error(f"❌ Git repo error: {err}")
180
+ raise err
181
+
182
+ self._initialized = True
183
+
184
+ def save(
185
+ self,
186
+ file_path: Path,
187
+ contents: str,
188
+ overwrite: bool = False,
189
+ fetch_latest: bool = True,
190
+ auto_commit: bool = True,
191
+ commit_msg: str = None,
192
+ **kwargs,
193
+ ) -> Path:
194
+ """
195
+ Function to save data in the given location.
196
+
197
+ Args:
198
+ file_path: path or paths to files
199
+ contents: file contents
200
+ fetch_latest: boolean indicating to fetch latest state before saving
201
+ overwrite: boolean indicating file can be overwritten
202
+ auto_commit: boolean indicating whether to auto-commit changes
203
+ commit_msg: commit message
204
+ Returns:
205
+ path to the materialized file
206
+ """
207
+ if fetch_latest:
208
+ self._repo.remote(self._remote).pull()
209
+
210
+ full_path = super().save(file_path, contents, overwrite)
211
+
212
+ if auto_commit and full_path is not None:
213
+ commit_msg = (
214
+ f"add {os.path.basename(full_path)}" if not commit_msg else commit_msg
215
+ )
216
+ self.commit_and_push([file_path], commit_msg)
217
+
218
+ return full_path
219
+
220
+ def commit_and_push(self, file_paths: list[Path], msg: str):
221
+ """
222
+ Helper function to commit and push the given file.
223
+ """
224
+ self._repo.git.add(file_paths)
225
+ self._repo.index.commit(msg)
226
+ push = self._repo.remote(self._remote).push()
227
+ push.raise_if_error()
228
+
229
+ @staticmethod
230
+ def get_instance() -> "GitStorageService":
231
+ if GitStorageService._instance is None:
232
+ raise RuntimeError("GitStorageService has not been initialized yet.")
233
+ return GitStorageService._instance
@@ -0,0 +1,6 @@
1
+ """Dataset metadata models."""
2
+
3
+ from everycure.datasets.models.v1 import DatasetMetadataV1
4
+ from everycure.datasets.models.v2 import DatasetMetadataV2
5
+
6
+ __all__ = ["DatasetMetadataV1", "DatasetMetadataV2"]
@@ -0,0 +1,194 @@
1
+ """Dataset Metadata v1 - Initial version matching the JSON schema."""
2
+
3
+ from datetime import UTC, datetime
4
+ from enum import Enum
5
+ from typing import Any, Optional
6
+
7
+ from pydantic import AnyUrl, BaseModel, Field, HttpUrl, field_validator
8
+
9
+
10
+ class StorageType(str, Enum):
11
+ """Storage type for dataset location."""
12
+
13
+ GCS = "gcs"
14
+ S3 = "s3"
15
+ LOCAL = "local"
16
+ BIGQUERY = "bigquery"
17
+ POSTGRES = "postgres"
18
+
19
+
20
+ class FileFormat(str, Enum):
21
+ """File format for dataset."""
22
+
23
+ TSV = "tsv"
24
+ CSV = "csv"
25
+ PARQUET = "parquet"
26
+ JSON = "json"
27
+ JSONL = "jsonl"
28
+ AVRO = "avro"
29
+ ORC = "orc"
30
+
31
+
32
+ class DatasetStatus(str, Enum):
33
+ """Dataset status."""
34
+
35
+ ACTIVE = "active"
36
+ DEPRECATED = "deprecated"
37
+ ARCHIVED = "archived"
38
+
39
+
40
+ class Location(BaseModel):
41
+ """Dataset location information."""
42
+
43
+ type: StorageType = Field(..., description="Storage type")
44
+ uri: AnyUrl = Field(..., description="Full URI to the dataset")
45
+ format: FileFormat = Field(..., description="File format")
46
+
47
+ model_config = {"extra": "forbid"}
48
+
49
+
50
+ class Owner(BaseModel):
51
+ """Dataset owner information."""
52
+
53
+ name: str = Field(..., min_length=1, description="Owner name")
54
+ email: Optional[str] = Field(None, description="Owner email address")
55
+
56
+ model_config = {"extra": "forbid"}
57
+
58
+
59
+ class Origin(BaseModel):
60
+ """Dataset origin information."""
61
+
62
+ system: str = Field(..., description="Pipeline or system name")
63
+ url: HttpUrl = Field(..., description="GitHub URL to source code")
64
+ commit: Optional[str] = Field(
65
+ None,
66
+ pattern=r"^[a-f0-9]{7,40}$",
67
+ description="Git commit hash (7-40 hex characters)",
68
+ )
69
+ tag: Optional[str] = Field(None, description="Git tag")
70
+
71
+ model_config = {"extra": "forbid"}
72
+
73
+
74
+ class ColumnSchema(BaseModel):
75
+ """Schema definition for a single column."""
76
+
77
+ name: str = Field(..., description="Column name")
78
+ type: str = Field(..., description="Column data type")
79
+ description: Optional[str] = Field(None, description="Column description")
80
+
81
+ model_config = {"extra": "forbid"}
82
+
83
+
84
+ class DatasetSchema(BaseModel):
85
+ """Dataset schema information."""
86
+
87
+ row_count: Optional[int] = Field(None, ge=0, description="Number of rows")
88
+ columns: Optional[list[ColumnSchema]] = Field(
89
+ None, description="List of column definitions"
90
+ )
91
+
92
+ model_config = {"extra": "forbid"}
93
+
94
+
95
+ class DatasetMetadataV1(BaseModel):
96
+ """
97
+ Dataset Metadata v1.
98
+
99
+ This model represents the metadata for a dataset in the registry.
100
+ It matches the structure defined in dataset.schema.json.
101
+ """
102
+
103
+ # Schema version - tracks the version of this metadata definition itself
104
+ schema_version: str = Field(
105
+ default="1.0.0",
106
+ description="Version of the dataset metadata schema definition",
107
+ )
108
+ name: str = Field(
109
+ ...,
110
+ pattern=r"^[a-z][a-z0-9_]*$",
111
+ description="Dataset name in snake_case",
112
+ )
113
+ version: str = Field(
114
+ ...,
115
+ pattern=r"^\d+\.\d+\.\d+$",
116
+ description="Semantic version (e.g., 0.2.0)",
117
+ )
118
+ description: Optional[str] = Field(
119
+ None, min_length=10, description="Brief description of the dataset"
120
+ )
121
+ message: Optional[str] = Field(
122
+ None, description="Optional message about this dataset version"
123
+ )
124
+ location: Location = Field(..., description="Dataset location")
125
+ created_at: datetime = Field(
126
+ default_factory=lambda: datetime.now(UTC),
127
+ description="ISO 8601 timestamp",
128
+ )
129
+ owner: Owner = Field(..., description="Dataset owner")
130
+ origin: Origin = Field(..., description="Dataset origin")
131
+ status: DatasetStatus = Field(
132
+ default=DatasetStatus.ACTIVE, description="Dataset status"
133
+ )
134
+ lineage: Optional[dict[str, Any]] = Field(
135
+ default=None, description="Placeholder for future lineage tracking"
136
+ )
137
+ # Use model_field to avoid shadowing BaseModel.schema
138
+ dataset_schema: Optional[DatasetSchema] = Field(
139
+ default=None,
140
+ alias="schema",
141
+ description="Dataset schema information",
142
+ )
143
+ metadata: Optional[dict[str, Any]] = Field(
144
+ default=None, description="Additional metadata dictionary"
145
+ )
146
+ tags: Optional[list[str]] = Field(
147
+ default=None,
148
+ description="Tags for discoverability (lowercase with hyphens)",
149
+ )
150
+ related_docs: Optional[HttpUrl] = Field(
151
+ default=None, description="Link to documentation"
152
+ )
153
+ deprecated_by: Optional[str] = Field(
154
+ default=None, description="Version that replaces this dataset"
155
+ )
156
+ deprecation_date: Optional[datetime] = Field(
157
+ default=None, description="Date when dataset was deprecated"
158
+ )
159
+
160
+ @field_validator("description")
161
+ @classmethod
162
+ def validate_description(cls, v: Optional[str]) -> Optional[str]:
163
+ """Validate description length if provided."""
164
+ if v is not None and len(v) < 10:
165
+ raise ValueError("Description must be at least 10 characters long")
166
+ return v
167
+
168
+ @field_validator("tags")
169
+ @classmethod
170
+ def validate_tags(cls, v: Optional[list[str]]) -> Optional[list[str]]:
171
+ """Validate tag format."""
172
+ if v is None:
173
+ return v
174
+ for tag in v:
175
+ if not tag or not tag.replace("-", "").replace("_", "").isalnum():
176
+ raise ValueError(
177
+ f"Tag '{tag}' must contain only lowercase alphanumeric characters and hyphens"
178
+ )
179
+ if tag != tag.lower():
180
+ raise ValueError(f"Tag '{tag}' must be lowercase")
181
+ # Ensure unique tags
182
+ if len(v) != len(set(v)):
183
+ raise ValueError("Tags must be unique")
184
+ return v
185
+
186
+ model_config = {
187
+ "extra": "forbid",
188
+ "json_schema_extra": {
189
+ "$schema": "http://json-schema.org/2020-12/schema#",
190
+ "$id": "https://everycure.org/schemas/dataset.v1.schema.json",
191
+ "title": "Dataset Metadata v1",
192
+ "description": "Schema for dataset registry metadata files (v1)",
193
+ },
194
+ }
@@ -0,0 +1,31 @@
1
+ """Dataset Metadata v2 - Refined version (placeholder for future improvements)."""
2
+
3
+ from everycure.datasets.models.v1 import DatasetMetadataV1
4
+
5
+
6
+ class DatasetMetadataV2(DatasetMetadataV1):
7
+ """
8
+ Dataset Metadata v2.
9
+
10
+ This is a placeholder for a future refined version of the dataset metadata model.
11
+ When v2 is implemented, it will include improvements and refinements over v1.
12
+
13
+ For now, this class inherits from v1 to maintain compatibility.
14
+ """
15
+
16
+ # TODO: Add v2-specific fields and improvements here
17
+ # Examples of potential improvements:
18
+ # - Better validation
19
+ # - Additional metadata fields
20
+ # - Improved lineage tracking
21
+ # - Enhanced schema definitions
22
+
23
+ model_config = {
24
+ "extra": "forbid",
25
+ "json_schema_extra": {
26
+ "$schema": "http://json-schema.org/2020-12/schema#",
27
+ "$id": "https://everycure.org/schemas/dataset.v2.schema.json",
28
+ "title": "Dataset Metadata v2",
29
+ "description": "Schema for dataset registry metadata files (v2)",
30
+ },
31
+ }
@@ -0,0 +1,161 @@
1
+ """
2
+ Pre-commit validation script for dataset registry.
3
+
4
+ Checks:
5
+ 1. All version directories follow semantic versioning (MAJOR.MINOR.PATCH)
6
+ 2. Dataset folder names are snake_case
7
+ 3. No files are edited in datasets/ on main branch (immutability check)
8
+ """
9
+
10
+ import yaml
11
+ import re
12
+ import sys
13
+ from pathlib import Path
14
+ from pydantic import ValidationError
15
+
16
+ from logging import getLogger
17
+
18
+ from everycure.datasets.models.v1 import DatasetMetadataV1
19
+
20
+ logger = getLogger(__name__)
21
+
22
+ # Patterns
23
+ SNAKE_CASE_PATTERN = re.compile(r"^[a-z][a-z0-9_]*$")
24
+ SEMVER_PATTERN = re.compile(r"^\d+\.\d+\.\d+$")
25
+
26
+
27
+ def get_dataset_directories(datasets_dir: Path) -> list[Path]:
28
+ """Get all dataset directories that don't start with '.' or '_'."""
29
+ return [
30
+ item
31
+ for item in datasets_dir.iterdir()
32
+ if item.is_dir()
33
+ and not item.name.startswith(".")
34
+ and not item.name.startswith("_")
35
+ ]
36
+
37
+
38
+ def get_version_directories(dataset_dir: Path) -> list[Path]:
39
+ """Get all valid version directories in a dataset directory."""
40
+ versions = []
41
+ for item in dataset_dir.iterdir():
42
+ if item.is_dir() and SEMVER_PATTERN.match(item.name):
43
+ versions.append(item)
44
+ return versions
45
+
46
+
47
+ def get_dataset_specification_files(datasets_dir: Path) -> list[Path]:
48
+ """Get all dataset specification files in a dataset directory."""
49
+ files = []
50
+
51
+ for dataset_dir in datasets_dir.iterdir():
52
+ for versioned_dataset in get_version_directories(dataset_dir):
53
+ if (versioned_dataset / "dataset.yaml").is_file():
54
+ files.append(versioned_dataset / "dataset.yaml")
55
+ return files
56
+
57
+
58
+ def check_snake_case_names(datasets_dir: Path) -> list[str]:
59
+ """Check that all dataset names are snake_case."""
60
+ errors = []
61
+
62
+ for item in get_dataset_directories(datasets_dir):
63
+ if not SNAKE_CASE_PATTERN.match(item.name):
64
+ errors.append(
65
+ f"Dataset name '{item.name}' is not snake_case. "
66
+ f"Use lowercase letters, numbers, and underscores only."
67
+ )
68
+
69
+ return errors
70
+
71
+
72
+ def check_semver_directories(datasets_dir: Path) -> list[str]:
73
+ """Check that all version directories follow semantic versioning."""
74
+ errors = []
75
+
76
+ for dataset in get_dataset_directories(datasets_dir):
77
+ for item in dataset.iterdir():
78
+ # Skip hidden files
79
+ if item.name.startswith("."):
80
+ continue
81
+
82
+ if item.is_dir() and not SEMVER_PATTERN.match(item.name):
83
+ errors.append(
84
+ f"Version directory '{dataset.name}/{item.name}' does not follow "
85
+ f"semantic versioning (MAJOR.MINOR.PATCH). Example: 0.1.0"
86
+ )
87
+
88
+ return errors
89
+
90
+
91
+ def check_schema_validity(datasets_dir: Path) -> list[str]:
92
+ """Check that all schema files are valid."""
93
+ errors = []
94
+
95
+ for dataset_file in get_dataset_specification_files(datasets_dir):
96
+ dataset_yaml = yaml.safe_load(dataset_file.read_text())
97
+ try:
98
+ DatasetMetadataV1.model_validate(dataset_yaml)
99
+ except ValidationError as e:
100
+ print(f"Schema file '{dataset_file.relative_to(datasets_dir)}' is not valid: {e}")
101
+ errors.append(f"Schema file '{dataset_file.relative_to(datasets_dir)}' is not valid.")
102
+
103
+ return errors
104
+
105
+ def _find_repo_root() -> Path:
106
+ """Find the repository root by walking up from current directory or file location."""
107
+ # Start from current working directory
108
+ current = Path.cwd()
109
+
110
+ # Walk up looking for pyproject.toml (repo marker)
111
+ for path in [current, *current.parents]:
112
+ if (path / "pyproject.toml").exists() and (path / "datasets").exists():
113
+ return path
114
+
115
+ # Fallback: use file location (we're in src/everycure/datasets/validate.py)
116
+ return Path(__file__).parent.parent.parent.parent
117
+
118
+
119
+ def validate_datasets(datasets_dir: Path | None = None) -> int:
120
+ """
121
+ Run all validation checks.
122
+
123
+ Args:
124
+ datasets_dir: Path to the datasets directory. If None, will try to find it
125
+ relative to the current working directory or repository root.
126
+
127
+ Returns:
128
+ 0 if validation passes, 1 if it fails.
129
+ """
130
+ if datasets_dir is None:
131
+ repo_root = _find_repo_root()
132
+ datasets_dir = repo_root / "datasets"
133
+
134
+ if not datasets_dir.exists():
135
+ print(
136
+ f"Error: datasets/ directory not found at {datasets_dir}", file=sys.stderr
137
+ )
138
+ return 1
139
+
140
+ all_errors = []
141
+
142
+ # Run all checks
143
+ print("Checking dataset naming conventions...")
144
+ all_errors.extend(check_snake_case_names(datasets_dir))
145
+
146
+ print("Checking semantic versioning...")
147
+ all_errors.extend(check_semver_directories(datasets_dir))
148
+
149
+ print("Checking schema validity...")
150
+ all_errors.extend(check_schema_validity(datasets_dir))
151
+
152
+ # Report results
153
+ if all_errors:
154
+ print("\n❌ Validation failed with the following errors:\n", file=sys.stderr)
155
+ for error in all_errors:
156
+ print(f" {error}", file=sys.stderr)
157
+ print(f"\nTotal errors: {len(all_errors)}", file=sys.stderr)
158
+ return 1
159
+ else:
160
+ print("\n✅ All validation checks passed!")
161
+ return 0