everycure-datasets 0.2.33__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- everycure_datasets-0.2.33/.gitignore +28 -0
- everycure_datasets-0.2.33/PKG-INFO +11 -0
- everycure_datasets-0.2.33/pyproject.toml +52 -0
- everycure_datasets-0.2.33/src/everycure/__init__.py +3 -0
- everycure_datasets-0.2.33/src/everycure/datasets/__init__.py +6 -0
- everycure_datasets-0.2.33/src/everycure/datasets/cli.py +64 -0
- everycure_datasets-0.2.33/src/everycure/datasets/generate_schema.py +62 -0
- everycure_datasets-0.2.33/src/everycure/datasets/kedro/__init__.py +5 -0
- everycure_datasets-0.2.33/src/everycure/datasets/kedro/catalog_dataset.py +385 -0
- everycure_datasets-0.2.33/src/everycure/datasets/kedro/hooks.py +69 -0
- everycure_datasets-0.2.33/src/everycure/datasets/kedro/storage.py +233 -0
- everycure_datasets-0.2.33/src/everycure/datasets/models/__init__.py +6 -0
- everycure_datasets-0.2.33/src/everycure/datasets/models/v1.py +194 -0
- everycure_datasets-0.2.33/src/everycure/datasets/models/v2.py +31 -0
- everycure_datasets-0.2.33/src/everycure/datasets/validate.py +161 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.pyc
|
|
5
|
+
|
|
6
|
+
# Virtual environments
|
|
7
|
+
.venv/
|
|
8
|
+
venv/
|
|
9
|
+
|
|
10
|
+
# uv
|
|
11
|
+
.uv/
|
|
12
|
+
|
|
13
|
+
# Build artifacts
|
|
14
|
+
dist/
|
|
15
|
+
build/
|
|
16
|
+
*.egg-info/
|
|
17
|
+
|
|
18
|
+
# IDE (keep .vscode/settings.json for schema validation)
|
|
19
|
+
.vscode/*
|
|
20
|
+
!.vscode/settings.json
|
|
21
|
+
.idea/
|
|
22
|
+
|
|
23
|
+
# Environment variables
|
|
24
|
+
.env
|
|
25
|
+
.env.local
|
|
26
|
+
|
|
27
|
+
# OS
|
|
28
|
+
.DS_Store
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: everycure-datasets
|
|
3
|
+
Version: 0.2.33
|
|
4
|
+
Requires-Python: >=3.11
|
|
5
|
+
Requires-Dist: gitpython>=3.1.45
|
|
6
|
+
Requires-Dist: pydantic>=2.0.0
|
|
7
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
8
|
+
Provides-Extra: kedro
|
|
9
|
+
Requires-Dist: kedro-datasets[pandas,spark]>=6.0.0; extra == 'kedro'
|
|
10
|
+
Requires-Dist: kedro>=0.19.1; extra == 'kedro'
|
|
11
|
+
Requires-Dist: semantic-version>=2.10.0; extra == 'kedro'
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "everycure-datasets"
|
|
3
|
+
version = "0.2.33"
|
|
4
|
+
requires-python = ">=3.11"
|
|
5
|
+
scripts = {datasets = "everycure.datasets.cli:app"}
|
|
6
|
+
dependencies = [
|
|
7
|
+
"gitpython>=3.1.45",
|
|
8
|
+
"pydantic>=2.0.0",
|
|
9
|
+
"pyyaml>=6.0.0",
|
|
10
|
+
]
|
|
11
|
+
[project.optional-dependencies]
|
|
12
|
+
kedro = [
|
|
13
|
+
"kedro>=0.19.1",
|
|
14
|
+
"kedro-datasets[pandas,spark]>=6.0.0",
|
|
15
|
+
"semantic-version>=2.10.0",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[tool.hatch.build]
|
|
19
|
+
include = ["src/**"]
|
|
20
|
+
|
|
21
|
+
[tool.hatch.build.targets.sdist]
|
|
22
|
+
exclude = [
|
|
23
|
+
"datasets/**",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[tool.hatch.build.targets.wheel]
|
|
27
|
+
packages = ["src/everycure"]
|
|
28
|
+
|
|
29
|
+
# [tool.uv]
|
|
30
|
+
# keyring-provider = "subprocess"
|
|
31
|
+
#
|
|
32
|
+
# [[tool.uv.index]]
|
|
33
|
+
# name = "pypi"
|
|
34
|
+
# url = "https://pypi.org/simple"
|
|
35
|
+
#
|
|
36
|
+
# [[tool.uv.index]]
|
|
37
|
+
# name = "everycure"
|
|
38
|
+
# url = "https://oauth2accesstoken@us-central1-python.pkg.dev/core-422020/everycure/simple/"
|
|
39
|
+
# publish-url = "https://us-central1-python.pkg.dev/core-422020/everycure/"
|
|
40
|
+
# default = false
|
|
41
|
+
|
|
42
|
+
[build-system]
|
|
43
|
+
requires = ["hatchling"]
|
|
44
|
+
build-backend = "hatchling.build"
|
|
45
|
+
|
|
46
|
+
[dependency-groups]
|
|
47
|
+
dev = [
|
|
48
|
+
"keyrings-google-artifactregistry-auth>=1.1.2",
|
|
49
|
+
"pytest>=9.0.1",
|
|
50
|
+
"pre-commit>=4.3.0",
|
|
51
|
+
"typer>=0.12.0",
|
|
52
|
+
]
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""CLI for datasets registry operations."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
from everycure.datasets.generate_schema import generate_all_schemas
|
|
8
|
+
from everycure.datasets.validate import validate_datasets
|
|
9
|
+
|
|
10
|
+
app = typer.Typer(
|
|
11
|
+
name="datasets",
|
|
12
|
+
help="Datasets registry management CLI",
|
|
13
|
+
add_completion=False,
|
|
14
|
+
no_args_is_help=True,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
schema_app = typer.Typer(
|
|
18
|
+
name="schema",
|
|
19
|
+
help="Schema management commands",
|
|
20
|
+
add_completion=False,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
app.add_typer(schema_app)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@app.callback(invoke_without_command=True)
|
|
27
|
+
def main(ctx: typer.Context) -> None:
|
|
28
|
+
"""Datasets registry management CLI."""
|
|
29
|
+
if ctx.invoked_subcommand is None:
|
|
30
|
+
typer.echo(ctx.get_help())
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@app.command()
|
|
34
|
+
def validate(
|
|
35
|
+
datasets_dir: Path | None = typer.Option(
|
|
36
|
+
None,
|
|
37
|
+
"--datasets-dir",
|
|
38
|
+
"-d",
|
|
39
|
+
help="Path to the datasets directory (default: auto-detect)",
|
|
40
|
+
),
|
|
41
|
+
) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Validate dataset YAML files and directory structure.
|
|
44
|
+
|
|
45
|
+
Checks:
|
|
46
|
+
- Dataset names are snake_case
|
|
47
|
+
- Version directories follow semantic versioning (MAJOR.MINOR.PATCH)
|
|
48
|
+
"""
|
|
49
|
+
return validate_datasets(datasets_dir)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@schema_app.command()
|
|
53
|
+
def generate() -> None:
|
|
54
|
+
"""
|
|
55
|
+
Generate JSON schema(s) from the Pydantic models.
|
|
56
|
+
"""
|
|
57
|
+
repo_root = Path.cwd()
|
|
58
|
+
schema_dir = repo_root / ".schema"
|
|
59
|
+
generate_all_schemas(schema_dir)
|
|
60
|
+
typer.echo(f"✓ Generated all JSON schemas in {schema_dir}")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if __name__ == "__main__":
|
|
64
|
+
app()
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Generate JSON schema from Pydantic models."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from everycure.datasets.models.v1 import DatasetMetadataV1
|
|
7
|
+
from everycure.datasets.models.v2 import DatasetMetadataV2
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def generate_schema(model_class, output_path: Path, schema_id: str) -> None:
|
|
11
|
+
"""
|
|
12
|
+
Generate JSON schema from a Pydantic model.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
model_class: The Pydantic model class to generate schema from
|
|
16
|
+
output_path: Path where the JSON schema should be written
|
|
17
|
+
schema_id: The $id for the schema (FQDN)
|
|
18
|
+
"""
|
|
19
|
+
# Get the JSON schema from the Pydantic model
|
|
20
|
+
schema = model_class.model_json_schema(
|
|
21
|
+
mode="serialization",
|
|
22
|
+
by_alias=True,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Update the $id to use the provided FQDN
|
|
26
|
+
schema["$id"] = schema_id
|
|
27
|
+
|
|
28
|
+
# Remove fields with defaults from required list
|
|
29
|
+
# but keep them in properties so they're documented
|
|
30
|
+
fields_with_defaults = ["schema_version", "status", "created_at"]
|
|
31
|
+
if "required" in schema:
|
|
32
|
+
for field in fields_with_defaults:
|
|
33
|
+
if field in schema["required"]:
|
|
34
|
+
schema["required"].remove(field)
|
|
35
|
+
|
|
36
|
+
# Write to file with proper formatting
|
|
37
|
+
with open(output_path, "w") as f:
|
|
38
|
+
json.dump(schema, f, indent=2)
|
|
39
|
+
f.write("\n") # Add trailing newline
|
|
40
|
+
|
|
41
|
+
print(f"Generated JSON schema at {output_path}")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def generate_all_schemas(schema_dir: Path) -> None:
|
|
45
|
+
"""Generate all schema versions."""
|
|
46
|
+
schema_dir.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
|
|
48
|
+
# Generate v1 schema
|
|
49
|
+
v1_path = schema_dir / "dataset.v1.schema.json"
|
|
50
|
+
generate_schema(
|
|
51
|
+
DatasetMetadataV1,
|
|
52
|
+
v1_path,
|
|
53
|
+
"https://everycure.org/schemas/dataset.v1.schema.json",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Generate v2 schema
|
|
57
|
+
v2_path = schema_dir / "dataset.v2.schema.json"
|
|
58
|
+
generate_schema(
|
|
59
|
+
DatasetMetadataV2,
|
|
60
|
+
v2_path,
|
|
61
|
+
"https://everycure.org/schemas/dataset.v2.schema.json",
|
|
62
|
+
)
|
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
import yaml
|
|
7
|
+
from kedro.io.core import AbstractDataset, DatasetError
|
|
8
|
+
from kedro_datasets import pandas, spark
|
|
9
|
+
from semantic_version import NpmSpec, Version
|
|
10
|
+
|
|
11
|
+
from everycure.datasets.kedro.storage import GitStorageService, is_uri
|
|
12
|
+
from everycure.datasets.models.v1 import (
|
|
13
|
+
ColumnSchema,
|
|
14
|
+
DatasetMetadataV1,
|
|
15
|
+
DatasetSchema,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
app = typer.Typer()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DataCatalogDataset(AbstractDataset):
|
|
24
|
+
"""Custom dataset to load and read resources
|
|
25
|
+
|
|
26
|
+
Examples:
|
|
27
|
+
|
|
28
|
+
```yaml
|
|
29
|
+
catalog_diseases:
|
|
30
|
+
type: everycure.datasets.kedro.catalog_dataset.DataCatalogDataset
|
|
31
|
+
dataset: disease_list
|
|
32
|
+
engine: spark
|
|
33
|
+
|
|
34
|
+
load_args:
|
|
35
|
+
version: ~0.2.0
|
|
36
|
+
|
|
37
|
+
# Arguments for the underlying Kedro engine (e.g., spark.SparkDataset)
|
|
38
|
+
save_args:
|
|
39
|
+
mode: overwrite
|
|
40
|
+
|
|
41
|
+
# Arguments for the DatasetMetadataV1 model
|
|
42
|
+
catalog_args:
|
|
43
|
+
description: "Dataset description"
|
|
44
|
+
message: "Optional message"
|
|
45
|
+
owner:
|
|
46
|
+
name: "John Doe"
|
|
47
|
+
email: "john@example.com"
|
|
48
|
+
location:
|
|
49
|
+
uri: "gs://path/to/{version}/data.parquet"
|
|
50
|
+
format: "parquet"
|
|
51
|
+
# ...etc
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
*,
|
|
59
|
+
dataset: str | dict[str, Any],
|
|
60
|
+
engine: str,
|
|
61
|
+
save_args: dict[str, Any] = None,
|
|
62
|
+
load_args: dict[str, Any] = None,
|
|
63
|
+
**kwargs,
|
|
64
|
+
):
|
|
65
|
+
self._dataset = dataset
|
|
66
|
+
self._engine = engine
|
|
67
|
+
self._storage_service = GitStorageService.get_instance()
|
|
68
|
+
self._save_args = save_args or {}
|
|
69
|
+
self._catalog_args = self._save_args.pop("catalog_args", None)
|
|
70
|
+
self._load_args = load_args or {}
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def filepath(self) -> str:
|
|
74
|
+
semver_pattern = self._load_args.get("version")
|
|
75
|
+
version, _ = self.best_match(self.versions, semver_pattern)
|
|
76
|
+
content = self._storage_service.get(
|
|
77
|
+
Path(f"datasets/{self._dataset}/{version}/dataset.yaml")
|
|
78
|
+
)
|
|
79
|
+
if content is None:
|
|
80
|
+
raise DatasetError(
|
|
81
|
+
f"Dataset metadata file not found for '{self._dataset}' version '{version}'"
|
|
82
|
+
)
|
|
83
|
+
dataset = DatasetMetadataV1.model_validate(yaml.safe_load(content))
|
|
84
|
+
|
|
85
|
+
return str(dataset.location.uri)
|
|
86
|
+
|
|
87
|
+
def load(self) -> Any:
|
|
88
|
+
"""Dataset loading
|
|
89
|
+
|
|
90
|
+
Dataset loads the best matching version of the requested
|
|
91
|
+
dataset using the pattern.
|
|
92
|
+
"""
|
|
93
|
+
# Make a copy to avoid modifying the original dict
|
|
94
|
+
engine_load_args = self._load_args.copy()
|
|
95
|
+
semver_pattern = engine_load_args.pop("version", None)
|
|
96
|
+
assert_latest = engine_load_args.pop("assert_latest", False)
|
|
97
|
+
|
|
98
|
+
version, is_latest = self.best_match(self.versions, semver_pattern)
|
|
99
|
+
|
|
100
|
+
if version is None:
|
|
101
|
+
raise DatasetError(
|
|
102
|
+
f"No version matched for dataset '{self._dataset}', available versions: {','.join(self.versions)}"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
if assert_latest and not is_latest:
|
|
106
|
+
raise DatasetError(
|
|
107
|
+
f"Newer version for dataset '{self._dataset}' available!"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
logger.info(f"Using version {version} for dataset '{self._dataset}'")
|
|
111
|
+
try:
|
|
112
|
+
content = self._storage_service.get(
|
|
113
|
+
Path(f"datasets/{self._dataset}/{version}/dataset.yaml")
|
|
114
|
+
)
|
|
115
|
+
if content is None:
|
|
116
|
+
raise DatasetError(
|
|
117
|
+
f"Dataset metadata file not found for '{self._dataset}' version '{version}'"
|
|
118
|
+
)
|
|
119
|
+
dataset = DatasetMetadataV1.model_validate(yaml.safe_load(content))
|
|
120
|
+
|
|
121
|
+
return self.get_dataset(
|
|
122
|
+
dataset.location.format.value,
|
|
123
|
+
str(dataset.location.uri),
|
|
124
|
+
engine_load_args,
|
|
125
|
+
{}, # save_args are not used in load
|
|
126
|
+
).load()
|
|
127
|
+
except Exception as e:
|
|
128
|
+
raise DatasetError(
|
|
129
|
+
f"Failed to load version for dataset '{self._dataset}': {e}"
|
|
130
|
+
) from e
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def _uri_to_path(uri: str) -> str:
|
|
134
|
+
"""Convert file:// URLs to file paths for local file access.
|
|
135
|
+
|
|
136
|
+
kedro_datasets expects file paths, not file:// URLs.
|
|
137
|
+
Other URI schemes (http, https, gs, s3, etc.) are passed through unchanged.
|
|
138
|
+
"""
|
|
139
|
+
if uri.startswith("file://"):
|
|
140
|
+
from urllib.parse import unquote, urlparse
|
|
141
|
+
|
|
142
|
+
parsed = urlparse(uri)
|
|
143
|
+
return unquote(parsed.path)
|
|
144
|
+
return uri
|
|
145
|
+
|
|
146
|
+
def get_dataset(
|
|
147
|
+
self,
|
|
148
|
+
format_: str,
|
|
149
|
+
file_path: str,
|
|
150
|
+
load_args: dict[str, Any],
|
|
151
|
+
save_args: dict[str, Any],
|
|
152
|
+
) -> AbstractDataset:
|
|
153
|
+
# Convert file:// URLs to paths for local file access
|
|
154
|
+
file_path = self._uri_to_path(file_path)
|
|
155
|
+
|
|
156
|
+
if self._engine == "spark":
|
|
157
|
+
if format_ == "tsv":
|
|
158
|
+
return spark.SparkDataset(
|
|
159
|
+
filepath=file_path,
|
|
160
|
+
file_format="csv",
|
|
161
|
+
load_args={
|
|
162
|
+
**load_args,
|
|
163
|
+
"sep": "\t",
|
|
164
|
+
"header": True,
|
|
165
|
+
"index": False,
|
|
166
|
+
},
|
|
167
|
+
save_args=save_args,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
return spark.SparkDataset(
|
|
171
|
+
filepath=file_path,
|
|
172
|
+
file_format=format_,
|
|
173
|
+
load_args={**load_args, "header": True, "index": False},
|
|
174
|
+
save_args=save_args,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
if self._engine == "pandas":
|
|
178
|
+
if format_ == "csv":
|
|
179
|
+
return pandas.CSVDataset(
|
|
180
|
+
filepath=file_path,
|
|
181
|
+
load_args=load_args,
|
|
182
|
+
save_args=save_args,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
if format_ == "parquet":
|
|
186
|
+
return pandas.ParquetDataset(
|
|
187
|
+
filepath=file_path, load_args=load_args, save_args=save_args
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
raise ValueError(f"Unsupported engine: {self._engine} and format {format}")
|
|
191
|
+
|
|
192
|
+
def get_schema(self, data) -> DatasetSchema:
|
|
193
|
+
"""Get dataset schema as DatasetSchema model."""
|
|
194
|
+
columns = None
|
|
195
|
+
row_count = None
|
|
196
|
+
|
|
197
|
+
if self._engine == "pandas":
|
|
198
|
+
type_map = {
|
|
199
|
+
"int64": "int",
|
|
200
|
+
"Int64": "int",
|
|
201
|
+
"float64": "float",
|
|
202
|
+
"object": "string",
|
|
203
|
+
"bool": "bool",
|
|
204
|
+
"datetime64[ns]": "datetime",
|
|
205
|
+
}
|
|
206
|
+
columns = [
|
|
207
|
+
ColumnSchema(name=col, type=type_map.get(str(dtype), "unknown"))
|
|
208
|
+
for col, dtype in data.dtypes.items()
|
|
209
|
+
]
|
|
210
|
+
row_count = len(data)
|
|
211
|
+
|
|
212
|
+
elif self._engine == "spark":
|
|
213
|
+
spark_map = {
|
|
214
|
+
"IntegerType()": "int",
|
|
215
|
+
"LongType()": "int",
|
|
216
|
+
"DoubleType()": "float",
|
|
217
|
+
"FloatType()": "float",
|
|
218
|
+
"StringType()": "string",
|
|
219
|
+
"BooleanType()": "bool",
|
|
220
|
+
"TimestampType()": "datetime",
|
|
221
|
+
"DateType()": "date",
|
|
222
|
+
}
|
|
223
|
+
columns = [
|
|
224
|
+
ColumnSchema(
|
|
225
|
+
name=field.name,
|
|
226
|
+
type=spark_map.get(str(field.dataType), "unknown"),
|
|
227
|
+
)
|
|
228
|
+
for field in data.schema.fields
|
|
229
|
+
]
|
|
230
|
+
row_count = data.count()
|
|
231
|
+
|
|
232
|
+
else:
|
|
233
|
+
raise ValueError(f"Unsupported engine: {self._engine}")
|
|
234
|
+
|
|
235
|
+
return DatasetSchema(columns=columns, row_count=row_count)
|
|
236
|
+
|
|
237
|
+
def save(self, data: Any) -> None:
|
|
238
|
+
"""Dataset saving
|
|
239
|
+
|
|
240
|
+
Dataset is saved using the next relevant semversion based
|
|
241
|
+
on the catalog arguments.
|
|
242
|
+
"""
|
|
243
|
+
if not self._catalog_args:
|
|
244
|
+
raise DatasetError("Required 'catalog_args' missing in save_args.")
|
|
245
|
+
|
|
246
|
+
# 1. Calculate dynamic properties from _catalog_args
|
|
247
|
+
save_version = self._catalog_args.get("version")
|
|
248
|
+
message = self._catalog_args.get("message")
|
|
249
|
+
if not save_version:
|
|
250
|
+
save_version = self.prompt_version_bump()
|
|
251
|
+
if not save_version: # User cancelled prompt
|
|
252
|
+
logger.warning("Save cancelled by user.")
|
|
253
|
+
return
|
|
254
|
+
|
|
255
|
+
if not message:
|
|
256
|
+
message = (
|
|
257
|
+
typer.prompt("Optional message", default="", show_default=False) or None
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# 2. Prepare the dictionary of dynamic/runtime arguments
|
|
261
|
+
dynamic_args = {
|
|
262
|
+
"name": self._dataset,
|
|
263
|
+
"version": save_version,
|
|
264
|
+
# is aliased in original, hence writing to schema, not dataset_schema
|
|
265
|
+
"schema": self.get_schema(data).model_dump(exclude_none=True),
|
|
266
|
+
}
|
|
267
|
+
if message:
|
|
268
|
+
dynamic_args["message"] = message
|
|
269
|
+
|
|
270
|
+
metadata_dict = {**self._catalog_args, **dynamic_args}
|
|
271
|
+
|
|
272
|
+
# 4. Post-merge processing: Handle the version placeholder in the URI
|
|
273
|
+
location = metadata_dict.get("location", {})
|
|
274
|
+
if "uri" in location:
|
|
275
|
+
# Format the template string first, then convert to URI if needed
|
|
276
|
+
formatted_filesystem_path_str = location["uri"].format(
|
|
277
|
+
version=metadata_dict["version"]
|
|
278
|
+
)
|
|
279
|
+
if not is_uri(formatted_filesystem_path_str):
|
|
280
|
+
path_obj = Path(formatted_filesystem_path_str)
|
|
281
|
+
if not path_obj.is_absolute():
|
|
282
|
+
path_obj = path_obj.resolve()
|
|
283
|
+
location["uri"] = path_obj.as_uri()
|
|
284
|
+
else:
|
|
285
|
+
location["uri"] = formatted_filesystem_path_str
|
|
286
|
+
else:
|
|
287
|
+
raise DatasetError("Required 'location.uri' missing in catalog_args.")
|
|
288
|
+
|
|
289
|
+
# 5. Validate the final dictionary and create the Pydantic object
|
|
290
|
+
try:
|
|
291
|
+
dataset_metadata = DatasetMetadataV1.model_validate(metadata_dict)
|
|
292
|
+
except Exception as e: # Catches Pydantic's ValidationError
|
|
293
|
+
raise DatasetError(f"Invalid dataset metadata configuration: {e}") from e
|
|
294
|
+
|
|
295
|
+
# 6. Save the dataset file using the correct engine and self._save_args
|
|
296
|
+
self.get_dataset(
|
|
297
|
+
dataset_metadata.location.format.value,
|
|
298
|
+
str(dataset_metadata.location.uri),
|
|
299
|
+
{}, # load_args not used in save path
|
|
300
|
+
self._save_args, # Pass engine-specific save_args directly
|
|
301
|
+
).save(data)
|
|
302
|
+
|
|
303
|
+
# 7. Save the metadata YAML file
|
|
304
|
+
self._storage_service.save(
|
|
305
|
+
f"datasets/{dataset_metadata.name}/{dataset_metadata.version}/dataset.yaml",
|
|
306
|
+
yaml.dump(dataset_metadata.model_dump(mode="json", by_alias=True)),
|
|
307
|
+
commit_msg=f"🤖 Create version {dataset_metadata.version} for '{dataset_metadata.name}'",
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
@staticmethod
|
|
311
|
+
def best_match(versions: list[str], pattern: str) -> tuple[str | None, bool]:
|
|
312
|
+
"""Function to find the best semver match.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
versions: List of available versions
|
|
316
|
+
pattern: semver pattern to match
|
|
317
|
+
Returns:
|
|
318
|
+
Best match, and boolean indicating whether this is the last version.
|
|
319
|
+
"""
|
|
320
|
+
if not pattern:
|
|
321
|
+
spec = NpmSpec("*")
|
|
322
|
+
else:
|
|
323
|
+
spec = NpmSpec(pattern)
|
|
324
|
+
parsed_versions = [Version(v) for v in versions]
|
|
325
|
+
|
|
326
|
+
# Find versions that satisfy the pattern
|
|
327
|
+
matching = [v for v in parsed_versions if v in spec]
|
|
328
|
+
if not matching:
|
|
329
|
+
return None, False
|
|
330
|
+
|
|
331
|
+
best_version = max(matching)
|
|
332
|
+
latest_version = max(parsed_versions)
|
|
333
|
+
is_latest = best_version == latest_version
|
|
334
|
+
|
|
335
|
+
return str(best_version), is_latest
|
|
336
|
+
|
|
337
|
+
def prompt_version_bump(self) -> tuple[str, str | None]:
|
|
338
|
+
"""Prompt user for bumping information."""
|
|
339
|
+
parsed = [Version(v) for v in self.versions]
|
|
340
|
+
current_version = max([*parsed, Version("0.0.0")])
|
|
341
|
+
typer.echo(f"Saving dataset: '{self._dataset}'")
|
|
342
|
+
typer.echo(f"Current version: '{current_version}'")
|
|
343
|
+
|
|
344
|
+
allowed = ["major", "minor", "patch"]
|
|
345
|
+
bump_type = typer.prompt("Which part to bump? (major/minor/patch)").lower()
|
|
346
|
+
while bump_type not in allowed:
|
|
347
|
+
bump_type = typer.prompt(
|
|
348
|
+
"Invalid choice. Please choose major, minor, or patch"
|
|
349
|
+
).lower()
|
|
350
|
+
|
|
351
|
+
new_version = {
|
|
352
|
+
"major": Version(major=current_version.major + 1, minor=0, patch=0),
|
|
353
|
+
"minor": Version(
|
|
354
|
+
major=current_version.major, minor=current_version.minor + 1, patch=0
|
|
355
|
+
),
|
|
356
|
+
"patch": Version(
|
|
357
|
+
major=current_version.major,
|
|
358
|
+
minor=current_version.minor,
|
|
359
|
+
patch=current_version.patch + 1,
|
|
360
|
+
),
|
|
361
|
+
}[bump_type]
|
|
362
|
+
|
|
363
|
+
if not typer.confirm(
|
|
364
|
+
f"Do you want to save dataset '{self._dataset}' with version '{new_version}'?"
|
|
365
|
+
):
|
|
366
|
+
typer.echo("Save cancelled.")
|
|
367
|
+
return None, None
|
|
368
|
+
|
|
369
|
+
return str(new_version)
|
|
370
|
+
|
|
371
|
+
@property
|
|
372
|
+
def versions(self) -> list[str]:
|
|
373
|
+
"""Function to get versions for dataset."""
|
|
374
|
+
paths = self._storage_service.ls(f"datasets/{self._dataset}/*")
|
|
375
|
+
return [
|
|
376
|
+
str(path.relative_to(Path(f"datasets/{self._dataset}"))) for path in paths
|
|
377
|
+
]
|
|
378
|
+
|
|
379
|
+
def _describe(self) -> dict[str, Any]:
|
|
380
|
+
"""Describe the dataset by returning its metadata."""
|
|
381
|
+
return {
|
|
382
|
+
"dataset": self._dataset,
|
|
383
|
+
"engine": self._engine,
|
|
384
|
+
"versions": self.versions,
|
|
385
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import shutil
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from git import InvalidGitRepositoryError, Repo
|
|
6
|
+
from kedro.framework.hooks import hook_impl
|
|
7
|
+
|
|
8
|
+
from everycure.datasets.kedro.storage import GitStorageService
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class GitStorageHook:
|
|
14
|
+
"""Kedro hook to clone or update a Git repository before running a pipeline."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
repo_url: str,
|
|
19
|
+
target_dir: str = "data/external/data-catalog",
|
|
20
|
+
branch: str = "main",
|
|
21
|
+
force: bool = False,
|
|
22
|
+
pull: bool = True,
|
|
23
|
+
):
|
|
24
|
+
self.repo_url = repo_url
|
|
25
|
+
self.target_dir = Path(target_dir)
|
|
26
|
+
self.branch = branch
|
|
27
|
+
self.force = force
|
|
28
|
+
self.pull = pull
|
|
29
|
+
|
|
30
|
+
@hook_impl
|
|
31
|
+
def after_context_created(self, context):
|
|
32
|
+
"""Clone or update repo before the pipeline runs."""
|
|
33
|
+
|
|
34
|
+
if self.force and self.target_dir.exists():
|
|
35
|
+
shutil.rmtree(self.target_dir)
|
|
36
|
+
|
|
37
|
+
if not self.target_dir.exists():
|
|
38
|
+
repo = Repo.clone_from(self.repo_url, self.target_dir, branch=self.branch)
|
|
39
|
+
|
|
40
|
+
else:
|
|
41
|
+
try:
|
|
42
|
+
repo = Repo(self.target_dir)
|
|
43
|
+
logger.info(f"📁 Existing repo found: {repo.working_dir}")
|
|
44
|
+
|
|
45
|
+
if self.pull:
|
|
46
|
+
logger.info(f"⬇️ Pulling latest changes from {self.branch}")
|
|
47
|
+
origin = repo.remotes.origin
|
|
48
|
+
origin.fetch()
|
|
49
|
+
origin.pull(self.branch)
|
|
50
|
+
logger.info("✅ Repository updated.")
|
|
51
|
+
else:
|
|
52
|
+
logger.info("🔸 Skipping pull, using existing contents.")
|
|
53
|
+
|
|
54
|
+
# Ensure branch consistency
|
|
55
|
+
repo.git.checkout(self.branch)
|
|
56
|
+
except InvalidGitRepositoryError:
|
|
57
|
+
logger.info(
|
|
58
|
+
f"⚠️ {self.target_dir} exists but is not a Git repo. Re-cloning."
|
|
59
|
+
)
|
|
60
|
+
shutil.rmtree(self.target_dir)
|
|
61
|
+
Repo.clone_from(self.repo_url, self.target_dir, branch=self.branch)
|
|
62
|
+
|
|
63
|
+
self.git_service = GitStorageService(
|
|
64
|
+
root_path=str(self.target_dir),
|
|
65
|
+
user="Kedro",
|
|
66
|
+
email="kedro@everycure.org",
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
logger.info(f"🔁 GitStorageService ready at {self.target_dir}")
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import glob
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def is_uri(path: str) -> bool:
|
|
14
|
+
"""Check if a string is a valid URI (has a scheme)."""
|
|
15
|
+
parsed = urlparse(path)
|
|
16
|
+
return bool(parsed.scheme)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class StorageService(abc.ABC):
|
|
20
|
+
"""
|
|
21
|
+
Abstract base class defining a storage service.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, root_path: str):
|
|
25
|
+
self._root_path = root_path
|
|
26
|
+
|
|
27
|
+
@abc.abstractmethod
|
|
28
|
+
def exists(self, file_path: Path) -> bool:
|
|
29
|
+
"""
|
|
30
|
+
Function to verify whether given path exists.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
file_path: Path to the file
|
|
34
|
+
Returns:
|
|
35
|
+
boolean representing existence
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
@abc.abstractmethod
|
|
39
|
+
def ls(self, glob_path: str) -> list[str]:
|
|
40
|
+
"""
|
|
41
|
+
Function to list files in the given path.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
glob_path: Path to the directory
|
|
45
|
+
Returns:
|
|
46
|
+
list of paths to files in the directory
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
@abc.abstractmethod
|
|
50
|
+
def get(self, file_path: Path) -> str | None:
|
|
51
|
+
"""
|
|
52
|
+
Function to retrieve the contents of the given path.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
file_path: Path to the file
|
|
56
|
+
Returns:
|
|
57
|
+
string representing file contents
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
@abc.abstractmethod
|
|
61
|
+
def save(
|
|
62
|
+
self,
|
|
63
|
+
file_path: Path | list[Path],
|
|
64
|
+
contents: str,
|
|
65
|
+
overwrite: bool = False,
|
|
66
|
+
**kwargs,
|
|
67
|
+
) -> Path:
|
|
68
|
+
"""
|
|
69
|
+
Function to save data in the given location.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
file_path: path or paths to files
|
|
73
|
+
contents: file contents
|
|
74
|
+
overwrite: boolean indicating file can be overwritten
|
|
75
|
+
Returns:
|
|
76
|
+
path to the materialized file
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class LocalStorageService(StorageService):
|
|
81
|
+
"""
|
|
82
|
+
Specific StorageService that materializes files locally.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def exists(self, file_path: Path) -> bool:
|
|
86
|
+
"""
|
|
87
|
+
Function to verify whether given path exists.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
file_path: Path to the file
|
|
91
|
+
Returns:
|
|
92
|
+
boolean representing existence
|
|
93
|
+
"""
|
|
94
|
+
full_path = Path(self._root_path) / file_path
|
|
95
|
+
return full_path.exists()
|
|
96
|
+
|
|
97
|
+
def ls(self, glob_path: str) -> list[Path]:
|
|
98
|
+
"""
|
|
99
|
+
Function to list files in the given path.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
glob_path: Path to the directory
|
|
103
|
+
Returns:
|
|
104
|
+
list of paths to files in the directory
|
|
105
|
+
"""
|
|
106
|
+
globs = glob.glob(
|
|
107
|
+
f"{self._root_path}/{glob_path}",
|
|
108
|
+
recursive=True,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return [Path(glob).relative_to(self._root_path) for glob in globs]
|
|
112
|
+
|
|
113
|
+
def get(self, file_path: Path) -> str | None:
|
|
114
|
+
"""
|
|
115
|
+
Function to retrieve the contents of the given path.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
file_path: Path to the file
|
|
119
|
+
Returns:
|
|
120
|
+
string representing file contents
|
|
121
|
+
"""
|
|
122
|
+
full_path = Path(self._root_path) / file_path
|
|
123
|
+
|
|
124
|
+
if full_path.exists():
|
|
125
|
+
return full_path.open(encoding="utf-8").read()
|
|
126
|
+
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
def save(
|
|
130
|
+
self, file_path: Path, contents: str, overwrite: bool = False, **kwargs
|
|
131
|
+
) -> Path:
|
|
132
|
+
"""
|
|
133
|
+
Function to save data in the given location.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
file_path: file destination path
|
|
137
|
+
contents: file contents
|
|
138
|
+
overwrite: boolean indicating file can be overwritten
|
|
139
|
+
Returns:
|
|
140
|
+
path to the materialized file
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
full_path = Path(self._root_path) / file_path
|
|
144
|
+
|
|
145
|
+
if overwrite is False and full_path.exists():
|
|
146
|
+
raise FileExistsError()
|
|
147
|
+
|
|
148
|
+
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
|
149
|
+
with full_path.open("w+", encoding="utf-8") as file:
|
|
150
|
+
file.write(contents)
|
|
151
|
+
|
|
152
|
+
return full_path
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class GitStorageService(LocalStorageService):
|
|
156
|
+
_instance = None
|
|
157
|
+
|
|
158
|
+
def __new__(cls, root_path: str, user: str, email: str, remote: str = "origin"):
|
|
159
|
+
if cls._instance is None:
|
|
160
|
+
cls._instance = super().__new__(cls)
|
|
161
|
+
return cls._instance
|
|
162
|
+
|
|
163
|
+
def __init__(self, root_path: str, user: str, email: str, remote: str = "origin"):
|
|
164
|
+
# prevent reinitialization
|
|
165
|
+
if getattr(self, "_initialized", False):
|
|
166
|
+
return
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
self._root_path = root_path
|
|
170
|
+
self._remote = remote
|
|
171
|
+
self._repo = Repo(str(root_path))
|
|
172
|
+
|
|
173
|
+
with self._repo.config_writer() as git_config:
|
|
174
|
+
git_config.set_value("user", "email", email)
|
|
175
|
+
git_config.set_value("user", "name", user)
|
|
176
|
+
|
|
177
|
+
logging.info(f"✅ Initialized GitStorageService at {root_path}")
|
|
178
|
+
except (InvalidGitRepositoryError, NoSuchPathError) as err:
|
|
179
|
+
logging.error(f"❌ Git repo error: {err}")
|
|
180
|
+
raise err
|
|
181
|
+
|
|
182
|
+
self._initialized = True
|
|
183
|
+
|
|
184
|
+
def save(
|
|
185
|
+
self,
|
|
186
|
+
file_path: Path,
|
|
187
|
+
contents: str,
|
|
188
|
+
overwrite: bool = False,
|
|
189
|
+
fetch_latest: bool = True,
|
|
190
|
+
auto_commit: bool = True,
|
|
191
|
+
commit_msg: str = None,
|
|
192
|
+
**kwargs,
|
|
193
|
+
) -> Path:
|
|
194
|
+
"""
|
|
195
|
+
Function to save data in the given location.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
file_path: path or paths to files
|
|
199
|
+
contents: file contents
|
|
200
|
+
fetch_latest: boolean indicating to fetch latest state before saving
|
|
201
|
+
overwrite: boolean indicating file can be overwritten
|
|
202
|
+
auto_commit: boolean indicating whether to auto-commit changes
|
|
203
|
+
commit_msg: commit message
|
|
204
|
+
Returns:
|
|
205
|
+
path to the materialized file
|
|
206
|
+
"""
|
|
207
|
+
if fetch_latest:
|
|
208
|
+
self._repo.remote(self._remote).pull()
|
|
209
|
+
|
|
210
|
+
full_path = super().save(file_path, contents, overwrite)
|
|
211
|
+
|
|
212
|
+
if auto_commit and full_path is not None:
|
|
213
|
+
commit_msg = (
|
|
214
|
+
f"add {os.path.basename(full_path)}" if not commit_msg else commit_msg
|
|
215
|
+
)
|
|
216
|
+
self.commit_and_push([file_path], commit_msg)
|
|
217
|
+
|
|
218
|
+
return full_path
|
|
219
|
+
|
|
220
|
+
def commit_and_push(self, file_paths: list[Path], msg: str):
|
|
221
|
+
"""
|
|
222
|
+
Helper function to commit and push the given file.
|
|
223
|
+
"""
|
|
224
|
+
self._repo.git.add(file_paths)
|
|
225
|
+
self._repo.index.commit(msg)
|
|
226
|
+
push = self._repo.remote(self._remote).push()
|
|
227
|
+
push.raise_if_error()
|
|
228
|
+
|
|
229
|
+
@staticmethod
|
|
230
|
+
def get_instance() -> "GitStorageService":
|
|
231
|
+
if GitStorageService._instance is None:
|
|
232
|
+
raise RuntimeError("GitStorageService has not been initialized yet.")
|
|
233
|
+
return GitStorageService._instance
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""Dataset Metadata v1 - Initial version matching the JSON schema."""
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import AnyUrl, BaseModel, Field, HttpUrl, field_validator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StorageType(str, Enum):
|
|
11
|
+
"""Storage type for dataset location."""
|
|
12
|
+
|
|
13
|
+
GCS = "gcs"
|
|
14
|
+
S3 = "s3"
|
|
15
|
+
LOCAL = "local"
|
|
16
|
+
BIGQUERY = "bigquery"
|
|
17
|
+
POSTGRES = "postgres"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FileFormat(str, Enum):
|
|
21
|
+
"""File format for dataset."""
|
|
22
|
+
|
|
23
|
+
TSV = "tsv"
|
|
24
|
+
CSV = "csv"
|
|
25
|
+
PARQUET = "parquet"
|
|
26
|
+
JSON = "json"
|
|
27
|
+
JSONL = "jsonl"
|
|
28
|
+
AVRO = "avro"
|
|
29
|
+
ORC = "orc"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DatasetStatus(str, Enum):
|
|
33
|
+
"""Dataset status."""
|
|
34
|
+
|
|
35
|
+
ACTIVE = "active"
|
|
36
|
+
DEPRECATED = "deprecated"
|
|
37
|
+
ARCHIVED = "archived"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class Location(BaseModel):
|
|
41
|
+
"""Dataset location information."""
|
|
42
|
+
|
|
43
|
+
type: StorageType = Field(..., description="Storage type")
|
|
44
|
+
uri: AnyUrl = Field(..., description="Full URI to the dataset")
|
|
45
|
+
format: FileFormat = Field(..., description="File format")
|
|
46
|
+
|
|
47
|
+
model_config = {"extra": "forbid"}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Owner(BaseModel):
|
|
51
|
+
"""Dataset owner information."""
|
|
52
|
+
|
|
53
|
+
name: str = Field(..., min_length=1, description="Owner name")
|
|
54
|
+
email: Optional[str] = Field(None, description="Owner email address")
|
|
55
|
+
|
|
56
|
+
model_config = {"extra": "forbid"}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class Origin(BaseModel):
|
|
60
|
+
"""Dataset origin information."""
|
|
61
|
+
|
|
62
|
+
system: str = Field(..., description="Pipeline or system name")
|
|
63
|
+
url: HttpUrl = Field(..., description="GitHub URL to source code")
|
|
64
|
+
commit: Optional[str] = Field(
|
|
65
|
+
None,
|
|
66
|
+
pattern=r"^[a-f0-9]{7,40}$",
|
|
67
|
+
description="Git commit hash (7-40 hex characters)",
|
|
68
|
+
)
|
|
69
|
+
tag: Optional[str] = Field(None, description="Git tag")
|
|
70
|
+
|
|
71
|
+
model_config = {"extra": "forbid"}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ColumnSchema(BaseModel):
|
|
75
|
+
"""Schema definition for a single column."""
|
|
76
|
+
|
|
77
|
+
name: str = Field(..., description="Column name")
|
|
78
|
+
type: str = Field(..., description="Column data type")
|
|
79
|
+
description: Optional[str] = Field(None, description="Column description")
|
|
80
|
+
|
|
81
|
+
model_config = {"extra": "forbid"}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class DatasetSchema(BaseModel):
|
|
85
|
+
"""Dataset schema information."""
|
|
86
|
+
|
|
87
|
+
row_count: Optional[int] = Field(None, ge=0, description="Number of rows")
|
|
88
|
+
columns: Optional[list[ColumnSchema]] = Field(
|
|
89
|
+
None, description="List of column definitions"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
model_config = {"extra": "forbid"}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class DatasetMetadataV1(BaseModel):
|
|
96
|
+
"""
|
|
97
|
+
Dataset Metadata v1.
|
|
98
|
+
|
|
99
|
+
This model represents the metadata for a dataset in the registry.
|
|
100
|
+
It matches the structure defined in dataset.schema.json.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
# Schema version - tracks the version of this metadata definition itself
|
|
104
|
+
schema_version: str = Field(
|
|
105
|
+
default="1.0.0",
|
|
106
|
+
description="Version of the dataset metadata schema definition",
|
|
107
|
+
)
|
|
108
|
+
name: str = Field(
|
|
109
|
+
...,
|
|
110
|
+
pattern=r"^[a-z][a-z0-9_]*$",
|
|
111
|
+
description="Dataset name in snake_case",
|
|
112
|
+
)
|
|
113
|
+
version: str = Field(
|
|
114
|
+
...,
|
|
115
|
+
pattern=r"^\d+\.\d+\.\d+$",
|
|
116
|
+
description="Semantic version (e.g., 0.2.0)",
|
|
117
|
+
)
|
|
118
|
+
description: Optional[str] = Field(
|
|
119
|
+
None, min_length=10, description="Brief description of the dataset"
|
|
120
|
+
)
|
|
121
|
+
message: Optional[str] = Field(
|
|
122
|
+
None, description="Optional message about this dataset version"
|
|
123
|
+
)
|
|
124
|
+
location: Location = Field(..., description="Dataset location")
|
|
125
|
+
created_at: datetime = Field(
|
|
126
|
+
default_factory=lambda: datetime.now(UTC),
|
|
127
|
+
description="ISO 8601 timestamp",
|
|
128
|
+
)
|
|
129
|
+
owner: Owner = Field(..., description="Dataset owner")
|
|
130
|
+
origin: Origin = Field(..., description="Dataset origin")
|
|
131
|
+
status: DatasetStatus = Field(
|
|
132
|
+
default=DatasetStatus.ACTIVE, description="Dataset status"
|
|
133
|
+
)
|
|
134
|
+
lineage: Optional[dict[str, Any]] = Field(
|
|
135
|
+
default=None, description="Placeholder for future lineage tracking"
|
|
136
|
+
)
|
|
137
|
+
# Use model_field to avoid shadowing BaseModel.schema
|
|
138
|
+
dataset_schema: Optional[DatasetSchema] = Field(
|
|
139
|
+
default=None,
|
|
140
|
+
alias="schema",
|
|
141
|
+
description="Dataset schema information",
|
|
142
|
+
)
|
|
143
|
+
metadata: Optional[dict[str, Any]] = Field(
|
|
144
|
+
default=None, description="Additional metadata dictionary"
|
|
145
|
+
)
|
|
146
|
+
tags: Optional[list[str]] = Field(
|
|
147
|
+
default=None,
|
|
148
|
+
description="Tags for discoverability (lowercase with hyphens)",
|
|
149
|
+
)
|
|
150
|
+
related_docs: Optional[HttpUrl] = Field(
|
|
151
|
+
default=None, description="Link to documentation"
|
|
152
|
+
)
|
|
153
|
+
deprecated_by: Optional[str] = Field(
|
|
154
|
+
default=None, description="Version that replaces this dataset"
|
|
155
|
+
)
|
|
156
|
+
deprecation_date: Optional[datetime] = Field(
|
|
157
|
+
default=None, description="Date when dataset was deprecated"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
@field_validator("description")
|
|
161
|
+
@classmethod
|
|
162
|
+
def validate_description(cls, v: Optional[str]) -> Optional[str]:
|
|
163
|
+
"""Validate description length if provided."""
|
|
164
|
+
if v is not None and len(v) < 10:
|
|
165
|
+
raise ValueError("Description must be at least 10 characters long")
|
|
166
|
+
return v
|
|
167
|
+
|
|
168
|
+
@field_validator("tags")
|
|
169
|
+
@classmethod
|
|
170
|
+
def validate_tags(cls, v: Optional[list[str]]) -> Optional[list[str]]:
|
|
171
|
+
"""Validate tag format."""
|
|
172
|
+
if v is None:
|
|
173
|
+
return v
|
|
174
|
+
for tag in v:
|
|
175
|
+
if not tag or not tag.replace("-", "").replace("_", "").isalnum():
|
|
176
|
+
raise ValueError(
|
|
177
|
+
f"Tag '{tag}' must contain only lowercase alphanumeric characters and hyphens"
|
|
178
|
+
)
|
|
179
|
+
if tag != tag.lower():
|
|
180
|
+
raise ValueError(f"Tag '{tag}' must be lowercase")
|
|
181
|
+
# Ensure unique tags
|
|
182
|
+
if len(v) != len(set(v)):
|
|
183
|
+
raise ValueError("Tags must be unique")
|
|
184
|
+
return v
|
|
185
|
+
|
|
186
|
+
model_config = {
|
|
187
|
+
"extra": "forbid",
|
|
188
|
+
"json_schema_extra": {
|
|
189
|
+
"$schema": "http://json-schema.org/2020-12/schema#",
|
|
190
|
+
"$id": "https://everycure.org/schemas/dataset.v1.schema.json",
|
|
191
|
+
"title": "Dataset Metadata v1",
|
|
192
|
+
"description": "Schema for dataset registry metadata files (v1)",
|
|
193
|
+
},
|
|
194
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Dataset Metadata v2 - Refined version (placeholder for future improvements)."""
|
|
2
|
+
|
|
3
|
+
from everycure.datasets.models.v1 import DatasetMetadataV1
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DatasetMetadataV2(DatasetMetadataV1):
|
|
7
|
+
"""
|
|
8
|
+
Dataset Metadata v2.
|
|
9
|
+
|
|
10
|
+
This is a placeholder for a future refined version of the dataset metadata model.
|
|
11
|
+
When v2 is implemented, it will include improvements and refinements over v1.
|
|
12
|
+
|
|
13
|
+
For now, this class inherits from v1 to maintain compatibility.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
# TODO: Add v2-specific fields and improvements here
|
|
17
|
+
# Examples of potential improvements:
|
|
18
|
+
# - Better validation
|
|
19
|
+
# - Additional metadata fields
|
|
20
|
+
# - Improved lineage tracking
|
|
21
|
+
# - Enhanced schema definitions
|
|
22
|
+
|
|
23
|
+
model_config = {
|
|
24
|
+
"extra": "forbid",
|
|
25
|
+
"json_schema_extra": {
|
|
26
|
+
"$schema": "http://json-schema.org/2020-12/schema#",
|
|
27
|
+
"$id": "https://everycure.org/schemas/dataset.v2.schema.json",
|
|
28
|
+
"title": "Dataset Metadata v2",
|
|
29
|
+
"description": "Schema for dataset registry metadata files (v2)",
|
|
30
|
+
},
|
|
31
|
+
}
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pre-commit validation script for dataset registry.
|
|
3
|
+
|
|
4
|
+
Checks:
|
|
5
|
+
1. All version directories follow semantic versioning (MAJOR.MINOR.PATCH)
|
|
6
|
+
2. Dataset folder names are snake_case
|
|
7
|
+
3. No files are edited in datasets/ on main branch (immutability check)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import yaml
|
|
11
|
+
import re
|
|
12
|
+
import sys
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from pydantic import ValidationError
|
|
15
|
+
|
|
16
|
+
from logging import getLogger
|
|
17
|
+
|
|
18
|
+
from everycure.datasets.models.v1 import DatasetMetadataV1
|
|
19
|
+
|
|
20
|
+
logger = getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
# Patterns
|
|
23
|
+
SNAKE_CASE_PATTERN = re.compile(r"^[a-z][a-z0-9_]*$")
|
|
24
|
+
SEMVER_PATTERN = re.compile(r"^\d+\.\d+\.\d+$")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_dataset_directories(datasets_dir: Path) -> list[Path]:
|
|
28
|
+
"""Get all dataset directories that don't start with '.' or '_'."""
|
|
29
|
+
return [
|
|
30
|
+
item
|
|
31
|
+
for item in datasets_dir.iterdir()
|
|
32
|
+
if item.is_dir()
|
|
33
|
+
and not item.name.startswith(".")
|
|
34
|
+
and not item.name.startswith("_")
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_version_directories(dataset_dir: Path) -> list[Path]:
|
|
39
|
+
"""Get all valid version directories in a dataset directory."""
|
|
40
|
+
versions = []
|
|
41
|
+
for item in dataset_dir.iterdir():
|
|
42
|
+
if item.is_dir() and SEMVER_PATTERN.match(item.name):
|
|
43
|
+
versions.append(item)
|
|
44
|
+
return versions
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_dataset_specification_files(datasets_dir: Path) -> list[Path]:
|
|
48
|
+
"""Get all dataset specification files in a dataset directory."""
|
|
49
|
+
files = []
|
|
50
|
+
|
|
51
|
+
for dataset_dir in datasets_dir.iterdir():
|
|
52
|
+
for versioned_dataset in get_version_directories(dataset_dir):
|
|
53
|
+
if (versioned_dataset / "dataset.yaml").is_file():
|
|
54
|
+
files.append(versioned_dataset / "dataset.yaml")
|
|
55
|
+
return files
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def check_snake_case_names(datasets_dir: Path) -> list[str]:
|
|
59
|
+
"""Check that all dataset names are snake_case."""
|
|
60
|
+
errors = []
|
|
61
|
+
|
|
62
|
+
for item in get_dataset_directories(datasets_dir):
|
|
63
|
+
if not SNAKE_CASE_PATTERN.match(item.name):
|
|
64
|
+
errors.append(
|
|
65
|
+
f"Dataset name '{item.name}' is not snake_case. "
|
|
66
|
+
f"Use lowercase letters, numbers, and underscores only."
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return errors
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def check_semver_directories(datasets_dir: Path) -> list[str]:
|
|
73
|
+
"""Check that all version directories follow semantic versioning."""
|
|
74
|
+
errors = []
|
|
75
|
+
|
|
76
|
+
for dataset in get_dataset_directories(datasets_dir):
|
|
77
|
+
for item in dataset.iterdir():
|
|
78
|
+
# Skip hidden files
|
|
79
|
+
if item.name.startswith("."):
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
if item.is_dir() and not SEMVER_PATTERN.match(item.name):
|
|
83
|
+
errors.append(
|
|
84
|
+
f"Version directory '{dataset.name}/{item.name}' does not follow "
|
|
85
|
+
f"semantic versioning (MAJOR.MINOR.PATCH). Example: 0.1.0"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return errors
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def check_schema_validity(datasets_dir: Path) -> list[str]:
|
|
92
|
+
"""Check that all schema files are valid."""
|
|
93
|
+
errors = []
|
|
94
|
+
|
|
95
|
+
for dataset_file in get_dataset_specification_files(datasets_dir):
|
|
96
|
+
dataset_yaml = yaml.safe_load(dataset_file.read_text())
|
|
97
|
+
try:
|
|
98
|
+
DatasetMetadataV1.model_validate(dataset_yaml)
|
|
99
|
+
except ValidationError as e:
|
|
100
|
+
print(f"Schema file '{dataset_file.relative_to(datasets_dir)}' is not valid: {e}")
|
|
101
|
+
errors.append(f"Schema file '{dataset_file.relative_to(datasets_dir)}' is not valid.")
|
|
102
|
+
|
|
103
|
+
return errors
|
|
104
|
+
|
|
105
|
+
def _find_repo_root() -> Path:
|
|
106
|
+
"""Find the repository root by walking up from current directory or file location."""
|
|
107
|
+
# Start from current working directory
|
|
108
|
+
current = Path.cwd()
|
|
109
|
+
|
|
110
|
+
# Walk up looking for pyproject.toml (repo marker)
|
|
111
|
+
for path in [current, *current.parents]:
|
|
112
|
+
if (path / "pyproject.toml").exists() and (path / "datasets").exists():
|
|
113
|
+
return path
|
|
114
|
+
|
|
115
|
+
# Fallback: use file location (we're in src/everycure/datasets/validate.py)
|
|
116
|
+
return Path(__file__).parent.parent.parent.parent
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def validate_datasets(datasets_dir: Path | None = None) -> int:
|
|
120
|
+
"""
|
|
121
|
+
Run all validation checks.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
datasets_dir: Path to the datasets directory. If None, will try to find it
|
|
125
|
+
relative to the current working directory or repository root.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
0 if validation passes, 1 if it fails.
|
|
129
|
+
"""
|
|
130
|
+
if datasets_dir is None:
|
|
131
|
+
repo_root = _find_repo_root()
|
|
132
|
+
datasets_dir = repo_root / "datasets"
|
|
133
|
+
|
|
134
|
+
if not datasets_dir.exists():
|
|
135
|
+
print(
|
|
136
|
+
f"Error: datasets/ directory not found at {datasets_dir}", file=sys.stderr
|
|
137
|
+
)
|
|
138
|
+
return 1
|
|
139
|
+
|
|
140
|
+
all_errors = []
|
|
141
|
+
|
|
142
|
+
# Run all checks
|
|
143
|
+
print("Checking dataset naming conventions...")
|
|
144
|
+
all_errors.extend(check_snake_case_names(datasets_dir))
|
|
145
|
+
|
|
146
|
+
print("Checking semantic versioning...")
|
|
147
|
+
all_errors.extend(check_semver_directories(datasets_dir))
|
|
148
|
+
|
|
149
|
+
print("Checking schema validity...")
|
|
150
|
+
all_errors.extend(check_schema_validity(datasets_dir))
|
|
151
|
+
|
|
152
|
+
# Report results
|
|
153
|
+
if all_errors:
|
|
154
|
+
print("\n❌ Validation failed with the following errors:\n", file=sys.stderr)
|
|
155
|
+
for error in all_errors:
|
|
156
|
+
print(f" {error}", file=sys.stderr)
|
|
157
|
+
print(f"\nTotal errors: {len(all_errors)}", file=sys.stderr)
|
|
158
|
+
return 1
|
|
159
|
+
else:
|
|
160
|
+
print("\n✅ All validation checks passed!")
|
|
161
|
+
return 0
|