everycure-datasets 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyc
5
+
6
+ # Virtual environments
7
+ .venv/
8
+ venv/
9
+
10
+ # uv
11
+ .uv/
12
+
13
+ # Build artifacts
14
+ dist/
15
+ build/
16
+ *.egg-info/
17
+
18
+ # IDE (keep .vscode/settings.json for schema validation)
19
+ .vscode/*
20
+ !.vscode/settings.json
21
+ .idea/
22
+
23
+ # Environment variables
24
+ .env
25
+ .env.local
26
+
27
+ # OS
28
+ .DS_Store
@@ -0,0 +1,7 @@
1
+ Metadata-Version: 2.4
2
+ Name: everycure-datasets
3
+ Version: 0.1.1
4
+ Requires-Python: >=3.12
5
+ Requires-Dist: pre-commit>=4.3.0
6
+ Requires-Dist: pydantic>=2.0.0
7
+ Requires-Dist: typer>=0.12.0
@@ -0,0 +1,38 @@
1
+ [project]
2
+ name = "everycure-datasets"
3
+ version = "0.1.1"
4
+ requires-python = ">=3.12"
5
+ dependencies = [
6
+ "pre-commit>=4.3.0",
7
+ "pydantic>=2.0.0",
8
+ "typer>=0.12.0",
9
+ ]
10
+ scripts = {datasets = "everycure.datasets.cli:app"}
11
+
12
+ [tool.hatch.build]
13
+ include = ["src/**"]
14
+
15
+ [tool.hatch.build.targets.wheel]
16
+ packages = ["src/everycure"]
17
+
18
+ [tool.uv]
19
+ keyring-provider = "subprocess"
20
+
21
+ [[tool.uv.index]]
22
+ name = "pypi"
23
+ url = "https://pypi.org/simple"
24
+
25
+ [[tool.uv.index]]
26
+ name = "everycure"
27
+ url = "https://oauth2accesstoken@us-central1-python.pkg.dev/core-422020/everycure/simple/"
28
+ publish-url = "https://us-central1-python.pkg.dev/core-422020/everycure/"
29
+ default = false
30
+
31
+ [build-system]
32
+ requires = ["hatchling"]
33
+ build-backend = "hatchling.build"
34
+
35
+ [dependency-groups]
36
+ dev = [
37
+ "keyrings-google-artifactregistry-auth>=1.1.2",
38
+ ]
@@ -0,0 +1,3 @@
1
+ """Every Cure namespace package."""
2
+
3
+ __path__ = __import__("pkgutil").extend_path(__path__, __name__)
@@ -0,0 +1,6 @@
1
+ """Dataset Registry - Centralized dataset metadata management."""
2
+
3
+ from everycure.datasets.models.v1 import DatasetMetadataV1
4
+ from everycure.datasets.models.v2 import DatasetMetadataV2
5
+
6
+ __all__ = ["DatasetMetadataV1", "DatasetMetadataV2"]
@@ -0,0 +1,64 @@
1
+ """CLI for datasets registry operations."""
2
+
3
+ from pathlib import Path
4
+
5
+ import typer
6
+
7
+ from everycure.datasets.generate_schema import generate_all_schemas
8
+ from everycure.datasets.validate import validate_datasets
9
+
10
+ app = typer.Typer(
11
+ name="datasets",
12
+ help="Datasets registry management CLI",
13
+ add_completion=False,
14
+ no_args_is_help=True,
15
+ )
16
+
17
+ schema_app = typer.Typer(
18
+ name="schema",
19
+ help="Schema management commands",
20
+ add_completion=False,
21
+ )
22
+
23
+ app.add_typer(schema_app)
24
+
25
+
26
+ @app.callback(invoke_without_command=True)
27
+ def main(ctx: typer.Context) -> None:
28
+ """Datasets registry management CLI."""
29
+ if ctx.invoked_subcommand is None:
30
+ typer.echo(ctx.get_help())
31
+
32
+
33
+ @app.command()
34
+ def validate(
35
+ datasets_dir: Path | None = typer.Option(
36
+ None,
37
+ "--datasets-dir",
38
+ "-d",
39
+ help="Path to the datasets directory (default: auto-detect)",
40
+ ),
41
+ ) -> None:
42
+ """
43
+ Validate dataset YAML files and directory structure.
44
+
45
+ Checks:
46
+ - Dataset names are snake_case
47
+ - Version directories follow semantic versioning (MAJOR.MINOR.PATCH)
48
+ """
49
+ return validate_datasets(datasets_dir)
50
+
51
+
52
+ @schema_app.command()
53
+ def generate() -> None:
54
+ """
55
+ Generate JSON schema(s) from the Pydantic models.
56
+ """
57
+ repo_root = Path.cwd()
58
+ schema_dir = repo_root / ".schema"
59
+ generate_all_schemas(schema_dir)
60
+ typer.echo(f"✓ Generated all JSON schemas in {schema_dir}")
61
+
62
+
63
+ if __name__ == "__main__":
64
+ app()
@@ -0,0 +1,62 @@
1
+ """Generate JSON schema from Pydantic models."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from everycure.datasets.models.v1 import DatasetMetadataV1
7
+ from everycure.datasets.models.v2 import DatasetMetadataV2
8
+
9
+
10
+ def generate_schema(model_class, output_path: Path, schema_id: str) -> None:
11
+ """
12
+ Generate JSON schema from a Pydantic model.
13
+
14
+ Args:
15
+ model_class: The Pydantic model class to generate schema from
16
+ output_path: Path where the JSON schema should be written
17
+ schema_id: The $id for the schema (FQDN)
18
+ """
19
+ # Get the JSON schema from the Pydantic model
20
+ schema = model_class.model_json_schema(
21
+ mode="serialization",
22
+ by_alias=True,
23
+ )
24
+
25
+ # Update the $id to use the provided FQDN
26
+ schema["$id"] = schema_id
27
+
28
+ # Remove fields with defaults from required list
29
+ # but keep them in properties so they're documented
30
+ fields_with_defaults = ["schema_version", "status", "created_at"]
31
+ if "required" in schema:
32
+ for field in fields_with_defaults:
33
+ if field in schema["required"]:
34
+ schema["required"].remove(field)
35
+
36
+ # Write to file with proper formatting
37
+ with open(output_path, "w") as f:
38
+ json.dump(schema, f, indent=2)
39
+ f.write("\n") # Add trailing newline
40
+
41
+ print(f"Generated JSON schema at {output_path}")
42
+
43
+
44
+ def generate_all_schemas(schema_dir: Path) -> None:
45
+ """Generate all schema versions."""
46
+ schema_dir.mkdir(parents=True, exist_ok=True)
47
+
48
+ # Generate v1 schema
49
+ v1_path = schema_dir / "dataset.v1.schema.json"
50
+ generate_schema(
51
+ DatasetMetadataV1,
52
+ v1_path,
53
+ "https://everycure.org/schemas/dataset.v1.schema.json",
54
+ )
55
+
56
+ # Generate v2 schema
57
+ v2_path = schema_dir / "dataset.v2.schema.json"
58
+ generate_schema(
59
+ DatasetMetadataV2,
60
+ v2_path,
61
+ "https://everycure.org/schemas/dataset.v2.schema.json",
62
+ )
@@ -0,0 +1,6 @@
1
+ """Dataset metadata models."""
2
+
3
+ from everycure.datasets.models.v1 import DatasetMetadataV1
4
+ from everycure.datasets.models.v2 import DatasetMetadataV2
5
+
6
+ __all__ = ["DatasetMetadataV1", "DatasetMetadataV2"]
@@ -0,0 +1,193 @@
1
+ """Dataset Metadata v1 - Initial version matching the JSON schema."""
2
+
3
+ from datetime import UTC, datetime
4
+ from enum import Enum
5
+ from typing import Any, Optional
6
+
7
+ from pydantic import BaseModel, Field, field_validator, HttpUrl
8
+
9
+
10
+ class StorageType(str, Enum):
11
+ """Storage type for dataset location."""
12
+
13
+ GCS = "gcs"
14
+ S3 = "s3"
15
+ LOCAL = "local"
16
+ BIGQUERY = "bigquery"
17
+ POSTGRES = "postgres"
18
+
19
+
20
+ class FileFormat(str, Enum):
21
+ """File format for dataset."""
22
+
23
+ TSV = "tsv"
24
+ CSV = "csv"
25
+ PARQUET = "parquet"
26
+ JSON = "json"
27
+ JSONL = "jsonl"
28
+ AVRO = "avro"
29
+ ORC = "orc"
30
+
31
+
32
+ class DatasetStatus(str, Enum):
33
+ """Dataset status."""
34
+
35
+ ACTIVE = "active"
36
+ DEPRECATED = "deprecated"
37
+ ARCHIVED = "archived"
38
+
39
+
40
+ class Location(BaseModel):
41
+ """Dataset location information."""
42
+
43
+ type: StorageType = Field(..., description="Storage type")
44
+ uri: HttpUrl = Field(..., description="Full URI to the dataset")
45
+ format: FileFormat = Field(..., description="File format")
46
+
47
+ model_config = {"extra": "forbid"}
48
+
49
+
50
+ class Owner(BaseModel):
51
+ """Dataset owner information."""
52
+
53
+ name: str = Field(..., min_length=1, description="Owner name")
54
+ email: Optional[str] = Field(None, description="Owner email address")
55
+
56
+ model_config = {"extra": "forbid"}
57
+
58
+
59
+ class Origin(BaseModel):
60
+ """Dataset origin information."""
61
+
62
+ system: str = Field(..., description="Pipeline or system name")
63
+ url: HttpUrl = Field(..., description="GitHub URL to source code")
64
+ commit: str = Field(
65
+ ...,
66
+ pattern=r"^[a-f0-9]{7,40}$",
67
+ description="Git commit hash (7-40 hex characters)",
68
+ )
69
+
70
+ model_config = {"extra": "forbid"}
71
+
72
+
73
+ class ColumnSchema(BaseModel):
74
+ """Schema definition for a single column."""
75
+
76
+ name: str = Field(..., description="Column name")
77
+ type: str = Field(..., description="Column data type")
78
+ description: Optional[str] = Field(None, description="Column description")
79
+
80
+ model_config = {"extra": "forbid"}
81
+
82
+
83
+ class DatasetSchema(BaseModel):
84
+ """Dataset schema information."""
85
+
86
+ row_count: Optional[int] = Field(None, ge=0, description="Number of rows")
87
+ columns: Optional[list[ColumnSchema]] = Field(
88
+ None, description="List of column definitions"
89
+ )
90
+
91
+ model_config = {"extra": "forbid"}
92
+
93
+
94
+ class DatasetMetadataV1(BaseModel):
95
+ """
96
+ Dataset Metadata v1.
97
+
98
+ This model represents the metadata for a dataset in the registry.
99
+ It matches the structure defined in dataset.schema.json.
100
+ """
101
+
102
+ # Schema version - tracks the version of this metadata definition itself
103
+ schema_version: str = Field(
104
+ default="1.0.0",
105
+ description="Version of the dataset metadata schema definition",
106
+ )
107
+ name: str = Field(
108
+ ...,
109
+ pattern=r"^[a-z][a-z0-9_]*$",
110
+ description="Dataset name in snake_case",
111
+ )
112
+ version: str = Field(
113
+ ...,
114
+ pattern=r"^\d+\.\d+\.\d+$",
115
+ description="Semantic version (e.g., 0.2.0)",
116
+ )
117
+ description: Optional[str] = Field(
118
+ None, min_length=10, description="Brief description of the dataset"
119
+ )
120
+ message: Optional[str] = Field(
121
+ None, description="Optional message about this dataset version"
122
+ )
123
+ location: Location = Field(..., description="Dataset location")
124
+ created_at: datetime = Field(
125
+ default_factory=lambda: datetime.now(UTC),
126
+ description="ISO 8601 timestamp",
127
+ )
128
+ owner: Owner = Field(..., description="Dataset owner")
129
+ origin: Origin = Field(..., description="Dataset origin")
130
+ status: DatasetStatus = Field(
131
+ default=DatasetStatus.ACTIVE, description="Dataset status"
132
+ )
133
+ lineage: Optional[dict[str, Any]] = Field(
134
+ default=None, description="Placeholder for future lineage tracking"
135
+ )
136
+ # Use model_field to avoid shadowing BaseModel.schema
137
+ dataset_schema: Optional[DatasetSchema] = Field(
138
+ default=None,
139
+ alias="schema",
140
+ description="Dataset schema information",
141
+ )
142
+ metadata: Optional[dict[str, Any]] = Field(
143
+ default=None, description="Additional metadata dictionary"
144
+ )
145
+ tags: Optional[list[str]] = Field(
146
+ default=None,
147
+ description="Tags for discoverability (lowercase with hyphens)",
148
+ )
149
+ related_docs: Optional[HttpUrl] = Field(
150
+ default=None, description="Link to documentation"
151
+ )
152
+ deprecated_by: Optional[str] = Field(
153
+ default=None, description="Version that replaces this dataset"
154
+ )
155
+ deprecation_date: Optional[datetime] = Field(
156
+ default=None, description="Date when dataset was deprecated"
157
+ )
158
+
159
+ @field_validator("description")
160
+ @classmethod
161
+ def validate_description(cls, v: Optional[str]) -> Optional[str]:
162
+ """Validate description length if provided."""
163
+ if v is not None and len(v) < 10:
164
+ raise ValueError("Description must be at least 10 characters long")
165
+ return v
166
+
167
+ @field_validator("tags")
168
+ @classmethod
169
+ def validate_tags(cls, v: Optional[list[str]]) -> Optional[list[str]]:
170
+ """Validate tag format."""
171
+ if v is None:
172
+ return v
173
+ for tag in v:
174
+ if not tag or not tag.replace("-", "").replace("_", "").isalnum():
175
+ raise ValueError(
176
+ f"Tag '{tag}' must contain only lowercase alphanumeric characters and hyphens"
177
+ )
178
+ if tag != tag.lower():
179
+ raise ValueError(f"Tag '{tag}' must be lowercase")
180
+ # Ensure unique tags
181
+ if len(v) != len(set(v)):
182
+ raise ValueError("Tags must be unique")
183
+ return v
184
+
185
+ model_config = {
186
+ "extra": "forbid",
187
+ "json_schema_extra": {
188
+ "$schema": "http://json-schema.org/2020-12/schema#",
189
+ "$id": "https://everycure.org/schemas/dataset.v1.schema.json",
190
+ "title": "Dataset Metadata v1",
191
+ "description": "Schema for dataset registry metadata files (v1)",
192
+ },
193
+ }
@@ -0,0 +1,31 @@
1
+ """Dataset Metadata v2 - Refined version (placeholder for future improvements)."""
2
+
3
+ from everycure.datasets.models.v1 import DatasetMetadataV1
4
+
5
+
6
+ class DatasetMetadataV2(DatasetMetadataV1):
7
+ """
8
+ Dataset Metadata v2.
9
+
10
+ This is a placeholder for a future refined version of the dataset metadata model.
11
+ When v2 is implemented, it will include improvements and refinements over v1.
12
+
13
+ For now, this class inherits from v1 to maintain compatibility.
14
+ """
15
+
16
+ # TODO: Add v2-specific fields and improvements here
17
+ # Examples of potential improvements:
18
+ # - Better validation
19
+ # - Additional metadata fields
20
+ # - Improved lineage tracking
21
+ # - Enhanced schema definitions
22
+
23
+ model_config = {
24
+ "extra": "forbid",
25
+ "json_schema_extra": {
26
+ "$schema": "http://json-schema.org/2020-12/schema#",
27
+ "$id": "https://everycure.org/schemas/dataset.v2.schema.json",
28
+ "title": "Dataset Metadata v2",
29
+ "description": "Schema for dataset registry metadata files (v2)",
30
+ },
31
+ }
@@ -0,0 +1,125 @@
1
+ """
2
+ Pre-commit validation script for dataset registry.
3
+
4
+ Checks:
5
+ 1. All version directories follow semantic versioning (MAJOR.MINOR.PATCH)
6
+ 2. Dataset folder names are snake_case
7
+ 3. No files are edited in datasets/ on main branch (immutability check)
8
+ """
9
+
10
+ import re
11
+ import sys
12
+ from pathlib import Path
13
+
14
+ # Patterns
15
+ SNAKE_CASE_PATTERN = re.compile(r"^[a-z][a-z0-9_]*$")
16
+ SEMVER_PATTERN = re.compile(r"^\d+\.\d+\.\d+$")
17
+
18
+
19
+ def get_dataset_directories(datasets_dir: Path) -> list[Path]:
20
+ """Get all dataset directories that don't start with '.' or '_'."""
21
+ return [
22
+ item
23
+ for item in datasets_dir.iterdir()
24
+ if item.is_dir()
25
+ and not item.name.startswith(".")
26
+ and not item.name.startswith("_")
27
+ ]
28
+
29
+
30
+ def get_version_directories(dataset_dir: Path) -> list[Path]:
31
+ """Get all valid version directories in a dataset directory."""
32
+ versions = []
33
+ for item in dataset_dir.iterdir():
34
+ if item.is_dir() and SEMVER_PATTERN.match(item.name):
35
+ versions.append(item)
36
+ return versions
37
+
38
+
39
+ def check_snake_case_names(datasets_dir: Path) -> list[str]:
40
+ """Check that all dataset names are snake_case."""
41
+ errors = []
42
+
43
+ for item in get_dataset_directories(datasets_dir):
44
+ if not SNAKE_CASE_PATTERN.match(item.name):
45
+ errors.append(
46
+ f"Dataset name '{item.name}' is not snake_case. "
47
+ f"Use lowercase letters, numbers, and underscores only."
48
+ )
49
+
50
+ return errors
51
+
52
+
53
+ def check_semver_directories(datasets_dir: Path) -> list[str]:
54
+ """Check that all version directories follow semantic versioning."""
55
+ errors = []
56
+
57
+ for dataset in get_dataset_directories(datasets_dir):
58
+ for item in dataset.iterdir():
59
+ # Skip hidden files
60
+ if item.name.startswith("."):
61
+ continue
62
+
63
+ if item.is_dir() and not SEMVER_PATTERN.match(item.name):
64
+ errors.append(
65
+ f"Version directory '{dataset.name}/{item.name}' does not follow "
66
+ f"semantic versioning (MAJOR.MINOR.PATCH). Example: 0.1.0"
67
+ )
68
+
69
+ return errors
70
+
71
+
72
+ def _find_repo_root() -> Path:
73
+ """Find the repository root by walking up from current directory or file location."""
74
+ # Start from current working directory
75
+ current = Path.cwd()
76
+
77
+ # Walk up looking for pyproject.toml (repo marker)
78
+ for path in [current, *current.parents]:
79
+ if (path / "pyproject.toml").exists() and (path / "datasets").exists():
80
+ return path
81
+
82
+ # Fallback: use file location (we're in src/everycure/datasets/validate.py)
83
+ return Path(__file__).parent.parent.parent.parent
84
+
85
+
86
+ def validate_datasets(datasets_dir: Path | None = None) -> int:
87
+ """
88
+ Run all validation checks.
89
+
90
+ Args:
91
+ datasets_dir: Path to the datasets directory. If None, will try to find it
92
+ relative to the current working directory or repository root.
93
+
94
+ Returns:
95
+ 0 if validation passes, 1 if it fails.
96
+ """
97
+ if datasets_dir is None:
98
+ repo_root = _find_repo_root()
99
+ datasets_dir = repo_root / "datasets"
100
+
101
+ if not datasets_dir.exists():
102
+ print(
103
+ f"Error: datasets/ directory not found at {datasets_dir}", file=sys.stderr
104
+ )
105
+ return 1
106
+
107
+ all_errors = []
108
+
109
+ # Run all checks
110
+ print("Checking dataset naming conventions...")
111
+ all_errors.extend(check_snake_case_names(datasets_dir))
112
+
113
+ print("Checking semantic versioning...")
114
+ all_errors.extend(check_semver_directories(datasets_dir))
115
+
116
+ # Report results
117
+ if all_errors:
118
+ print("\n❌ Validation failed with the following errors:\n", file=sys.stderr)
119
+ for error in all_errors:
120
+ print(f" {error}", file=sys.stderr)
121
+ print(f"\nTotal errors: {len(all_errors)}", file=sys.stderr)
122
+ return 1
123
+ else:
124
+ print("\n✅ All validation checks passed!")
125
+ return 0