sunstone-py 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sunstone/cli.py +542 -0
- sunstone/dataframe.py +16 -89
- sunstone/datasets.py +78 -17
- sunstone/lineage.py +58 -29
- {sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/METADATA +4 -2
- sunstone_py-0.6.0.dist-info/RECORD +16 -0
- {sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/WHEEL +1 -1
- {sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/entry_points.txt +1 -0
- sunstone_py-0.5.2.dist-info/RECORD +0 -15
- {sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/top_level.txt +0 -0
sunstone/cli.py
ADDED
|
@@ -0,0 +1,542 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sunstone command-line interface.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import sys
|
|
9
|
+
import tomllib
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Optional
|
|
12
|
+
from urllib.parse import urlparse
|
|
13
|
+
|
|
14
|
+
import click
|
|
15
|
+
from click.shell_completion import CompletionItem
|
|
16
|
+
from ruamel.yaml import YAML
|
|
17
|
+
|
|
18
|
+
from .datasets import DatasetsManager
|
|
19
|
+
from .exceptions import DatasetNotFoundError
|
|
20
|
+
|
|
21
|
+
# Configure ruamel.yaml for round-trip parsing
|
|
22
|
+
_yaml = YAML()
|
|
23
|
+
_yaml.preserve_quotes = True
|
|
24
|
+
_yaml.default_flow_style = False
|
|
25
|
+
_yaml.indent(mapping=2, sequence=4, offset=2)
|
|
26
|
+
|
|
27
|
+
# Valid field types
|
|
28
|
+
VALID_FIELD_TYPES = {"string", "number", "integer", "boolean", "date", "datetime"}
|
|
29
|
+
|
|
30
|
+
# Pattern for ${VAR} or ${VAR:-default} substitution
|
|
31
|
+
ENV_VAR_PATTERN = re.compile(r"\$\{([^}:]+)(?::-([^}]*))?\}")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_project_slug(project_path: Path) -> str:
|
|
35
|
+
"""
|
|
36
|
+
Get the project slug from pyproject.toml or directory name.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
project_path: Path to the project directory.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
The project slug (kebab-case identifier).
|
|
43
|
+
"""
|
|
44
|
+
pyproject_path = project_path / "pyproject.toml"
|
|
45
|
+
if pyproject_path.exists():
|
|
46
|
+
try:
|
|
47
|
+
with open(pyproject_path, "rb") as f:
|
|
48
|
+
pyproject = tomllib.load(f)
|
|
49
|
+
name = pyproject.get("project", {}).get("name")
|
|
50
|
+
if isinstance(name, str):
|
|
51
|
+
return name
|
|
52
|
+
except Exception:
|
|
53
|
+
pass
|
|
54
|
+
return project_path.name
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def expand_env_vars(text: str) -> str:
|
|
58
|
+
"""
|
|
59
|
+
Expand environment variables in text using ${VAR} or ${VAR:-default} syntax.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
text: The text containing environment variable references.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
The text with environment variables expanded.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def replace_var(match: re.Match[str]) -> str:
|
|
69
|
+
var_name = match.group(1)
|
|
70
|
+
default_value = match.group(2)
|
|
71
|
+
value = os.environ.get(var_name)
|
|
72
|
+
if value is not None:
|
|
73
|
+
return value
|
|
74
|
+
if default_value is not None:
|
|
75
|
+
return default_value
|
|
76
|
+
return match.group(0) # Return original if no value and no default
|
|
77
|
+
|
|
78
|
+
return ENV_VAR_PATTERN.sub(replace_var, text)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_manager(datasets_file: str) -> tuple[DatasetsManager, Path]:
|
|
82
|
+
"""Get DatasetsManager and project path from datasets file."""
|
|
83
|
+
datasets_path = Path(datasets_file).resolve()
|
|
84
|
+
project_path = datasets_path.parent
|
|
85
|
+
manager = DatasetsManager(project_path)
|
|
86
|
+
return manager, project_path
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def complete_dataset_slugs(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]:
|
|
90
|
+
"""Shell completion for dataset slugs."""
|
|
91
|
+
# Get the datasets file from context or use default
|
|
92
|
+
datasets_file = ctx.params.get("datasets_file", "datasets.yaml")
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
manager, _ = get_manager(datasets_file)
|
|
96
|
+
all_datasets = manager.get_all_inputs() + manager.get_all_outputs()
|
|
97
|
+
slugs = [ds.slug for ds in all_datasets]
|
|
98
|
+
|
|
99
|
+
return [CompletionItem(slug) for slug in slugs if slug.startswith(incomplete)]
|
|
100
|
+
except Exception:
|
|
101
|
+
return []
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# =============================================================================
|
|
105
|
+
# Main CLI group
|
|
106
|
+
# =============================================================================
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@click.group()
|
|
110
|
+
@click.version_option()
|
|
111
|
+
def main() -> None:
|
|
112
|
+
"""Sunstone dataset and package management CLI."""
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# =============================================================================
|
|
117
|
+
# Dataset commands
|
|
118
|
+
# =============================================================================
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@main.group()
|
|
122
|
+
def dataset() -> None:
|
|
123
|
+
"""Manage datasets in datasets.yaml."""
|
|
124
|
+
pass
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@dataset.command("list")
|
|
128
|
+
@click.option(
|
|
129
|
+
"-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
|
|
130
|
+
)
|
|
131
|
+
def dataset_list(datasets_file: str) -> None:
|
|
132
|
+
"""List all datasets."""
|
|
133
|
+
try:
|
|
134
|
+
manager, _ = get_manager(datasets_file)
|
|
135
|
+
except FileNotFoundError as e:
|
|
136
|
+
click.echo(f"Error: {e}", err=True)
|
|
137
|
+
sys.exit(1)
|
|
138
|
+
|
|
139
|
+
inputs = manager.get_all_inputs()
|
|
140
|
+
outputs = manager.get_all_outputs()
|
|
141
|
+
|
|
142
|
+
if inputs:
|
|
143
|
+
click.echo("Inputs:")
|
|
144
|
+
for ds in inputs:
|
|
145
|
+
flags = []
|
|
146
|
+
if ds.strict:
|
|
147
|
+
flags.append("strict")
|
|
148
|
+
flag_str = f" [{', '.join(flags)}]" if flags else ""
|
|
149
|
+
click.echo(f" - {ds.slug} ({ds.name}){flag_str}")
|
|
150
|
+
|
|
151
|
+
if outputs:
|
|
152
|
+
if inputs:
|
|
153
|
+
click.echo()
|
|
154
|
+
click.echo("Outputs:")
|
|
155
|
+
for ds in outputs:
|
|
156
|
+
flags = []
|
|
157
|
+
if ds.is_publishable:
|
|
158
|
+
flags.append("publish")
|
|
159
|
+
if ds.strict:
|
|
160
|
+
flags.append("strict")
|
|
161
|
+
flag_str = f" [{', '.join(flags)}]" if flags else ""
|
|
162
|
+
click.echo(f" - {ds.slug} ({ds.name}){flag_str}")
|
|
163
|
+
|
|
164
|
+
if not inputs and not outputs:
|
|
165
|
+
click.echo("No datasets found.")
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@dataset.command("validate")
|
|
169
|
+
@click.option(
|
|
170
|
+
"-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
|
|
171
|
+
)
|
|
172
|
+
@click.argument("datasets", nargs=-1, shell_complete=complete_dataset_slugs)
|
|
173
|
+
def dataset_validate(datasets_file: str, datasets: tuple[str, ...]) -> None:
|
|
174
|
+
"""Validate datasets.
|
|
175
|
+
|
|
176
|
+
If no datasets are specified, validates all datasets.
|
|
177
|
+
"""
|
|
178
|
+
datasets_path = Path(datasets_file).resolve()
|
|
179
|
+
|
|
180
|
+
errors: list[str] = []
|
|
181
|
+
|
|
182
|
+
# Load and parse YAML
|
|
183
|
+
try:
|
|
184
|
+
with open(datasets_path, "r") as f:
|
|
185
|
+
data = _yaml.load(f)
|
|
186
|
+
except Exception as e:
|
|
187
|
+
click.echo(f"Error: Failed to parse YAML: {e}", err=True)
|
|
188
|
+
sys.exit(1)
|
|
189
|
+
|
|
190
|
+
if data is None:
|
|
191
|
+
data = {}
|
|
192
|
+
|
|
193
|
+
# Check structure
|
|
194
|
+
if "inputs" not in data and "outputs" not in data:
|
|
195
|
+
errors.append("datasets.yaml must contain 'inputs' and/or 'outputs' lists")
|
|
196
|
+
|
|
197
|
+
# Track slugs for duplicate detection
|
|
198
|
+
all_slugs: dict[str, str] = {} # slug -> type
|
|
199
|
+
datasets_to_validate = set(datasets) if datasets else None
|
|
200
|
+
|
|
201
|
+
def validate_dataset_entry(ds: dict, ds_type: str, index: int) -> None:
|
|
202
|
+
prefix = f"{ds_type}[{index}]"
|
|
203
|
+
slug = ds.get("slug")
|
|
204
|
+
|
|
205
|
+
# Skip if specific datasets requested and this isn't one of them
|
|
206
|
+
if datasets_to_validate and slug not in datasets_to_validate:
|
|
207
|
+
# Still track slug for duplicate detection
|
|
208
|
+
if slug:
|
|
209
|
+
all_slugs[slug] = ds_type
|
|
210
|
+
return
|
|
211
|
+
|
|
212
|
+
# Required fields
|
|
213
|
+
for field in ["name", "slug", "location", "fields"]:
|
|
214
|
+
if field not in ds:
|
|
215
|
+
errors.append(f"{prefix}: missing required field '{field}'")
|
|
216
|
+
|
|
217
|
+
# Check slug
|
|
218
|
+
if slug:
|
|
219
|
+
if slug in all_slugs:
|
|
220
|
+
errors.append(f"{prefix}: duplicate slug '{slug}' (also in {all_slugs[slug]})")
|
|
221
|
+
else:
|
|
222
|
+
all_slugs[slug] = ds_type
|
|
223
|
+
|
|
224
|
+
# Check fields
|
|
225
|
+
fields = ds.get("fields", [])
|
|
226
|
+
if not isinstance(fields, list):
|
|
227
|
+
errors.append(f"{prefix}: 'fields' must be a list")
|
|
228
|
+
else:
|
|
229
|
+
for i, field in enumerate(fields):
|
|
230
|
+
if not isinstance(field, dict):
|
|
231
|
+
errors.append(f"{prefix}.fields[{i}]: must be an object")
|
|
232
|
+
continue
|
|
233
|
+
if "name" not in field:
|
|
234
|
+
errors.append(f"{prefix}.fields[{i}]: missing 'name'")
|
|
235
|
+
if "type" not in field:
|
|
236
|
+
errors.append(f"{prefix}.fields[{i}]: missing 'type'")
|
|
237
|
+
elif field["type"] not in VALID_FIELD_TYPES:
|
|
238
|
+
errors.append(
|
|
239
|
+
f"{prefix}.fields[{i}]: invalid type '{field['type']}' "
|
|
240
|
+
f"(must be one of: {', '.join(sorted(VALID_FIELD_TYPES))})"
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Validate inputs
|
|
244
|
+
inputs = data.get("inputs", [])
|
|
245
|
+
if not isinstance(inputs, list):
|
|
246
|
+
errors.append("'inputs' must be a list")
|
|
247
|
+
else:
|
|
248
|
+
for i, ds in enumerate(inputs):
|
|
249
|
+
if not isinstance(ds, dict):
|
|
250
|
+
errors.append(f"inputs[{i}]: must be an object")
|
|
251
|
+
else:
|
|
252
|
+
validate_dataset_entry(ds, "inputs", i)
|
|
253
|
+
|
|
254
|
+
# Validate outputs
|
|
255
|
+
outputs = data.get("outputs", [])
|
|
256
|
+
if not isinstance(outputs, list):
|
|
257
|
+
errors.append("'outputs' must be a list")
|
|
258
|
+
else:
|
|
259
|
+
for i, ds in enumerate(outputs):
|
|
260
|
+
if not isinstance(ds, dict):
|
|
261
|
+
errors.append(f"outputs[{i}]: must be an object")
|
|
262
|
+
else:
|
|
263
|
+
validate_dataset_entry(ds, "outputs", i)
|
|
264
|
+
|
|
265
|
+
# Check if requested datasets were found
|
|
266
|
+
if datasets_to_validate:
|
|
267
|
+
found_slugs = set(all_slugs.keys())
|
|
268
|
+
missing = datasets_to_validate - found_slugs
|
|
269
|
+
for slug in missing:
|
|
270
|
+
errors.append(f"Dataset '{slug}' not found")
|
|
271
|
+
|
|
272
|
+
if errors:
|
|
273
|
+
click.echo("Validation errors:", err=True)
|
|
274
|
+
for error in errors:
|
|
275
|
+
click.echo(f" - {error}", err=True)
|
|
276
|
+
sys.exit(1)
|
|
277
|
+
else:
|
|
278
|
+
if datasets:
|
|
279
|
+
click.echo(f"✓ {len(datasets)} dataset(s) valid")
|
|
280
|
+
else:
|
|
281
|
+
click.echo(f"✓ {datasets_file} is valid")
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
@dataset.command("lock")
|
|
285
|
+
@click.option(
|
|
286
|
+
"-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
|
|
287
|
+
)
|
|
288
|
+
@click.argument("datasets", nargs=-1, shell_complete=complete_dataset_slugs)
|
|
289
|
+
def dataset_lock(datasets_file: str, datasets: tuple[str, ...]) -> None:
|
|
290
|
+
"""Enable strict mode for datasets.
|
|
291
|
+
|
|
292
|
+
If no datasets are specified, locks all datasets.
|
|
293
|
+
"""
|
|
294
|
+
try:
|
|
295
|
+
manager, _ = get_manager(datasets_file)
|
|
296
|
+
except FileNotFoundError as e:
|
|
297
|
+
click.echo(f"Error: {e}", err=True)
|
|
298
|
+
sys.exit(1)
|
|
299
|
+
|
|
300
|
+
# Get all datasets if none specified
|
|
301
|
+
if not datasets:
|
|
302
|
+
all_datasets = manager.get_all_inputs() + manager.get_all_outputs()
|
|
303
|
+
datasets = tuple(ds.slug for ds in all_datasets)
|
|
304
|
+
|
|
305
|
+
if not datasets:
|
|
306
|
+
click.echo("No datasets found.")
|
|
307
|
+
return
|
|
308
|
+
|
|
309
|
+
locked = []
|
|
310
|
+
for slug in datasets:
|
|
311
|
+
try:
|
|
312
|
+
manager.set_dataset_strict(slug, strict=True)
|
|
313
|
+
locked.append(slug)
|
|
314
|
+
except DatasetNotFoundError:
|
|
315
|
+
click.echo(f"Warning: Dataset '{slug}' not found", err=True)
|
|
316
|
+
|
|
317
|
+
if locked:
|
|
318
|
+
click.echo(f"✓ Locked {len(locked)} dataset(s): {', '.join(locked)}")
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
@dataset.command("unlock")
|
|
322
|
+
@click.option(
|
|
323
|
+
"-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
|
|
324
|
+
)
|
|
325
|
+
@click.argument("datasets", nargs=-1, shell_complete=complete_dataset_slugs)
|
|
326
|
+
def dataset_unlock(datasets_file: str, datasets: tuple[str, ...]) -> None:
|
|
327
|
+
"""Disable strict mode for datasets.
|
|
328
|
+
|
|
329
|
+
If no datasets are specified, unlocks all datasets.
|
|
330
|
+
"""
|
|
331
|
+
try:
|
|
332
|
+
manager, _ = get_manager(datasets_file)
|
|
333
|
+
except FileNotFoundError as e:
|
|
334
|
+
click.echo(f"Error: {e}", err=True)
|
|
335
|
+
sys.exit(1)
|
|
336
|
+
|
|
337
|
+
# Get all datasets if none specified
|
|
338
|
+
if not datasets:
|
|
339
|
+
all_datasets = manager.get_all_inputs() + manager.get_all_outputs()
|
|
340
|
+
datasets = tuple(ds.slug for ds in all_datasets)
|
|
341
|
+
|
|
342
|
+
if not datasets:
|
|
343
|
+
click.echo("No datasets found.")
|
|
344
|
+
return
|
|
345
|
+
|
|
346
|
+
unlocked = []
|
|
347
|
+
for slug in datasets:
|
|
348
|
+
try:
|
|
349
|
+
manager.set_dataset_strict(slug, strict=False)
|
|
350
|
+
unlocked.append(slug)
|
|
351
|
+
except DatasetNotFoundError:
|
|
352
|
+
click.echo(f"Warning: Dataset '{slug}' not found", err=True)
|
|
353
|
+
|
|
354
|
+
if unlocked:
|
|
355
|
+
click.echo(f"✓ Unlocked {len(unlocked)} dataset(s): {', '.join(unlocked)}")
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
# =============================================================================
|
|
359
|
+
# Package commands
|
|
360
|
+
# =============================================================================
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
@main.group()
|
|
364
|
+
def package() -> None:
|
|
365
|
+
"""Manage data packages."""
|
|
366
|
+
pass
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
@package.command("build")
|
|
370
|
+
@click.option(
|
|
371
|
+
"-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
|
|
372
|
+
)
|
|
373
|
+
@click.option("-o", "--output", "output_file", type=click.Path(), default="datapackage.json", help="Output file path")
|
|
374
|
+
def package_build(datasets_file: str, output_file: str) -> None:
|
|
375
|
+
"""Build a datapackage.json from datasets.yaml.
|
|
376
|
+
|
|
377
|
+
Creates a Data Package (https://datapackage.org/) with all output datasets as resources.
|
|
378
|
+
"""
|
|
379
|
+
try:
|
|
380
|
+
manager, project_path = get_manager(datasets_file)
|
|
381
|
+
except FileNotFoundError as e:
|
|
382
|
+
click.echo(f"Error: {e}", err=True)
|
|
383
|
+
sys.exit(1)
|
|
384
|
+
|
|
385
|
+
outputs = manager.get_all_outputs()
|
|
386
|
+
if not outputs:
|
|
387
|
+
click.echo("No output datasets found.", err=True)
|
|
388
|
+
sys.exit(1)
|
|
389
|
+
|
|
390
|
+
project_slug = get_project_slug(project_path)
|
|
391
|
+
|
|
392
|
+
try:
|
|
393
|
+
from frictionless import describe
|
|
394
|
+
except ImportError:
|
|
395
|
+
click.echo("Error: frictionless is required for package build", err=True)
|
|
396
|
+
sys.exit(1)
|
|
397
|
+
|
|
398
|
+
resources = []
|
|
399
|
+
for ds in outputs:
|
|
400
|
+
data_path = manager.get_absolute_path(ds.location)
|
|
401
|
+
if not data_path.exists():
|
|
402
|
+
click.echo(f"Warning: Data file not found for '{ds.slug}': {data_path}", err=True)
|
|
403
|
+
continue
|
|
404
|
+
|
|
405
|
+
try:
|
|
406
|
+
resource = describe(str(data_path))
|
|
407
|
+
resource.name = ds.slug
|
|
408
|
+
resource.title = ds.name
|
|
409
|
+
# Use relative path in the package
|
|
410
|
+
resource.path = ds.location
|
|
411
|
+
resources.append(resource.to_dict())
|
|
412
|
+
click.echo(f" + {ds.slug}")
|
|
413
|
+
except Exception as e:
|
|
414
|
+
click.echo(f"Warning: Failed to describe '{ds.slug}': {e}", err=True)
|
|
415
|
+
|
|
416
|
+
if not resources:
|
|
417
|
+
click.echo("Error: No resources could be added to the package", err=True)
|
|
418
|
+
sys.exit(1)
|
|
419
|
+
|
|
420
|
+
datapackage = {
|
|
421
|
+
"name": project_slug,
|
|
422
|
+
"resources": resources,
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
output_path = Path(output_file)
|
|
426
|
+
with open(output_path, "w") as f:
|
|
427
|
+
json.dump(datapackage, f, indent=2)
|
|
428
|
+
|
|
429
|
+
click.echo(f"\n✓ Created {output_file} with {len(resources)} resource(s)")
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
@package.command("push")
|
|
433
|
+
@click.option("--env", type=click.Choice(["dev", "prod"]), default="dev", help="Target environment")
|
|
434
|
+
@click.option(
|
|
435
|
+
"-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
|
|
436
|
+
)
|
|
437
|
+
@click.option("--destination", "-d", "destination", type=str, default=None, help="Override destination gs:// URL")
|
|
438
|
+
def package_push(env: str, datasets_file: str, destination: Optional[str]) -> None:
|
|
439
|
+
"""Push the data package to Google Cloud Storage.
|
|
440
|
+
|
|
441
|
+
Uploads datapackage.json and all publishable output datasets.
|
|
442
|
+
"""
|
|
443
|
+
try:
|
|
444
|
+
manager, project_path = get_manager(datasets_file)
|
|
445
|
+
except FileNotFoundError as e:
|
|
446
|
+
click.echo(f"Error: {e}", err=True)
|
|
447
|
+
sys.exit(1)
|
|
448
|
+
|
|
449
|
+
outputs = manager.get_all_outputs()
|
|
450
|
+
publishable = [ds for ds in outputs if ds.is_publishable]
|
|
451
|
+
|
|
452
|
+
if not publishable:
|
|
453
|
+
click.echo("Error: No publishable datasets found (need publish.enabled: true)", err=True)
|
|
454
|
+
sys.exit(1)
|
|
455
|
+
|
|
456
|
+
project_slug = get_project_slug(project_path)
|
|
457
|
+
|
|
458
|
+
# Determine destination
|
|
459
|
+
if destination:
|
|
460
|
+
dest_url = expand_env_vars(destination)
|
|
461
|
+
elif publishable[0].publish and publishable[0].publish.to:
|
|
462
|
+
# Use first dataset's publish.to as package destination
|
|
463
|
+
dest_url = expand_env_vars(publishable[0].publish.to)
|
|
464
|
+
else:
|
|
465
|
+
dest_url = f"gs://payloadcms-{env}/datasets/projects/{project_slug}/"
|
|
466
|
+
|
|
467
|
+
parsed = urlparse(dest_url)
|
|
468
|
+
if parsed.scheme != "gs":
|
|
469
|
+
click.echo(f"Error: Destination must be a gs:// URL, got: {dest_url}", err=True)
|
|
470
|
+
sys.exit(1)
|
|
471
|
+
|
|
472
|
+
bucket_name = parsed.netloc
|
|
473
|
+
gcs_prefix = parsed.path.lstrip("/")
|
|
474
|
+
if gcs_prefix and not gcs_prefix.endswith("/"):
|
|
475
|
+
gcs_prefix += "/"
|
|
476
|
+
|
|
477
|
+
# Build the datapackage
|
|
478
|
+
try:
|
|
479
|
+
from frictionless import describe
|
|
480
|
+
except ImportError:
|
|
481
|
+
click.echo("Error: frictionless is required for package push", err=True)
|
|
482
|
+
sys.exit(1)
|
|
483
|
+
|
|
484
|
+
resources = []
|
|
485
|
+
data_files: list[tuple[Path, str]] = [] # (local_path, remote_name)
|
|
486
|
+
|
|
487
|
+
for ds in publishable:
|
|
488
|
+
data_path = manager.get_absolute_path(ds.location)
|
|
489
|
+
if not data_path.exists():
|
|
490
|
+
click.echo(f"Warning: Data file not found for '{ds.slug}': {data_path}", err=True)
|
|
491
|
+
continue
|
|
492
|
+
|
|
493
|
+
try:
|
|
494
|
+
resource = describe(str(data_path))
|
|
495
|
+
resource.name = ds.slug
|
|
496
|
+
resource.title = ds.name
|
|
497
|
+
resource.path = data_path.name # Just the filename in the package
|
|
498
|
+
resources.append(resource.to_dict())
|
|
499
|
+
data_files.append((data_path, data_path.name))
|
|
500
|
+
except Exception as e:
|
|
501
|
+
click.echo(f"Warning: Failed to describe '{ds.slug}': {e}", err=True)
|
|
502
|
+
|
|
503
|
+
if not resources:
|
|
504
|
+
click.echo("Error: No resources could be added to the package", err=True)
|
|
505
|
+
sys.exit(1)
|
|
506
|
+
|
|
507
|
+
datapackage = {
|
|
508
|
+
"name": project_slug,
|
|
509
|
+
"resources": resources,
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
# Upload to GCS
|
|
513
|
+
try:
|
|
514
|
+
from google.cloud import storage # type: ignore[import-untyped]
|
|
515
|
+
|
|
516
|
+
client = storage.Client()
|
|
517
|
+
bucket = client.bucket(bucket_name)
|
|
518
|
+
|
|
519
|
+
# Upload datapackage.json
|
|
520
|
+
datapackage_blob = bucket.blob(f"{gcs_prefix}datapackage.json")
|
|
521
|
+
datapackage_blob.upload_from_string(json.dumps(datapackage, indent=2), content_type="application/json")
|
|
522
|
+
click.echo("✓ Uploaded datapackage.json")
|
|
523
|
+
|
|
524
|
+
# Upload data files
|
|
525
|
+
for local_path, remote_name in data_files:
|
|
526
|
+
data_blob = bucket.blob(f"{gcs_prefix}{remote_name}")
|
|
527
|
+
data_blob.upload_from_filename(str(local_path))
|
|
528
|
+
click.echo(f"✓ Uploaded {remote_name}")
|
|
529
|
+
|
|
530
|
+
click.echo(f"\nPackage pushed to: gs://{bucket_name}/{gcs_prefix}")
|
|
531
|
+
|
|
532
|
+
except ImportError:
|
|
533
|
+
click.echo("Error: google-cloud-storage is required for push", err=True)
|
|
534
|
+
click.echo("Install with: pip install google-cloud-storage", err=True)
|
|
535
|
+
sys.exit(1)
|
|
536
|
+
except Exception as e:
|
|
537
|
+
click.echo(f"Error uploading to GCS: {e}", err=True)
|
|
538
|
+
sys.exit(1)
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
if __name__ == "__main__":
|
|
542
|
+
main()
|
sunstone/dataframe.py
CHANGED
|
@@ -10,7 +10,7 @@ import pandas as pd
|
|
|
10
10
|
|
|
11
11
|
from .datasets import DatasetsManager
|
|
12
12
|
from .exceptions import DatasetNotFoundError, StrictModeError
|
|
13
|
-
from .lineage import FieldSchema, LineageMetadata
|
|
13
|
+
from .lineage import FieldSchema, LineageMetadata, compute_dataframe_hash
|
|
14
14
|
|
|
15
15
|
pd.options.mode.copy_on_write = True
|
|
16
16
|
|
|
@@ -196,7 +196,6 @@ class DataFrame:
|
|
|
196
196
|
# Create lineage metadata
|
|
197
197
|
lineage = LineageMetadata(project_path=str(manager.project_path))
|
|
198
198
|
lineage.add_source(dataset)
|
|
199
|
-
lineage.add_operation(f"read_dataset({dataset.slug}, format={format})")
|
|
200
199
|
|
|
201
200
|
# Return wrapped DataFrame
|
|
202
201
|
return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
|
|
@@ -294,7 +293,6 @@ class DataFrame:
|
|
|
294
293
|
# Create lineage metadata
|
|
295
294
|
lineage = LineageMetadata(project_path=str(manager.project_path))
|
|
296
295
|
lineage.add_source(dataset)
|
|
297
|
-
lineage.add_operation(f"read_csv({dataset.slug})")
|
|
298
296
|
|
|
299
297
|
# Return wrapped DataFrame
|
|
300
298
|
return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
|
|
@@ -363,11 +361,13 @@ class DataFrame:
|
|
|
363
361
|
absolute_path.parent.mkdir(parents=True, exist_ok=True)
|
|
364
362
|
self.data.to_csv(absolute_path, **kwargs)
|
|
365
363
|
|
|
366
|
-
#
|
|
367
|
-
self.
|
|
364
|
+
# Compute content hash for change detection
|
|
365
|
+
content_hash = compute_dataframe_hash(self.data)
|
|
368
366
|
|
|
369
367
|
# Persist lineage metadata to datasets.yaml
|
|
370
|
-
manager.update_output_lineage(
|
|
368
|
+
manager.update_output_lineage(
|
|
369
|
+
slug=dataset.slug, lineage=self.lineage, content_hash=content_hash, strict=self.strict_mode
|
|
370
|
+
)
|
|
371
371
|
|
|
372
372
|
def _infer_field_schema(self) -> List[FieldSchema]:
|
|
373
373
|
"""
|
|
@@ -410,11 +410,8 @@ class DataFrame:
|
|
|
410
410
|
# Perform the merge
|
|
411
411
|
merged_data = pd.merge(self.data, right.data, **kwargs)
|
|
412
412
|
|
|
413
|
-
# Combine lineage
|
|
413
|
+
# Combine lineage (sources from both DataFrames)
|
|
414
414
|
merged_lineage = self.lineage.merge(right.lineage)
|
|
415
|
-
merged_lineage.add_operation(
|
|
416
|
-
f"merge(left={len(self.lineage.sources)} sources, right={len(right.lineage.sources)} sources)"
|
|
417
|
-
)
|
|
418
415
|
|
|
419
416
|
return DataFrame(
|
|
420
417
|
data=merged_data,
|
|
@@ -437,11 +434,8 @@ class DataFrame:
|
|
|
437
434
|
# Perform the join
|
|
438
435
|
joined_data = self.data.join(other.data, **kwargs)
|
|
439
436
|
|
|
440
|
-
# Combine lineage
|
|
437
|
+
# Combine lineage (sources from both DataFrames)
|
|
441
438
|
joined_lineage = self.lineage.merge(other.lineage)
|
|
442
|
-
joined_lineage.add_operation(
|
|
443
|
-
f"join(left={len(self.lineage.sources)} sources, right={len(other.lineage.sources)} sources)"
|
|
444
|
-
)
|
|
445
439
|
|
|
446
440
|
return DataFrame(
|
|
447
441
|
data=joined_data,
|
|
@@ -467,16 +461,11 @@ class DataFrame:
|
|
|
467
461
|
# Concatenate
|
|
468
462
|
concatenated_data = pd.concat(all_dfs, **kwargs)
|
|
469
463
|
|
|
470
|
-
# Combine lineage from all DataFrames
|
|
464
|
+
# Combine lineage (sources from all DataFrames)
|
|
471
465
|
combined_lineage = self.lineage
|
|
472
466
|
for other in others:
|
|
473
467
|
combined_lineage = combined_lineage.merge(other.lineage)
|
|
474
468
|
|
|
475
|
-
combined_lineage.add_operation(
|
|
476
|
-
f"concat({len(others) + 1} dataframes, "
|
|
477
|
-
f"{sum(len(df.lineage.sources) for df in [self] + others)} total sources)"
|
|
478
|
-
)
|
|
479
|
-
|
|
480
469
|
return DataFrame(
|
|
481
470
|
data=concatenated_data,
|
|
482
471
|
lineage=combined_lineage,
|
|
@@ -484,42 +473,12 @@ class DataFrame:
|
|
|
484
473
|
project_path=self.lineage.project_path,
|
|
485
474
|
)
|
|
486
475
|
|
|
487
|
-
def
|
|
488
|
-
"""
|
|
489
|
-
Apply a transformation operation to the DataFrame.
|
|
490
|
-
|
|
491
|
-
Args:
|
|
492
|
-
operation: Function that takes a pandas DataFrame and returns a DataFrame.
|
|
493
|
-
description: Human-readable description of the operation.
|
|
494
|
-
|
|
495
|
-
Returns:
|
|
496
|
-
A new DataFrame with the operation applied and recorded in lineage.
|
|
497
|
-
"""
|
|
498
|
-
# Apply the operation
|
|
499
|
-
new_data = operation(self.data)
|
|
500
|
-
|
|
501
|
-
# Copy lineage and add operation
|
|
502
|
-
new_lineage = LineageMetadata(
|
|
503
|
-
sources=self.lineage.sources.copy(),
|
|
504
|
-
operations=self.lineage.operations.copy(),
|
|
505
|
-
project_path=self.lineage.project_path,
|
|
506
|
-
)
|
|
507
|
-
new_lineage.add_operation(description)
|
|
508
|
-
|
|
509
|
-
return DataFrame(
|
|
510
|
-
data=new_data,
|
|
511
|
-
lineage=new_lineage,
|
|
512
|
-
strict=self.strict_mode,
|
|
513
|
-
project_path=self.lineage.project_path,
|
|
514
|
-
)
|
|
515
|
-
|
|
516
|
-
def _wrap_result(self, result: Any, operation: Optional[str] = None) -> Any:
|
|
476
|
+
def _wrap_result(self, result: Any) -> Any:
|
|
517
477
|
"""
|
|
518
478
|
Wrap a pandas result in a Sunstone DataFrame if applicable.
|
|
519
479
|
|
|
520
480
|
Args:
|
|
521
481
|
result: The result from a pandas operation.
|
|
522
|
-
operation: Name of the operation performed. If None, no operation is recorded.
|
|
523
482
|
|
|
524
483
|
Returns:
|
|
525
484
|
Wrapped DataFrame if result is a DataFrame, otherwise the result.
|
|
@@ -527,11 +486,8 @@ class DataFrame:
|
|
|
527
486
|
if isinstance(result, pd.DataFrame):
|
|
528
487
|
new_lineage = LineageMetadata(
|
|
529
488
|
sources=self.lineage.sources.copy(),
|
|
530
|
-
operations=self.lineage.operations.copy(),
|
|
531
489
|
project_path=self.lineage.project_path,
|
|
532
490
|
)
|
|
533
|
-
if operation is not None:
|
|
534
|
-
new_lineage.add_operation(operation)
|
|
535
491
|
|
|
536
492
|
return DataFrame(
|
|
537
493
|
data=result,
|
|
@@ -541,28 +497,6 @@ class DataFrame:
|
|
|
541
497
|
)
|
|
542
498
|
return result
|
|
543
499
|
|
|
544
|
-
# Methods that don't represent meaningful data transformations
|
|
545
|
-
# These return DataFrames but shouldn't be tracked in lineage
|
|
546
|
-
_NON_TRACKING_METHODS = frozenset(
|
|
547
|
-
{
|
|
548
|
-
# Copy operations - same data, no transformation
|
|
549
|
-
"copy",
|
|
550
|
-
# Index operations - same data, different index
|
|
551
|
-
"reset_index",
|
|
552
|
-
"set_index",
|
|
553
|
-
"reindex",
|
|
554
|
-
# Type conversions without data change
|
|
555
|
-
"astype",
|
|
556
|
-
"infer_objects",
|
|
557
|
-
# Column/index renaming - same data, different labels
|
|
558
|
-
"rename",
|
|
559
|
-
"rename_axis",
|
|
560
|
-
# Reshaping without data loss
|
|
561
|
-
"T",
|
|
562
|
-
"transpose",
|
|
563
|
-
}
|
|
564
|
-
)
|
|
565
|
-
|
|
566
500
|
def __getattr__(self, name: str) -> Any:
|
|
567
501
|
"""
|
|
568
502
|
Delegate attribute access to the underlying pandas DataFrame.
|
|
@@ -583,14 +517,11 @@ class DataFrame:
|
|
|
583
517
|
|
|
584
518
|
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
585
519
|
result = attr(*args, **kwargs)
|
|
586
|
-
|
|
587
|
-
if name in DataFrame._NON_TRACKING_METHODS:
|
|
588
|
-
return self._wrap_result(result, operation=None)
|
|
589
|
-
return self._wrap_result(result, operation=f"{name}")
|
|
520
|
+
return self._wrap_result(result)
|
|
590
521
|
|
|
591
522
|
return wrapper
|
|
592
523
|
|
|
593
|
-
return self._wrap_result(attr
|
|
524
|
+
return self._wrap_result(attr)
|
|
594
525
|
|
|
595
526
|
def __getitem__(self, key: Any) -> Any:
|
|
596
527
|
"""
|
|
@@ -603,9 +534,7 @@ class DataFrame:
|
|
|
603
534
|
The item from the underlying DataFrame, wrapped if it's a DataFrame.
|
|
604
535
|
"""
|
|
605
536
|
result = self.data[key]
|
|
606
|
-
|
|
607
|
-
# not a meaningful transformation
|
|
608
|
-
return self._wrap_result(result, operation=None)
|
|
537
|
+
return self._wrap_result(result)
|
|
609
538
|
|
|
610
539
|
def __setitem__(self, key: Any, value: Any) -> None:
|
|
611
540
|
"""
|
|
@@ -616,14 +545,12 @@ class DataFrame:
|
|
|
616
545
|
value: Value to assign.
|
|
617
546
|
"""
|
|
618
547
|
self.data[key] = value
|
|
619
|
-
#
|
|
620
|
-
|
|
548
|
+
# Don't track column assignments automatically
|
|
549
|
+
# Users should use add_operation() for meaningful transformations
|
|
621
550
|
|
|
622
551
|
def __repr__(self) -> str:
|
|
623
552
|
"""String representation of the DataFrame."""
|
|
624
|
-
lineage_info = (
|
|
625
|
-
f"\n\nLineage: {len(self.lineage.sources)} source(s), {len(self.lineage.operations)} operation(s)"
|
|
626
|
-
)
|
|
553
|
+
lineage_info = f"\n\nLineage: {len(self.lineage.sources)} source(s)"
|
|
627
554
|
return repr(self.data) + lineage_info
|
|
628
555
|
|
|
629
556
|
def __str__(self) -> str:
|
sunstone/datasets.py
CHANGED
|
@@ -15,7 +15,7 @@ import requests
|
|
|
15
15
|
from ruamel.yaml import YAML
|
|
16
16
|
|
|
17
17
|
from .exceptions import DatasetNotFoundError, DatasetValidationError
|
|
18
|
-
from .lineage import DatasetMetadata, FieldSchema, LineageMetadata, Source, SourceLocation
|
|
18
|
+
from .lineage import DatasetMetadata, FieldSchema, LineageMetadata, PublishConfig, Source, SourceLocation
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
21
21
|
|
|
@@ -156,6 +156,26 @@ class DatasetsManager:
|
|
|
156
156
|
for field in fields_data
|
|
157
157
|
]
|
|
158
158
|
|
|
159
|
+
def _parse_publish(self, publish_data: Any) -> Optional[PublishConfig]:
|
|
160
|
+
"""
|
|
161
|
+
Parse publish configuration from YAML.
|
|
162
|
+
|
|
163
|
+
Supports both legacy boolean format and new object format:
|
|
164
|
+
- publish: true -> PublishConfig(enabled=True)
|
|
165
|
+
- publish: false -> None
|
|
166
|
+
- publish: { enabled: true, to: "..." } -> PublishConfig(enabled=True, to="...")
|
|
167
|
+
"""
|
|
168
|
+
if publish_data is None:
|
|
169
|
+
return None
|
|
170
|
+
if isinstance(publish_data, bool):
|
|
171
|
+
return PublishConfig(enabled=publish_data) if publish_data else None
|
|
172
|
+
if isinstance(publish_data, dict):
|
|
173
|
+
enabled = publish_data.get("enabled", False)
|
|
174
|
+
if not enabled:
|
|
175
|
+
return None
|
|
176
|
+
return PublishConfig(enabled=True, to=publish_data.get("to"))
|
|
177
|
+
return None
|
|
178
|
+
|
|
159
179
|
def _parse_dataset(self, dataset_data: Dict[str, Any], dataset_type: str) -> DatasetMetadata:
|
|
160
180
|
"""
|
|
161
181
|
Parse dataset metadata from YAML data.
|
|
@@ -177,7 +197,8 @@ class DatasetsManager:
|
|
|
177
197
|
location=dataset_data["location"],
|
|
178
198
|
fields=self._parse_fields(dataset_data["fields"]),
|
|
179
199
|
source=source,
|
|
180
|
-
publish=dataset_data.get("publish"
|
|
200
|
+
publish=self._parse_publish(dataset_data.get("publish")),
|
|
201
|
+
strict=dataset_data.get("strict", False),
|
|
181
202
|
dataset_type=dataset_type,
|
|
182
203
|
)
|
|
183
204
|
|
|
@@ -380,22 +401,57 @@ class DatasetsManager:
|
|
|
380
401
|
|
|
381
402
|
raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
|
|
382
403
|
|
|
383
|
-
def
|
|
404
|
+
def set_dataset_strict(self, slug: str, strict: bool, dataset_type: Optional[str] = None) -> None:
|
|
405
|
+
"""
|
|
406
|
+
Set or remove strict mode for a dataset.
|
|
407
|
+
|
|
408
|
+
Args:
|
|
409
|
+
slug: The slug of the dataset to update.
|
|
410
|
+
strict: If True, enable strict mode. If False, disable it.
|
|
411
|
+
dataset_type: Optional filter by 'input' or 'output'. If None, searches both.
|
|
412
|
+
|
|
413
|
+
Raises:
|
|
414
|
+
DatasetNotFoundError: If the dataset doesn't exist.
|
|
415
|
+
"""
|
|
416
|
+
search_types = ["input", "output"] if dataset_type is None else [dataset_type]
|
|
417
|
+
|
|
418
|
+
for dtype in search_types:
|
|
419
|
+
key = "inputs" if dtype == "input" else "outputs"
|
|
420
|
+
for dataset_data in self._data.get(key, []):
|
|
421
|
+
if dataset_data["slug"] == slug:
|
|
422
|
+
if strict:
|
|
423
|
+
dataset_data["strict"] = True
|
|
424
|
+
elif "strict" in dataset_data:
|
|
425
|
+
del dataset_data["strict"]
|
|
426
|
+
self._save()
|
|
427
|
+
return
|
|
428
|
+
|
|
429
|
+
raise DatasetNotFoundError(f"Dataset with slug '{slug}' not found")
|
|
430
|
+
|
|
431
|
+
def update_output_lineage(
|
|
432
|
+
self, slug: str, lineage: LineageMetadata, content_hash: str, strict: bool = False
|
|
433
|
+
) -> None:
|
|
384
434
|
"""
|
|
385
435
|
Update lineage metadata for an output dataset.
|
|
386
436
|
|
|
437
|
+
The timestamp is only updated when the content hash changes, preventing
|
|
438
|
+
unnecessary updates when the data hasn't changed.
|
|
439
|
+
|
|
387
440
|
In strict mode, validates that the lineage matches what would be written
|
|
388
441
|
without modifying the file. In relaxed mode, updates the file with lineage.
|
|
389
442
|
|
|
390
443
|
Args:
|
|
391
444
|
slug: The slug of the output dataset to update.
|
|
392
445
|
lineage: The lineage metadata to persist.
|
|
446
|
+
content_hash: SHA256 hash of the DataFrame content.
|
|
393
447
|
strict: If True, validate without modifying. If False, update the file.
|
|
394
448
|
|
|
395
449
|
Raises:
|
|
396
450
|
DatasetNotFoundError: If the dataset doesn't exist.
|
|
397
451
|
DatasetValidationError: In strict mode, if lineage differs from what's in the file.
|
|
398
452
|
"""
|
|
453
|
+
from datetime import datetime
|
|
454
|
+
|
|
399
455
|
# Find the output dataset
|
|
400
456
|
dataset_idx = None
|
|
401
457
|
for i, dataset_data in enumerate(self._data["outputs"]):
|
|
@@ -406,23 +462,28 @@ class DatasetsManager:
|
|
|
406
462
|
if dataset_idx is None:
|
|
407
463
|
raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
|
|
408
464
|
|
|
409
|
-
#
|
|
410
|
-
|
|
465
|
+
# Get existing lineage data if present
|
|
466
|
+
existing_lineage = self._data["outputs"][dataset_idx].get("lineage", {})
|
|
467
|
+
existing_hash = existing_lineage.get("content_hash")
|
|
468
|
+
existing_timestamp = existing_lineage.get("created_at")
|
|
411
469
|
|
|
412
|
-
if
|
|
413
|
-
|
|
414
|
-
{
|
|
415
|
-
"slug": src.slug,
|
|
416
|
-
"name": src.name,
|
|
417
|
-
}
|
|
418
|
-
for src in lineage.sources
|
|
419
|
-
]
|
|
470
|
+
# Determine if content has changed
|
|
471
|
+
content_changed = existing_hash != content_hash
|
|
420
472
|
|
|
421
|
-
if
|
|
422
|
-
|
|
473
|
+
# Only update timestamp if content changed
|
|
474
|
+
if content_changed:
|
|
475
|
+
timestamp = datetime.now().isoformat()
|
|
476
|
+
else:
|
|
477
|
+
# Preserve existing timestamp
|
|
478
|
+
timestamp = existing_timestamp
|
|
423
479
|
|
|
424
|
-
|
|
425
|
-
|
|
480
|
+
# Build lineage metadata to add (order: content_hash, created_at, sources)
|
|
481
|
+
lineage_data: dict[str, Any] = {}
|
|
482
|
+
lineage_data["content_hash"] = content_hash
|
|
483
|
+
if timestamp:
|
|
484
|
+
lineage_data["created_at"] = timestamp
|
|
485
|
+
if lineage.sources:
|
|
486
|
+
lineage_data["sources"] = [{"slug": src.slug} for src in lineage.sources]
|
|
426
487
|
|
|
427
488
|
# Create a copy of the data with updated lineage
|
|
428
489
|
updated_data = self._data.copy()
|
sunstone/lineage.py
CHANGED
|
@@ -2,9 +2,13 @@
|
|
|
2
2
|
Lineage metadata structures for tracking data provenance.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import hashlib
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
7
|
from datetime import datetime
|
|
7
|
-
from typing import Any, Dict, List, Optional
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
import pandas as pd
|
|
8
12
|
|
|
9
13
|
|
|
10
14
|
@dataclass
|
|
@@ -62,6 +66,17 @@ class FieldSchema:
|
|
|
62
66
|
"""Optional constraints (e.g., enum values)."""
|
|
63
67
|
|
|
64
68
|
|
|
69
|
+
@dataclass
|
|
70
|
+
class PublishConfig:
|
|
71
|
+
"""Configuration for publishing a dataset."""
|
|
72
|
+
|
|
73
|
+
enabled: bool = False
|
|
74
|
+
"""Whether publishing is enabled."""
|
|
75
|
+
|
|
76
|
+
to: Optional[str] = None
|
|
77
|
+
"""Optional destination URL (supports ${VAR:-default} substitution)."""
|
|
78
|
+
|
|
79
|
+
|
|
65
80
|
@dataclass
|
|
66
81
|
class DatasetMetadata:
|
|
67
82
|
"""Metadata for a dataset from datasets.yaml."""
|
|
@@ -81,30 +96,56 @@ class DatasetMetadata:
|
|
|
81
96
|
source: Optional[Source] = None
|
|
82
97
|
"""Source attribution (for input datasets)."""
|
|
83
98
|
|
|
84
|
-
publish:
|
|
85
|
-
"""
|
|
99
|
+
publish: Optional[PublishConfig] = None
|
|
100
|
+
"""Publishing configuration (for output datasets)."""
|
|
101
|
+
|
|
102
|
+
strict: bool = False
|
|
103
|
+
"""Whether strict mode is enabled (lineage cannot be modified)."""
|
|
86
104
|
|
|
87
105
|
dataset_type: str = "input"
|
|
88
106
|
"""Type of dataset: 'input' or 'output'."""
|
|
89
107
|
|
|
108
|
+
@property
|
|
109
|
+
def is_publishable(self) -> bool:
|
|
110
|
+
"""Check if this dataset is configured for publishing."""
|
|
111
|
+
return self.publish is not None and self.publish.enabled
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def compute_dataframe_hash(df: "pd.DataFrame") -> str:
|
|
115
|
+
"""
|
|
116
|
+
Compute a fast SHA256 hash of a pandas DataFrame's content.
|
|
117
|
+
|
|
118
|
+
Uses pickle serialization for a consistent, fast representation of the data.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
df: The pandas DataFrame to hash.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
A SHA256 hex digest string representing the DataFrame content.
|
|
125
|
+
"""
|
|
126
|
+
import pickle
|
|
127
|
+
|
|
128
|
+
# Use pickle protocol 5 for efficiency; hash the bytes directly
|
|
129
|
+
data_bytes = pickle.dumps(df, protocol=5)
|
|
130
|
+
return hashlib.sha256(data_bytes).hexdigest()
|
|
131
|
+
|
|
90
132
|
|
|
91
133
|
@dataclass
|
|
92
134
|
class LineageMetadata:
|
|
93
135
|
"""
|
|
94
136
|
Lineage metadata tracking the provenance of data in a DataFrame.
|
|
95
137
|
|
|
96
|
-
This tracks all source datasets that contributed to the current DataFrame
|
|
97
|
-
including information about transformations and operations performed.
|
|
138
|
+
This tracks all source datasets that contributed to the current DataFrame.
|
|
98
139
|
"""
|
|
99
140
|
|
|
100
141
|
sources: List[DatasetMetadata] = field(default_factory=list)
|
|
101
142
|
"""List of source datasets that contributed to this data."""
|
|
102
143
|
|
|
103
|
-
|
|
104
|
-
"""
|
|
144
|
+
created_at: Optional[datetime] = None
|
|
145
|
+
"""Timestamp when this lineage was last updated (content changed)."""
|
|
105
146
|
|
|
106
|
-
|
|
107
|
-
"""
|
|
147
|
+
content_hash: Optional[str] = None
|
|
148
|
+
"""SHA256 hash of the DataFrame content, used to detect changes."""
|
|
108
149
|
|
|
109
150
|
project_path: Optional[str] = None
|
|
110
151
|
"""Path to the project directory containing datasets.yaml."""
|
|
@@ -119,15 +160,6 @@ class LineageMetadata:
|
|
|
119
160
|
if dataset not in self.sources:
|
|
120
161
|
self.sources.append(dataset)
|
|
121
162
|
|
|
122
|
-
def add_operation(self, operation: str) -> None:
|
|
123
|
-
"""
|
|
124
|
-
Record an operation performed on the data.
|
|
125
|
-
|
|
126
|
-
Args:
|
|
127
|
-
operation: Description of the operation.
|
|
128
|
-
"""
|
|
129
|
-
self.operations.append(operation)
|
|
130
|
-
|
|
131
163
|
def merge(self, other: "LineageMetadata") -> "LineageMetadata":
|
|
132
164
|
"""
|
|
133
165
|
Merge lineage from another DataFrame.
|
|
@@ -136,12 +168,10 @@ class LineageMetadata:
|
|
|
136
168
|
other: The other lineage metadata to merge.
|
|
137
169
|
|
|
138
170
|
Returns:
|
|
139
|
-
A new LineageMetadata with combined sources
|
|
171
|
+
A new LineageMetadata with combined sources.
|
|
140
172
|
"""
|
|
141
173
|
merged = LineageMetadata(
|
|
142
174
|
sources=self.sources.copy(),
|
|
143
|
-
operations=self.operations.copy(),
|
|
144
|
-
created_at=datetime.now(),
|
|
145
175
|
project_path=self.project_path or other.project_path,
|
|
146
176
|
)
|
|
147
177
|
|
|
@@ -150,9 +180,6 @@ class LineageMetadata:
|
|
|
150
180
|
if source not in merged.sources:
|
|
151
181
|
merged.sources.append(source)
|
|
152
182
|
|
|
153
|
-
# Combine operations
|
|
154
|
-
merged.operations.extend(other.operations)
|
|
155
|
-
|
|
156
183
|
return merged
|
|
157
184
|
|
|
158
185
|
def get_licenses(self) -> List[str]:
|
|
@@ -175,16 +202,18 @@ class LineageMetadata:
|
|
|
175
202
|
Returns:
|
|
176
203
|
Dictionary containing lineage information.
|
|
177
204
|
"""
|
|
178
|
-
|
|
205
|
+
result: Dict[str, Any] = {
|
|
179
206
|
"sources": [
|
|
180
207
|
{
|
|
181
|
-
"name": src.name,
|
|
182
208
|
"slug": src.slug,
|
|
209
|
+
"name": src.name,
|
|
183
210
|
"location": src.location,
|
|
184
211
|
}
|
|
185
212
|
for src in self.sources
|
|
186
213
|
],
|
|
187
|
-
"operations": self.operations,
|
|
188
|
-
"created_at": self.created_at.isoformat(),
|
|
189
|
-
"licenses": self.get_licenses(),
|
|
190
214
|
}
|
|
215
|
+
if self.created_at is not None:
|
|
216
|
+
result["created_at"] = self.created_at.isoformat()
|
|
217
|
+
if self.content_hash is not None:
|
|
218
|
+
result["content_hash"] = self.content_hash
|
|
219
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sunstone-py
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Python library for managing datasets with lineage tracking in Sunstone projects
|
|
5
5
|
Author-email: Sunstone Institute <stig@sunstone.institute>
|
|
6
6
|
License: MIT
|
|
@@ -17,8 +17,10 @@ Classifier: Programming Language :: Python :: 3.14
|
|
|
17
17
|
Requires-Python: >=3.12
|
|
18
18
|
Description-Content-Type: text/markdown
|
|
19
19
|
License-File: LICENSE
|
|
20
|
+
Requires-Dist: click>=8.0
|
|
20
21
|
Requires-Dist: frictionless>=5.18.1
|
|
21
22
|
Requires-Dist: google-auth>=2.43.0
|
|
23
|
+
Requires-Dist: google-cloud-storage>=2.0.0
|
|
22
24
|
Requires-Dist: pandas>=2.0.0
|
|
23
25
|
Requires-Dist: pyyaml>=6.0
|
|
24
26
|
Requires-Dist: requests>=2.31.0
|
|
@@ -29,7 +31,7 @@ Dynamic: license-file
|
|
|
29
31
|
|
|
30
32
|
A Python library for managing datasets with lineage tracking in data science projects.
|
|
31
33
|
|
|
32
|
-
[](https://www.python.org/downloads/)
|
|
33
35
|
[](https://opensource.org/licenses/MIT)
|
|
34
36
|
|
|
35
37
|
## Features
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
sunstone/__init__.py,sha256=LC0ZtmxP26eXPLKejbg7UStcHOnE_lwttNTL4m3F4yM,2032
|
|
2
|
+
sunstone/_release.py,sha256=MQNaUD7mSK6h8vu6EIgJuaMlAxuFxv82NQwHgBpLZm4,14907
|
|
3
|
+
sunstone/cli.py,sha256=YNwMXWCezQCJikJEC1iprf4rl5hsTr0V8toETVoRVCk,17905
|
|
4
|
+
sunstone/dataframe.py,sha256=rFGuMq-63Haua_QQfR3E708KYc1g43yEyCej11_Gl3A,20679
|
|
5
|
+
sunstone/datasets.py,sha256=9mJJ02UFcjFtbbx01rFLUMAacUPaJdothfqnTsc66kw,23851
|
|
6
|
+
sunstone/exceptions.py,sha256=fiixXazur3LtQGy21bGEaSr356DObFcYxQJ3FvOxNec,623
|
|
7
|
+
sunstone/lineage.py,sha256=iZiVBY-l-iEeVVlEORkow29fMM5UGtah8FU5ZVLetAI,6001
|
|
8
|
+
sunstone/pandas.py,sha256=CLEqIIgTbMmpH73TPy_vDUPxQa37Hpmqn4r6No8PJwo,8188
|
|
9
|
+
sunstone/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
sunstone/validation.py,sha256=1356vcUc72a1zGBUe9Xjrcb5h41Xo53PaK2nnQ_FuSM,8286
|
|
11
|
+
sunstone_py-0.6.0.dist-info/licenses/LICENSE,sha256=pB6VuR4QRjwjMjy8RSNGho-N1SUdu07ntIhT5lrhkzU,1078
|
|
12
|
+
sunstone_py-0.6.0.dist-info/METADATA,sha256=3eqIzvMuCIMbuzLaAMcVMV_KsUxcvJNlh5drnUfV7hk,9529
|
|
13
|
+
sunstone_py-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
14
|
+
sunstone_py-0.6.0.dist-info/entry_points.txt,sha256=DT-mp-lPl6UEcHBNs2o3HJ8dLp4iqMnzvHJhiLfCd0g,80
|
|
15
|
+
sunstone_py-0.6.0.dist-info/top_level.txt,sha256=A2fW-7JO10rlx_L28Bc4FVvWt2R8kgvS8_TGPBhQp3c,9
|
|
16
|
+
sunstone_py-0.6.0.dist-info/RECORD,,
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
sunstone/__init__.py,sha256=LC0ZtmxP26eXPLKejbg7UStcHOnE_lwttNTL4m3F4yM,2032
|
|
2
|
-
sunstone/_release.py,sha256=MQNaUD7mSK6h8vu6EIgJuaMlAxuFxv82NQwHgBpLZm4,14907
|
|
3
|
-
sunstone/dataframe.py,sha256=UJgQx7auiNb6hSIvhB8EQs2afu-7S22xdWL5DZUr29g,23602
|
|
4
|
-
sunstone/datasets.py,sha256=LdHk3Vkfc7QH2VxhSskRCm9wUFSkldCmgS_1c2KDAPA,21142
|
|
5
|
-
sunstone/exceptions.py,sha256=fiixXazur3LtQGy21bGEaSr356DObFcYxQJ3FvOxNec,623
|
|
6
|
-
sunstone/lineage.py,sha256=B9GKMu5-v8Izos5G40K_EvsCPJL3Z2Tg1T_Fc7ezSMI,5240
|
|
7
|
-
sunstone/pandas.py,sha256=CLEqIIgTbMmpH73TPy_vDUPxQa37Hpmqn4r6No8PJwo,8188
|
|
8
|
-
sunstone/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
sunstone/validation.py,sha256=1356vcUc72a1zGBUe9Xjrcb5h41Xo53PaK2nnQ_FuSM,8286
|
|
10
|
-
sunstone_py-0.5.2.dist-info/licenses/LICENSE,sha256=pB6VuR4QRjwjMjy8RSNGho-N1SUdu07ntIhT5lrhkzU,1078
|
|
11
|
-
sunstone_py-0.5.2.dist-info/METADATA,sha256=uR8iPIENJBiPVFhtr5EXT3V6VAmLiju0CfFjm6oQubI,9460
|
|
12
|
-
sunstone_py-0.5.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
-
sunstone_py-0.5.2.dist-info/entry_points.txt,sha256=0h6E88rH9a_503BAzXvFPR-UfmkrRFjcOf29DXgJNjk,51
|
|
14
|
-
sunstone_py-0.5.2.dist-info/top_level.txt,sha256=A2fW-7JO10rlx_L28Bc4FVvWt2R8kgvS8_TGPBhQp3c,9
|
|
15
|
-
sunstone_py-0.5.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|