sunstone-py 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sunstone/cli.py ADDED
@@ -0,0 +1,542 @@
1
+ """
2
+ Sunstone command-line interface.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import re
8
+ import sys
9
+ import tomllib
10
+ from pathlib import Path
11
+ from typing import Optional
12
+ from urllib.parse import urlparse
13
+
14
+ import click
15
+ from click.shell_completion import CompletionItem
16
+ from ruamel.yaml import YAML
17
+
18
+ from .datasets import DatasetsManager
19
+ from .exceptions import DatasetNotFoundError
20
+
21
+ # Configure ruamel.yaml for round-trip parsing
22
+ _yaml = YAML()
23
+ _yaml.preserve_quotes = True
24
+ _yaml.default_flow_style = False
25
+ _yaml.indent(mapping=2, sequence=4, offset=2)
26
+
27
+ # Valid field types
28
+ VALID_FIELD_TYPES = {"string", "number", "integer", "boolean", "date", "datetime"}
29
+
30
+ # Pattern for ${VAR} or ${VAR:-default} substitution
31
+ ENV_VAR_PATTERN = re.compile(r"\$\{([^}:]+)(?::-([^}]*))?\}")
32
+
33
+
34
+ def get_project_slug(project_path: Path) -> str:
35
+ """
36
+ Get the project slug from pyproject.toml or directory name.
37
+
38
+ Args:
39
+ project_path: Path to the project directory.
40
+
41
+ Returns:
42
+ The project slug (kebab-case identifier).
43
+ """
44
+ pyproject_path = project_path / "pyproject.toml"
45
+ if pyproject_path.exists():
46
+ try:
47
+ with open(pyproject_path, "rb") as f:
48
+ pyproject = tomllib.load(f)
49
+ name = pyproject.get("project", {}).get("name")
50
+ if isinstance(name, str):
51
+ return name
52
+ except Exception:
53
+ pass
54
+ return project_path.name
55
+
56
+
57
+ def expand_env_vars(text: str) -> str:
58
+ """
59
+ Expand environment variables in text using ${VAR} or ${VAR:-default} syntax.
60
+
61
+ Args:
62
+ text: The text containing environment variable references.
63
+
64
+ Returns:
65
+ The text with environment variables expanded.
66
+ """
67
+
68
+ def replace_var(match: re.Match[str]) -> str:
69
+ var_name = match.group(1)
70
+ default_value = match.group(2)
71
+ value = os.environ.get(var_name)
72
+ if value is not None:
73
+ return value
74
+ if default_value is not None:
75
+ return default_value
76
+ return match.group(0) # Return original if no value and no default
77
+
78
+ return ENV_VAR_PATTERN.sub(replace_var, text)
79
+
80
+
81
+ def get_manager(datasets_file: str) -> tuple[DatasetsManager, Path]:
82
+ """Get DatasetsManager and project path from datasets file."""
83
+ datasets_path = Path(datasets_file).resolve()
84
+ project_path = datasets_path.parent
85
+ manager = DatasetsManager(project_path)
86
+ return manager, project_path
87
+
88
+
89
+ def complete_dataset_slugs(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]:
90
+ """Shell completion for dataset slugs."""
91
+ # Get the datasets file from context or use default
92
+ datasets_file = ctx.params.get("datasets_file", "datasets.yaml")
93
+
94
+ try:
95
+ manager, _ = get_manager(datasets_file)
96
+ all_datasets = manager.get_all_inputs() + manager.get_all_outputs()
97
+ slugs = [ds.slug for ds in all_datasets]
98
+
99
+ return [CompletionItem(slug) for slug in slugs if slug.startswith(incomplete)]
100
+ except Exception:
101
+ return []
102
+
103
+
104
+ # =============================================================================
105
+ # Main CLI group
106
+ # =============================================================================
107
+
108
+
109
+ @click.group()
110
+ @click.version_option()
111
+ def main() -> None:
112
+ """Sunstone dataset and package management CLI."""
113
+ pass
114
+
115
+
116
+ # =============================================================================
117
+ # Dataset commands
118
+ # =============================================================================
119
+
120
+
121
+ @main.group()
122
+ def dataset() -> None:
123
+ """Manage datasets in datasets.yaml."""
124
+ pass
125
+
126
+
127
+ @dataset.command("list")
128
+ @click.option(
129
+ "-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
130
+ )
131
+ def dataset_list(datasets_file: str) -> None:
132
+ """List all datasets."""
133
+ try:
134
+ manager, _ = get_manager(datasets_file)
135
+ except FileNotFoundError as e:
136
+ click.echo(f"Error: {e}", err=True)
137
+ sys.exit(1)
138
+
139
+ inputs = manager.get_all_inputs()
140
+ outputs = manager.get_all_outputs()
141
+
142
+ if inputs:
143
+ click.echo("Inputs:")
144
+ for ds in inputs:
145
+ flags = []
146
+ if ds.strict:
147
+ flags.append("strict")
148
+ flag_str = f" [{', '.join(flags)}]" if flags else ""
149
+ click.echo(f" - {ds.slug} ({ds.name}){flag_str}")
150
+
151
+ if outputs:
152
+ if inputs:
153
+ click.echo()
154
+ click.echo("Outputs:")
155
+ for ds in outputs:
156
+ flags = []
157
+ if ds.is_publishable:
158
+ flags.append("publish")
159
+ if ds.strict:
160
+ flags.append("strict")
161
+ flag_str = f" [{', '.join(flags)}]" if flags else ""
162
+ click.echo(f" - {ds.slug} ({ds.name}){flag_str}")
163
+
164
+ if not inputs and not outputs:
165
+ click.echo("No datasets found.")
166
+
167
+
168
+ @dataset.command("validate")
169
+ @click.option(
170
+ "-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
171
+ )
172
+ @click.argument("datasets", nargs=-1, shell_complete=complete_dataset_slugs)
173
+ def dataset_validate(datasets_file: str, datasets: tuple[str, ...]) -> None:
174
+ """Validate datasets.
175
+
176
+ If no datasets are specified, validates all datasets.
177
+ """
178
+ datasets_path = Path(datasets_file).resolve()
179
+
180
+ errors: list[str] = []
181
+
182
+ # Load and parse YAML
183
+ try:
184
+ with open(datasets_path, "r") as f:
185
+ data = _yaml.load(f)
186
+ except Exception as e:
187
+ click.echo(f"Error: Failed to parse YAML: {e}", err=True)
188
+ sys.exit(1)
189
+
190
+ if data is None:
191
+ data = {}
192
+
193
+ # Check structure
194
+ if "inputs" not in data and "outputs" not in data:
195
+ errors.append("datasets.yaml must contain 'inputs' and/or 'outputs' lists")
196
+
197
+ # Track slugs for duplicate detection
198
+ all_slugs: dict[str, str] = {} # slug -> type
199
+ datasets_to_validate = set(datasets) if datasets else None
200
+
201
+ def validate_dataset_entry(ds: dict, ds_type: str, index: int) -> None:
202
+ prefix = f"{ds_type}[{index}]"
203
+ slug = ds.get("slug")
204
+
205
+ # Skip if specific datasets requested and this isn't one of them
206
+ if datasets_to_validate and slug not in datasets_to_validate:
207
+ # Still track slug for duplicate detection
208
+ if slug:
209
+ all_slugs[slug] = ds_type
210
+ return
211
+
212
+ # Required fields
213
+ for field in ["name", "slug", "location", "fields"]:
214
+ if field not in ds:
215
+ errors.append(f"{prefix}: missing required field '{field}'")
216
+
217
+ # Check slug
218
+ if slug:
219
+ if slug in all_slugs:
220
+ errors.append(f"{prefix}: duplicate slug '{slug}' (also in {all_slugs[slug]})")
221
+ else:
222
+ all_slugs[slug] = ds_type
223
+
224
+ # Check fields
225
+ fields = ds.get("fields", [])
226
+ if not isinstance(fields, list):
227
+ errors.append(f"{prefix}: 'fields' must be a list")
228
+ else:
229
+ for i, field in enumerate(fields):
230
+ if not isinstance(field, dict):
231
+ errors.append(f"{prefix}.fields[{i}]: must be an object")
232
+ continue
233
+ if "name" not in field:
234
+ errors.append(f"{prefix}.fields[{i}]: missing 'name'")
235
+ if "type" not in field:
236
+ errors.append(f"{prefix}.fields[{i}]: missing 'type'")
237
+ elif field["type"] not in VALID_FIELD_TYPES:
238
+ errors.append(
239
+ f"{prefix}.fields[{i}]: invalid type '{field['type']}' "
240
+ f"(must be one of: {', '.join(sorted(VALID_FIELD_TYPES))})"
241
+ )
242
+
243
+ # Validate inputs
244
+ inputs = data.get("inputs", [])
245
+ if not isinstance(inputs, list):
246
+ errors.append("'inputs' must be a list")
247
+ else:
248
+ for i, ds in enumerate(inputs):
249
+ if not isinstance(ds, dict):
250
+ errors.append(f"inputs[{i}]: must be an object")
251
+ else:
252
+ validate_dataset_entry(ds, "inputs", i)
253
+
254
+ # Validate outputs
255
+ outputs = data.get("outputs", [])
256
+ if not isinstance(outputs, list):
257
+ errors.append("'outputs' must be a list")
258
+ else:
259
+ for i, ds in enumerate(outputs):
260
+ if not isinstance(ds, dict):
261
+ errors.append(f"outputs[{i}]: must be an object")
262
+ else:
263
+ validate_dataset_entry(ds, "outputs", i)
264
+
265
+ # Check if requested datasets were found
266
+ if datasets_to_validate:
267
+ found_slugs = set(all_slugs.keys())
268
+ missing = datasets_to_validate - found_slugs
269
+ for slug in missing:
270
+ errors.append(f"Dataset '{slug}' not found")
271
+
272
+ if errors:
273
+ click.echo("Validation errors:", err=True)
274
+ for error in errors:
275
+ click.echo(f" - {error}", err=True)
276
+ sys.exit(1)
277
+ else:
278
+ if datasets:
279
+ click.echo(f"✓ {len(datasets)} dataset(s) valid")
280
+ else:
281
+ click.echo(f"✓ {datasets_file} is valid")
282
+
283
+
284
+ @dataset.command("lock")
285
+ @click.option(
286
+ "-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
287
+ )
288
+ @click.argument("datasets", nargs=-1, shell_complete=complete_dataset_slugs)
289
+ def dataset_lock(datasets_file: str, datasets: tuple[str, ...]) -> None:
290
+ """Enable strict mode for datasets.
291
+
292
+ If no datasets are specified, locks all datasets.
293
+ """
294
+ try:
295
+ manager, _ = get_manager(datasets_file)
296
+ except FileNotFoundError as e:
297
+ click.echo(f"Error: {e}", err=True)
298
+ sys.exit(1)
299
+
300
+ # Get all datasets if none specified
301
+ if not datasets:
302
+ all_datasets = manager.get_all_inputs() + manager.get_all_outputs()
303
+ datasets = tuple(ds.slug for ds in all_datasets)
304
+
305
+ if not datasets:
306
+ click.echo("No datasets found.")
307
+ return
308
+
309
+ locked = []
310
+ for slug in datasets:
311
+ try:
312
+ manager.set_dataset_strict(slug, strict=True)
313
+ locked.append(slug)
314
+ except DatasetNotFoundError:
315
+ click.echo(f"Warning: Dataset '{slug}' not found", err=True)
316
+
317
+ if locked:
318
+ click.echo(f"✓ Locked {len(locked)} dataset(s): {', '.join(locked)}")
319
+
320
+
321
+ @dataset.command("unlock")
322
+ @click.option(
323
+ "-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
324
+ )
325
+ @click.argument("datasets", nargs=-1, shell_complete=complete_dataset_slugs)
326
+ def dataset_unlock(datasets_file: str, datasets: tuple[str, ...]) -> None:
327
+ """Disable strict mode for datasets.
328
+
329
+ If no datasets are specified, unlocks all datasets.
330
+ """
331
+ try:
332
+ manager, _ = get_manager(datasets_file)
333
+ except FileNotFoundError as e:
334
+ click.echo(f"Error: {e}", err=True)
335
+ sys.exit(1)
336
+
337
+ # Get all datasets if none specified
338
+ if not datasets:
339
+ all_datasets = manager.get_all_inputs() + manager.get_all_outputs()
340
+ datasets = tuple(ds.slug for ds in all_datasets)
341
+
342
+ if not datasets:
343
+ click.echo("No datasets found.")
344
+ return
345
+
346
+ unlocked = []
347
+ for slug in datasets:
348
+ try:
349
+ manager.set_dataset_strict(slug, strict=False)
350
+ unlocked.append(slug)
351
+ except DatasetNotFoundError:
352
+ click.echo(f"Warning: Dataset '{slug}' not found", err=True)
353
+
354
+ if unlocked:
355
+ click.echo(f"✓ Unlocked {len(unlocked)} dataset(s): {', '.join(unlocked)}")
356
+
357
+
358
+ # =============================================================================
359
+ # Package commands
360
+ # =============================================================================
361
+
362
+
363
+ @main.group()
364
+ def package() -> None:
365
+ """Manage data packages."""
366
+ pass
367
+
368
+
369
+ @package.command("build")
370
+ @click.option(
371
+ "-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
372
+ )
373
+ @click.option("-o", "--output", "output_file", type=click.Path(), default="datapackage.json", help="Output file path")
374
+ def package_build(datasets_file: str, output_file: str) -> None:
375
+ """Build a datapackage.json from datasets.yaml.
376
+
377
+ Creates a Data Package (https://datapackage.org/) with all output datasets as resources.
378
+ """
379
+ try:
380
+ manager, project_path = get_manager(datasets_file)
381
+ except FileNotFoundError as e:
382
+ click.echo(f"Error: {e}", err=True)
383
+ sys.exit(1)
384
+
385
+ outputs = manager.get_all_outputs()
386
+ if not outputs:
387
+ click.echo("No output datasets found.", err=True)
388
+ sys.exit(1)
389
+
390
+ project_slug = get_project_slug(project_path)
391
+
392
+ try:
393
+ from frictionless import describe
394
+ except ImportError:
395
+ click.echo("Error: frictionless is required for package build", err=True)
396
+ sys.exit(1)
397
+
398
+ resources = []
399
+ for ds in outputs:
400
+ data_path = manager.get_absolute_path(ds.location)
401
+ if not data_path.exists():
402
+ click.echo(f"Warning: Data file not found for '{ds.slug}': {data_path}", err=True)
403
+ continue
404
+
405
+ try:
406
+ resource = describe(str(data_path))
407
+ resource.name = ds.slug
408
+ resource.title = ds.name
409
+ # Use relative path in the package
410
+ resource.path = ds.location
411
+ resources.append(resource.to_dict())
412
+ click.echo(f" + {ds.slug}")
413
+ except Exception as e:
414
+ click.echo(f"Warning: Failed to describe '{ds.slug}': {e}", err=True)
415
+
416
+ if not resources:
417
+ click.echo("Error: No resources could be added to the package", err=True)
418
+ sys.exit(1)
419
+
420
+ datapackage = {
421
+ "name": project_slug,
422
+ "resources": resources,
423
+ }
424
+
425
+ output_path = Path(output_file)
426
+ with open(output_path, "w") as f:
427
+ json.dump(datapackage, f, indent=2)
428
+
429
+ click.echo(f"\n✓ Created {output_file} with {len(resources)} resource(s)")
430
+
431
+
432
+ @package.command("push")
433
+ @click.option("--env", type=click.Choice(["dev", "prod"]), default="dev", help="Target environment")
434
+ @click.option(
435
+ "-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
436
+ )
437
+ @click.option("--destination", "-d", "destination", type=str, default=None, help="Override destination gs:// URL")
438
+ def package_push(env: str, datasets_file: str, destination: Optional[str]) -> None:
439
+ """Push the data package to Google Cloud Storage.
440
+
441
+ Uploads datapackage.json and all publishable output datasets.
442
+ """
443
+ try:
444
+ manager, project_path = get_manager(datasets_file)
445
+ except FileNotFoundError as e:
446
+ click.echo(f"Error: {e}", err=True)
447
+ sys.exit(1)
448
+
449
+ outputs = manager.get_all_outputs()
450
+ publishable = [ds for ds in outputs if ds.is_publishable]
451
+
452
+ if not publishable:
453
+ click.echo("Error: No publishable datasets found (need publish.enabled: true)", err=True)
454
+ sys.exit(1)
455
+
456
+ project_slug = get_project_slug(project_path)
457
+
458
+ # Determine destination
459
+ if destination:
460
+ dest_url = expand_env_vars(destination)
461
+ elif publishable[0].publish and publishable[0].publish.to:
462
+ # Use first dataset's publish.to as package destination
463
+ dest_url = expand_env_vars(publishable[0].publish.to)
464
+ else:
465
+ dest_url = f"gs://payloadcms-{env}/datasets/projects/{project_slug}/"
466
+
467
+ parsed = urlparse(dest_url)
468
+ if parsed.scheme != "gs":
469
+ click.echo(f"Error: Destination must be a gs:// URL, got: {dest_url}", err=True)
470
+ sys.exit(1)
471
+
472
+ bucket_name = parsed.netloc
473
+ gcs_prefix = parsed.path.lstrip("/")
474
+ if gcs_prefix and not gcs_prefix.endswith("/"):
475
+ gcs_prefix += "/"
476
+
477
+ # Build the datapackage
478
+ try:
479
+ from frictionless import describe
480
+ except ImportError:
481
+ click.echo("Error: frictionless is required for package push", err=True)
482
+ sys.exit(1)
483
+
484
+ resources = []
485
+ data_files: list[tuple[Path, str]] = [] # (local_path, remote_name)
486
+
487
+ for ds in publishable:
488
+ data_path = manager.get_absolute_path(ds.location)
489
+ if not data_path.exists():
490
+ click.echo(f"Warning: Data file not found for '{ds.slug}': {data_path}", err=True)
491
+ continue
492
+
493
+ try:
494
+ resource = describe(str(data_path))
495
+ resource.name = ds.slug
496
+ resource.title = ds.name
497
+ resource.path = data_path.name # Just the filename in the package
498
+ resources.append(resource.to_dict())
499
+ data_files.append((data_path, data_path.name))
500
+ except Exception as e:
501
+ click.echo(f"Warning: Failed to describe '{ds.slug}': {e}", err=True)
502
+
503
+ if not resources:
504
+ click.echo("Error: No resources could be added to the package", err=True)
505
+ sys.exit(1)
506
+
507
+ datapackage = {
508
+ "name": project_slug,
509
+ "resources": resources,
510
+ }
511
+
512
+ # Upload to GCS
513
+ try:
514
+ from google.cloud import storage # type: ignore[import-untyped]
515
+
516
+ client = storage.Client()
517
+ bucket = client.bucket(bucket_name)
518
+
519
+ # Upload datapackage.json
520
+ datapackage_blob = bucket.blob(f"{gcs_prefix}datapackage.json")
521
+ datapackage_blob.upload_from_string(json.dumps(datapackage, indent=2), content_type="application/json")
522
+ click.echo("✓ Uploaded datapackage.json")
523
+
524
+ # Upload data files
525
+ for local_path, remote_name in data_files:
526
+ data_blob = bucket.blob(f"{gcs_prefix}{remote_name}")
527
+ data_blob.upload_from_filename(str(local_path))
528
+ click.echo(f"✓ Uploaded {remote_name}")
529
+
530
+ click.echo(f"\nPackage pushed to: gs://{bucket_name}/{gcs_prefix}")
531
+
532
+ except ImportError:
533
+ click.echo("Error: google-cloud-storage is required for push", err=True)
534
+ click.echo("Install with: pip install google-cloud-storage", err=True)
535
+ sys.exit(1)
536
+ except Exception as e:
537
+ click.echo(f"Error uploading to GCS: {e}", err=True)
538
+ sys.exit(1)
539
+
540
+
541
+ if __name__ == "__main__":
542
+ main()
sunstone/dataframe.py CHANGED
@@ -10,7 +10,7 @@ import pandas as pd
10
10
 
11
11
  from .datasets import DatasetsManager
12
12
  from .exceptions import DatasetNotFoundError, StrictModeError
13
- from .lineage import FieldSchema, LineageMetadata
13
+ from .lineage import FieldSchema, LineageMetadata, compute_dataframe_hash
14
14
 
15
15
  pd.options.mode.copy_on_write = True
16
16
 
@@ -196,7 +196,6 @@ class DataFrame:
196
196
  # Create lineage metadata
197
197
  lineage = LineageMetadata(project_path=str(manager.project_path))
198
198
  lineage.add_source(dataset)
199
- lineage.add_operation(f"read_dataset({dataset.slug}, format={format})")
200
199
 
201
200
  # Return wrapped DataFrame
202
201
  return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
@@ -294,7 +293,6 @@ class DataFrame:
294
293
  # Create lineage metadata
295
294
  lineage = LineageMetadata(project_path=str(manager.project_path))
296
295
  lineage.add_source(dataset)
297
- lineage.add_operation(f"read_csv({dataset.slug})")
298
296
 
299
297
  # Return wrapped DataFrame
300
298
  return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
@@ -363,11 +361,13 @@ class DataFrame:
363
361
  absolute_path.parent.mkdir(parents=True, exist_ok=True)
364
362
  self.data.to_csv(absolute_path, **kwargs)
365
363
 
366
- # Record the operation
367
- self.lineage.add_operation(f"to_csv({dataset.slug})")
364
+ # Compute content hash for change detection
365
+ content_hash = compute_dataframe_hash(self.data)
368
366
 
369
367
  # Persist lineage metadata to datasets.yaml
370
- manager.update_output_lineage(slug=dataset.slug, lineage=self.lineage, strict=self.strict_mode)
368
+ manager.update_output_lineage(
369
+ slug=dataset.slug, lineage=self.lineage, content_hash=content_hash, strict=self.strict_mode
370
+ )
371
371
 
372
372
  def _infer_field_schema(self) -> List[FieldSchema]:
373
373
  """
@@ -410,11 +410,8 @@ class DataFrame:
410
410
  # Perform the merge
411
411
  merged_data = pd.merge(self.data, right.data, **kwargs)
412
412
 
413
- # Combine lineage
413
+ # Combine lineage (sources from both DataFrames)
414
414
  merged_lineage = self.lineage.merge(right.lineage)
415
- merged_lineage.add_operation(
416
- f"merge(left={len(self.lineage.sources)} sources, right={len(right.lineage.sources)} sources)"
417
- )
418
415
 
419
416
  return DataFrame(
420
417
  data=merged_data,
@@ -437,11 +434,8 @@ class DataFrame:
437
434
  # Perform the join
438
435
  joined_data = self.data.join(other.data, **kwargs)
439
436
 
440
- # Combine lineage
437
+ # Combine lineage (sources from both DataFrames)
441
438
  joined_lineage = self.lineage.merge(other.lineage)
442
- joined_lineage.add_operation(
443
- f"join(left={len(self.lineage.sources)} sources, right={len(other.lineage.sources)} sources)"
444
- )
445
439
 
446
440
  return DataFrame(
447
441
  data=joined_data,
@@ -467,16 +461,11 @@ class DataFrame:
467
461
  # Concatenate
468
462
  concatenated_data = pd.concat(all_dfs, **kwargs)
469
463
 
470
- # Combine lineage from all DataFrames
464
+ # Combine lineage (sources from all DataFrames)
471
465
  combined_lineage = self.lineage
472
466
  for other in others:
473
467
  combined_lineage = combined_lineage.merge(other.lineage)
474
468
 
475
- combined_lineage.add_operation(
476
- f"concat({len(others) + 1} dataframes, "
477
- f"{sum(len(df.lineage.sources) for df in [self] + others)} total sources)"
478
- )
479
-
480
469
  return DataFrame(
481
470
  data=concatenated_data,
482
471
  lineage=combined_lineage,
@@ -484,42 +473,12 @@ class DataFrame:
484
473
  project_path=self.lineage.project_path,
485
474
  )
486
475
 
487
- def apply_operation(self, operation: Callable[[pd.DataFrame], pd.DataFrame], description: str) -> "DataFrame":
488
- """
489
- Apply a transformation operation to the DataFrame.
490
-
491
- Args:
492
- operation: Function that takes a pandas DataFrame and returns a DataFrame.
493
- description: Human-readable description of the operation.
494
-
495
- Returns:
496
- A new DataFrame with the operation applied and recorded in lineage.
497
- """
498
- # Apply the operation
499
- new_data = operation(self.data)
500
-
501
- # Copy lineage and add operation
502
- new_lineage = LineageMetadata(
503
- sources=self.lineage.sources.copy(),
504
- operations=self.lineage.operations.copy(),
505
- project_path=self.lineage.project_path,
506
- )
507
- new_lineage.add_operation(description)
508
-
509
- return DataFrame(
510
- data=new_data,
511
- lineage=new_lineage,
512
- strict=self.strict_mode,
513
- project_path=self.lineage.project_path,
514
- )
515
-
516
- def _wrap_result(self, result: Any, operation: Optional[str] = None) -> Any:
476
+ def _wrap_result(self, result: Any) -> Any:
517
477
  """
518
478
  Wrap a pandas result in a Sunstone DataFrame if applicable.
519
479
 
520
480
  Args:
521
481
  result: The result from a pandas operation.
522
- operation: Name of the operation performed. If None, no operation is recorded.
523
482
 
524
483
  Returns:
525
484
  Wrapped DataFrame if result is a DataFrame, otherwise the result.
@@ -527,11 +486,8 @@ class DataFrame:
527
486
  if isinstance(result, pd.DataFrame):
528
487
  new_lineage = LineageMetadata(
529
488
  sources=self.lineage.sources.copy(),
530
- operations=self.lineage.operations.copy(),
531
489
  project_path=self.lineage.project_path,
532
490
  )
533
- if operation is not None:
534
- new_lineage.add_operation(operation)
535
491
 
536
492
  return DataFrame(
537
493
  data=result,
@@ -541,28 +497,6 @@ class DataFrame:
541
497
  )
542
498
  return result
543
499
 
544
- # Methods that don't represent meaningful data transformations
545
- # These return DataFrames but shouldn't be tracked in lineage
546
- _NON_TRACKING_METHODS = frozenset(
547
- {
548
- # Copy operations - same data, no transformation
549
- "copy",
550
- # Index operations - same data, different index
551
- "reset_index",
552
- "set_index",
553
- "reindex",
554
- # Type conversions without data change
555
- "astype",
556
- "infer_objects",
557
- # Column/index renaming - same data, different labels
558
- "rename",
559
- "rename_axis",
560
- # Reshaping without data loss
561
- "T",
562
- "transpose",
563
- }
564
- )
565
-
566
500
  def __getattr__(self, name: str) -> Any:
567
501
  """
568
502
  Delegate attribute access to the underlying pandas DataFrame.
@@ -583,14 +517,11 @@ class DataFrame:
583
517
 
584
518
  def wrapper(*args: Any, **kwargs: Any) -> Any:
585
519
  result = attr(*args, **kwargs)
586
- # Don't track non-transforming methods
587
- if name in DataFrame._NON_TRACKING_METHODS:
588
- return self._wrap_result(result, operation=None)
589
- return self._wrap_result(result, operation=f"{name}")
520
+ return self._wrap_result(result)
590
521
 
591
522
  return wrapper
592
523
 
593
- return self._wrap_result(attr, operation=None) # Don't track attribute access
524
+ return self._wrap_result(attr)
594
525
 
595
526
  def __getitem__(self, key: Any) -> Any:
596
527
  """
@@ -603,9 +534,7 @@ class DataFrame:
603
534
  The item from the underlying DataFrame, wrapped if it's a DataFrame.
604
535
  """
605
536
  result = self.data[key]
606
- # Don't track __getitem__ as an operation - it's just column/row access
607
- # not a meaningful transformation
608
- return self._wrap_result(result, operation=None)
537
+ return self._wrap_result(result)
609
538
 
610
539
  def __setitem__(self, key: Any, value: Any) -> None:
611
540
  """
@@ -616,14 +545,12 @@ class DataFrame:
616
545
  value: Value to assign.
617
546
  """
618
547
  self.data[key] = value
619
- # Track column assignment in lineage
620
- self.lineage.add_operation(f"__setitem__({key!r})")
548
+ # Don't track column assignments automatically
549
+ # Users should use add_operation() for meaningful transformations
621
550
 
622
551
  def __repr__(self) -> str:
623
552
  """String representation of the DataFrame."""
624
- lineage_info = (
625
- f"\n\nLineage: {len(self.lineage.sources)} source(s), {len(self.lineage.operations)} operation(s)"
626
- )
553
+ lineage_info = f"\n\nLineage: {len(self.lineage.sources)} source(s)"
627
554
  return repr(self.data) + lineage_info
628
555
 
629
556
  def __str__(self) -> str:
sunstone/datasets.py CHANGED
@@ -15,7 +15,7 @@ import requests
15
15
  from ruamel.yaml import YAML
16
16
 
17
17
  from .exceptions import DatasetNotFoundError, DatasetValidationError
18
- from .lineage import DatasetMetadata, FieldSchema, LineageMetadata, Source, SourceLocation
18
+ from .lineage import DatasetMetadata, FieldSchema, LineageMetadata, PublishConfig, Source, SourceLocation
19
19
 
20
20
  logger = logging.getLogger(__name__)
21
21
 
@@ -156,6 +156,26 @@ class DatasetsManager:
156
156
  for field in fields_data
157
157
  ]
158
158
 
159
+ def _parse_publish(self, publish_data: Any) -> Optional[PublishConfig]:
160
+ """
161
+ Parse publish configuration from YAML.
162
+
163
+ Supports both legacy boolean format and new object format:
164
+ - publish: true -> PublishConfig(enabled=True)
165
+ - publish: false -> None
166
+ - publish: { enabled: true, to: "..." } -> PublishConfig(enabled=True, to="...")
167
+ """
168
+ if publish_data is None:
169
+ return None
170
+ if isinstance(publish_data, bool):
171
+ return PublishConfig(enabled=publish_data) if publish_data else None
172
+ if isinstance(publish_data, dict):
173
+ enabled = publish_data.get("enabled", False)
174
+ if not enabled:
175
+ return None
176
+ return PublishConfig(enabled=True, to=publish_data.get("to"))
177
+ return None
178
+
159
179
  def _parse_dataset(self, dataset_data: Dict[str, Any], dataset_type: str) -> DatasetMetadata:
160
180
  """
161
181
  Parse dataset metadata from YAML data.
@@ -177,7 +197,8 @@ class DatasetsManager:
177
197
  location=dataset_data["location"],
178
198
  fields=self._parse_fields(dataset_data["fields"]),
179
199
  source=source,
180
- publish=dataset_data.get("publish", False),
200
+ publish=self._parse_publish(dataset_data.get("publish")),
201
+ strict=dataset_data.get("strict", False),
181
202
  dataset_type=dataset_type,
182
203
  )
183
204
 
@@ -380,22 +401,57 @@ class DatasetsManager:
380
401
 
381
402
  raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
382
403
 
383
- def update_output_lineage(self, slug: str, lineage: LineageMetadata, strict: bool = False) -> None:
404
+ def set_dataset_strict(self, slug: str, strict: bool, dataset_type: Optional[str] = None) -> None:
405
+ """
406
+ Set or remove strict mode for a dataset.
407
+
408
+ Args:
409
+ slug: The slug of the dataset to update.
410
+ strict: If True, enable strict mode. If False, disable it.
411
+ dataset_type: Optional filter by 'input' or 'output'. If None, searches both.
412
+
413
+ Raises:
414
+ DatasetNotFoundError: If the dataset doesn't exist.
415
+ """
416
+ search_types = ["input", "output"] if dataset_type is None else [dataset_type]
417
+
418
+ for dtype in search_types:
419
+ key = "inputs" if dtype == "input" else "outputs"
420
+ for dataset_data in self._data.get(key, []):
421
+ if dataset_data["slug"] == slug:
422
+ if strict:
423
+ dataset_data["strict"] = True
424
+ elif "strict" in dataset_data:
425
+ del dataset_data["strict"]
426
+ self._save()
427
+ return
428
+
429
+ raise DatasetNotFoundError(f"Dataset with slug '{slug}' not found")
430
+
431
+ def update_output_lineage(
432
+ self, slug: str, lineage: LineageMetadata, content_hash: str, strict: bool = False
433
+ ) -> None:
384
434
  """
385
435
  Update lineage metadata for an output dataset.
386
436
 
437
+ The timestamp is only updated when the content hash changes, preventing
438
+ unnecessary updates when the data hasn't changed.
439
+
387
440
  In strict mode, validates that the lineage matches what would be written
388
441
  without modifying the file. In relaxed mode, updates the file with lineage.
389
442
 
390
443
  Args:
391
444
  slug: The slug of the output dataset to update.
392
445
  lineage: The lineage metadata to persist.
446
+ content_hash: SHA256 hash of the DataFrame content.
393
447
  strict: If True, validate without modifying. If False, update the file.
394
448
 
395
449
  Raises:
396
450
  DatasetNotFoundError: If the dataset doesn't exist.
397
451
  DatasetValidationError: In strict mode, if lineage differs from what's in the file.
398
452
  """
453
+ from datetime import datetime
454
+
399
455
  # Find the output dataset
400
456
  dataset_idx = None
401
457
  for i, dataset_data in enumerate(self._data["outputs"]):
@@ -406,23 +462,28 @@ class DatasetsManager:
406
462
  if dataset_idx is None:
407
463
  raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
408
464
 
409
- # Build lineage metadata to add
410
- lineage_data: dict[str, Any] = {}
465
+ # Get existing lineage data if present
466
+ existing_lineage = self._data["outputs"][dataset_idx].get("lineage", {})
467
+ existing_hash = existing_lineage.get("content_hash")
468
+ existing_timestamp = existing_lineage.get("created_at")
411
469
 
412
- if lineage.sources:
413
- lineage_data["sources"] = [
414
- {
415
- "slug": src.slug,
416
- "name": src.name,
417
- }
418
- for src in lineage.sources
419
- ]
470
+ # Determine if content has changed
471
+ content_changed = existing_hash != content_hash
420
472
 
421
- if lineage.operations:
422
- lineage_data["operations"] = lineage.operations.copy()
473
+ # Only update timestamp if content changed
474
+ if content_changed:
475
+ timestamp = datetime.now().isoformat()
476
+ else:
477
+ # Preserve existing timestamp
478
+ timestamp = existing_timestamp
423
479
 
424
- if lineage.created_at:
425
- lineage_data["created_at"] = lineage.created_at.isoformat()
480
+ # Build lineage metadata to add (order: content_hash, created_at, sources)
481
+ lineage_data: dict[str, Any] = {}
482
+ lineage_data["content_hash"] = content_hash
483
+ if timestamp:
484
+ lineage_data["created_at"] = timestamp
485
+ if lineage.sources:
486
+ lineage_data["sources"] = [{"slug": src.slug} for src in lineage.sources]
426
487
 
427
488
  # Create a copy of the data with updated lineage
428
489
  updated_data = self._data.copy()
sunstone/lineage.py CHANGED
@@ -2,9 +2,13 @@
2
2
  Lineage metadata structures for tracking data provenance.
3
3
  """
4
4
 
5
+ import hashlib
5
6
  from dataclasses import dataclass, field
6
7
  from datetime import datetime
7
- from typing import Any, Dict, List, Optional
8
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
9
+
10
+ if TYPE_CHECKING:
11
+ import pandas as pd
8
12
 
9
13
 
10
14
  @dataclass
@@ -62,6 +66,17 @@ class FieldSchema:
62
66
  """Optional constraints (e.g., enum values)."""
63
67
 
64
68
 
69
+ @dataclass
70
+ class PublishConfig:
71
+ """Configuration for publishing a dataset."""
72
+
73
+ enabled: bool = False
74
+ """Whether publishing is enabled."""
75
+
76
+ to: Optional[str] = None
77
+ """Optional destination URL (supports ${VAR:-default} substitution)."""
78
+
79
+
65
80
  @dataclass
66
81
  class DatasetMetadata:
67
82
  """Metadata for a dataset from datasets.yaml."""
@@ -81,30 +96,56 @@ class DatasetMetadata:
81
96
  source: Optional[Source] = None
82
97
  """Source attribution (for input datasets)."""
83
98
 
84
- publish: bool = False
85
- """Whether this dataset should be published (for output datasets)."""
99
+ publish: Optional[PublishConfig] = None
100
+ """Publishing configuration (for output datasets)."""
101
+
102
+ strict: bool = False
103
+ """Whether strict mode is enabled (lineage cannot be modified)."""
86
104
 
87
105
  dataset_type: str = "input"
88
106
  """Type of dataset: 'input' or 'output'."""
89
107
 
108
+ @property
109
+ def is_publishable(self) -> bool:
110
+ """Check if this dataset is configured for publishing."""
111
+ return self.publish is not None and self.publish.enabled
112
+
113
+
114
+ def compute_dataframe_hash(df: "pd.DataFrame") -> str:
115
+ """
116
+ Compute a fast SHA256 hash of a pandas DataFrame's content.
117
+
118
+ Uses pickle serialization for a consistent, fast representation of the data.
119
+
120
+ Args:
121
+ df: The pandas DataFrame to hash.
122
+
123
+ Returns:
124
+ A SHA256 hex digest string representing the DataFrame content.
125
+ """
126
+ import pickle
127
+
128
+ # Use pickle protocol 5 for efficiency; hash the bytes directly
129
+ data_bytes = pickle.dumps(df, protocol=5)
130
+ return hashlib.sha256(data_bytes).hexdigest()
131
+
90
132
 
91
133
  @dataclass
92
134
  class LineageMetadata:
93
135
  """
94
136
  Lineage metadata tracking the provenance of data in a DataFrame.
95
137
 
96
- This tracks all source datasets that contributed to the current DataFrame,
97
- including information about transformations and operations performed.
138
+ This tracks all source datasets that contributed to the current DataFrame.
98
139
  """
99
140
 
100
141
  sources: List[DatasetMetadata] = field(default_factory=list)
101
142
  """List of source datasets that contributed to this data."""
102
143
 
103
- operations: List[str] = field(default_factory=list)
104
- """List of operations performed on the data."""
144
+ created_at: Optional[datetime] = None
145
+ """Timestamp when this lineage was last updated (content changed)."""
105
146
 
106
- created_at: datetime = field(default_factory=datetime.now)
107
- """Timestamp when this lineage was created."""
147
+ content_hash: Optional[str] = None
148
+ """SHA256 hash of the DataFrame content, used to detect changes."""
108
149
 
109
150
  project_path: Optional[str] = None
110
151
  """Path to the project directory containing datasets.yaml."""
@@ -119,15 +160,6 @@ class LineageMetadata:
119
160
  if dataset not in self.sources:
120
161
  self.sources.append(dataset)
121
162
 
122
- def add_operation(self, operation: str) -> None:
123
- """
124
- Record an operation performed on the data.
125
-
126
- Args:
127
- operation: Description of the operation.
128
- """
129
- self.operations.append(operation)
130
-
131
163
  def merge(self, other: "LineageMetadata") -> "LineageMetadata":
132
164
  """
133
165
  Merge lineage from another DataFrame.
@@ -136,12 +168,10 @@ class LineageMetadata:
136
168
  other: The other lineage metadata to merge.
137
169
 
138
170
  Returns:
139
- A new LineageMetadata with combined sources and operations.
171
+ A new LineageMetadata with combined sources.
140
172
  """
141
173
  merged = LineageMetadata(
142
174
  sources=self.sources.copy(),
143
- operations=self.operations.copy(),
144
- created_at=datetime.now(),
145
175
  project_path=self.project_path or other.project_path,
146
176
  )
147
177
 
@@ -150,9 +180,6 @@ class LineageMetadata:
150
180
  if source not in merged.sources:
151
181
  merged.sources.append(source)
152
182
 
153
- # Combine operations
154
- merged.operations.extend(other.operations)
155
-
156
183
  return merged
157
184
 
158
185
  def get_licenses(self) -> List[str]:
@@ -175,16 +202,18 @@ class LineageMetadata:
175
202
  Returns:
176
203
  Dictionary containing lineage information.
177
204
  """
178
- return {
205
+ result: Dict[str, Any] = {
179
206
  "sources": [
180
207
  {
181
- "name": src.name,
182
208
  "slug": src.slug,
209
+ "name": src.name,
183
210
  "location": src.location,
184
211
  }
185
212
  for src in self.sources
186
213
  ],
187
- "operations": self.operations,
188
- "created_at": self.created_at.isoformat(),
189
- "licenses": self.get_licenses(),
190
214
  }
215
+ if self.created_at is not None:
216
+ result["created_at"] = self.created_at.isoformat()
217
+ if self.content_hash is not None:
218
+ result["content_hash"] = self.content_hash
219
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sunstone-py
3
- Version: 0.5.2
3
+ Version: 0.6.0
4
4
  Summary: Python library for managing datasets with lineage tracking in Sunstone projects
5
5
  Author-email: Sunstone Institute <stig@sunstone.institute>
6
6
  License: MIT
@@ -17,8 +17,10 @@ Classifier: Programming Language :: Python :: 3.14
17
17
  Requires-Python: >=3.12
18
18
  Description-Content-Type: text/markdown
19
19
  License-File: LICENSE
20
+ Requires-Dist: click>=8.0
20
21
  Requires-Dist: frictionless>=5.18.1
21
22
  Requires-Dist: google-auth>=2.43.0
23
+ Requires-Dist: google-cloud-storage>=2.0.0
22
24
  Requires-Dist: pandas>=2.0.0
23
25
  Requires-Dist: pyyaml>=6.0
24
26
  Requires-Dist: requests>=2.31.0
@@ -29,7 +31,7 @@ Dynamic: license-file
29
31
 
30
32
  A Python library for managing datasets with lineage tracking in data science projects.
31
33
 
32
- [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
34
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
33
35
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
34
36
 
35
37
  ## Features
@@ -0,0 +1,16 @@
1
+ sunstone/__init__.py,sha256=LC0ZtmxP26eXPLKejbg7UStcHOnE_lwttNTL4m3F4yM,2032
2
+ sunstone/_release.py,sha256=MQNaUD7mSK6h8vu6EIgJuaMlAxuFxv82NQwHgBpLZm4,14907
3
+ sunstone/cli.py,sha256=YNwMXWCezQCJikJEC1iprf4rl5hsTr0V8toETVoRVCk,17905
4
+ sunstone/dataframe.py,sha256=rFGuMq-63Haua_QQfR3E708KYc1g43yEyCej11_Gl3A,20679
5
+ sunstone/datasets.py,sha256=9mJJ02UFcjFtbbx01rFLUMAacUPaJdothfqnTsc66kw,23851
6
+ sunstone/exceptions.py,sha256=fiixXazur3LtQGy21bGEaSr356DObFcYxQJ3FvOxNec,623
7
+ sunstone/lineage.py,sha256=iZiVBY-l-iEeVVlEORkow29fMM5UGtah8FU5ZVLetAI,6001
8
+ sunstone/pandas.py,sha256=CLEqIIgTbMmpH73TPy_vDUPxQa37Hpmqn4r6No8PJwo,8188
9
+ sunstone/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ sunstone/validation.py,sha256=1356vcUc72a1zGBUe9Xjrcb5h41Xo53PaK2nnQ_FuSM,8286
11
+ sunstone_py-0.6.0.dist-info/licenses/LICENSE,sha256=pB6VuR4QRjwjMjy8RSNGho-N1SUdu07ntIhT5lrhkzU,1078
12
+ sunstone_py-0.6.0.dist-info/METADATA,sha256=3eqIzvMuCIMbuzLaAMcVMV_KsUxcvJNlh5drnUfV7hk,9529
13
+ sunstone_py-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
14
+ sunstone_py-0.6.0.dist-info/entry_points.txt,sha256=DT-mp-lPl6UEcHBNs2o3HJ8dLp4iqMnzvHJhiLfCd0g,80
15
+ sunstone_py-0.6.0.dist-info/top_level.txt,sha256=A2fW-7JO10rlx_L28Bc4FVvWt2R8kgvS8_TGPBhQp3c,9
16
+ sunstone_py-0.6.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,2 +1,3 @@
1
1
  [console_scripts]
2
2
  release = sunstone._release:main
3
+ sunstone = sunstone.cli:main
@@ -1,15 +0,0 @@
1
- sunstone/__init__.py,sha256=LC0ZtmxP26eXPLKejbg7UStcHOnE_lwttNTL4m3F4yM,2032
2
- sunstone/_release.py,sha256=MQNaUD7mSK6h8vu6EIgJuaMlAxuFxv82NQwHgBpLZm4,14907
3
- sunstone/dataframe.py,sha256=UJgQx7auiNb6hSIvhB8EQs2afu-7S22xdWL5DZUr29g,23602
4
- sunstone/datasets.py,sha256=LdHk3Vkfc7QH2VxhSskRCm9wUFSkldCmgS_1c2KDAPA,21142
5
- sunstone/exceptions.py,sha256=fiixXazur3LtQGy21bGEaSr356DObFcYxQJ3FvOxNec,623
6
- sunstone/lineage.py,sha256=B9GKMu5-v8Izos5G40K_EvsCPJL3Z2Tg1T_Fc7ezSMI,5240
7
- sunstone/pandas.py,sha256=CLEqIIgTbMmpH73TPy_vDUPxQa37Hpmqn4r6No8PJwo,8188
8
- sunstone/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- sunstone/validation.py,sha256=1356vcUc72a1zGBUe9Xjrcb5h41Xo53PaK2nnQ_FuSM,8286
10
- sunstone_py-0.5.2.dist-info/licenses/LICENSE,sha256=pB6VuR4QRjwjMjy8RSNGho-N1SUdu07ntIhT5lrhkzU,1078
11
- sunstone_py-0.5.2.dist-info/METADATA,sha256=uR8iPIENJBiPVFhtr5EXT3V6VAmLiju0CfFjm6oQubI,9460
12
- sunstone_py-0.5.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- sunstone_py-0.5.2.dist-info/entry_points.txt,sha256=0h6E88rH9a_503BAzXvFPR-UfmkrRFjcOf29DXgJNjk,51
14
- sunstone_py-0.5.2.dist-info/top_level.txt,sha256=A2fW-7JO10rlx_L28Bc4FVvWt2R8kgvS8_TGPBhQp3c,9
15
- sunstone_py-0.5.2.dist-info/RECORD,,