sunstone-py 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sunstone/dataframe.py ADDED
@@ -0,0 +1,607 @@
1
+ """
2
+ DataFrame wrapper with lineage tracking for Sunstone projects.
3
+ """
4
+
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any, Callable, List, Optional, Union
8
+
9
+ import pandas as pd
10
+
11
+ from .datasets import DatasetsManager
12
+ from .exceptions import DatasetNotFoundError, StrictModeError
13
+ from .lineage import FieldSchema, LineageMetadata
14
+
15
+ pd.options.mode.copy_on_write = True
16
+
17
+
18
+ class DataFrame:
19
+ """
20
+ A pandas DataFrame wrapper that maintains lineage metadata.
21
+
22
+ This class wraps a pandas DataFrame and tracks the provenance of the data,
23
+ ensuring that all reads and writes are registered in datasets.yaml files.
24
+
25
+ Attributes:
26
+ data: The underlying pandas DataFrame.
27
+ lineage: Lineage metadata tracking data provenance.
28
+ strict_mode: Whether to operate in strict mode.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ data: Any = None,
34
+ lineage: Optional[LineageMetadata] = None,
35
+ strict: Optional[bool] = None,
36
+ project_path: Optional[Union[str, Path]] = None,
37
+ **kwargs: Any,
38
+ ):
39
+ """
40
+ Initialize a Sunstone DataFrame.
41
+
42
+ Args:
43
+ data: Data to wrap. Can be a pandas DataFrame or any data accepted
44
+ by pandas.DataFrame() constructor (dict, list of dicts, etc.).
45
+ lineage: Optional lineage metadata.
46
+ strict: Whether to operate in strict mode. If None, reads from
47
+ SUNSTONE_DATAFRAME_STRICT environment variable.
48
+ project_path: Path to the project directory. If None, uses current directory.
49
+ **kwargs: Additional arguments passed to pandas.DataFrame constructor.
50
+
51
+ Note:
52
+ Strict mode behavior:
53
+ - strict=True: Operations that would modify datasets.yaml will fail
54
+ - strict=False (relaxed): datasets.yaml will be updated as needed
55
+ - Default is determined by SUNSTONE_DATAFRAME_STRICT env var
56
+ ("1" or "true" -> strict mode, otherwise relaxed mode)
57
+ """
58
+ # Convert data to pandas DataFrame if it isn't already
59
+ if data is None:
60
+ self.data = pd.DataFrame(**kwargs)
61
+ elif isinstance(data, pd.DataFrame):
62
+ self.data = data
63
+ else:
64
+ # data is some other type (dict, list, etc.) - pass to pandas
65
+ self.data = pd.DataFrame(data, **kwargs)
66
+
67
+ self.lineage = lineage if lineage is not None else LineageMetadata()
68
+
69
+ # Determine strict mode
70
+ if strict is None:
71
+ env_strict = os.environ.get("SUNSTONE_DATAFRAME_STRICT", "").lower()
72
+ self.strict_mode = env_strict in ("1", "true")
73
+ else:
74
+ self.strict_mode = strict
75
+
76
+ # Set project path
77
+ if project_path is not None:
78
+ self.lineage.project_path = str(Path(project_path).resolve())
79
+ elif self.lineage.project_path is None:
80
+ self.lineage.project_path = str(Path.cwd())
81
+
82
+ def _get_datasets_manager(self) -> DatasetsManager:
83
+ """Get a DatasetsManager for the current project."""
84
+ if self.lineage.project_path is None:
85
+ raise ValueError("Project path not set")
86
+ return DatasetsManager(self.lineage.project_path)
87
+
88
+ @classmethod
89
+ def read_dataset(
90
+ cls,
91
+ slug: str,
92
+ project_path: Optional[Union[str, Path]] = None,
93
+ strict: Optional[bool] = None,
94
+ fetch_from_url: bool = True,
95
+ format: Optional[str] = None,
96
+ **kwargs: Any,
97
+ ) -> "DataFrame":
98
+ """
99
+ Read a dataset by slug from datasets.yaml with format auto-detection.
100
+
101
+ This method looks up a dataset by its slug in datasets.yaml and automatically
102
+ detects the file format from the file extension unless explicitly specified.
103
+
104
+ Supported formats:
105
+ - CSV (.csv)
106
+ - JSON (.json)
107
+ - Excel (.xlsx, .xls)
108
+ - Parquet (.parquet)
109
+ - TSV (.tsv, .txt with tab delimiter)
110
+
111
+ Args:
112
+ slug: Dataset slug to look up in datasets.yaml.
113
+ project_path: Path to project directory containing datasets.yaml.
114
+ strict: Whether to operate in strict mode.
115
+ fetch_from_url: If True and dataset has a source URL but no local file,
116
+ automatically fetch from URL.
117
+ format: Optional format override ('csv', 'json', 'excel', 'parquet', 'tsv').
118
+ If not provided, format is auto-detected from file extension.
119
+ **kwargs: Additional arguments passed to the pandas reader function.
120
+
121
+ Returns:
122
+ A new Sunstone DataFrame with lineage metadata.
123
+
124
+ Raises:
125
+ DatasetNotFoundError: If dataset with slug not found in datasets.yaml.
126
+ FileNotFoundError: If datasets.yaml doesn't exist.
127
+ ValueError: If format cannot be detected or is unsupported.
128
+
129
+ Examples:
130
+ >>> # Auto-detect format from extension
131
+ >>> df = DataFrame.read_dataset('official-un-member-states', project_path='/path/to/project')
132
+ >>>
133
+ >>> # Explicitly specify format
134
+ >>> df = DataFrame.read_dataset('my-data', format='json', project_path='/path/to/project')
135
+ """
136
+ if project_path is None:
137
+ project_path = Path.cwd()
138
+
139
+ manager = DatasetsManager(project_path)
140
+
141
+ # Look up by slug
142
+ dataset = manager.find_dataset_by_slug(slug)
143
+ if dataset is None:
144
+ raise DatasetNotFoundError(
145
+ f"Dataset with slug '{slug}' not found in datasets.yaml. Check that the dataset is registered."
146
+ )
147
+
148
+ # Get the file path
149
+ absolute_path = manager.get_absolute_path(dataset.location)
150
+
151
+ # If file doesn't exist and we have a source URL, fetch it
152
+ if not absolute_path.exists() and fetch_from_url:
153
+ if dataset.source and dataset.source.location.data:
154
+ absolute_path = manager.fetch_from_url(dataset)
155
+ else:
156
+ raise FileNotFoundError(
157
+ f"File not found: {absolute_path}\nDataset '{dataset.slug}' has no source URL to fetch from."
158
+ )
159
+
160
+ # Determine format
161
+ if format is None:
162
+ # Auto-detect from file extension
163
+ extension = absolute_path.suffix.lower()
164
+ format_map = {
165
+ ".csv": "csv",
166
+ ".json": "json",
167
+ ".xlsx": "excel",
168
+ ".xls": "excel",
169
+ ".parquet": "parquet",
170
+ ".tsv": "tsv",
171
+ ".txt": "tsv", # Assume tab-delimited for .txt
172
+ }
173
+ format = format_map.get(extension)
174
+ if format is None:
175
+ raise ValueError(
176
+ f"Cannot auto-detect format for file extension '{extension}'. "
177
+ f"Supported extensions: {', '.join(format_map.keys())}. "
178
+ f"Please specify format explicitly using the 'format' parameter."
179
+ )
180
+
181
+ # Read using appropriate pandas function
182
+ reader_map: dict[str, Callable[..., pd.DataFrame]] = {
183
+ "csv": pd.read_csv,
184
+ "json": pd.read_json,
185
+ "excel": pd.read_excel,
186
+ "parquet": pd.read_parquet,
187
+ "tsv": lambda path, **kw: pd.read_csv(path, sep="\t", **kw),
188
+ }
189
+
190
+ reader = reader_map.get(format)
191
+ if reader is None:
192
+ raise ValueError(f"Unsupported format '{format}'. Supported formats: {', '.join(reader_map.keys())}")
193
+
194
+ df = reader(absolute_path, **kwargs)
195
+
196
+ # Create lineage metadata
197
+ lineage = LineageMetadata(project_path=str(manager.project_path))
198
+ lineage.add_source(dataset)
199
+ lineage.add_operation(f"read_dataset({dataset.slug}, format={format})")
200
+
201
+ # Return wrapped DataFrame
202
+ return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
203
+
204
+ @classmethod
205
+ def read_csv(
206
+ cls,
207
+ filepath_or_buffer: Union[str, Path],
208
+ project_path: Optional[Union[str, Path]] = None,
209
+ strict: Optional[bool] = None,
210
+ fetch_from_url: bool = True,
211
+ **kwargs: Any,
212
+ ) -> "DataFrame":
213
+ """
214
+ Read a CSV file into a Sunstone DataFrame.
215
+
216
+ The file must be registered in datasets.yaml, otherwise this will fail
217
+ (or in relaxed mode, register it automatically).
218
+
219
+ Args:
220
+ filepath_or_buffer: Path to CSV file, URL, or dataset slug.
221
+ If it's a slug (e.g., 'official-un-member-states'),
222
+ the dataset will be looked up in datasets.yaml.
223
+ project_path: Path to project directory containing datasets.yaml.
224
+ strict: Whether to operate in strict mode.
225
+ fetch_from_url: If True and dataset has a source URL but no local file,
226
+ automatically fetch from URL.
227
+ **kwargs: Additional arguments passed to pandas.read_csv.
228
+
229
+ Returns:
230
+ A new Sunstone DataFrame with lineage metadata.
231
+
232
+ Raises:
233
+ DatasetNotFoundError: In strict mode, if dataset not found in datasets.yaml.
234
+ FileNotFoundError: If datasets.yaml doesn't exist.
235
+
236
+ Examples:
237
+ >>> # Load by slug
238
+ >>> df = DataFrame.read_csv('official-un-member-states', project_path='/path/to/project')
239
+ >>>
240
+ >>> # Load by file path
241
+ >>> df = DataFrame.read_csv('inputs/data.csv', project_path='/path/to/project')
242
+ """
243
+ location = str(filepath_or_buffer)
244
+
245
+ # Determine if this is a slug or a file path
246
+ # Slugs don't contain path separators and typically use kebab-case
247
+ is_slug = "/" not in location and "\\" not in location and not Path(location).suffix
248
+
249
+ if is_slug:
250
+ # Delegate to read_dataset with CSV format
251
+ return cls.read_dataset(
252
+ slug=location,
253
+ project_path=project_path,
254
+ strict=strict,
255
+ fetch_from_url=fetch_from_url,
256
+ format="csv",
257
+ **kwargs,
258
+ )
259
+
260
+ # File path - handle with original logic
261
+ if project_path is None:
262
+ project_path = Path.cwd()
263
+
264
+ manager = DatasetsManager(project_path)
265
+
266
+ # Look up by location
267
+ dataset = manager.find_dataset_by_location(location)
268
+ if dataset is None:
269
+ if strict or (strict is None and cls._get_default_strict_mode()):
270
+ raise DatasetNotFoundError(
271
+ f"Dataset at '{location}' not found in datasets.yaml. "
272
+ f"In strict mode, all datasets must be registered."
273
+ )
274
+ else:
275
+ raise DatasetNotFoundError(
276
+ f"Dataset at '{location}' not found in datasets.yaml. Please add it to datasets.yaml first."
277
+ )
278
+
279
+ # Use the requested location
280
+ absolute_path = manager.get_absolute_path(location)
281
+
282
+ # If file doesn't exist and we have a source URL, fetch it
283
+ if not absolute_path.exists() and fetch_from_url:
284
+ if dataset.source and dataset.source.location.data:
285
+ absolute_path = manager.fetch_from_url(dataset)
286
+ else:
287
+ raise FileNotFoundError(
288
+ f"File not found: {absolute_path}\nDataset '{dataset.slug}' has no source URL to fetch from."
289
+ )
290
+
291
+ # Read the CSV using pandas
292
+ df = pd.read_csv(absolute_path, **kwargs)
293
+
294
+ # Create lineage metadata
295
+ lineage = LineageMetadata(project_path=str(manager.project_path))
296
+ lineage.add_source(dataset)
297
+ lineage.add_operation(f"read_csv({dataset.slug})")
298
+
299
+ # Return wrapped DataFrame
300
+ return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
301
+
302
+ @staticmethod
303
+ def _get_default_strict_mode() -> bool:
304
+ """Get the default strict mode from environment variable."""
305
+ env_strict = os.environ.get("SUNSTONE_DATAFRAME_STRICT", "").lower()
306
+ return env_strict in ("1", "true")
307
+
308
+ def to_csv(
309
+ self,
310
+ path_or_buf: Union[str, Path],
311
+ slug: Optional[str] = None,
312
+ name: Optional[str] = None,
313
+ publish: bool = False,
314
+ **kwargs: Any,
315
+ ) -> None:
316
+ """
317
+ Write DataFrame to CSV file.
318
+
319
+ In strict mode, the output must already be registered in datasets.yaml.
320
+ In relaxed mode, it will be registered automatically if not present.
321
+
322
+ Args:
323
+ path_or_buf: File path for the output CSV.
324
+ slug: Dataset slug (required in relaxed mode if not registered).
325
+ name: Dataset name (required in relaxed mode if not registered).
326
+ publish: Whether to publish the dataset.
327
+ **kwargs: Additional arguments passed to pandas.to_csv.
328
+
329
+ Raises:
330
+ StrictModeError: In strict mode, if dataset not registered.
331
+ ValueError: In relaxed mode, if slug/name not provided for new dataset.
332
+ """
333
+ manager = self._get_datasets_manager()
334
+ location = str(path_or_buf)
335
+
336
+ # Try to find existing dataset
337
+ dataset = manager.find_dataset_by_location(location, "output")
338
+
339
+ if dataset is None:
340
+ if self.strict_mode:
341
+ raise StrictModeError(
342
+ f"Output dataset at '{location}' not registered in datasets.yaml. "
343
+ f"In strict mode, outputs must be pre-registered."
344
+ )
345
+ else:
346
+ # Relaxed mode: auto-register
347
+ if slug is None or name is None:
348
+ raise ValueError(
349
+ "In relaxed mode, 'slug' and 'name' are required "
350
+ "when writing to an unregistered output location."
351
+ )
352
+
353
+ # Infer field schema from DataFrame
354
+ fields = self._infer_field_schema()
355
+
356
+ # Register the new output
357
+ dataset = manager.add_output_dataset(
358
+ name=name, slug=slug, location=location, fields=fields, publish=publish
359
+ )
360
+
361
+ # Write the CSV
362
+ absolute_path = manager.get_absolute_path(dataset.location)
363
+ absolute_path.parent.mkdir(parents=True, exist_ok=True)
364
+ self.data.to_csv(absolute_path, **kwargs)
365
+
366
+ # Record the operation
367
+ self.lineage.add_operation(f"to_csv({dataset.slug})")
368
+
369
+ def _infer_field_schema(self) -> List[FieldSchema]:
370
+ """
371
+ Infer field schema from the DataFrame.
372
+
373
+ Returns:
374
+ List of FieldSchema objects based on DataFrame columns and dtypes.
375
+ """
376
+ fields = []
377
+ for col in self.data.columns:
378
+ dtype = self.data[col].dtype
379
+
380
+ # Map pandas dtypes to dataset types
381
+ if pd.api.types.is_integer_dtype(dtype):
382
+ field_type = "integer"
383
+ elif pd.api.types.is_float_dtype(dtype):
384
+ field_type = "number"
385
+ elif pd.api.types.is_bool_dtype(dtype):
386
+ field_type = "boolean"
387
+ elif pd.api.types.is_datetime64_any_dtype(dtype):
388
+ field_type = "datetime"
389
+ else:
390
+ field_type = "string"
391
+
392
+ fields.append(FieldSchema(name=str(col), type=field_type))
393
+
394
+ return fields
395
+
396
+ def merge(self, right: "DataFrame", **kwargs: Any) -> "DataFrame":
397
+ """
398
+ Merge with another Sunstone DataFrame, combining lineage.
399
+
400
+ Args:
401
+ right: The other DataFrame to merge with.
402
+ **kwargs: Arguments passed to pandas.merge.
403
+
404
+ Returns:
405
+ A new DataFrame with combined data and lineage.
406
+ """
407
+ # Perform the merge
408
+ merged_data = pd.merge(self.data, right.data, **kwargs)
409
+
410
+ # Combine lineage
411
+ merged_lineage = self.lineage.merge(right.lineage)
412
+ merged_lineage.add_operation(
413
+ f"merge(left={len(self.lineage.sources)} sources, right={len(right.lineage.sources)} sources)"
414
+ )
415
+
416
+ return DataFrame(
417
+ data=merged_data,
418
+ lineage=merged_lineage,
419
+ strict=self.strict_mode,
420
+ project_path=self.lineage.project_path,
421
+ )
422
+
423
+ def join(self, other: "DataFrame", **kwargs: Any) -> "DataFrame":
424
+ """
425
+ Join with another Sunstone DataFrame, combining lineage.
426
+
427
+ Args:
428
+ other: The other DataFrame to join with.
429
+ **kwargs: Arguments passed to pandas.join.
430
+
431
+ Returns:
432
+ A new DataFrame with combined data and lineage.
433
+ """
434
+ # Perform the join
435
+ joined_data = self.data.join(other.data, **kwargs)
436
+
437
+ # Combine lineage
438
+ joined_lineage = self.lineage.merge(other.lineage)
439
+ joined_lineage.add_operation(
440
+ f"join(left={len(self.lineage.sources)} sources, right={len(other.lineage.sources)} sources)"
441
+ )
442
+
443
+ return DataFrame(
444
+ data=joined_data,
445
+ lineage=joined_lineage,
446
+ strict=self.strict_mode,
447
+ project_path=self.lineage.project_path,
448
+ )
449
+
450
+ def concat(self, others: List["DataFrame"], **kwargs: Any) -> "DataFrame":
451
+ """
452
+ Concatenate with other Sunstone DataFrames, combining lineage.
453
+
454
+ Args:
455
+ others: List of other DataFrames to concatenate.
456
+ **kwargs: Arguments passed to pandas.concat.
457
+
458
+ Returns:
459
+ A new DataFrame with combined data and lineage.
460
+ """
461
+ # Collect all DataFrames
462
+ all_dfs = [self.data] + [df.data for df in others]
463
+
464
+ # Concatenate
465
+ concatenated_data = pd.concat(all_dfs, **kwargs)
466
+
467
+ # Combine lineage from all DataFrames
468
+ combined_lineage = self.lineage
469
+ for other in others:
470
+ combined_lineage = combined_lineage.merge(other.lineage)
471
+
472
+ combined_lineage.add_operation(
473
+ f"concat({len(others) + 1} dataframes, "
474
+ f"{sum(len(df.lineage.sources) for df in [self] + others)} total sources)"
475
+ )
476
+
477
+ return DataFrame(
478
+ data=concatenated_data,
479
+ lineage=combined_lineage,
480
+ strict=self.strict_mode,
481
+ project_path=self.lineage.project_path,
482
+ )
483
+
484
+ def apply_operation(self, operation: Callable[[pd.DataFrame], pd.DataFrame], description: str) -> "DataFrame":
485
+ """
486
+ Apply a transformation operation to the DataFrame.
487
+
488
+ Args:
489
+ operation: Function that takes a pandas DataFrame and returns a DataFrame.
490
+ description: Human-readable description of the operation.
491
+
492
+ Returns:
493
+ A new DataFrame with the operation applied and recorded in lineage.
494
+ """
495
+ # Apply the operation
496
+ new_data = operation(self.data)
497
+
498
+ # Copy lineage and add operation
499
+ new_lineage = LineageMetadata(
500
+ sources=self.lineage.sources.copy(),
501
+ operations=self.lineage.operations.copy(),
502
+ project_path=self.lineage.project_path,
503
+ )
504
+ new_lineage.add_operation(description)
505
+
506
+ return DataFrame(
507
+ data=new_data,
508
+ lineage=new_lineage,
509
+ strict=self.strict_mode,
510
+ project_path=self.lineage.project_path,
511
+ )
512
+
513
+ def _wrap_result(self, result: Any, operation: str = "pandas_operation") -> Any:
514
+ """
515
+ Wrap a pandas result in a Sunstone DataFrame if applicable.
516
+
517
+ Args:
518
+ result: The result from a pandas operation.
519
+ operation: Name of the operation performed.
520
+
521
+ Returns:
522
+ Wrapped DataFrame if result is a DataFrame, otherwise the result.
523
+ """
524
+ if isinstance(result, pd.DataFrame):
525
+ new_lineage = LineageMetadata(
526
+ sources=self.lineage.sources.copy(),
527
+ operations=self.lineage.operations.copy(),
528
+ project_path=self.lineage.project_path,
529
+ )
530
+ new_lineage.add_operation(operation)
531
+
532
+ return DataFrame(
533
+ data=result,
534
+ lineage=new_lineage,
535
+ strict=self.strict_mode,
536
+ project_path=self.lineage.project_path,
537
+ )
538
+ return result
539
+
540
+ def __getattr__(self, name: str) -> Any:
541
+ """
542
+ Delegate attribute access to the underlying pandas DataFrame.
543
+
544
+ Args:
545
+ name: Attribute name.
546
+
547
+ Returns:
548
+ The attribute from the underlying DataFrame, wrapped if it's a method or DataFrame.
549
+ """
550
+ # Special handling for pandas indexers - return as-is
551
+ if name in ("loc", "iloc", "at", "iat"):
552
+ return getattr(self.data, name)
553
+
554
+ attr = getattr(self.data, name)
555
+
556
+ if callable(attr):
557
+
558
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
559
+ result = attr(*args, **kwargs)
560
+ return self._wrap_result(result, operation=f"{name}")
561
+
562
+ return wrapper
563
+
564
+ return self._wrap_result(attr, operation=f"access_attribute_{name}")
565
+
566
+ def __getitem__(self, key: Any) -> Any:
567
+ """
568
+ Delegate item access to the underlying pandas DataFrame.
569
+
570
+ Args:
571
+ key: Index key.
572
+
573
+ Returns:
574
+ The item from the underlying DataFrame, wrapped if it's a DataFrame.
575
+ """
576
+ result = self.data[key]
577
+ return self._wrap_result(result, operation="__getitem__")
578
+
579
+ def __setitem__(self, key: Any, value: Any) -> None:
580
+ """
581
+ Delegate item assignment to the underlying pandas DataFrame.
582
+
583
+ Args:
584
+ key: Index key.
585
+ value: Value to assign.
586
+ """
587
+ self.data[key] = value
588
+ self.lineage.add_operation("__setitem__")
589
+
590
+ def __repr__(self) -> str:
591
+ """String representation of the DataFrame."""
592
+ lineage_info = (
593
+ f"\n\nLineage: {len(self.lineage.sources)} source(s), {len(self.lineage.operations)} operation(s)"
594
+ )
595
+ return repr(self.data) + lineage_info
596
+
597
+ def __str__(self) -> str:
598
+ """String representation of the DataFrame."""
599
+ return str(self.data)
600
+
601
+ def __len__(self) -> int:
602
+ """Return the number of rows in the DataFrame."""
603
+ return len(self.data)
604
+
605
+ def __iter__(self) -> Any:
606
+ """Iterate over column names."""
607
+ return iter(self.data)