sunstone-py 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sunstone/datasets.py ADDED
@@ -0,0 +1,480 @@
1
+ """
2
+ Parser and manager for datasets.yaml files.
3
+ """
4
+
5
+ import ipaddress
6
+ import logging
7
+ import socket
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional, Union
10
+ from urllib.parse import urljoin, urlparse
11
+
12
+ import requests
13
+ import yaml
14
+
15
+ from .exceptions import DatasetNotFoundError, DatasetValidationError
16
+ from .lineage import DatasetMetadata, FieldSchema, Source, SourceLocation
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def _is_public_url(url: str) -> bool:
22
+ """
23
+ Validate that a URL points to a public (non-private) resource.
24
+
25
+ This function prevents SSRF attacks by blocking:
26
+ - Non-HTTP(S) schemes (e.g., file://, ftp://)
27
+ - Private IP addresses (10.x.x.x, 172.16-31.x.x, 192.168.x.x)
28
+ - Localhost and loopback addresses
29
+ - Link-local addresses (169.254.x.x)
30
+
31
+ Args:
32
+ url: The URL to validate.
33
+
34
+ Returns:
35
+ True if the URL points to a public resource, False otherwise.
36
+
37
+ Raises:
38
+ Exception: Re-raises unexpected exceptions after logging.
39
+ """
40
+ try:
41
+ parsed = urlparse(url)
42
+
43
+ # Only allow HTTP and HTTPS schemes
44
+ if parsed.scheme not in ("http", "https"):
45
+ logger.warning("URL scheme '%s' not allowed (only http/https permitted)", parsed.scheme)
46
+ return False
47
+
48
+ # Ensure hostname is present
49
+ if not parsed.hostname:
50
+ logger.warning("URL has no hostname")
51
+ return False
52
+
53
+ # Resolve hostname to all IP addresses (IPv4 and IPv6) and check each
54
+ addrinfos = socket.getaddrinfo(parsed.hostname, None)
55
+ for addrinfo in addrinfos:
56
+ sockaddr = addrinfo[4]
57
+ ip = sockaddr[0]
58
+ ip_obj = ipaddress.ip_address(ip)
59
+
60
+ # Block private, loopback, and link-local addresses
61
+ if ip_obj.is_private or ip_obj.is_loopback or ip_obj.is_link_local:
62
+ logger.warning(
63
+ "URL hostname '%s' resolves to restricted IP address: %s",
64
+ parsed.hostname,
65
+ ip,
66
+ )
67
+ return False
68
+
69
+ return True
70
+
71
+ except socket.gaierror:
72
+ logger.warning("Unable to resolve hostname: %s", parsed.hostname)
73
+ return False
74
+ except ValueError as e:
75
+ logger.warning("Error validating URL '%s': %s", url, e)
76
+ return False
77
+ except Exception as e:
78
+ logger.exception("Unexpected error validating URL '%s': %s", url, e)
79
+ raise
80
+
81
+
82
+ class DatasetsManager:
83
+ """
84
+ Manager for parsing and updating datasets.yaml files.
85
+
86
+ This class handles reading, parsing, and updating dataset metadata
87
+ from datasets.yaml files in Sunstone projects.
88
+ """
89
+
90
+ def __init__(self, project_path: Union[str, Path]):
91
+ """
92
+ Initialize the datasets manager.
93
+
94
+ Args:
95
+ project_path: Path to the project directory containing datasets.yaml.
96
+
97
+ Raises:
98
+ FileNotFoundError: If datasets.yaml doesn't exist in the project path.
99
+ """
100
+ self.project_path = Path(project_path).resolve()
101
+ self.datasets_file = self.project_path / "datasets.yaml"
102
+
103
+ if not self.datasets_file.exists():
104
+ raise FileNotFoundError(f"datasets.yaml not found in {self.project_path}")
105
+
106
+ self._data: Dict[str, Any] = {}
107
+ self._load()
108
+
109
+ def _load(self) -> None:
110
+ """Load and parse the datasets.yaml file."""
111
+ with open(self.datasets_file, "r") as f:
112
+ self._data = yaml.safe_load(f) or {}
113
+
114
+ if "inputs" not in self._data:
115
+ self._data["inputs"] = []
116
+ if "outputs" not in self._data:
117
+ self._data["outputs"] = []
118
+
119
+ def _save(self) -> None:
120
+ """Save the current data back to datasets.yaml."""
121
+ with open(self.datasets_file, "w") as f:
122
+ yaml.dump(self._data, f, default_flow_style=False, sort_keys=False)
123
+
124
+ def _parse_source_location(self, loc_data: Dict[str, Any]) -> SourceLocation:
125
+ """Parse source location data from YAML."""
126
+ return SourceLocation(
127
+ data=loc_data.get("data"),
128
+ metadata=loc_data.get("metadata"),
129
+ about=loc_data.get("about"),
130
+ )
131
+
132
+ def _parse_source(self, source_data: Dict[str, Any]) -> Source:
133
+ """Parse source attribution data from YAML."""
134
+ return Source(
135
+ name=source_data["name"],
136
+ location=self._parse_source_location(source_data["location"]),
137
+ attributed_to=source_data["attributedTo"],
138
+ acquired_at=source_data["acquiredAt"],
139
+ acquisition_method=source_data["acquisitionMethod"],
140
+ license=source_data["license"],
141
+ updated=source_data.get("updated"),
142
+ )
143
+
144
+ def _parse_fields(self, fields_data: List[Dict[str, Any]]) -> List[FieldSchema]:
145
+ """Parse field schema data from YAML."""
146
+ return [
147
+ FieldSchema(name=field["name"], type=field["type"], constraints=field.get("constraints"))
148
+ for field in fields_data
149
+ ]
150
+
151
+ def _parse_dataset(self, dataset_data: Dict[str, Any], dataset_type: str) -> DatasetMetadata:
152
+ """
153
+ Parse dataset metadata from YAML data.
154
+
155
+ Args:
156
+ dataset_data: Raw dataset data from YAML.
157
+ dataset_type: Either 'input' or 'output'.
158
+
159
+ Returns:
160
+ Parsed DatasetMetadata object.
161
+ """
162
+ source = None
163
+ if "source" in dataset_data:
164
+ source = self._parse_source(dataset_data["source"])
165
+
166
+ return DatasetMetadata(
167
+ name=dataset_data["name"],
168
+ slug=dataset_data["slug"],
169
+ location=dataset_data["location"],
170
+ fields=self._parse_fields(dataset_data["fields"]),
171
+ source=source,
172
+ publish=dataset_data.get("publish", False),
173
+ dataset_type=dataset_type,
174
+ )
175
+
176
+ def find_dataset_by_location(self, location: str, dataset_type: Optional[str] = None) -> Optional[DatasetMetadata]:
177
+ """
178
+ Find a dataset by its file location.
179
+
180
+ Args:
181
+ location: The file path or URL to search for.
182
+ dataset_type: Optional filter by 'input' or 'output'.
183
+
184
+ Returns:
185
+ DatasetMetadata if found, None otherwise.
186
+ """
187
+ # Normalize location to handle both absolute and relative paths
188
+ location_path = Path(location)
189
+ if location_path.is_absolute():
190
+ # Try to make it relative to project path
191
+ try:
192
+ location = str(location_path.relative_to(self.project_path))
193
+ except ValueError:
194
+ # Not relative to project path, use as-is
195
+ location = str(location_path)
196
+ else:
197
+ location = str(location_path)
198
+
199
+ search_types = ["input", "output"] if dataset_type is None else [dataset_type]
200
+
201
+ # Resolve the requested location to an absolute path
202
+ location_path = Path(location)
203
+ if not location_path.is_absolute():
204
+ location_abs = (self.project_path / location_path).resolve()
205
+ else:
206
+ location_abs = location_path.resolve()
207
+
208
+ for dtype in search_types:
209
+ key = "inputs" if dtype == "input" else "outputs"
210
+ for dataset_data in self._data.get(key, []):
211
+ dataset_location = dataset_data["location"]
212
+
213
+ # Try multiple resolution strategies:
214
+ # 1. Direct string match
215
+ if dataset_location == location:
216
+ return self._parse_dataset(dataset_data, dtype)
217
+
218
+ # 2. Resolve dataset location as-is
219
+ dataset_loc = Path(dataset_location)
220
+ if not dataset_loc.is_absolute():
221
+ dataset_abs = (self.project_path / dataset_loc).resolve()
222
+ else:
223
+ dataset_abs = dataset_loc.resolve()
224
+
225
+ if dataset_abs == location_abs:
226
+ return self._parse_dataset(dataset_data, dtype)
227
+
228
+ # 3. If the requested location exists, and just the filename matches,
229
+ # check if they point to the same existing file
230
+ if location_abs.exists() and dataset_abs.exists():
231
+ if location_abs.samefile(dataset_abs):
232
+ return self._parse_dataset(dataset_data, dtype)
233
+
234
+ # 4. If requested location exists but dataset location in yaml doesn't,
235
+ # check if the filename matches (for cases where the directory changed)
236
+ if location_abs.exists() and not dataset_abs.exists():
237
+ if dataset_loc.name == location_path.name:
238
+ # Same filename - this might be a match
239
+ if (
240
+ location_abs.samefile(self.project_path / dataset_loc.name)
241
+ if (self.project_path / dataset_loc.name).exists()
242
+ else False
243
+ ):
244
+ return self._parse_dataset(dataset_data, dtype)
245
+ # Check in common subdirectories
246
+ for subdir in ["inputs", "outputs", "data"]:
247
+ candidate = self.project_path / subdir / dataset_loc.name
248
+ if candidate.exists() and location_abs.samefile(candidate):
249
+ return self._parse_dataset(dataset_data, dtype)
250
+
251
+ return None
252
+
253
+ def find_dataset_by_slug(self, slug: str, dataset_type: Optional[str] = None) -> Optional[DatasetMetadata]:
254
+ """
255
+ Find a dataset by its slug.
256
+
257
+ Args:
258
+ slug: The dataset slug to search for.
259
+ dataset_type: Optional filter by 'input' or 'output'.
260
+
261
+ Returns:
262
+ DatasetMetadata if found, None otherwise.
263
+ """
264
+ search_types = ["input", "output"] if dataset_type is None else [dataset_type]
265
+
266
+ for dtype in search_types:
267
+ key = "inputs" if dtype == "input" else "outputs"
268
+ for dataset_data in self._data.get(key, []):
269
+ if dataset_data["slug"] == slug:
270
+ return self._parse_dataset(dataset_data, dtype)
271
+
272
+ return None
273
+
274
+ def get_all_inputs(self) -> List[DatasetMetadata]:
275
+ """
276
+ Get all input datasets.
277
+
278
+ Returns:
279
+ List of all input dataset metadata.
280
+ """
281
+ return [self._parse_dataset(data, "input") for data in self._data.get("inputs", [])]
282
+
283
+ def get_all_outputs(self) -> List[DatasetMetadata]:
284
+ """
285
+ Get all output datasets.
286
+
287
+ Returns:
288
+ List of all output dataset metadata.
289
+ """
290
+ return [self._parse_dataset(data, "output") for data in self._data.get("outputs", [])]
291
+
292
+ def add_output_dataset(
293
+ self, name: str, slug: str, location: str, fields: List[FieldSchema], publish: bool = False
294
+ ) -> DatasetMetadata:
295
+ """
296
+ Add a new output dataset to datasets.yaml.
297
+
298
+ Args:
299
+ name: Human-readable name.
300
+ slug: Kebab-case identifier.
301
+ location: File path for the output.
302
+ fields: List of field schemas.
303
+ publish: Whether to publish this dataset.
304
+
305
+ Returns:
306
+ The newly created DatasetMetadata.
307
+
308
+ Raises:
309
+ DatasetValidationError: If a dataset with this slug already exists.
310
+ """
311
+ # Check if slug already exists
312
+ if self.find_dataset_by_slug(slug, "output"):
313
+ raise DatasetValidationError(f"Output dataset with slug '{slug}' already exists")
314
+
315
+ # Create the dataset entry
316
+ dataset_data = {
317
+ "name": name,
318
+ "slug": slug,
319
+ "location": location,
320
+ "publish": publish,
321
+ "fields": [
322
+ {
323
+ "name": field.name,
324
+ "type": field.type,
325
+ **({"constraints": field.constraints} if field.constraints else {}),
326
+ }
327
+ for field in fields
328
+ ],
329
+ }
330
+
331
+ # Add to outputs
332
+ self._data["outputs"].append(dataset_data)
333
+
334
+ # Save changes
335
+ self._save()
336
+
337
+ return self._parse_dataset(dataset_data, "output")
338
+
339
+ def update_output_dataset(
340
+ self, slug: str, fields: Optional[List[FieldSchema]] = None, location: Optional[str] = None
341
+ ) -> DatasetMetadata:
342
+ """
343
+ Update an existing output dataset.
344
+
345
+ Args:
346
+ slug: The slug of the dataset to update.
347
+ fields: Optional new field schema.
348
+ location: Optional new location.
349
+
350
+ Returns:
351
+ The updated DatasetMetadata.
352
+
353
+ Raises:
354
+ DatasetNotFoundError: If the dataset doesn't exist.
355
+ """
356
+ for i, dataset_data in enumerate(self._data["outputs"]):
357
+ if dataset_data["slug"] == slug:
358
+ if fields is not None:
359
+ dataset_data["fields"] = [
360
+ {
361
+ "name": field.name,
362
+ "type": field.type,
363
+ **({"constraints": field.constraints} if field.constraints else {}),
364
+ }
365
+ for field in fields
366
+ ]
367
+ if location is not None:
368
+ dataset_data["location"] = location
369
+
370
+ self._save()
371
+ return self._parse_dataset(dataset_data, "output")
372
+
373
+ raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
374
+
375
+ def get_absolute_path(self, location: str) -> Path:
376
+ """
377
+ Get the absolute path for a dataset location.
378
+
379
+ Args:
380
+ location: The location string from dataset metadata.
381
+
382
+ Returns:
383
+ Absolute path to the dataset file.
384
+ """
385
+ location_path = Path(location)
386
+ if location_path.is_absolute():
387
+ return location_path
388
+ return (self.project_path / location_path).resolve()
389
+
390
+ def fetch_from_url(
391
+ self,
392
+ dataset: DatasetMetadata,
393
+ timeout: int = 30,
394
+ force: bool = False,
395
+ max_redirects: int = 10,
396
+ ) -> Path:
397
+ """
398
+ Fetch a dataset from its source URL if available.
399
+
400
+ Args:
401
+ dataset: The dataset metadata containing source URL.
402
+ timeout: Request timeout in seconds.
403
+ force: If True, fetch even if local file exists.
404
+ max_redirects: Maximum number of redirects to follow (default: 10).
405
+
406
+ Returns:
407
+ Path to the local file (newly downloaded or existing).
408
+
409
+ Raises:
410
+ ValueError: If dataset has no source URL or URL is not allowed.
411
+ requests.RequestException: If the fetch fails.
412
+ """
413
+ if not dataset.source or not dataset.source.location.data:
414
+ raise ValueError(f"Dataset '{dataset.slug}' has no source URL")
415
+
416
+ local_path = self.get_absolute_path(dataset.location)
417
+
418
+ # Skip if file exists and not forcing
419
+ if local_path.exists() and not force:
420
+ logger.info("Using existing local file: %s", local_path)
421
+ return local_path
422
+
423
+ url = dataset.source.location.data
424
+
425
+ # Validate URL points to public resource to prevent SSRF attacks
426
+ if not _is_public_url(url):
427
+ raise ValueError(
428
+ f"URL '{url}' is not allowed. Only HTTP/HTTPS URLs pointing to public internet addresses are permitted."
429
+ )
430
+
431
+ logger.info("Fetching dataset from URL: %s", url)
432
+
433
+ try:
434
+ # Disable automatic redirects and handle them manually to prevent SSRF bypass
435
+ # An attacker could use a public URL that redirects to a private IP
436
+ current_url = url
437
+ response = requests.get(current_url, timeout=timeout, allow_redirects=False)
438
+ redirect_count = 0
439
+
440
+ while response.is_redirect and redirect_count < max_redirects:
441
+ redirect_url = response.headers.get("Location")
442
+ if not redirect_url:
443
+ raise ValueError("Redirect response without Location header")
444
+
445
+ # Resolve relative URLs against the current URL
446
+ redirect_url = urljoin(current_url, redirect_url)
447
+
448
+ # Validate the redirect target URL for SSRF protection
449
+ if not _is_public_url(redirect_url):
450
+ raise ValueError(
451
+ f"Redirect URL '{redirect_url}' is not allowed. Only HTTP/HTTPS URLs "
452
+ "pointing to public internet addresses are permitted."
453
+ )
454
+
455
+ logger.info("Following redirect to: %s", redirect_url)
456
+ current_url = redirect_url
457
+ response = requests.get(current_url, timeout=timeout, allow_redirects=False)
458
+ redirect_count += 1
459
+
460
+ if response.is_redirect:
461
+ raise ValueError(f"Too many redirects (max: {max_redirects})")
462
+
463
+ response.raise_for_status()
464
+
465
+ # Ensure parent directory exists
466
+ local_path.parent.mkdir(parents=True, exist_ok=True)
467
+
468
+ # Save to local file
469
+ with open(local_path, "wb") as f:
470
+ f.write(response.content)
471
+
472
+ logger.info("✓ Successfully saved to %s (%d bytes)", local_path, len(response.content))
473
+ return local_path
474
+
475
+ except requests.Timeout:
476
+ logger.error("Request timed out after %d seconds", timeout)
477
+ raise
478
+ except requests.RequestException as e:
479
+ logger.error("Failed to fetch from URL: %s", e)
480
+ raise
sunstone/exceptions.py ADDED
@@ -0,0 +1,33 @@
1
+ """
2
+ Custom exceptions for the Sunstone library.
3
+ """
4
+
5
+
6
+ class SunstoneError(Exception):
7
+ """Base exception for all Sunstone library errors."""
8
+
9
+ pass
10
+
11
+
12
+ class DatasetNotFoundError(SunstoneError):
13
+ """Raised when a dataset is not found in datasets.yaml."""
14
+
15
+ pass
16
+
17
+
18
+ class StrictModeError(SunstoneError):
19
+ """Raised when an operation would modify datasets.yaml in strict mode."""
20
+
21
+ pass
22
+
23
+
24
+ class DatasetValidationError(SunstoneError):
25
+ """Raised when dataset metadata fails validation."""
26
+
27
+ pass
28
+
29
+
30
+ class LineageError(SunstoneError):
31
+ """Raised when there's an issue with lineage tracking."""
32
+
33
+ pass
sunstone/lineage.py ADDED
@@ -0,0 +1,190 @@
1
+ """
2
+ Lineage metadata structures for tracking data provenance.
3
+ """
4
+
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime
7
+ from typing import Any, Dict, List, Optional
8
+
9
+
10
+ @dataclass
11
+ class SourceLocation:
12
+ """Location information for a data source."""
13
+
14
+ data: Optional[str] = None
15
+ """URL to the data file."""
16
+
17
+ metadata: Optional[str] = None
18
+ """URL to metadata about the data."""
19
+
20
+ about: Optional[str] = None
21
+ """URL to a page describing the data source."""
22
+
23
+
24
+ @dataclass
25
+ class Source:
26
+ """Source attribution information for a dataset."""
27
+
28
+ name: str
29
+ """Name of the data source."""
30
+
31
+ location: SourceLocation
32
+ """Location information for the source."""
33
+
34
+ attributed_to: str
35
+ """Organization or individual to attribute the data to."""
36
+
37
+ acquired_at: str
38
+ """Date when the data was acquired (YYYY-MM-DD format)."""
39
+
40
+ acquisition_method: str
41
+ """Method used to acquire the data (e.g., 'manual-download', 'api', 'scraping')."""
42
+
43
+ license: str
44
+ """SPDX license identifier."""
45
+ # TODO: Consider using a library for SPDX license validation.
46
+
47
+ updated: Optional[str] = None
48
+ """Optional description of update frequency."""
49
+
50
+
51
+ @dataclass
52
+ class FieldSchema:
53
+ """Schema definition for a dataset field."""
54
+
55
+ name: str
56
+ """Name of the field/column."""
57
+
58
+ type: str
59
+ """Data type (string, number, integer, boolean, date, datetime)."""
60
+
61
+ constraints: Optional[Dict[str, Any]] = None
62
+ """Optional constraints (e.g., enum values)."""
63
+
64
+
65
+ @dataclass
66
+ class DatasetMetadata:
67
+ """Metadata for a dataset from datasets.yaml."""
68
+
69
+ name: str
70
+ """Human-readable name of the dataset."""
71
+
72
+ slug: str
73
+ """Kebab-case identifier for the dataset."""
74
+
75
+ location: str
76
+ """File path or URL for the dataset."""
77
+
78
+ fields: List[FieldSchema]
79
+ """Schema definitions for dataset fields."""
80
+
81
+ source: Optional[Source] = None
82
+ """Source attribution (for input datasets)."""
83
+
84
+ publish: bool = False
85
+ """Whether this dataset should be published (for output datasets)."""
86
+
87
+ dataset_type: str = "input"
88
+ """Type of dataset: 'input' or 'output'."""
89
+
90
+
91
+ @dataclass
92
+ class LineageMetadata:
93
+ """
94
+ Lineage metadata tracking the provenance of data in a DataFrame.
95
+
96
+ This tracks all source datasets that contributed to the current DataFrame,
97
+ including information about transformations and operations performed.
98
+ """
99
+
100
+ sources: List[DatasetMetadata] = field(default_factory=list)
101
+ """List of source datasets that contributed to this data."""
102
+
103
+ operations: List[str] = field(default_factory=list)
104
+ """List of operations performed on the data."""
105
+
106
+ created_at: datetime = field(default_factory=datetime.now)
107
+ """Timestamp when this lineage was created."""
108
+
109
+ project_path: Optional[str] = None
110
+ """Path to the project directory containing datasets.yaml."""
111
+
112
+ def add_source(self, dataset: DatasetMetadata) -> None:
113
+ """
114
+ Add a source dataset to the lineage.
115
+
116
+ Args:
117
+ dataset: The dataset metadata to add to sources.
118
+ """
119
+ if dataset not in self.sources:
120
+ self.sources.append(dataset)
121
+
122
+ def add_operation(self, operation: str) -> None:
123
+ """
124
+ Record an operation performed on the data.
125
+
126
+ Args:
127
+ operation: Description of the operation.
128
+ """
129
+ self.operations.append(operation)
130
+
131
+ def merge(self, other: "LineageMetadata") -> "LineageMetadata":
132
+ """
133
+ Merge lineage from another DataFrame.
134
+
135
+ Args:
136
+ other: The other lineage metadata to merge.
137
+
138
+ Returns:
139
+ A new LineageMetadata with combined sources and operations.
140
+ """
141
+ merged = LineageMetadata(
142
+ sources=self.sources.copy(),
143
+ operations=self.operations.copy(),
144
+ created_at=datetime.now(),
145
+ project_path=self.project_path or other.project_path,
146
+ )
147
+
148
+ # Add sources from other that aren't already present
149
+ for source in other.sources:
150
+ if source not in merged.sources:
151
+ merged.sources.append(source)
152
+
153
+ # Combine operations
154
+ merged.operations.extend(other.operations)
155
+
156
+ return merged
157
+
158
+ def get_licenses(self) -> List[str]:
159
+ """
160
+ Get all unique licenses from source datasets.
161
+
162
+ Returns:
163
+ List of unique license identifiers.
164
+ """
165
+ licenses = set()
166
+ for source in self.sources:
167
+ if source.source and source.source.license:
168
+ licenses.add(source.source.license)
169
+ return sorted(licenses)
170
+
171
+ def to_dict(self) -> Dict[str, Any]:
172
+ """
173
+ Convert lineage metadata to a dictionary representation.
174
+
175
+ Returns:
176
+ Dictionary containing lineage information.
177
+ """
178
+ return {
179
+ "sources": [
180
+ {
181
+ "name": src.name,
182
+ "slug": src.slug,
183
+ "location": src.location,
184
+ }
185
+ for src in self.sources
186
+ ],
187
+ "operations": self.operations,
188
+ "created_at": self.created_at.isoformat(),
189
+ "licenses": self.get_licenses(),
190
+ }