sunstone-py 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sunstone/__init__.py +84 -0
- sunstone/_release.py +403 -0
- sunstone/dataframe.py +607 -0
- sunstone/datasets.py +480 -0
- sunstone/exceptions.py +33 -0
- sunstone/lineage.py +190 -0
- sunstone/pandas.py +246 -0
- sunstone/py.typed +0 -0
- sunstone/validation.py +253 -0
- sunstone_py-0.4.0.dist-info/METADATA +348 -0
- sunstone_py-0.4.0.dist-info/RECORD +15 -0
- sunstone_py-0.4.0.dist-info/WHEEL +5 -0
- sunstone_py-0.4.0.dist-info/entry_points.txt +2 -0
- sunstone_py-0.4.0.dist-info/licenses/LICENSE +21 -0
- sunstone_py-0.4.0.dist-info/top_level.txt +1 -0
sunstone/datasets.py
ADDED
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Parser and manager for datasets.yaml files.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import ipaddress
|
|
6
|
+
import logging
|
|
7
|
+
import socket
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List, Optional, Union
|
|
10
|
+
from urllib.parse import urljoin, urlparse
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
import yaml
|
|
14
|
+
|
|
15
|
+
from .exceptions import DatasetNotFoundError, DatasetValidationError
|
|
16
|
+
from .lineage import DatasetMetadata, FieldSchema, Source, SourceLocation
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _is_public_url(url: str) -> bool:
|
|
22
|
+
"""
|
|
23
|
+
Validate that a URL points to a public (non-private) resource.
|
|
24
|
+
|
|
25
|
+
This function prevents SSRF attacks by blocking:
|
|
26
|
+
- Non-HTTP(S) schemes (e.g., file://, ftp://)
|
|
27
|
+
- Private IP addresses (10.x.x.x, 172.16-31.x.x, 192.168.x.x)
|
|
28
|
+
- Localhost and loopback addresses
|
|
29
|
+
- Link-local addresses (169.254.x.x)
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
url: The URL to validate.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
True if the URL points to a public resource, False otherwise.
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
Exception: Re-raises unexpected exceptions after logging.
|
|
39
|
+
"""
|
|
40
|
+
try:
|
|
41
|
+
parsed = urlparse(url)
|
|
42
|
+
|
|
43
|
+
# Only allow HTTP and HTTPS schemes
|
|
44
|
+
if parsed.scheme not in ("http", "https"):
|
|
45
|
+
logger.warning("URL scheme '%s' not allowed (only http/https permitted)", parsed.scheme)
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
# Ensure hostname is present
|
|
49
|
+
if not parsed.hostname:
|
|
50
|
+
logger.warning("URL has no hostname")
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
# Resolve hostname to all IP addresses (IPv4 and IPv6) and check each
|
|
54
|
+
addrinfos = socket.getaddrinfo(parsed.hostname, None)
|
|
55
|
+
for addrinfo in addrinfos:
|
|
56
|
+
sockaddr = addrinfo[4]
|
|
57
|
+
ip = sockaddr[0]
|
|
58
|
+
ip_obj = ipaddress.ip_address(ip)
|
|
59
|
+
|
|
60
|
+
# Block private, loopback, and link-local addresses
|
|
61
|
+
if ip_obj.is_private or ip_obj.is_loopback or ip_obj.is_link_local:
|
|
62
|
+
logger.warning(
|
|
63
|
+
"URL hostname '%s' resolves to restricted IP address: %s",
|
|
64
|
+
parsed.hostname,
|
|
65
|
+
ip,
|
|
66
|
+
)
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
return True
|
|
70
|
+
|
|
71
|
+
except socket.gaierror:
|
|
72
|
+
logger.warning("Unable to resolve hostname: %s", parsed.hostname)
|
|
73
|
+
return False
|
|
74
|
+
except ValueError as e:
|
|
75
|
+
logger.warning("Error validating URL '%s': %s", url, e)
|
|
76
|
+
return False
|
|
77
|
+
except Exception as e:
|
|
78
|
+
logger.exception("Unexpected error validating URL '%s': %s", url, e)
|
|
79
|
+
raise
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class DatasetsManager:
|
|
83
|
+
"""
|
|
84
|
+
Manager for parsing and updating datasets.yaml files.
|
|
85
|
+
|
|
86
|
+
This class handles reading, parsing, and updating dataset metadata
|
|
87
|
+
from datasets.yaml files in Sunstone projects.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def __init__(self, project_path: Union[str, Path]):
|
|
91
|
+
"""
|
|
92
|
+
Initialize the datasets manager.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
project_path: Path to the project directory containing datasets.yaml.
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
FileNotFoundError: If datasets.yaml doesn't exist in the project path.
|
|
99
|
+
"""
|
|
100
|
+
self.project_path = Path(project_path).resolve()
|
|
101
|
+
self.datasets_file = self.project_path / "datasets.yaml"
|
|
102
|
+
|
|
103
|
+
if not self.datasets_file.exists():
|
|
104
|
+
raise FileNotFoundError(f"datasets.yaml not found in {self.project_path}")
|
|
105
|
+
|
|
106
|
+
self._data: Dict[str, Any] = {}
|
|
107
|
+
self._load()
|
|
108
|
+
|
|
109
|
+
def _load(self) -> None:
|
|
110
|
+
"""Load and parse the datasets.yaml file."""
|
|
111
|
+
with open(self.datasets_file, "r") as f:
|
|
112
|
+
self._data = yaml.safe_load(f) or {}
|
|
113
|
+
|
|
114
|
+
if "inputs" not in self._data:
|
|
115
|
+
self._data["inputs"] = []
|
|
116
|
+
if "outputs" not in self._data:
|
|
117
|
+
self._data["outputs"] = []
|
|
118
|
+
|
|
119
|
+
def _save(self) -> None:
|
|
120
|
+
"""Save the current data back to datasets.yaml."""
|
|
121
|
+
with open(self.datasets_file, "w") as f:
|
|
122
|
+
yaml.dump(self._data, f, default_flow_style=False, sort_keys=False)
|
|
123
|
+
|
|
124
|
+
def _parse_source_location(self, loc_data: Dict[str, Any]) -> SourceLocation:
|
|
125
|
+
"""Parse source location data from YAML."""
|
|
126
|
+
return SourceLocation(
|
|
127
|
+
data=loc_data.get("data"),
|
|
128
|
+
metadata=loc_data.get("metadata"),
|
|
129
|
+
about=loc_data.get("about"),
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def _parse_source(self, source_data: Dict[str, Any]) -> Source:
|
|
133
|
+
"""Parse source attribution data from YAML."""
|
|
134
|
+
return Source(
|
|
135
|
+
name=source_data["name"],
|
|
136
|
+
location=self._parse_source_location(source_data["location"]),
|
|
137
|
+
attributed_to=source_data["attributedTo"],
|
|
138
|
+
acquired_at=source_data["acquiredAt"],
|
|
139
|
+
acquisition_method=source_data["acquisitionMethod"],
|
|
140
|
+
license=source_data["license"],
|
|
141
|
+
updated=source_data.get("updated"),
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def _parse_fields(self, fields_data: List[Dict[str, Any]]) -> List[FieldSchema]:
|
|
145
|
+
"""Parse field schema data from YAML."""
|
|
146
|
+
return [
|
|
147
|
+
FieldSchema(name=field["name"], type=field["type"], constraints=field.get("constraints"))
|
|
148
|
+
for field in fields_data
|
|
149
|
+
]
|
|
150
|
+
|
|
151
|
+
def _parse_dataset(self, dataset_data: Dict[str, Any], dataset_type: str) -> DatasetMetadata:
|
|
152
|
+
"""
|
|
153
|
+
Parse dataset metadata from YAML data.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
dataset_data: Raw dataset data from YAML.
|
|
157
|
+
dataset_type: Either 'input' or 'output'.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Parsed DatasetMetadata object.
|
|
161
|
+
"""
|
|
162
|
+
source = None
|
|
163
|
+
if "source" in dataset_data:
|
|
164
|
+
source = self._parse_source(dataset_data["source"])
|
|
165
|
+
|
|
166
|
+
return DatasetMetadata(
|
|
167
|
+
name=dataset_data["name"],
|
|
168
|
+
slug=dataset_data["slug"],
|
|
169
|
+
location=dataset_data["location"],
|
|
170
|
+
fields=self._parse_fields(dataset_data["fields"]),
|
|
171
|
+
source=source,
|
|
172
|
+
publish=dataset_data.get("publish", False),
|
|
173
|
+
dataset_type=dataset_type,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def find_dataset_by_location(self, location: str, dataset_type: Optional[str] = None) -> Optional[DatasetMetadata]:
|
|
177
|
+
"""
|
|
178
|
+
Find a dataset by its file location.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
location: The file path or URL to search for.
|
|
182
|
+
dataset_type: Optional filter by 'input' or 'output'.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
DatasetMetadata if found, None otherwise.
|
|
186
|
+
"""
|
|
187
|
+
# Normalize location to handle both absolute and relative paths
|
|
188
|
+
location_path = Path(location)
|
|
189
|
+
if location_path.is_absolute():
|
|
190
|
+
# Try to make it relative to project path
|
|
191
|
+
try:
|
|
192
|
+
location = str(location_path.relative_to(self.project_path))
|
|
193
|
+
except ValueError:
|
|
194
|
+
# Not relative to project path, use as-is
|
|
195
|
+
location = str(location_path)
|
|
196
|
+
else:
|
|
197
|
+
location = str(location_path)
|
|
198
|
+
|
|
199
|
+
search_types = ["input", "output"] if dataset_type is None else [dataset_type]
|
|
200
|
+
|
|
201
|
+
# Resolve the requested location to an absolute path
|
|
202
|
+
location_path = Path(location)
|
|
203
|
+
if not location_path.is_absolute():
|
|
204
|
+
location_abs = (self.project_path / location_path).resolve()
|
|
205
|
+
else:
|
|
206
|
+
location_abs = location_path.resolve()
|
|
207
|
+
|
|
208
|
+
for dtype in search_types:
|
|
209
|
+
key = "inputs" if dtype == "input" else "outputs"
|
|
210
|
+
for dataset_data in self._data.get(key, []):
|
|
211
|
+
dataset_location = dataset_data["location"]
|
|
212
|
+
|
|
213
|
+
# Try multiple resolution strategies:
|
|
214
|
+
# 1. Direct string match
|
|
215
|
+
if dataset_location == location:
|
|
216
|
+
return self._parse_dataset(dataset_data, dtype)
|
|
217
|
+
|
|
218
|
+
# 2. Resolve dataset location as-is
|
|
219
|
+
dataset_loc = Path(dataset_location)
|
|
220
|
+
if not dataset_loc.is_absolute():
|
|
221
|
+
dataset_abs = (self.project_path / dataset_loc).resolve()
|
|
222
|
+
else:
|
|
223
|
+
dataset_abs = dataset_loc.resolve()
|
|
224
|
+
|
|
225
|
+
if dataset_abs == location_abs:
|
|
226
|
+
return self._parse_dataset(dataset_data, dtype)
|
|
227
|
+
|
|
228
|
+
# 3. If the requested location exists, and just the filename matches,
|
|
229
|
+
# check if they point to the same existing file
|
|
230
|
+
if location_abs.exists() and dataset_abs.exists():
|
|
231
|
+
if location_abs.samefile(dataset_abs):
|
|
232
|
+
return self._parse_dataset(dataset_data, dtype)
|
|
233
|
+
|
|
234
|
+
# 4. If requested location exists but dataset location in yaml doesn't,
|
|
235
|
+
# check if the filename matches (for cases where the directory changed)
|
|
236
|
+
if location_abs.exists() and not dataset_abs.exists():
|
|
237
|
+
if dataset_loc.name == location_path.name:
|
|
238
|
+
# Same filename - this might be a match
|
|
239
|
+
if (
|
|
240
|
+
location_abs.samefile(self.project_path / dataset_loc.name)
|
|
241
|
+
if (self.project_path / dataset_loc.name).exists()
|
|
242
|
+
else False
|
|
243
|
+
):
|
|
244
|
+
return self._parse_dataset(dataset_data, dtype)
|
|
245
|
+
# Check in common subdirectories
|
|
246
|
+
for subdir in ["inputs", "outputs", "data"]:
|
|
247
|
+
candidate = self.project_path / subdir / dataset_loc.name
|
|
248
|
+
if candidate.exists() and location_abs.samefile(candidate):
|
|
249
|
+
return self._parse_dataset(dataset_data, dtype)
|
|
250
|
+
|
|
251
|
+
return None
|
|
252
|
+
|
|
253
|
+
def find_dataset_by_slug(self, slug: str, dataset_type: Optional[str] = None) -> Optional[DatasetMetadata]:
|
|
254
|
+
"""
|
|
255
|
+
Find a dataset by its slug.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
slug: The dataset slug to search for.
|
|
259
|
+
dataset_type: Optional filter by 'input' or 'output'.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
DatasetMetadata if found, None otherwise.
|
|
263
|
+
"""
|
|
264
|
+
search_types = ["input", "output"] if dataset_type is None else [dataset_type]
|
|
265
|
+
|
|
266
|
+
for dtype in search_types:
|
|
267
|
+
key = "inputs" if dtype == "input" else "outputs"
|
|
268
|
+
for dataset_data in self._data.get(key, []):
|
|
269
|
+
if dataset_data["slug"] == slug:
|
|
270
|
+
return self._parse_dataset(dataset_data, dtype)
|
|
271
|
+
|
|
272
|
+
return None
|
|
273
|
+
|
|
274
|
+
def get_all_inputs(self) -> List[DatasetMetadata]:
|
|
275
|
+
"""
|
|
276
|
+
Get all input datasets.
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
List of all input dataset metadata.
|
|
280
|
+
"""
|
|
281
|
+
return [self._parse_dataset(data, "input") for data in self._data.get("inputs", [])]
|
|
282
|
+
|
|
283
|
+
def get_all_outputs(self) -> List[DatasetMetadata]:
|
|
284
|
+
"""
|
|
285
|
+
Get all output datasets.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
List of all output dataset metadata.
|
|
289
|
+
"""
|
|
290
|
+
return [self._parse_dataset(data, "output") for data in self._data.get("outputs", [])]
|
|
291
|
+
|
|
292
|
+
def add_output_dataset(
|
|
293
|
+
self, name: str, slug: str, location: str, fields: List[FieldSchema], publish: bool = False
|
|
294
|
+
) -> DatasetMetadata:
|
|
295
|
+
"""
|
|
296
|
+
Add a new output dataset to datasets.yaml.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
name: Human-readable name.
|
|
300
|
+
slug: Kebab-case identifier.
|
|
301
|
+
location: File path for the output.
|
|
302
|
+
fields: List of field schemas.
|
|
303
|
+
publish: Whether to publish this dataset.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
The newly created DatasetMetadata.
|
|
307
|
+
|
|
308
|
+
Raises:
|
|
309
|
+
DatasetValidationError: If a dataset with this slug already exists.
|
|
310
|
+
"""
|
|
311
|
+
# Check if slug already exists
|
|
312
|
+
if self.find_dataset_by_slug(slug, "output"):
|
|
313
|
+
raise DatasetValidationError(f"Output dataset with slug '{slug}' already exists")
|
|
314
|
+
|
|
315
|
+
# Create the dataset entry
|
|
316
|
+
dataset_data = {
|
|
317
|
+
"name": name,
|
|
318
|
+
"slug": slug,
|
|
319
|
+
"location": location,
|
|
320
|
+
"publish": publish,
|
|
321
|
+
"fields": [
|
|
322
|
+
{
|
|
323
|
+
"name": field.name,
|
|
324
|
+
"type": field.type,
|
|
325
|
+
**({"constraints": field.constraints} if field.constraints else {}),
|
|
326
|
+
}
|
|
327
|
+
for field in fields
|
|
328
|
+
],
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
# Add to outputs
|
|
332
|
+
self._data["outputs"].append(dataset_data)
|
|
333
|
+
|
|
334
|
+
# Save changes
|
|
335
|
+
self._save()
|
|
336
|
+
|
|
337
|
+
return self._parse_dataset(dataset_data, "output")
|
|
338
|
+
|
|
339
|
+
def update_output_dataset(
|
|
340
|
+
self, slug: str, fields: Optional[List[FieldSchema]] = None, location: Optional[str] = None
|
|
341
|
+
) -> DatasetMetadata:
|
|
342
|
+
"""
|
|
343
|
+
Update an existing output dataset.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
slug: The slug of the dataset to update.
|
|
347
|
+
fields: Optional new field schema.
|
|
348
|
+
location: Optional new location.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
The updated DatasetMetadata.
|
|
352
|
+
|
|
353
|
+
Raises:
|
|
354
|
+
DatasetNotFoundError: If the dataset doesn't exist.
|
|
355
|
+
"""
|
|
356
|
+
for i, dataset_data in enumerate(self._data["outputs"]):
|
|
357
|
+
if dataset_data["slug"] == slug:
|
|
358
|
+
if fields is not None:
|
|
359
|
+
dataset_data["fields"] = [
|
|
360
|
+
{
|
|
361
|
+
"name": field.name,
|
|
362
|
+
"type": field.type,
|
|
363
|
+
**({"constraints": field.constraints} if field.constraints else {}),
|
|
364
|
+
}
|
|
365
|
+
for field in fields
|
|
366
|
+
]
|
|
367
|
+
if location is not None:
|
|
368
|
+
dataset_data["location"] = location
|
|
369
|
+
|
|
370
|
+
self._save()
|
|
371
|
+
return self._parse_dataset(dataset_data, "output")
|
|
372
|
+
|
|
373
|
+
raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
|
|
374
|
+
|
|
375
|
+
def get_absolute_path(self, location: str) -> Path:
|
|
376
|
+
"""
|
|
377
|
+
Get the absolute path for a dataset location.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
location: The location string from dataset metadata.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
Absolute path to the dataset file.
|
|
384
|
+
"""
|
|
385
|
+
location_path = Path(location)
|
|
386
|
+
if location_path.is_absolute():
|
|
387
|
+
return location_path
|
|
388
|
+
return (self.project_path / location_path).resolve()
|
|
389
|
+
|
|
390
|
+
def fetch_from_url(
|
|
391
|
+
self,
|
|
392
|
+
dataset: DatasetMetadata,
|
|
393
|
+
timeout: int = 30,
|
|
394
|
+
force: bool = False,
|
|
395
|
+
max_redirects: int = 10,
|
|
396
|
+
) -> Path:
|
|
397
|
+
"""
|
|
398
|
+
Fetch a dataset from its source URL if available.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
dataset: The dataset metadata containing source URL.
|
|
402
|
+
timeout: Request timeout in seconds.
|
|
403
|
+
force: If True, fetch even if local file exists.
|
|
404
|
+
max_redirects: Maximum number of redirects to follow (default: 10).
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
Path to the local file (newly downloaded or existing).
|
|
408
|
+
|
|
409
|
+
Raises:
|
|
410
|
+
ValueError: If dataset has no source URL or URL is not allowed.
|
|
411
|
+
requests.RequestException: If the fetch fails.
|
|
412
|
+
"""
|
|
413
|
+
if not dataset.source or not dataset.source.location.data:
|
|
414
|
+
raise ValueError(f"Dataset '{dataset.slug}' has no source URL")
|
|
415
|
+
|
|
416
|
+
local_path = self.get_absolute_path(dataset.location)
|
|
417
|
+
|
|
418
|
+
# Skip if file exists and not forcing
|
|
419
|
+
if local_path.exists() and not force:
|
|
420
|
+
logger.info("Using existing local file: %s", local_path)
|
|
421
|
+
return local_path
|
|
422
|
+
|
|
423
|
+
url = dataset.source.location.data
|
|
424
|
+
|
|
425
|
+
# Validate URL points to public resource to prevent SSRF attacks
|
|
426
|
+
if not _is_public_url(url):
|
|
427
|
+
raise ValueError(
|
|
428
|
+
f"URL '{url}' is not allowed. Only HTTP/HTTPS URLs pointing to public internet addresses are permitted."
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
logger.info("Fetching dataset from URL: %s", url)
|
|
432
|
+
|
|
433
|
+
try:
|
|
434
|
+
# Disable automatic redirects and handle them manually to prevent SSRF bypass
|
|
435
|
+
# An attacker could use a public URL that redirects to a private IP
|
|
436
|
+
current_url = url
|
|
437
|
+
response = requests.get(current_url, timeout=timeout, allow_redirects=False)
|
|
438
|
+
redirect_count = 0
|
|
439
|
+
|
|
440
|
+
while response.is_redirect and redirect_count < max_redirects:
|
|
441
|
+
redirect_url = response.headers.get("Location")
|
|
442
|
+
if not redirect_url:
|
|
443
|
+
raise ValueError("Redirect response without Location header")
|
|
444
|
+
|
|
445
|
+
# Resolve relative URLs against the current URL
|
|
446
|
+
redirect_url = urljoin(current_url, redirect_url)
|
|
447
|
+
|
|
448
|
+
# Validate the redirect target URL for SSRF protection
|
|
449
|
+
if not _is_public_url(redirect_url):
|
|
450
|
+
raise ValueError(
|
|
451
|
+
f"Redirect URL '{redirect_url}' is not allowed. Only HTTP/HTTPS URLs "
|
|
452
|
+
"pointing to public internet addresses are permitted."
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
logger.info("Following redirect to: %s", redirect_url)
|
|
456
|
+
current_url = redirect_url
|
|
457
|
+
response = requests.get(current_url, timeout=timeout, allow_redirects=False)
|
|
458
|
+
redirect_count += 1
|
|
459
|
+
|
|
460
|
+
if response.is_redirect:
|
|
461
|
+
raise ValueError(f"Too many redirects (max: {max_redirects})")
|
|
462
|
+
|
|
463
|
+
response.raise_for_status()
|
|
464
|
+
|
|
465
|
+
# Ensure parent directory exists
|
|
466
|
+
local_path.parent.mkdir(parents=True, exist_ok=True)
|
|
467
|
+
|
|
468
|
+
# Save to local file
|
|
469
|
+
with open(local_path, "wb") as f:
|
|
470
|
+
f.write(response.content)
|
|
471
|
+
|
|
472
|
+
logger.info("✓ Successfully saved to %s (%d bytes)", local_path, len(response.content))
|
|
473
|
+
return local_path
|
|
474
|
+
|
|
475
|
+
except requests.Timeout:
|
|
476
|
+
logger.error("Request timed out after %d seconds", timeout)
|
|
477
|
+
raise
|
|
478
|
+
except requests.RequestException as e:
|
|
479
|
+
logger.error("Failed to fetch from URL: %s", e)
|
|
480
|
+
raise
|
sunstone/exceptions.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Custom exceptions for the Sunstone library.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SunstoneError(Exception):
|
|
7
|
+
"""Base exception for all Sunstone library errors."""
|
|
8
|
+
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DatasetNotFoundError(SunstoneError):
|
|
13
|
+
"""Raised when a dataset is not found in datasets.yaml."""
|
|
14
|
+
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class StrictModeError(SunstoneError):
|
|
19
|
+
"""Raised when an operation would modify datasets.yaml in strict mode."""
|
|
20
|
+
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatasetValidationError(SunstoneError):
|
|
25
|
+
"""Raised when dataset metadata fails validation."""
|
|
26
|
+
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class LineageError(SunstoneError):
|
|
31
|
+
"""Raised when there's an issue with lineage tracking."""
|
|
32
|
+
|
|
33
|
+
pass
|
sunstone/lineage.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lineage metadata structures for tracking data provenance.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class SourceLocation:
|
|
12
|
+
"""Location information for a data source."""
|
|
13
|
+
|
|
14
|
+
data: Optional[str] = None
|
|
15
|
+
"""URL to the data file."""
|
|
16
|
+
|
|
17
|
+
metadata: Optional[str] = None
|
|
18
|
+
"""URL to metadata about the data."""
|
|
19
|
+
|
|
20
|
+
about: Optional[str] = None
|
|
21
|
+
"""URL to a page describing the data source."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Source:
|
|
26
|
+
"""Source attribution information for a dataset."""
|
|
27
|
+
|
|
28
|
+
name: str
|
|
29
|
+
"""Name of the data source."""
|
|
30
|
+
|
|
31
|
+
location: SourceLocation
|
|
32
|
+
"""Location information for the source."""
|
|
33
|
+
|
|
34
|
+
attributed_to: str
|
|
35
|
+
"""Organization or individual to attribute the data to."""
|
|
36
|
+
|
|
37
|
+
acquired_at: str
|
|
38
|
+
"""Date when the data was acquired (YYYY-MM-DD format)."""
|
|
39
|
+
|
|
40
|
+
acquisition_method: str
|
|
41
|
+
"""Method used to acquire the data (e.g., 'manual-download', 'api', 'scraping')."""
|
|
42
|
+
|
|
43
|
+
license: str
|
|
44
|
+
"""SPDX license identifier."""
|
|
45
|
+
# TODO: Consider using a library for SPDX license validation.
|
|
46
|
+
|
|
47
|
+
updated: Optional[str] = None
|
|
48
|
+
"""Optional description of update frequency."""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class FieldSchema:
|
|
53
|
+
"""Schema definition for a dataset field."""
|
|
54
|
+
|
|
55
|
+
name: str
|
|
56
|
+
"""Name of the field/column."""
|
|
57
|
+
|
|
58
|
+
type: str
|
|
59
|
+
"""Data type (string, number, integer, boolean, date, datetime)."""
|
|
60
|
+
|
|
61
|
+
constraints: Optional[Dict[str, Any]] = None
|
|
62
|
+
"""Optional constraints (e.g., enum values)."""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class DatasetMetadata:
|
|
67
|
+
"""Metadata for a dataset from datasets.yaml."""
|
|
68
|
+
|
|
69
|
+
name: str
|
|
70
|
+
"""Human-readable name of the dataset."""
|
|
71
|
+
|
|
72
|
+
slug: str
|
|
73
|
+
"""Kebab-case identifier for the dataset."""
|
|
74
|
+
|
|
75
|
+
location: str
|
|
76
|
+
"""File path or URL for the dataset."""
|
|
77
|
+
|
|
78
|
+
fields: List[FieldSchema]
|
|
79
|
+
"""Schema definitions for dataset fields."""
|
|
80
|
+
|
|
81
|
+
source: Optional[Source] = None
|
|
82
|
+
"""Source attribution (for input datasets)."""
|
|
83
|
+
|
|
84
|
+
publish: bool = False
|
|
85
|
+
"""Whether this dataset should be published (for output datasets)."""
|
|
86
|
+
|
|
87
|
+
dataset_type: str = "input"
|
|
88
|
+
"""Type of dataset: 'input' or 'output'."""
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class LineageMetadata:
|
|
93
|
+
"""
|
|
94
|
+
Lineage metadata tracking the provenance of data in a DataFrame.
|
|
95
|
+
|
|
96
|
+
This tracks all source datasets that contributed to the current DataFrame,
|
|
97
|
+
including information about transformations and operations performed.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
sources: List[DatasetMetadata] = field(default_factory=list)
|
|
101
|
+
"""List of source datasets that contributed to this data."""
|
|
102
|
+
|
|
103
|
+
operations: List[str] = field(default_factory=list)
|
|
104
|
+
"""List of operations performed on the data."""
|
|
105
|
+
|
|
106
|
+
created_at: datetime = field(default_factory=datetime.now)
|
|
107
|
+
"""Timestamp when this lineage was created."""
|
|
108
|
+
|
|
109
|
+
project_path: Optional[str] = None
|
|
110
|
+
"""Path to the project directory containing datasets.yaml."""
|
|
111
|
+
|
|
112
|
+
def add_source(self, dataset: DatasetMetadata) -> None:
|
|
113
|
+
"""
|
|
114
|
+
Add a source dataset to the lineage.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
dataset: The dataset metadata to add to sources.
|
|
118
|
+
"""
|
|
119
|
+
if dataset not in self.sources:
|
|
120
|
+
self.sources.append(dataset)
|
|
121
|
+
|
|
122
|
+
def add_operation(self, operation: str) -> None:
|
|
123
|
+
"""
|
|
124
|
+
Record an operation performed on the data.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
operation: Description of the operation.
|
|
128
|
+
"""
|
|
129
|
+
self.operations.append(operation)
|
|
130
|
+
|
|
131
|
+
def merge(self, other: "LineageMetadata") -> "LineageMetadata":
|
|
132
|
+
"""
|
|
133
|
+
Merge lineage from another DataFrame.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
other: The other lineage metadata to merge.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
A new LineageMetadata with combined sources and operations.
|
|
140
|
+
"""
|
|
141
|
+
merged = LineageMetadata(
|
|
142
|
+
sources=self.sources.copy(),
|
|
143
|
+
operations=self.operations.copy(),
|
|
144
|
+
created_at=datetime.now(),
|
|
145
|
+
project_path=self.project_path or other.project_path,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Add sources from other that aren't already present
|
|
149
|
+
for source in other.sources:
|
|
150
|
+
if source not in merged.sources:
|
|
151
|
+
merged.sources.append(source)
|
|
152
|
+
|
|
153
|
+
# Combine operations
|
|
154
|
+
merged.operations.extend(other.operations)
|
|
155
|
+
|
|
156
|
+
return merged
|
|
157
|
+
|
|
158
|
+
def get_licenses(self) -> List[str]:
|
|
159
|
+
"""
|
|
160
|
+
Get all unique licenses from source datasets.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
List of unique license identifiers.
|
|
164
|
+
"""
|
|
165
|
+
licenses = set()
|
|
166
|
+
for source in self.sources:
|
|
167
|
+
if source.source and source.source.license:
|
|
168
|
+
licenses.add(source.source.license)
|
|
169
|
+
return sorted(licenses)
|
|
170
|
+
|
|
171
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
172
|
+
"""
|
|
173
|
+
Convert lineage metadata to a dictionary representation.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Dictionary containing lineage information.
|
|
177
|
+
"""
|
|
178
|
+
return {
|
|
179
|
+
"sources": [
|
|
180
|
+
{
|
|
181
|
+
"name": src.name,
|
|
182
|
+
"slug": src.slug,
|
|
183
|
+
"location": src.location,
|
|
184
|
+
}
|
|
185
|
+
for src in self.sources
|
|
186
|
+
],
|
|
187
|
+
"operations": self.operations,
|
|
188
|
+
"created_at": self.created_at.isoformat(),
|
|
189
|
+
"licenses": self.get_licenses(),
|
|
190
|
+
}
|