aptdata 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aptdata/__init__.py +3 -0
- aptdata/cli/__init__.py +5 -0
- aptdata/cli/app.py +247 -0
- aptdata/cli/commands/__init__.py +9 -0
- aptdata/cli/commands/config_cmd.py +128 -0
- aptdata/cli/commands/mesh_cmd.py +435 -0
- aptdata/cli/commands/plugin_cmd.py +107 -0
- aptdata/cli/commands/system_cmd.py +90 -0
- aptdata/cli/commands/telemetry_cmd.py +57 -0
- aptdata/cli/completions.py +56 -0
- aptdata/cli/interactive.py +269 -0
- aptdata/cli/rendering/__init__.py +31 -0
- aptdata/cli/rendering/console.py +119 -0
- aptdata/cli/rendering/logger.py +26 -0
- aptdata/cli/rendering/panels.py +87 -0
- aptdata/cli/rendering/tables.py +81 -0
- aptdata/cli/scaffold.py +1089 -0
- aptdata/config/__init__.py +13 -0
- aptdata/config/parser.py +136 -0
- aptdata/config/schema.py +27 -0
- aptdata/config/secrets.py +60 -0
- aptdata/core/__init__.py +46 -0
- aptdata/core/context.py +31 -0
- aptdata/core/dataset.py +39 -0
- aptdata/core/lineage.py +213 -0
- aptdata/core/state.py +27 -0
- aptdata/core/system.py +317 -0
- aptdata/core/workflow.py +372 -0
- aptdata/mcp/__init__.py +5 -0
- aptdata/mcp/server.py +198 -0
- aptdata/plugins/__init__.py +77 -0
- aptdata/plugins/ai/__init__.py +6 -0
- aptdata/plugins/ai/chunking.py +66 -0
- aptdata/plugins/ai/embeddings.py +56 -0
- aptdata/plugins/base.py +57 -0
- aptdata/plugins/dataset.py +62 -0
- aptdata/plugins/governance/__init__.py +32 -0
- aptdata/plugins/governance/catalog.py +115 -0
- aptdata/plugins/governance/classification.py +44 -0
- aptdata/plugins/governance/lineage_store.py +49 -0
- aptdata/plugins/governance/rules.py +180 -0
- aptdata/plugins/local_fs.py +241 -0
- aptdata/plugins/manager.py +142 -0
- aptdata/plugins/postgres.py +113 -0
- aptdata/plugins/quality/__init__.py +39 -0
- aptdata/plugins/quality/contract.py +128 -0
- aptdata/plugins/quality/expectations.py +310 -0
- aptdata/plugins/quality/report.py +94 -0
- aptdata/plugins/quality/validator.py +139 -0
- aptdata/plugins/rest.py +135 -0
- aptdata/plugins/transform/__init__.py +14 -0
- aptdata/plugins/transform/pandas.py +129 -0
- aptdata/plugins/transform/spark.py +134 -0
- aptdata/plugins/vector/__init__.py +6 -0
- aptdata/plugins/vector/base.py +19 -0
- aptdata/plugins/vector/qdrant.py +41 -0
- aptdata/telemetry/__init__.py +5 -0
- aptdata/telemetry/instrumentation.py +164 -0
- aptdata/tui/__init__.py +5 -0
- aptdata/tui/monitor.py +279 -0
- aptdata-0.0.2.dist-info/METADATA +330 -0
- aptdata-0.0.2.dist-info/RECORD +65 -0
- aptdata-0.0.2.dist-info/WHEEL +4 -0
- aptdata-0.0.2.dist-info/entry_points.txt +3 -0
- aptdata-0.0.2.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Declarative configuration helpers for aptdata."""
|
|
2
|
+
|
|
3
|
+
from aptdata.config.parser import ParsedConfig, YamlConfigParser
|
|
4
|
+
from aptdata.config.schema import export_domain_schema, write_domain_schema
|
|
5
|
+
from aptdata.config.secrets import SecretManager
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"ParsedConfig",
|
|
9
|
+
"YamlConfigParser",
|
|
10
|
+
"export_domain_schema",
|
|
11
|
+
"write_domain_schema",
|
|
12
|
+
"SecretManager",
|
|
13
|
+
]
|
aptdata/config/parser.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""YAML parser for declarative aptdata system definitions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
from pydantic import TypeAdapter
|
|
11
|
+
from pydantic.dataclasses import dataclass as pydantic_dataclass
|
|
12
|
+
|
|
13
|
+
from aptdata.config.secrets import SecretManager
|
|
14
|
+
from aptdata.core.dataset import IDataset
|
|
15
|
+
from aptdata.core.system import BaseComponent, BaseFlow, BaseSystem, IFlow
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pydantic_dataclass
|
|
19
|
+
class ConfigEdge:
|
|
20
|
+
"""Serializable flow edge for declarative YAML files."""
|
|
21
|
+
|
|
22
|
+
source_id: str
|
|
23
|
+
target_id: str
|
|
24
|
+
condition: str = ""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@pydantic_dataclass
|
|
28
|
+
class ConfigComponent(BaseComponent):
|
|
29
|
+
"""Concrete component used by configuration hydration."""
|
|
30
|
+
|
|
31
|
+
def validate_inputs(self, inputs: list[IDataset]) -> bool: # noqa: ARG002
|
|
32
|
+
return True
|
|
33
|
+
|
|
34
|
+
def execute(self, inputs: list[IDataset]) -> list[IDataset]:
|
|
35
|
+
return inputs
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@pydantic_dataclass
|
|
39
|
+
class ConfigFlow(BaseFlow):
|
|
40
|
+
"""Concrete flow used by configuration hydration."""
|
|
41
|
+
|
|
42
|
+
components: list[ConfigComponent] = field(default_factory=list)
|
|
43
|
+
edges: list[ConfigEdge] = field(default_factory=list)
|
|
44
|
+
|
|
45
|
+
def add_component(self, component: BaseComponent) -> None:
|
|
46
|
+
if not isinstance(component, ConfigComponent):
|
|
47
|
+
raise TypeError("ConfigFlow only accepts ConfigComponent instances.")
|
|
48
|
+
self.components.append(component)
|
|
49
|
+
|
|
50
|
+
def connect(
|
|
51
|
+
self,
|
|
52
|
+
source_id: str,
|
|
53
|
+
target_id: str,
|
|
54
|
+
condition: str | None = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
self.edges.append(
|
|
57
|
+
ConfigEdge(
|
|
58
|
+
source_id=source_id,
|
|
59
|
+
target_id=target_id,
|
|
60
|
+
condition=condition or "",
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def compile(self) -> None:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
def run(self, initial_inputs: list[IDataset]) -> list[IDataset]:
|
|
68
|
+
return initial_inputs
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@pydantic_dataclass
|
|
72
|
+
class ConfigSystem(BaseSystem):
|
|
73
|
+
"""Concrete system used by configuration hydration."""
|
|
74
|
+
|
|
75
|
+
flows: list[ConfigFlow] = field(default_factory=list)
|
|
76
|
+
|
|
77
|
+
def register_flow(self, flow: IFlow) -> None:
|
|
78
|
+
if not isinstance(flow, ConfigFlow):
|
|
79
|
+
raise TypeError("ConfigSystem only accepts ConfigFlow instances.")
|
|
80
|
+
self.flows.append(flow)
|
|
81
|
+
|
|
82
|
+
def run(self) -> None:
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@pydantic_dataclass
|
|
87
|
+
class ParsedConfig:
|
|
88
|
+
"""Hydrated config document with metadata and validated domain objects."""
|
|
89
|
+
|
|
90
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
91
|
+
system: ConfigSystem = field(
|
|
92
|
+
default_factory=lambda: ConfigSystem(system_id="default")
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class YamlConfigParser:
|
|
97
|
+
"""Parse YAML files into validated aptdata domain objects."""
|
|
98
|
+
|
|
99
|
+
_component_adapter = TypeAdapter(ConfigComponent)
|
|
100
|
+
_flow_adapter = TypeAdapter(ConfigFlow)
|
|
101
|
+
_edge_adapter = TypeAdapter(ConfigEdge)
|
|
102
|
+
_system_adapter = TypeAdapter(ConfigSystem)
|
|
103
|
+
|
|
104
|
+
def __init__(self, secret_manager: SecretManager | None = None) -> None:
|
|
105
|
+
self._secret_manager = secret_manager or SecretManager()
|
|
106
|
+
|
|
107
|
+
def parse_file(self, path: str | Path) -> ParsedConfig:
|
|
108
|
+
"""Read and parse a YAML config file."""
|
|
109
|
+
config_path = Path(path)
|
|
110
|
+
raw = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
|
111
|
+
if not isinstance(raw, dict):
|
|
112
|
+
raise ValueError("YAML root must be a mapping/object.")
|
|
113
|
+
return self.parse_data(raw)
|
|
114
|
+
|
|
115
|
+
def parse_data(self, payload: dict[str, Any]) -> ParsedConfig:
|
|
116
|
+
"""Parse a loaded YAML dictionary."""
|
|
117
|
+
payload = self._secret_manager.resolve(payload)
|
|
118
|
+
metadata = payload.get("metadata", {})
|
|
119
|
+
system_payload = dict(payload.get("system", {}))
|
|
120
|
+
flow_payloads = system_payload.pop("flows", payload.get("flows", []))
|
|
121
|
+
|
|
122
|
+
system = self._system_adapter.validate_python(system_payload)
|
|
123
|
+
for flow_payload in flow_payloads:
|
|
124
|
+
flow_data = dict(flow_payload)
|
|
125
|
+
component_payloads = flow_data.pop("components", [])
|
|
126
|
+
edge_payloads = flow_data.pop("edges", [])
|
|
127
|
+
|
|
128
|
+
flow = self._flow_adapter.validate_python(flow_data)
|
|
129
|
+
for component_payload in component_payloads:
|
|
130
|
+
component = self._component_adapter.validate_python(component_payload)
|
|
131
|
+
flow.add_component(component)
|
|
132
|
+
for edge_payload in edge_payloads:
|
|
133
|
+
flow.edges.append(self._edge_adapter.validate_python(edge_payload))
|
|
134
|
+
system.register_flow(flow)
|
|
135
|
+
|
|
136
|
+
return ParsedConfig(metadata=metadata, system=system)
|
aptdata/config/schema.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""JSON Schema utilities for declarative aptdata configs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from pydantic import TypeAdapter
|
|
10
|
+
|
|
11
|
+
from aptdata.config.parser import ParsedConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def export_domain_schema() -> dict[str, Any]:
|
|
15
|
+
"""Export JSON Schema for the full declarative config domain."""
|
|
16
|
+
return TypeAdapter(ParsedConfig).json_schema()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def write_domain_schema(output: str | Path) -> Path:
|
|
20
|
+
"""Write the domain JSON Schema to *output*."""
|
|
21
|
+
output_path = Path(output)
|
|
22
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
output_path.write_text(
|
|
24
|
+
json.dumps(export_domain_schema(), indent=2, ensure_ascii=False),
|
|
25
|
+
encoding="utf-8",
|
|
26
|
+
)
|
|
27
|
+
return output_path
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Secret resolution helpers for aptdata configuration and plugins."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from aptdata.telemetry.instrumentation import register_secret
|
|
10
|
+
|
|
11
|
+
_ENV_PATTERN = re.compile(r"\$\{([A-Za-z0-9_]+)\}")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SecretManager:
|
|
15
|
+
"""Resolve `${ENV_VAR}` placeholders using environment variables."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, *, load_dotenv_file: bool = True) -> None:
|
|
18
|
+
if load_dotenv_file:
|
|
19
|
+
try:
|
|
20
|
+
from dotenv import load_dotenv # noqa: WPS433
|
|
21
|
+
except ImportError:
|
|
22
|
+
load_dotenv = None
|
|
23
|
+
if load_dotenv is not None:
|
|
24
|
+
load_dotenv(dotenv_path=".env")
|
|
25
|
+
self._injected_keys: set[str] = set()
|
|
26
|
+
|
|
27
|
+
def get(self, key: str, default: str | None = None) -> str:
|
|
28
|
+
"""Return environment value for *key* and register it as a secret."""
|
|
29
|
+
value = os.getenv(key, default)
|
|
30
|
+
if value is None:
|
|
31
|
+
raise KeyError(f"Missing required secret: {key}")
|
|
32
|
+
self._injected_keys.add(key)
|
|
33
|
+
register_secret(key, value)
|
|
34
|
+
return value
|
|
35
|
+
|
|
36
|
+
def resolve(self, value: Any) -> Any:
|
|
37
|
+
"""Recursively resolve `${ENV_VAR}` placeholders in nested structures."""
|
|
38
|
+
if isinstance(value, str):
|
|
39
|
+
return self._resolve_string(value)
|
|
40
|
+
if isinstance(value, dict):
|
|
41
|
+
return {k: self.resolve(v) for k, v in value.items()}
|
|
42
|
+
if isinstance(value, list):
|
|
43
|
+
return [self.resolve(item) for item in value]
|
|
44
|
+
if isinstance(value, tuple):
|
|
45
|
+
return tuple(self.resolve(item) for item in value)
|
|
46
|
+
return value
|
|
47
|
+
|
|
48
|
+
def injected_keys(self) -> list[str]:
|
|
49
|
+
"""Return sorted secret names injected in this manager session."""
|
|
50
|
+
return sorted(self._injected_keys)
|
|
51
|
+
|
|
52
|
+
def _resolve_string(self, value: str) -> str:
|
|
53
|
+
matches = _ENV_PATTERN.findall(value)
|
|
54
|
+
if not matches:
|
|
55
|
+
return value
|
|
56
|
+
resolved = value
|
|
57
|
+
for key in matches:
|
|
58
|
+
resolved_secret = self.get(key)
|
|
59
|
+
resolved = resolved.replace(f"${{{key}}}", resolved_secret)
|
|
60
|
+
return resolved
|
aptdata/core/__init__.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Core interfaces and base classes for aptdata."""
|
|
2
|
+
|
|
3
|
+
from aptdata.core.context import ExecutionContext
|
|
4
|
+
from aptdata.core.dataset import BaseDataset, IDataset
|
|
5
|
+
from aptdata.core.state import StateBackend
|
|
6
|
+
from aptdata.core.system import (
|
|
7
|
+
BaseComponent,
|
|
8
|
+
BaseFlow,
|
|
9
|
+
BaseSystem,
|
|
10
|
+
ComponentKind,
|
|
11
|
+
ComponentMeta,
|
|
12
|
+
FlowEdge,
|
|
13
|
+
FlowNode,
|
|
14
|
+
IComponent,
|
|
15
|
+
IFlow,
|
|
16
|
+
ISystem,
|
|
17
|
+
)
|
|
18
|
+
from aptdata.core.workflow import (
|
|
19
|
+
BaseWorkflow,
|
|
20
|
+
IWorkflow,
|
|
21
|
+
Workflow,
|
|
22
|
+
WorkflowEdge,
|
|
23
|
+
WorkflowNode,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"IDataset",
|
|
28
|
+
"BaseDataset",
|
|
29
|
+
"ExecutionContext",
|
|
30
|
+
"ComponentKind",
|
|
31
|
+
"ComponentMeta",
|
|
32
|
+
"IComponent",
|
|
33
|
+
"BaseComponent",
|
|
34
|
+
"FlowEdge",
|
|
35
|
+
"FlowNode",
|
|
36
|
+
"IFlow",
|
|
37
|
+
"BaseFlow",
|
|
38
|
+
"ISystem",
|
|
39
|
+
"BaseSystem",
|
|
40
|
+
"WorkflowEdge",
|
|
41
|
+
"WorkflowNode",
|
|
42
|
+
"IWorkflow",
|
|
43
|
+
"BaseWorkflow",
|
|
44
|
+
"Workflow",
|
|
45
|
+
"StateBackend",
|
|
46
|
+
]
|
aptdata/core/context.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Injectable execution context for state sharing across runs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import field
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic.dataclasses import dataclass as pydantic_dataclass
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pydantic_dataclass
|
|
12
|
+
class ExecutionContext:
|
|
13
|
+
"""Simple in-memory key/value state container."""
|
|
14
|
+
|
|
15
|
+
memory: dict[str, Any] = field(default_factory=dict)
|
|
16
|
+
|
|
17
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
18
|
+
"""Return value for *key* or *default* if not present."""
|
|
19
|
+
return self.memory.get(key, default)
|
|
20
|
+
|
|
21
|
+
def set(self, key: str, value: Any) -> None:
|
|
22
|
+
"""Store *value* under *key*."""
|
|
23
|
+
self.memory[key] = value
|
|
24
|
+
|
|
25
|
+
def update(self, values: dict[str, Any]) -> None:
|
|
26
|
+
"""Merge mapping into memory state."""
|
|
27
|
+
self.memory.update(values)
|
|
28
|
+
|
|
29
|
+
def clear(self) -> None:
|
|
30
|
+
"""Remove all state."""
|
|
31
|
+
self.memory.clear()
|
aptdata/core/dataset.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Dataset interface and base class."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pydantic.dataclasses import dataclass as pydantic_dataclass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class IDataset(ABC):
|
|
12
|
+
"""Dataclass interface for dataset types.
|
|
13
|
+
|
|
14
|
+
All dataset contracts must implement :meth:`read` and :meth:`write`.
|
|
15
|
+
No concrete fields are defined here – field declarations live in
|
|
16
|
+
:class:`BaseDataset` and its subclasses.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def read(self) -> Any:
|
|
21
|
+
"""Read and return data from the dataset."""
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def write(self, data: Any) -> None:
|
|
25
|
+
"""Write data to the dataset."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@pydantic_dataclass
|
|
29
|
+
class BaseDataset(IDataset):
|
|
30
|
+
"""Base dataset with Pydantic-validated fields.
|
|
31
|
+
|
|
32
|
+
Provides the canonical ``uri`` and ``schema_metadata`` fields.
|
|
33
|
+
Concrete dataset implementations must inherit from this class and
|
|
34
|
+
implement the :meth:`read` and :meth:`write` abstract methods
|
|
35
|
+
inherited from :class:`IDataset`.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
uri: str
|
|
39
|
+
schema_metadata: dict[str, Any] = field(default_factory=dict)
|
aptdata/core/lineage.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""Data lineage graph for tracking dataset transformations and provenance.
|
|
2
|
+
|
|
3
|
+
Provides :class:`LineageGraph` which is built up during a workflow run and
|
|
4
|
+
stores :class:`LineageNode` instances that capture every read, transform,
|
|
5
|
+
quality-check, and write event.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from typing import Any
|
|
14
|
+
from uuid import uuid4
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LineageEventType(str, Enum):
|
|
18
|
+
"""Enumeration of lineage event types."""
|
|
19
|
+
|
|
20
|
+
READ = "READ"
|
|
21
|
+
TRANSFORM = "TRANSFORM"
|
|
22
|
+
QUALITY_CHECK = "QUALITY_CHECK"
|
|
23
|
+
BUSINESS_RULE = "BUSINESS_RULE"
|
|
24
|
+
WRITE = "WRITE"
|
|
25
|
+
SCHEMA_CHANGE = "SCHEMA_CHANGE"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class ColumnLineage:
|
|
30
|
+
"""Column-level lineage mapping a target column to its source columns.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
target_column:
|
|
35
|
+
Name of the output column.
|
|
36
|
+
source_columns:
|
|
37
|
+
Names of the input columns that contributed to *target_column*.
|
|
38
|
+
transformation:
|
|
39
|
+
Human-readable description of the transformation applied.
|
|
40
|
+
source_dataset_uri:
|
|
41
|
+
URI of the dataset that contains the source columns.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
target_column: str
|
|
45
|
+
source_columns: list[str] = field(default_factory=list)
|
|
46
|
+
transformation: str = ""
|
|
47
|
+
source_dataset_uri: str = ""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class LineageNode:
|
|
52
|
+
"""A single event node in the lineage graph.
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
dataset_uri:
|
|
57
|
+
Logical URI of the dataset involved in this event.
|
|
58
|
+
event_type:
|
|
59
|
+
The kind of lineage event (see :class:`LineageEventType`).
|
|
60
|
+
workflow_name:
|
|
61
|
+
Name of the workflow that produced this node.
|
|
62
|
+
step_name:
|
|
63
|
+
Name of the individual step within the workflow.
|
|
64
|
+
transformer_name:
|
|
65
|
+
Name of the transformer (for TRANSFORM events).
|
|
66
|
+
engine:
|
|
67
|
+
Processing engine (e.g. ``"pandas"`` or ``"pyspark"``).
|
|
68
|
+
rows_in:
|
|
69
|
+
Number of input rows.
|
|
70
|
+
rows_out:
|
|
71
|
+
Number of output rows.
|
|
72
|
+
columns_in:
|
|
73
|
+
List of input column names.
|
|
74
|
+
columns_out:
|
|
75
|
+
List of output column names.
|
|
76
|
+
column_lineage:
|
|
77
|
+
Column-level lineage mappings.
|
|
78
|
+
business_rules_applied:
|
|
79
|
+
IDs of business rules applied during this event.
|
|
80
|
+
quality_checks:
|
|
81
|
+
Names of quality checks executed.
|
|
82
|
+
quality_status:
|
|
83
|
+
Overall quality check outcome (e.g. ``"PASSED"`` or ``"FAILED"``).
|
|
84
|
+
trace_id:
|
|
85
|
+
OpenTelemetry trace identifier.
|
|
86
|
+
span_id:
|
|
87
|
+
OpenTelemetry span identifier.
|
|
88
|
+
parent_node_ids:
|
|
89
|
+
IDs of upstream :class:`LineageNode` objects.
|
|
90
|
+
metadata:
|
|
91
|
+
Arbitrary extra metadata.
|
|
92
|
+
node_id:
|
|
93
|
+
Auto-generated unique identifier (UUID hex).
|
|
94
|
+
timestamp:
|
|
95
|
+
UTC ISO-8601 timestamp of the event.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
dataset_uri: str
|
|
99
|
+
event_type: LineageEventType
|
|
100
|
+
workflow_name: str = ""
|
|
101
|
+
step_name: str = ""
|
|
102
|
+
transformer_name: str = ""
|
|
103
|
+
engine: str = ""
|
|
104
|
+
rows_in: int = 0
|
|
105
|
+
rows_out: int = 0
|
|
106
|
+
columns_in: list[str] = field(default_factory=list)
|
|
107
|
+
columns_out: list[str] = field(default_factory=list)
|
|
108
|
+
column_lineage: list[ColumnLineage] = field(default_factory=list)
|
|
109
|
+
business_rules_applied: list[str] = field(default_factory=list)
|
|
110
|
+
quality_checks: list[str] = field(default_factory=list)
|
|
111
|
+
quality_status: str = ""
|
|
112
|
+
trace_id: str = ""
|
|
113
|
+
span_id: str = ""
|
|
114
|
+
parent_node_ids: list[str] = field(default_factory=list)
|
|
115
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
116
|
+
node_id: str = field(default_factory=lambda: uuid4().hex)
|
|
117
|
+
timestamp: str = field(
|
|
118
|
+
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def to_dict(self) -> dict[str, Any]:
|
|
122
|
+
"""Serialize this node to a plain dictionary."""
|
|
123
|
+
return {
|
|
124
|
+
"node_id": self.node_id,
|
|
125
|
+
"dataset_uri": self.dataset_uri,
|
|
126
|
+
"event_type": self.event_type.value,
|
|
127
|
+
"workflow_name": self.workflow_name,
|
|
128
|
+
"step_name": self.step_name,
|
|
129
|
+
"transformer_name": self.transformer_name,
|
|
130
|
+
"engine": self.engine,
|
|
131
|
+
"rows_in": self.rows_in,
|
|
132
|
+
"rows_out": self.rows_out,
|
|
133
|
+
"columns_in": self.columns_in,
|
|
134
|
+
"columns_out": self.columns_out,
|
|
135
|
+
"column_lineage": [
|
|
136
|
+
{
|
|
137
|
+
"target_column": cl.target_column,
|
|
138
|
+
"source_columns": cl.source_columns,
|
|
139
|
+
"transformation": cl.transformation,
|
|
140
|
+
"source_dataset_uri": cl.source_dataset_uri,
|
|
141
|
+
}
|
|
142
|
+
for cl in self.column_lineage
|
|
143
|
+
],
|
|
144
|
+
"business_rules_applied": self.business_rules_applied,
|
|
145
|
+
"quality_checks": self.quality_checks,
|
|
146
|
+
"quality_status": self.quality_status,
|
|
147
|
+
"trace_id": self.trace_id,
|
|
148
|
+
"span_id": self.span_id,
|
|
149
|
+
"parent_node_ids": self.parent_node_ids,
|
|
150
|
+
"metadata": self.metadata,
|
|
151
|
+
"timestamp": self.timestamp,
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class LineageGraph:
|
|
157
|
+
"""A complete lineage graph for a single workflow run.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
run_id:
|
|
162
|
+
Unique identifier for the workflow run.
|
|
163
|
+
workflow_name:
|
|
164
|
+
Human-readable name of the workflow.
|
|
165
|
+
nodes:
|
|
166
|
+
Ordered list of :class:`LineageNode` objects.
|
|
167
|
+
started_at:
|
|
168
|
+
UTC ISO-8601 timestamp when the run started.
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
run_id: str
|
|
172
|
+
workflow_name: str
|
|
173
|
+
nodes: list[LineageNode] = field(default_factory=list)
|
|
174
|
+
started_at: str = field(
|
|
175
|
+
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
def add_node(self, node: LineageNode) -> None:
|
|
179
|
+
"""Append *node* to the graph."""
|
|
180
|
+
self.nodes.append(node)
|
|
181
|
+
|
|
182
|
+
def get_upstream(self, node_id: str) -> list[LineageNode]:
|
|
183
|
+
"""Return all nodes that are direct parents of *node_id*.
|
|
184
|
+
|
|
185
|
+
Nodes are considered parents when their ``node_id`` appears in
|
|
186
|
+
the target node's :attr:`~LineageNode.parent_node_ids` list.
|
|
187
|
+
"""
|
|
188
|
+
target = next((n for n in self.nodes if n.node_id == node_id), None)
|
|
189
|
+
if target is None:
|
|
190
|
+
return []
|
|
191
|
+
parent_ids = set(target.parent_node_ids)
|
|
192
|
+
return [n for n in self.nodes if n.node_id in parent_ids]
|
|
193
|
+
|
|
194
|
+
def get_downstream(self, node_id: str) -> list[LineageNode]:
|
|
195
|
+
"""Return all nodes that list *node_id* as a parent."""
|
|
196
|
+
return [n for n in self.nodes if node_id in n.parent_node_ids]
|
|
197
|
+
|
|
198
|
+
def to_dict(self) -> dict[str, Any]:
|
|
199
|
+
"""Serialize the full graph to a plain dictionary."""
|
|
200
|
+
return {
|
|
201
|
+
"run_id": self.run_id,
|
|
202
|
+
"workflow_name": self.workflow_name,
|
|
203
|
+
"started_at": self.started_at,
|
|
204
|
+
"nodes": [n.to_dict() for n in self.nodes],
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
__all__ = [
|
|
209
|
+
"LineageEventType",
|
|
210
|
+
"ColumnLineage",
|
|
211
|
+
"LineageNode",
|
|
212
|
+
"LineageGraph",
|
|
213
|
+
]
|
aptdata/core/state.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Persistent workflow execution state backend."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StateBackend:
|
|
11
|
+
"""Simple JSON-on-disk backend used for checkpointing workflow state."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, base_dir: str | Path = ".aptdata_state") -> None:
|
|
14
|
+
self.base_dir = Path(base_dir)
|
|
15
|
+
self.base_dir.mkdir(parents=True, exist_ok=True)
|
|
16
|
+
|
|
17
|
+
def save(self, run_id: str, state: dict[str, Any]) -> None:
|
|
18
|
+
"""Persist *state* for *run_id*."""
|
|
19
|
+
path = self.base_dir / f"{run_id}.json"
|
|
20
|
+
path.write_text(
|
|
21
|
+
json.dumps(state, ensure_ascii=False, default=str), encoding="utf-8"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def load(self, run_id: str) -> dict[str, Any]:
|
|
25
|
+
"""Load state for *run_id*."""
|
|
26
|
+
path = self.base_dir / f"{run_id}.json"
|
|
27
|
+
return json.loads(path.read_text(encoding="utf-8"))
|