vd-dlt 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vd_dlt-0.1.0/PKG-INFO +71 -0
- vd_dlt-0.1.0/README.md +43 -0
- vd_dlt-0.1.0/pyproject.toml +47 -0
- vd_dlt-0.1.0/src/vd_dlt/__init__.py +33 -0
- vd_dlt-0.1.0/src/vd_dlt/config_resolver.py +238 -0
- vd_dlt-0.1.0/src/vd_dlt/config_utils.py +364 -0
- vd_dlt-0.1.0/src/vd_dlt/credential_resolver.py +126 -0
- vd_dlt-0.1.0/src/vd_dlt/fabric_utils.py +177 -0
- vd_dlt-0.1.0/src/vd_dlt/observability.py +323 -0
- vd_dlt-0.1.0/src/vd_dlt/pipeline_runner.py +404 -0
- vd_dlt-0.1.0/src/vd_dlt/runner.py +296 -0
- vd_dlt-0.1.0/src/vd_dlt/schema_validator.py +387 -0
- vd_dlt-0.1.0/src/vd_dlt/schemas/target_schema.json +62 -0
- vd_dlt-0.1.0/src/vd_dlt/utils.py +148 -0
vd_dlt-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vd-dlt
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Core DLT ingestion framework for VibeData pipelines
|
|
5
|
+
Project-URL: Homepage, https://github.com/accelerate-data/vd-dlt-connectors
|
|
6
|
+
Project-URL: Repository, https://github.com/accelerate-data/vd-dlt-connectors
|
|
7
|
+
Author-email: VibeData <info@vibedata.dev>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Database
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Requires-Dist: pyyaml>=6.0
|
|
20
|
+
Provides-Extra: notion
|
|
21
|
+
Requires-Dist: vd-dlt-notion; extra == 'notion'
|
|
22
|
+
Provides-Extra: notion-schema
|
|
23
|
+
Requires-Dist: vd-dlt-notion-schema; extra == 'notion-schema'
|
|
24
|
+
Provides-Extra: pipeline
|
|
25
|
+
Requires-Dist: dlt[deltalake,filesystem]; extra == 'pipeline'
|
|
26
|
+
Requires-Dist: pyarrow>=17.0.0; extra == 'pipeline'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# vd-dlt
|
|
30
|
+
|
|
31
|
+
Core DLT ingestion framework for VibeData pipelines. Provides config resolution, credential management (Azure Key Vault), pipeline execution, and observability.
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
# Core only
|
|
37
|
+
pip install vd-dlt
|
|
38
|
+
|
|
39
|
+
# With pipeline dependencies (dlt, pyarrow)
|
|
40
|
+
pip install vd-dlt[pipeline]
|
|
41
|
+
|
|
42
|
+
# With Notion connector
|
|
43
|
+
pip install vd-dlt[notion]
|
|
44
|
+
|
|
45
|
+
# With Notion schema (defaults, docs)
|
|
46
|
+
pip install vd-dlt[notion-schema]
|
|
47
|
+
|
|
48
|
+
# Everything for Notion
|
|
49
|
+
pip install vd-dlt[pipeline,notion,notion-schema]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from vd_dlt import PipelineRunner
|
|
56
|
+
|
|
57
|
+
runner = PipelineRunner(
|
|
58
|
+
vault_url="https://my-vault.vault.azure.net/",
|
|
59
|
+
)
|
|
60
|
+
result = runner.run("my_source_name")
|
|
61
|
+
print(f"Loaded {result.total_rows_loaded} rows")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Architecture
|
|
65
|
+
|
|
66
|
+
The framework uses a 4-level config hierarchy (most specific wins):
|
|
67
|
+
|
|
68
|
+
1. **Connector Defaults** - from connector schema package
|
|
69
|
+
2. **Source Config** - from source YAML (including `default_sync`)
|
|
70
|
+
3. **Group Config** - optional grouping within source
|
|
71
|
+
4. **Resource Config** - per-table overrides
|
vd_dlt-0.1.0/README.md
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# vd-dlt
|
|
2
|
+
|
|
3
|
+
Core DLT ingestion framework for VibeData pipelines. Provides config resolution, credential management (Azure Key Vault), pipeline execution, and observability.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Core only
|
|
9
|
+
pip install vd-dlt
|
|
10
|
+
|
|
11
|
+
# With pipeline dependencies (dlt, pyarrow)
|
|
12
|
+
pip install vd-dlt[pipeline]
|
|
13
|
+
|
|
14
|
+
# With Notion connector
|
|
15
|
+
pip install vd-dlt[notion]
|
|
16
|
+
|
|
17
|
+
# With Notion schema (defaults, docs)
|
|
18
|
+
pip install vd-dlt[notion-schema]
|
|
19
|
+
|
|
20
|
+
# Everything for Notion
|
|
21
|
+
pip install vd-dlt[pipeline,notion,notion-schema]
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from vd_dlt import PipelineRunner
|
|
28
|
+
|
|
29
|
+
runner = PipelineRunner(
|
|
30
|
+
vault_url="https://my-vault.vault.azure.net/",
|
|
31
|
+
)
|
|
32
|
+
result = runner.run("my_source_name")
|
|
33
|
+
print(f"Loaded {result.total_rows_loaded} rows")
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Architecture
|
|
37
|
+
|
|
38
|
+
The framework uses a 4-level config hierarchy (most specific wins):
|
|
39
|
+
|
|
40
|
+
1. **Connector Defaults** - from connector schema package
|
|
41
|
+
2. **Source Config** - from source YAML (including `default_sync`)
|
|
42
|
+
3. **Group Config** - optional grouping within source
|
|
43
|
+
4. **Resource Config** - per-table overrides
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vd-dlt"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Core DLT ingestion framework for VibeData pipelines"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "VibeData", email = "info@vibedata.dev" },
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.9",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Database",
|
|
24
|
+
"Topic :: Software Development :: Libraries",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"pyyaml>=6.0",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
pipeline = [
|
|
32
|
+
"dlt[filesystem,deltalake]",
|
|
33
|
+
"pyarrow>=17.0.0",
|
|
34
|
+
]
|
|
35
|
+
notion = [
|
|
36
|
+
"vd-dlt-notion",
|
|
37
|
+
]
|
|
38
|
+
notion-schema = [
|
|
39
|
+
"vd-dlt-notion-schema",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
Homepage = "https://github.com/accelerate-data/vd-dlt-connectors"
|
|
44
|
+
Repository = "https://github.com/accelerate-data/vd-dlt-connectors"
|
|
45
|
+
|
|
46
|
+
[tool.hatch.build.targets.wheel]
|
|
47
|
+
packages = ["src/vd_dlt"]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
vd-dlt: Core DLT Ingestion Framework
|
|
3
|
+
|
|
4
|
+
Provides:
|
|
5
|
+
- Config resolution (4-level hierarchy)
|
|
6
|
+
- Credential resolution (Key Vault integration)
|
|
7
|
+
- Pipeline execution
|
|
8
|
+
- Observability (logging, sync history)
|
|
9
|
+
- Schema validation
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
__version__ = "0.1.0"
|
|
13
|
+
|
|
14
|
+
from .config_resolver import ConfigResolver, resolve_resource_config
|
|
15
|
+
from .credential_resolver import CredentialResolver, resolve_credentials
|
|
16
|
+
from .pipeline_runner import PipelineRunner, run_source
|
|
17
|
+
from .observability import setup_logging, SyncHistoryTracker, SyncRecord, ResourceResult
|
|
18
|
+
from .schema_validator import ValidationError, validate_all
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"ConfigResolver",
|
|
22
|
+
"resolve_resource_config",
|
|
23
|
+
"CredentialResolver",
|
|
24
|
+
"resolve_credentials",
|
|
25
|
+
"PipelineRunner",
|
|
26
|
+
"run_source",
|
|
27
|
+
"setup_logging",
|
|
28
|
+
"SyncHistoryTracker",
|
|
29
|
+
"SyncRecord",
|
|
30
|
+
"ResourceResult",
|
|
31
|
+
"ValidationError",
|
|
32
|
+
"validate_all",
|
|
33
|
+
]
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Config Resolution Engine - 4-Level Hierarchy
|
|
3
|
+
|
|
4
|
+
Resolution order (most specific wins):
|
|
5
|
+
1. Connector Defaults - connectors/{name}/defaults.yml
|
|
6
|
+
2. Source - sources/{source_name}.yml (including default_sync)
|
|
7
|
+
3. Group - Optional grouping within source config
|
|
8
|
+
4. Resource - Per-table overrides in source config
|
|
9
|
+
|
|
10
|
+
Priority: Resource > Group > Source > Connector Defaults
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
from typing import Any, Dict, List, Optional
|
|
15
|
+
from .utils import (
|
|
16
|
+
deep_merge,
|
|
17
|
+
load_yaml,
|
|
18
|
+
get_connectors_path,
|
|
19
|
+
get_configs_path,
|
|
20
|
+
file_exists,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ConfigResolver:
|
|
25
|
+
"""
|
|
26
|
+
Resolves configuration from the 4-level hierarchy.
|
|
27
|
+
|
|
28
|
+
Usage:
|
|
29
|
+
resolver = ConfigResolver()
|
|
30
|
+
source_config = resolver.load_source_config("notion_wiki")
|
|
31
|
+
resource_config = resolver.resolve_resource_config(source_config, "database_rows")
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, base_path: Optional[str] = None):
|
|
35
|
+
"""
|
|
36
|
+
Initialize config resolver.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
base_path: Base path for configs (default: from environment)
|
|
40
|
+
"""
|
|
41
|
+
if base_path:
|
|
42
|
+
self.connectors_path = f"{base_path}/connectors"
|
|
43
|
+
self.configs_path = f"{base_path}/configs"
|
|
44
|
+
else:
|
|
45
|
+
self.connectors_path = get_connectors_path()
|
|
46
|
+
self.configs_path = get_configs_path()
|
|
47
|
+
|
|
48
|
+
self._connector_cache: Dict[str, Dict] = {}
|
|
49
|
+
|
|
50
|
+
def load_connector_defaults(self, connector_name: str) -> Dict[str, Any]:
|
|
51
|
+
"""
|
|
52
|
+
Load connector defaults (Level 1).
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
connector_name: Name of the connector (e.g., "notion")
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Connector defaults dictionary
|
|
59
|
+
"""
|
|
60
|
+
if connector_name in self._connector_cache:
|
|
61
|
+
return self._connector_cache[connector_name]
|
|
62
|
+
|
|
63
|
+
defaults_path = f"{self.connectors_path}/{connector_name}/defaults.yml"
|
|
64
|
+
if file_exists(defaults_path):
|
|
65
|
+
defaults = load_yaml(defaults_path)
|
|
66
|
+
else:
|
|
67
|
+
defaults = {}
|
|
68
|
+
|
|
69
|
+
self._connector_cache[connector_name] = defaults
|
|
70
|
+
return defaults
|
|
71
|
+
|
|
72
|
+
def load_connector_manifest(self, connector_name: str) -> Dict[str, Any]:
|
|
73
|
+
"""
|
|
74
|
+
Load connector manifest (metadata).
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
connector_name: Name of the connector
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Connector manifest dictionary
|
|
81
|
+
"""
|
|
82
|
+
manifest_path = f"{self.connectors_path}/{connector_name}/manifest.yml"
|
|
83
|
+
if file_exists(manifest_path):
|
|
84
|
+
return load_yaml(manifest_path)
|
|
85
|
+
return {}
|
|
86
|
+
|
|
87
|
+
def load_source_config(self, source_name: str) -> Dict[str, Any]:
|
|
88
|
+
"""
|
|
89
|
+
Load source config (Level 2).
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
source_name: Name of the source (e.g., "notion_wiki")
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Source config dictionary
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
FileNotFoundError: If source config doesn't exist
|
|
99
|
+
"""
|
|
100
|
+
source_path = f"{self.configs_path}/sources/{source_name}.yml"
|
|
101
|
+
if not file_exists(source_path):
|
|
102
|
+
raise FileNotFoundError(f"Source config not found: {source_path}")
|
|
103
|
+
|
|
104
|
+
return load_yaml(source_path)
|
|
105
|
+
|
|
106
|
+
def get_group_for_resource(
|
|
107
|
+
self, source_config: Dict[str, Any], resource_name: str
|
|
108
|
+
) -> Optional[Dict[str, Any]]:
|
|
109
|
+
"""
|
|
110
|
+
Find the group config for a resource (Level 3).
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
source_config: Source configuration
|
|
114
|
+
resource_name: Name of the resource
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Group config if resource belongs to a group, None otherwise
|
|
118
|
+
"""
|
|
119
|
+
groups = source_config.get("groups", {})
|
|
120
|
+
for group_name, group_config in groups.items():
|
|
121
|
+
if resource_name in group_config.get("resources", []):
|
|
122
|
+
# Return group settings without the 'resources' list
|
|
123
|
+
return {k: v for k, v in group_config.items() if k != "resources"}
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
def resolve_resource_config(
|
|
127
|
+
self, source_config: Dict[str, Any], resource_name: str
|
|
128
|
+
) -> Dict[str, Any]:
|
|
129
|
+
"""
|
|
130
|
+
Resolve effective config for a resource.
|
|
131
|
+
|
|
132
|
+
Applies the 4-level hierarchy:
|
|
133
|
+
Priority: Resource > Group > Source > Connector Defaults
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
source_config: Source configuration (from load_source_config)
|
|
137
|
+
resource_name: Name of the resource to resolve
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Fully resolved configuration for the resource
|
|
141
|
+
"""
|
|
142
|
+
connector_name = source_config.get("connector", "rest_api_generic")
|
|
143
|
+
|
|
144
|
+
# Level 1: Start with connector defaults
|
|
145
|
+
connector_defaults = self.load_connector_defaults(connector_name)
|
|
146
|
+
effective = deep_merge({}, connector_defaults.get("defaults", {}))
|
|
147
|
+
|
|
148
|
+
# Apply resource-specific connector defaults if exists
|
|
149
|
+
resource_connector_defaults = connector_defaults.get("resource_defaults", {}).get(resource_name, {})
|
|
150
|
+
effective = deep_merge(effective, resource_connector_defaults)
|
|
151
|
+
|
|
152
|
+
# Level 2: Apply source-level config (including default_sync)
|
|
153
|
+
source_level = {
|
|
154
|
+
k: v for k, v in source_config.items()
|
|
155
|
+
if k not in ["groups", "resources", "source_name", "connector",
|
|
156
|
+
"connector_version", "secrets"]
|
|
157
|
+
}
|
|
158
|
+
effective = deep_merge(effective, source_level)
|
|
159
|
+
|
|
160
|
+
# Level 3: Apply group config (if resource belongs to a group)
|
|
161
|
+
group_config = self.get_group_for_resource(source_config, resource_name)
|
|
162
|
+
if group_config:
|
|
163
|
+
effective = deep_merge(effective, group_config)
|
|
164
|
+
|
|
165
|
+
# Level 4: Apply resource-specific overrides
|
|
166
|
+
# Handle both list format (new) and dict format (legacy)
|
|
167
|
+
resources = source_config.get("resources", [])
|
|
168
|
+
resource_config = {}
|
|
169
|
+
if isinstance(resources, list):
|
|
170
|
+
for res in resources:
|
|
171
|
+
if res.get("name") == resource_name:
|
|
172
|
+
resource_config = {k: v for k, v in res.items() if k != "name"}
|
|
173
|
+
break
|
|
174
|
+
elif isinstance(resources, dict):
|
|
175
|
+
resource_config = resources.get(resource_name, {})
|
|
176
|
+
effective = deep_merge(effective, resource_config)
|
|
177
|
+
|
|
178
|
+
# Add metadata
|
|
179
|
+
effective["_resource_name"] = resource_name
|
|
180
|
+
effective["_source_name"] = source_config.get("source_name", "unknown")
|
|
181
|
+
effective["_connector"] = connector_name
|
|
182
|
+
|
|
183
|
+
return effective
|
|
184
|
+
|
|
185
|
+
def get_enabled_resources(self, source_config: Dict[str, Any]) -> List[str]:
|
|
186
|
+
"""
|
|
187
|
+
Get list of enabled resources from source config.
|
|
188
|
+
|
|
189
|
+
Supports both list format (new) and dict format (legacy):
|
|
190
|
+
- List: [{"name": "projects", "enabled": true}, ...]
|
|
191
|
+
- Dict: {"projects": {"enabled": true}, ...}
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
source_config: Source configuration
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
List of enabled resource names
|
|
198
|
+
"""
|
|
199
|
+
resources = source_config.get("resources", [])
|
|
200
|
+
enabled = []
|
|
201
|
+
|
|
202
|
+
if isinstance(resources, list):
|
|
203
|
+
# New list format
|
|
204
|
+
for resource in resources:
|
|
205
|
+
if resource.get("enabled", True):
|
|
206
|
+
name = resource.get("name")
|
|
207
|
+
if name:
|
|
208
|
+
enabled.append(name)
|
|
209
|
+
elif isinstance(resources, dict):
|
|
210
|
+
# Legacy dict format
|
|
211
|
+
for name, config in resources.items():
|
|
212
|
+
if isinstance(config, dict) and config.get("enabled", True):
|
|
213
|
+
enabled.append(name)
|
|
214
|
+
elif config is None or config == {}:
|
|
215
|
+
enabled.append(name)
|
|
216
|
+
|
|
217
|
+
return enabled
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def resolve_resource_config(
|
|
221
|
+
source_name: str,
|
|
222
|
+
resource_name: str,
|
|
223
|
+
base_path: Optional[str] = None,
|
|
224
|
+
) -> Dict[str, Any]:
|
|
225
|
+
"""
|
|
226
|
+
Convenience function to resolve resource config.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
source_name: Name of the source
|
|
230
|
+
resource_name: Name of the resource
|
|
231
|
+
base_path: Optional base path override
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Resolved resource configuration
|
|
235
|
+
"""
|
|
236
|
+
resolver = ConfigResolver(base_path)
|
|
237
|
+
source_config = resolver.load_source_config(source_name)
|
|
238
|
+
return resolver.resolve_resource_config(source_config, resource_name)
|