dg-kit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dg_kit/__init__.py +0 -0
- dg_kit/base/__init__.py +0 -0
- dg_kit/base/business_information.py +60 -0
- dg_kit/base/convention.py +57 -0
- dg_kit/base/data_catalog.py +7 -0
- dg_kit/base/dataclasses/__init__.py +7 -0
- dg_kit/base/dataclasses/business_information.py +77 -0
- dg_kit/base/dataclasses/convention.py +30 -0
- dg_kit/base/dataclasses/data_catalog.py +64 -0
- dg_kit/base/dataclasses/logical_model.py +86 -0
- dg_kit/base/dataclasses/physical_model.py +38 -0
- dg_kit/base/enums.py +13 -0
- dg_kit/base/logical_model.py +66 -0
- dg_kit/base/physical_model.py +41 -0
- dg_kit/integrations/__init__.py +0 -0
- dg_kit/integrations/dbt/README.md +27 -0
- dg_kit/integrations/dbt/__init__.py +0 -0
- dg_kit/integrations/dbt/parser.py +202 -0
- dg_kit/integrations/notion/README.md +38 -0
- dg_kit/integrations/notion/__init__.py +0 -0
- dg_kit/integrations/notion/api.py +495 -0
- dg_kit/integrations/notion/formater.py +65 -0
- dg_kit/integrations/odm/README.md +46 -0
- dg_kit/integrations/odm/__init__.py +0 -0
- dg_kit/integrations/odm/attr_types.py +6 -0
- dg_kit/integrations/odm/parser.py +490 -0
- dg_kit-0.1.0.dist-info/METADATA +99 -0
- dg_kit-0.1.0.dist-info/RECORD +29 -0
- dg_kit-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
from dg_kit.base.physical_model import PhysicalModel
|
|
9
|
+
from dg_kit.base.dataclasses.physical_model import Table, Column, Layer
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
_REF_RE = re.compile(
|
|
13
|
+
r"""ref\(\s*(['"])(?P<a>[^'"]+)\1\s*(?:,\s*(['"])(?P<b>[^'"]+)\3\s*)?\)""",
|
|
14
|
+
re.IGNORECASE,
|
|
15
|
+
)
|
|
16
|
+
_SOURCE_RE = re.compile(
|
|
17
|
+
r"""source\(\s*(['"])(?P<src>[^'"]+)\1\s*,\s*(['"])(?P<table>[^'"]+)\3\s*\)""",
|
|
18
|
+
re.IGNORECASE,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DBTPhysicalModel(PhysicalModel):
|
|
23
|
+
def __init__(self, version: str):
|
|
24
|
+
super().__init__(version)
|
|
25
|
+
self.all_tables_by_name: dict[str, Table] = {}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DBTParser:
|
|
29
|
+
def __init__(self, dbt_project_path: Path, version: str = "unknown"):
|
|
30
|
+
if not isinstance(dbt_project_path, Path):
|
|
31
|
+
dbt_project_path = Path(dbt_project_path)
|
|
32
|
+
if not dbt_project_path.is_dir():
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"dbt_project_path must be a valid dbt project directory, got: {dbt_project_path}"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
self.dbt_project_path = dbt_project_path
|
|
38
|
+
self.models_path = self.dbt_project_path / "models"
|
|
39
|
+
|
|
40
|
+
if not self.models_path.is_dir():
|
|
41
|
+
raise FileNotFoundError(
|
|
42
|
+
f"Expected 'models' folder in dbt project, but not found: {str(self.models_path)}"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
self.dbt_project_yml_path = self.dbt_project_path / "dbt_project.yml"
|
|
46
|
+
|
|
47
|
+
if not self.dbt_project_yml_path.is_file():
|
|
48
|
+
raise FileNotFoundError(
|
|
49
|
+
f"Expected 'dbt_project.yml' config in dbt project, but not found: {str(self.dbt_project_yml_path)}"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
dbt_project_raw_yml = self.dbt_project_yml_path.read_text(encoding="utf-8")
|
|
53
|
+
self.dbt_project_conf = yaml.safe_load(dbt_project_raw_yml)
|
|
54
|
+
|
|
55
|
+
self.PM = DBTPhysicalModel(version)
|
|
56
|
+
|
|
57
|
+
def _parse_source_model_yml(self, source_yml_path: Path) -> None:
|
|
58
|
+
raw = source_yml_path.read_text(encoding="utf-8")
|
|
59
|
+
doc = yaml.safe_load(raw) or {}
|
|
60
|
+
|
|
61
|
+
sources = doc.get("sources") or []
|
|
62
|
+
if not isinstance(sources, list):
|
|
63
|
+
return
|
|
64
|
+
|
|
65
|
+
for source in sources:
|
|
66
|
+
source_name = source["name"]
|
|
67
|
+
|
|
68
|
+
# 1) Layer for the source
|
|
69
|
+
layer_nk = source_name
|
|
70
|
+
layer_obj = Layer(
|
|
71
|
+
natural_key=layer_nk,
|
|
72
|
+
name=source_name,
|
|
73
|
+
is_landing=True,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# if layer_nk not in self.PM.all_units_by_natural_key:
|
|
77
|
+
self.PM.register_layer(layer_obj)
|
|
78
|
+
|
|
79
|
+
for table in source["tables"]:
|
|
80
|
+
# 2) Table
|
|
81
|
+
table_nk = f"{source_name}.{table['name']}"
|
|
82
|
+
table_obj = Table(
|
|
83
|
+
natural_key=table_nk,
|
|
84
|
+
layer_id=layer_obj.id,
|
|
85
|
+
name=table["name"],
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# if table_nk not in self.PM.all_units_by_natural_key:
|
|
89
|
+
self.PM.register_table(table_obj)
|
|
90
|
+
self.PM.all_tables_by_name[table["name"]] = table_obj
|
|
91
|
+
|
|
92
|
+
# 3) Columns
|
|
93
|
+
for column in table["columns"]:
|
|
94
|
+
col_name = column["name"]
|
|
95
|
+
data_type = column["data_type"]
|
|
96
|
+
description = column["description"]
|
|
97
|
+
col_nk = f"{table_nk}.{col_name}"
|
|
98
|
+
|
|
99
|
+
col_obj = Column(
|
|
100
|
+
natural_key=col_nk,
|
|
101
|
+
layer_id=layer_obj.id,
|
|
102
|
+
table_id=table_obj.id,
|
|
103
|
+
name=str(col_name),
|
|
104
|
+
data_type=str(data_type),
|
|
105
|
+
description=str(description),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if col_nk not in self.PM.all_units_by_natural_key:
|
|
109
|
+
self.PM.register_column(col_obj)
|
|
110
|
+
|
|
111
|
+
def _parse_model_yml(self, model_yml_path: Path, layer_id: str) -> None:
|
|
112
|
+
raw = model_yml_path.read_text(encoding="utf-8")
|
|
113
|
+
doc = yaml.safe_load(raw)
|
|
114
|
+
|
|
115
|
+
models = doc["models"]
|
|
116
|
+
|
|
117
|
+
for model in models:
|
|
118
|
+
model_name = model["name"].strip()
|
|
119
|
+
|
|
120
|
+
# 1) Ensure table exists
|
|
121
|
+
table_nk = self.PM.all_units_by_id[layer_id].name + "." + model_name
|
|
122
|
+
|
|
123
|
+
table_obj = Table(
|
|
124
|
+
natural_key=table_nk,
|
|
125
|
+
layer_id=layer_id,
|
|
126
|
+
name=model_name,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
self.PM.register_table(table_obj)
|
|
130
|
+
self.PM.all_tables_by_name[model_name] = table_obj
|
|
131
|
+
|
|
132
|
+
# 2) Parse columns
|
|
133
|
+
columns = model["columns"]
|
|
134
|
+
|
|
135
|
+
for column in columns:
|
|
136
|
+
column_name = column["name"].strip()
|
|
137
|
+
description = column["description"]
|
|
138
|
+
data_type = column["data_type"]
|
|
139
|
+
|
|
140
|
+
column_nk = f"{table_nk}.{column_name}"
|
|
141
|
+
|
|
142
|
+
col_obj = Column(
|
|
143
|
+
natural_key=column_nk,
|
|
144
|
+
layer_id=layer_id,
|
|
145
|
+
table_id=table_obj.id,
|
|
146
|
+
name=column_name,
|
|
147
|
+
data_type=str(data_type),
|
|
148
|
+
description=str(description),
|
|
149
|
+
)
|
|
150
|
+
if column_nk not in self.PM.all_units_by_natural_key:
|
|
151
|
+
self.PM.register_column(col_obj)
|
|
152
|
+
|
|
153
|
+
def _parse_model_sql(self, model_name: str, model_sql_path: Path) -> None:
|
|
154
|
+
"""
|
|
155
|
+
Register dependency natural_keys found in SQL via ref()/source().
|
|
156
|
+
"""
|
|
157
|
+
text = model_sql_path.read_text(encoding="utf-8")
|
|
158
|
+
|
|
159
|
+
for m in _REF_RE.finditer(text):
|
|
160
|
+
a = m.group("a")
|
|
161
|
+
b = m.group("b")
|
|
162
|
+
# ref('model') or ref('package', 'model')
|
|
163
|
+
dep_nk = f"{a}.{b}" if b else a
|
|
164
|
+
table_obj = self.PM.all_units_by_natural_key.get(dep_nk)
|
|
165
|
+
if isinstance(table_obj, Table):
|
|
166
|
+
self.PM.register_dependency(
|
|
167
|
+
self.PM.all_tables_by_name[model_name], table_obj
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
for m in _SOURCE_RE.finditer(text):
|
|
171
|
+
src = m.group("src")
|
|
172
|
+
tbl = m.group("table")
|
|
173
|
+
dep_nk = f"{src}.{tbl}"
|
|
174
|
+
table_obj = self.PM.all_units_by_natural_key.get(dep_nk)
|
|
175
|
+
if isinstance(table_obj, Table):
|
|
176
|
+
self.PM.register_dependency(
|
|
177
|
+
self.PM.all_tables_by_name[model_name], table_obj
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def parse_pm(self) -> PhysicalModel:
|
|
181
|
+
# 1) parse source definitions
|
|
182
|
+
for source_yml_path in self.models_path.glob("*.yml"):
|
|
183
|
+
self._parse_source_model_yml(source_yml_path)
|
|
184
|
+
|
|
185
|
+
for project in self.dbt_project_conf["models"]:
|
|
186
|
+
for layer_name in self.dbt_project_conf["models"][project]:
|
|
187
|
+
layer_obj = Layer(
|
|
188
|
+
natural_key=layer_name, name=layer_name, is_landing=False
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
self.PM.register_layer(layer_obj)
|
|
192
|
+
|
|
193
|
+
layer_folder_path = self.models_path / layer_name
|
|
194
|
+
|
|
195
|
+
for model_yml_file in layer_folder_path.rglob("*.yml"):
|
|
196
|
+
self._parse_model_yml(model_yml_file, layer_id=layer_obj.id)
|
|
197
|
+
|
|
198
|
+
# 2) SQL second
|
|
199
|
+
for sql_path in self.models_path.rglob("*.sql"):
|
|
200
|
+
self._parse_model_sql(sql_path.stem, sql_path)
|
|
201
|
+
|
|
202
|
+
return self.PM
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
## Notion Integration
|
|
2
|
+
|
|
3
|
+
Syncs Data Governance objects to a Notion data source (database) and keeps properties and page bodies in sync.
|
|
4
|
+
|
|
5
|
+
This integration is useful for pushing logical and physical model details into a shared catalog space
|
|
6
|
+
where teams can browse governance metadata.
|
|
7
|
+
|
|
8
|
+
## Requirements
|
|
9
|
+
- A Notion integration token
|
|
10
|
+
- A Notion data source (database) ID
|
|
11
|
+
- Proper access granted to the data source for the integration
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
```python
|
|
15
|
+
from dg_kit.integrations.notion.api import NotionDataCatalog
|
|
16
|
+
|
|
17
|
+
catalog = NotionDataCatalog(
|
|
18
|
+
notion_token="secret",
|
|
19
|
+
dc_table_id="data_source_id",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
rows = catalog.pull()
|
|
23
|
+
print(len(rows))
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Properties Expected
|
|
27
|
+
By default, the Notion database should contain the following properties:
|
|
28
|
+
- `Data unit` (title)
|
|
29
|
+
- `Data unit type` (select)
|
|
30
|
+
- `Data unit uuid` (rich text)
|
|
31
|
+
- `Domain` (select)
|
|
32
|
+
|
|
33
|
+
Property names can be overridden when constructing `NotionDataCatalog`.
|
|
34
|
+
|
|
35
|
+
## Notes
|
|
36
|
+
- `update_page_by_uuid` rewrites page blocks to reflect the latest entity/attribute/relation details.
|
|
37
|
+
- `update_properties_by_uuid` updates the Notion properties only.
|
|
38
|
+
- `add_data_unit` creates a new page if the external UUID does not exist.
|
|
File without changes
|