dg-kit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,202 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from pathlib import Path
5
+
6
+ import yaml
7
+
8
+ from dg_kit.base.physical_model import PhysicalModel
9
+ from dg_kit.base.dataclasses.physical_model import Table, Column, Layer
10
+
11
+
12
+ _REF_RE = re.compile(
13
+ r"""ref\(\s*(['"])(?P<a>[^'"]+)\1\s*(?:,\s*(['"])(?P<b>[^'"]+)\3\s*)?\)""",
14
+ re.IGNORECASE,
15
+ )
16
+ _SOURCE_RE = re.compile(
17
+ r"""source\(\s*(['"])(?P<src>[^'"]+)\1\s*,\s*(['"])(?P<table>[^'"]+)\3\s*\)""",
18
+ re.IGNORECASE,
19
+ )
20
+
21
+
22
+ class DBTPhysicalModel(PhysicalModel):
23
+ def __init__(self, version: str):
24
+ super().__init__(version)
25
+ self.all_tables_by_name: dict[str, Table] = {}
26
+
27
+
28
+ class DBTParser:
29
+ def __init__(self, dbt_project_path: Path, version: str = "unknown"):
30
+ if not isinstance(dbt_project_path, Path):
31
+ dbt_project_path = Path(dbt_project_path)
32
+ if not dbt_project_path.is_dir():
33
+ raise ValueError(
34
+ f"dbt_project_path must be a valid dbt project directory, got: {dbt_project_path}"
35
+ )
36
+
37
+ self.dbt_project_path = dbt_project_path
38
+ self.models_path = self.dbt_project_path / "models"
39
+
40
+ if not self.models_path.is_dir():
41
+ raise FileNotFoundError(
42
+ f"Expected 'models' folder in dbt project, but not found: {str(self.models_path)}"
43
+ )
44
+
45
+ self.dbt_project_yml_path = self.dbt_project_path / "dbt_project.yml"
46
+
47
+ if not self.dbt_project_yml_path.is_file():
48
+ raise FileNotFoundError(
49
+ f"Expected 'dbt_project.yml' config in dbt project, but not found: {str(self.dbt_project_yml_path)}"
50
+ )
51
+
52
+ dbt_project_raw_yml = self.dbt_project_yml_path.read_text(encoding="utf-8")
53
+ self.dbt_project_conf = yaml.safe_load(dbt_project_raw_yml)
54
+
55
+ self.PM = DBTPhysicalModel(version)
56
+
57
+ def _parse_source_model_yml(self, source_yml_path: Path) -> None:
58
+ raw = source_yml_path.read_text(encoding="utf-8")
59
+ doc = yaml.safe_load(raw) or {}
60
+
61
+ sources = doc.get("sources") or []
62
+ if not isinstance(sources, list):
63
+ return
64
+
65
+ for source in sources:
66
+ source_name = source["name"]
67
+
68
+ # 1) Layer for the source
69
+ layer_nk = source_name
70
+ layer_obj = Layer(
71
+ natural_key=layer_nk,
72
+ name=source_name,
73
+ is_landing=True,
74
+ )
75
+
76
+ # if layer_nk not in self.PM.all_units_by_natural_key:
77
+ self.PM.register_layer(layer_obj)
78
+
79
+ for table in source["tables"]:
80
+ # 2) Table
81
+ table_nk = f"{source_name}.{table['name']}"
82
+ table_obj = Table(
83
+ natural_key=table_nk,
84
+ layer_id=layer_obj.id,
85
+ name=table["name"],
86
+ )
87
+
88
+ # if table_nk not in self.PM.all_units_by_natural_key:
89
+ self.PM.register_table(table_obj)
90
+ self.PM.all_tables_by_name[table["name"]] = table_obj
91
+
92
+ # 3) Columns
93
+ for column in table["columns"]:
94
+ col_name = column["name"]
95
+ data_type = column["data_type"]
96
+ description = column["description"]
97
+ col_nk = f"{table_nk}.{col_name}"
98
+
99
+ col_obj = Column(
100
+ natural_key=col_nk,
101
+ layer_id=layer_obj.id,
102
+ table_id=table_obj.id,
103
+ name=str(col_name),
104
+ data_type=str(data_type),
105
+ description=str(description),
106
+ )
107
+
108
+ if col_nk not in self.PM.all_units_by_natural_key:
109
+ self.PM.register_column(col_obj)
110
+
111
+ def _parse_model_yml(self, model_yml_path: Path, layer_id: str) -> None:
112
+ raw = model_yml_path.read_text(encoding="utf-8")
113
+ doc = yaml.safe_load(raw)
114
+
115
+ models = doc["models"]
116
+
117
+ for model in models:
118
+ model_name = model["name"].strip()
119
+
120
+ # 1) Ensure table exists
121
+ table_nk = self.PM.all_units_by_id[layer_id].name + "." + model_name
122
+
123
+ table_obj = Table(
124
+ natural_key=table_nk,
125
+ layer_id=layer_id,
126
+ name=model_name,
127
+ )
128
+
129
+ self.PM.register_table(table_obj)
130
+ self.PM.all_tables_by_name[model_name] = table_obj
131
+
132
+ # 2) Parse columns
133
+ columns = model["columns"]
134
+
135
+ for column in columns:
136
+ column_name = column["name"].strip()
137
+ description = column["description"]
138
+ data_type = column["data_type"]
139
+
140
+ column_nk = f"{table_nk}.{column_name}"
141
+
142
+ col_obj = Column(
143
+ natural_key=column_nk,
144
+ layer_id=layer_id,
145
+ table_id=table_obj.id,
146
+ name=column_name,
147
+ data_type=str(data_type),
148
+ description=str(description),
149
+ )
150
+ if column_nk not in self.PM.all_units_by_natural_key:
151
+ self.PM.register_column(col_obj)
152
+
153
+ def _parse_model_sql(self, model_name: str, model_sql_path: Path) -> None:
154
+ """
155
+ Register dependency natural_keys found in SQL via ref()/source().
156
+ """
157
+ text = model_sql_path.read_text(encoding="utf-8")
158
+
159
+ for m in _REF_RE.finditer(text):
160
+ a = m.group("a")
161
+ b = m.group("b")
162
+ # ref('model') or ref('package', 'model')
163
+ dep_nk = f"{a}.{b}" if b else a
164
+ table_obj = self.PM.all_units_by_natural_key.get(dep_nk)
165
+ if isinstance(table_obj, Table):
166
+ self.PM.register_dependency(
167
+ self.PM.all_tables_by_name[model_name], table_obj
168
+ )
169
+
170
+ for m in _SOURCE_RE.finditer(text):
171
+ src = m.group("src")
172
+ tbl = m.group("table")
173
+ dep_nk = f"{src}.{tbl}"
174
+ table_obj = self.PM.all_units_by_natural_key.get(dep_nk)
175
+ if isinstance(table_obj, Table):
176
+ self.PM.register_dependency(
177
+ self.PM.all_tables_by_name[model_name], table_obj
178
+ )
179
+
180
+ def parse_pm(self) -> PhysicalModel:
181
+ # 1) parse source definitions
182
+ for source_yml_path in self.models_path.glob("*.yml"):
183
+ self._parse_source_model_yml(source_yml_path)
184
+
185
+ for project in self.dbt_project_conf["models"]:
186
+ for layer_name in self.dbt_project_conf["models"][project]:
187
+ layer_obj = Layer(
188
+ natural_key=layer_name, name=layer_name, is_landing=False
189
+ )
190
+
191
+ self.PM.register_layer(layer_obj)
192
+
193
+ layer_folder_path = self.models_path / layer_name
194
+
195
+ for model_yml_file in layer_folder_path.rglob("*.yml"):
196
+ self._parse_model_yml(model_yml_file, layer_id=layer_obj.id)
197
+
198
+ # 2) SQL second
199
+ for sql_path in self.models_path.rglob("*.sql"):
200
+ self._parse_model_sql(sql_path.stem, sql_path)
201
+
202
+ return self.PM
@@ -0,0 +1,38 @@
1
+ ## Notion Integration
2
+
3
+ Syncs Data Governance objects to a Notion data source (database) and keeps properties and page bodies in sync.
4
+
5
+ This integration is useful for pushing logical and physical model details into a shared catalog space
6
+ where teams can browse governance metadata.
7
+
8
+ ## Requirements
9
+ - A Notion integration token
10
+ - A Notion data source (database) ID
11
+ - Proper access granted to the data source for the integration
12
+
13
+ ## Usage
14
+ ```python
15
+ from dg_kit.integrations.notion.api import NotionDataCatalog
16
+
17
+ catalog = NotionDataCatalog(
18
+ notion_token="secret",
19
+ dc_table_id="data_source_id",
20
+ )
21
+
22
+ rows = catalog.pull()
23
+ print(len(rows))
24
+ ```
25
+
26
+ ## Properties Expected
27
+ By default, the Notion database should contain the following properties:
28
+ - `Data unit` (title)
29
+ - `Data unit type` (select)
30
+ - `Data unit uuid` (rich text)
31
+ - `Domain` (select)
32
+
33
+ Property names can be overridden when constructing `NotionDataCatalog`.
34
+
35
+ ## Notes
36
+ - `update_page_by_uuid` rewrites page blocks to reflect the latest entity/attribute/relation details.
37
+ - `update_properties_by_uuid` updates the Notion properties only.
38
+ - `add_data_unit` creates a new page if the external UUID does not exist.
File without changes