cloe-nessy 0.3.5__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/__init__.py +0 -0
- cloe_nessy/clients/__init__.py +0 -0
- cloe_nessy/clients/api_client/__init__.py +0 -0
- cloe_nessy/clients/api_client/api_client.py +0 -0
- cloe_nessy/clients/api_client/api_response.py +0 -0
- cloe_nessy/clients/api_client/auth.py +0 -0
- cloe_nessy/clients/api_client/exceptions.py +0 -0
- cloe_nessy/file_utilities/__init__.py +0 -0
- cloe_nessy/file_utilities/exceptions.py +0 -0
- cloe_nessy/file_utilities/factory.py +0 -0
- cloe_nessy/file_utilities/get_file_paths.py +0 -0
- cloe_nessy/file_utilities/location_types.py +0 -0
- cloe_nessy/file_utilities/strategies/__init__.py +0 -0
- cloe_nessy/file_utilities/strategies/base_strategy.py +0 -0
- cloe_nessy/file_utilities/strategies/local_strategy.py +0 -0
- cloe_nessy/file_utilities/strategies/onelake_strategy.py +0 -0
- cloe_nessy/file_utilities/strategies/utils_strategy.py +0 -0
- cloe_nessy/integration/__init__.py +0 -0
- cloe_nessy/integration/reader/__init__.py +0 -0
- cloe_nessy/integration/reader/api_reader.py +0 -0
- cloe_nessy/integration/reader/catalog_reader.py +0 -0
- cloe_nessy/integration/reader/excel_reader.py +0 -0
- cloe_nessy/integration/reader/exceptions.py +0 -0
- cloe_nessy/integration/reader/file_reader.py +0 -0
- cloe_nessy/integration/reader/reader.py +0 -0
- cloe_nessy/integration/writer/__init__.py +0 -0
- cloe_nessy/integration/writer/catalog_writer.py +0 -0
- cloe_nessy/logging/__init__.py +0 -0
- cloe_nessy/logging/logger_mixin.py +0 -0
- cloe_nessy/models/__init__.py +4 -0
- cloe_nessy/models/adapter/__init__.py +3 -0
- cloe_nessy/models/adapter/unity_catalog_adapter.py +292 -0
- cloe_nessy/models/catalog.py +10 -0
- cloe_nessy/models/column.py +0 -0
- cloe_nessy/models/constraint.py +0 -0
- cloe_nessy/models/foreign_key.py +0 -0
- cloe_nessy/models/mixins/__init__.py +0 -0
- cloe_nessy/models/mixins/read_instance_mixin.py +0 -0
- cloe_nessy/models/mixins/template_loader_mixin.py +0 -0
- cloe_nessy/models/schema.py +19 -0
- cloe_nessy/models/table.py +50 -5
- cloe_nessy/models/types.py +0 -0
- cloe_nessy/models/volume.py +67 -0
- cloe_nessy/object_manager/__init__.py +7 -2
- cloe_nessy/object_manager/table_manager.py +183 -7
- cloe_nessy/object_manager/volume_manager.py +70 -0
- cloe_nessy/pipeline/__init__.py +0 -0
- cloe_nessy/pipeline/actions/__init__.py +2 -0
- cloe_nessy/pipeline/actions/read_api.py +0 -0
- cloe_nessy/pipeline/actions/read_catalog_table.py +0 -0
- cloe_nessy/pipeline/actions/read_excel.py +0 -0
- cloe_nessy/pipeline/actions/read_files.py +0 -0
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +0 -0
- cloe_nessy/pipeline/actions/transform_change_datatype.py +0 -0
- cloe_nessy/pipeline/actions/transform_clean_column_names.py +0 -0
- cloe_nessy/pipeline/actions/transform_concat_columns.py +0 -0
- cloe_nessy/pipeline/actions/transform_decode.py +0 -0
- cloe_nessy/pipeline/actions/transform_deduplication.py +0 -0
- cloe_nessy/pipeline/actions/transform_distinct.py +0 -0
- cloe_nessy/pipeline/actions/transform_filter.py +0 -0
- cloe_nessy/pipeline/actions/transform_generic_sql.py +0 -0
- cloe_nessy/pipeline/actions/transform_group_aggregate.py +0 -0
- cloe_nessy/pipeline/actions/transform_hash_columns.py +209 -0
- cloe_nessy/pipeline/actions/transform_join.py +0 -0
- cloe_nessy/pipeline/actions/transform_json_normalize.py +0 -0
- cloe_nessy/pipeline/actions/transform_rename_columns.py +0 -0
- cloe_nessy/pipeline/actions/transform_replace_values.py +0 -0
- cloe_nessy/pipeline/actions/transform_select_columns.py +0 -0
- cloe_nessy/pipeline/actions/transform_union.py +0 -0
- cloe_nessy/pipeline/actions/write_catalog_table.py +0 -0
- cloe_nessy/pipeline/pipeline.py +44 -2
- cloe_nessy/pipeline/pipeline_action.py +0 -0
- cloe_nessy/pipeline/pipeline_config.py +0 -0
- cloe_nessy/pipeline/pipeline_context.py +0 -0
- cloe_nessy/pipeline/pipeline_parsing_service.py +0 -0
- cloe_nessy/pipeline/pipeline_step.py +0 -0
- cloe_nessy/py.typed +0 -0
- cloe_nessy/session/__init__.py +0 -0
- cloe_nessy/session/session_manager.py +27 -0
- cloe_nessy/settings/__init__.py +0 -0
- cloe_nessy/settings/settings.py +0 -0
- cloe_nessy/utils/__init__.py +0 -0
- cloe_nessy/utils/file_and_directory_handler.py +0 -0
- cloe_nessy-0.3.8.dist-info/METADATA +46 -0
- {cloe_nessy-0.3.5.dist-info → cloe_nessy-0.3.8.dist-info}/RECORD +18 -12
- {cloe_nessy-0.3.5.dist-info → cloe_nessy-0.3.8.dist-info}/WHEEL +1 -1
- {cloe_nessy-0.3.5.dist-info → cloe_nessy-0.3.8.dist-info}/top_level.txt +0 -0
- cloe_nessy-0.3.5.dist-info/METADATA +0 -26
cloe_nessy/__init__.py
CHANGED
|
File without changes
|
cloe_nessy/clients/__init__.py
CHANGED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
cloe_nessy/logging/__init__.py
CHANGED
|
File without changes
|
|
File without changes
|
cloe_nessy/models/__init__.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
|
+
from .catalog import Catalog
|
|
1
2
|
from .column import Column
|
|
2
3
|
from .constraint import Constraint
|
|
3
4
|
from .foreign_key import ForeignKey
|
|
4
5
|
from .schema import Schema
|
|
5
6
|
from .table import Table
|
|
7
|
+
from .volume import Volume
|
|
6
8
|
|
|
7
9
|
__all__ = [
|
|
10
|
+
"Catalog",
|
|
8
11
|
"Column",
|
|
9
12
|
"Constraint",
|
|
10
13
|
"Table",
|
|
11
14
|
"Schema",
|
|
12
15
|
"ForeignKey",
|
|
16
|
+
"Volume",
|
|
13
17
|
]
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
from pyspark.sql import SparkSession
|
|
2
|
+
from pyspark.sql import functions as F
|
|
3
|
+
|
|
4
|
+
from cloe_nessy.logging.logger_mixin import LoggerMixin
|
|
5
|
+
from cloe_nessy.models import ForeignKey
|
|
6
|
+
|
|
7
|
+
from ...session import SessionManager
|
|
8
|
+
from ..catalog import Catalog
|
|
9
|
+
from ..column import Column
|
|
10
|
+
from ..schema import Schema
|
|
11
|
+
from ..table import Table
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UnityCatalogAdapter(LoggerMixin):
|
|
15
|
+
"""Acts as a translator between Unity Catalog metadata and Nessy Models."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, spark: SparkSession | None = None):
|
|
18
|
+
"""Initializes the UnityCatalogAdapter class."""
|
|
19
|
+
self._spark = spark or SessionManager.get_spark_session()
|
|
20
|
+
self._console_logger = self.get_console_logger()
|
|
21
|
+
self._catalogs = self.get_catalogs()
|
|
22
|
+
|
|
23
|
+
def _execute_sql(self, query):
|
|
24
|
+
"""Execute a SQL query and return a DataFrame.
|
|
25
|
+
|
|
26
|
+
This wrapper is used for better testability.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
The resulting DataFrame after executing the SQL query.
|
|
30
|
+
"""
|
|
31
|
+
return self._spark.sql(query)
|
|
32
|
+
|
|
33
|
+
def get_catalogs(self) -> list[Catalog]:
|
|
34
|
+
"""Retrieve a list of catalogs with their associated metadata.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
A list of `Catalog` objects.
|
|
38
|
+
"""
|
|
39
|
+
df = self._execute_sql("SHOW CATALOGS")
|
|
40
|
+
catalogs = []
|
|
41
|
+
for catalog in df.collect():
|
|
42
|
+
name = catalog["catalog"]
|
|
43
|
+
catalog_metadata = self._execute_sql(f"DESCRIBE CATALOG EXTENDED {name}")
|
|
44
|
+
pivoted_metadata = catalog_metadata.withColumn("dummy", F.lit("dummy"))
|
|
45
|
+
pivoted_df = pivoted_metadata.groupBy("dummy").pivot("info_name").agg(F.first("info_value"))
|
|
46
|
+
catalog_owner = pivoted_df.collect()[0]["Owner"]
|
|
47
|
+
comment = pivoted_df.collect()[0]["Comment"]
|
|
48
|
+
catalogs.append(Catalog(name=name, owner=catalog_owner, comment=comment))
|
|
49
|
+
return catalogs
|
|
50
|
+
|
|
51
|
+
def get_catalog_by_name(self, name: str) -> Catalog | None:
|
|
52
|
+
"""Returns a Catalog by its name.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
name: The name of the Catalog.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
The Catalog with the specified name.
|
|
59
|
+
"""
|
|
60
|
+
for catalog in self._catalogs:
|
|
61
|
+
if catalog.name == name:
|
|
62
|
+
return catalog
|
|
63
|
+
self._console_logger.warning(f"No catalog found with name: {name}")
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
def get_catalog_schemas(self, catalog: str | Catalog) -> list[Schema]:
|
|
67
|
+
"""Collects all schemas in a given catalog.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
catalog: The catalog from which the schemas are to be collected.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
A list of `Schema` objects.
|
|
74
|
+
"""
|
|
75
|
+
schemas = []
|
|
76
|
+
if isinstance(catalog, Catalog):
|
|
77
|
+
catalog = catalog.name
|
|
78
|
+
schemas_df = self._execute_sql(f"SELECT * FROM {catalog}.information_schema.schemata").collect()
|
|
79
|
+
|
|
80
|
+
for schema in schemas_df:
|
|
81
|
+
schemas.append(
|
|
82
|
+
Schema(name=schema["schema_name"], catalog=catalog, comment=schema["comment"]),
|
|
83
|
+
)
|
|
84
|
+
return schemas
|
|
85
|
+
|
|
86
|
+
def get_schema_by_name(self, catalog: str | Catalog, name: str) -> Schema | None:
|
|
87
|
+
"""Retrieve a schema by its name from a specified catalog.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
catalog: The catalog from which to retrieve the schema.
|
|
91
|
+
This can be either a string representing the catalog name or a
|
|
92
|
+
`Catalog` object.
|
|
93
|
+
name: The name of the schema to retrieve.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
The `Schema` object if found, otherwise `None`.
|
|
97
|
+
"""
|
|
98
|
+
if isinstance(catalog, Catalog):
|
|
99
|
+
catalog = catalog.name
|
|
100
|
+
schemas = self.get_catalog_schemas(catalog)
|
|
101
|
+
for schema in schemas:
|
|
102
|
+
if schema.name == name:
|
|
103
|
+
schema = self.add_tables_to_schema(catalog, schema)
|
|
104
|
+
return schema
|
|
105
|
+
|
|
106
|
+
self._console_logger.warning(f"No Schema in Catalog [ '{catalog}' ] found with name [ '{name}' ]")
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
def get_table_by_name(self, table_identifier: str) -> Table | None:
|
|
110
|
+
"""Retrieve a table by it's name."""
|
|
111
|
+
if len(table_identifier.split(".")) != 3:
|
|
112
|
+
raise ValueError("The identifier must be in the format 'catalog.schema.table'")
|
|
113
|
+
|
|
114
|
+
catalog_name, schema_name, table_name = table_identifier.split(".")
|
|
115
|
+
table_metadata_df = self._execute_sql(
|
|
116
|
+
f"""
|
|
117
|
+
SELECT * FROM {catalog_name}.information_schema.tables
|
|
118
|
+
WHERE table_catalog == '{catalog_name}'
|
|
119
|
+
AND table_schema == '{schema_name}'
|
|
120
|
+
AND table_name == '{table_name}'
|
|
121
|
+
AND table_type <> 'VIEW'
|
|
122
|
+
""",
|
|
123
|
+
)
|
|
124
|
+
if not table_metadata_df.head(1):
|
|
125
|
+
table = None
|
|
126
|
+
else:
|
|
127
|
+
table_metadata = table_metadata_df.collect()[0]
|
|
128
|
+
table_tags_list = self._execute_sql(
|
|
129
|
+
f"""
|
|
130
|
+
SELECT tag_name, tag_value FROM {catalog_name}.information_schema.table_tags
|
|
131
|
+
WHERE catalog_name == '{catalog_name}'
|
|
132
|
+
AND schema_name == '{schema_name}'
|
|
133
|
+
AND table_name == '{table_name}'
|
|
134
|
+
""",
|
|
135
|
+
).collect()
|
|
136
|
+
table_tags = {r["tag_name"]: r["tag_value"] for r in table_tags_list}
|
|
137
|
+
table = Table(
|
|
138
|
+
identifier=table_identifier,
|
|
139
|
+
data_source_format=table_metadata["data_source_format"],
|
|
140
|
+
business_properties=table_tags,
|
|
141
|
+
storage_path=table_metadata["storage_path"],
|
|
142
|
+
columns=[],
|
|
143
|
+
is_external=table_metadata["table_type"] != "MANAGED",
|
|
144
|
+
)
|
|
145
|
+
table = self.add_columns_to_table(table)
|
|
146
|
+
return table
|
|
147
|
+
|
|
148
|
+
def add_tables_to_schema(self, catalog: str | Catalog, schema: str | Schema) -> Schema:
|
|
149
|
+
"""Add tables to a schema within a specified catalog.
|
|
150
|
+
|
|
151
|
+
This method retrieves all tables within the specified schema and catalog,
|
|
152
|
+
and adds them to the `Schema` object. The schema is updated with `Table`
|
|
153
|
+
objects containing details about each table.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
catalog: The catalog containing the schema. This can be
|
|
157
|
+
either a string representing the catalog name or a `Catalog` object.
|
|
158
|
+
schema: The schema to which tables will be added. This
|
|
159
|
+
can be either a string representing the schema name or a `Schema`
|
|
160
|
+
object.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
The updated `Schema` object with tables added.
|
|
164
|
+
"""
|
|
165
|
+
if isinstance(catalog, Catalog):
|
|
166
|
+
catalog_name = catalog.name
|
|
167
|
+
else:
|
|
168
|
+
catalog_name = catalog
|
|
169
|
+
if isinstance(schema, str):
|
|
170
|
+
schema_obj = self.get_schema_by_name(catalog_name, schema)
|
|
171
|
+
if schema_obj is None:
|
|
172
|
+
raise ValueError(f"Schema {schema} not found in catalog {catalog_name}.")
|
|
173
|
+
else:
|
|
174
|
+
schema_obj = schema
|
|
175
|
+
tables_df = self._execute_sql(
|
|
176
|
+
f"SELECT * FROM {catalog_name}.information_schema.tables WHERE table_catalog == '{catalog_name}' AND table_schema == '{schema_obj.name}' AND table_type <> 'VIEW'",
|
|
177
|
+
).collect()
|
|
178
|
+
for table_row in tables_df:
|
|
179
|
+
table_name = table_row["table_name"]
|
|
180
|
+
table_tags_list = self._execute_sql(
|
|
181
|
+
f"""SELECT tag_name, tag_value FROM {catalog_name}.information_schema.table_tags
|
|
182
|
+
WHERE
|
|
183
|
+
catalog_name == '{catalog_name}'
|
|
184
|
+
AND schema_name == '{schema_obj.name}'
|
|
185
|
+
AND table_name == '{table_name}'
|
|
186
|
+
""",
|
|
187
|
+
).collect()
|
|
188
|
+
table_tags = {r["tag_name"]: r["tag_value"] for r in table_tags_list}
|
|
189
|
+
|
|
190
|
+
table = Table(
|
|
191
|
+
data_source_format=table_row["data_source_format"],
|
|
192
|
+
identifier=f"{catalog}.{schema_obj.name}.{table_name}",
|
|
193
|
+
business_properties=table_tags,
|
|
194
|
+
columns=[],
|
|
195
|
+
)
|
|
196
|
+
table = self.add_columns_to_table(table)
|
|
197
|
+
schema_obj.add_table(table)
|
|
198
|
+
return schema_obj
|
|
199
|
+
|
|
200
|
+
def add_columns_to_table(self, table: Table) -> Table:
|
|
201
|
+
"""Add columns to a table by retrieving column metadata from the information schema.
|
|
202
|
+
|
|
203
|
+
This method retrieves column details for the specified `table` from the
|
|
204
|
+
information schema and adds `Column` objects to the `Table`. It also identifies
|
|
205
|
+
primary key columns for the table.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
table: The `Table` object to which columns will be added. The
|
|
209
|
+
`Table` object must have its `identifier` attribute set.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
The updated `Table` object with columns added.
|
|
213
|
+
"""
|
|
214
|
+
if not table.identifier:
|
|
215
|
+
raise ValueError("Please set the Identifier of the Table to use this method.")
|
|
216
|
+
cols_df = self._execute_sql(
|
|
217
|
+
f"""
|
|
218
|
+
SELECT * FROM {table.catalog}.information_schema.columns
|
|
219
|
+
WHERE table_name == '{table.name}'
|
|
220
|
+
AND table_schema == '{table.schema}'
|
|
221
|
+
ORDER BY ordinal_position
|
|
222
|
+
""",
|
|
223
|
+
).collect()
|
|
224
|
+
partition_cols_indexed = {}
|
|
225
|
+
for col_row in cols_df:
|
|
226
|
+
generated = "GENERATED ALWAYS AS IDENTITY" if col_row["is_identity"] == "YES" else None
|
|
227
|
+
table.add_column(
|
|
228
|
+
Column(
|
|
229
|
+
name=col_row["column_name"],
|
|
230
|
+
data_type=col_row["data_type"],
|
|
231
|
+
default_value=col_row["column_default"],
|
|
232
|
+
generated=generated,
|
|
233
|
+
nullable=col_row["is_nullable"] == "YES",
|
|
234
|
+
),
|
|
235
|
+
)
|
|
236
|
+
if col_row["partition_index"] is not None:
|
|
237
|
+
partition_cols_indexed.update({str(col_row["partition_index"]): col_row["column_name"]})
|
|
238
|
+
partitioned_by = [val for _, val in sorted(partition_cols_indexed.items())]
|
|
239
|
+
if partitioned_by:
|
|
240
|
+
table.liquid_clustering = False
|
|
241
|
+
table.partition_by = partitioned_by
|
|
242
|
+
table = self._identify_pk_columns(table)
|
|
243
|
+
table = self._identify_fk_constraints(table)
|
|
244
|
+
return table
|
|
245
|
+
|
|
246
|
+
def _identify_pk_columns(self, table: Table) -> Table:
|
|
247
|
+
result = self._execute_sql(
|
|
248
|
+
f"""
|
|
249
|
+
SELECT A.column_name
|
|
250
|
+
FROM {table.catalog}.information_schema.key_column_usage AS A
|
|
251
|
+
JOIN {table.catalog}.information_schema.table_constraints AS B
|
|
252
|
+
USING (constraint_catalog, constraint_schema, constraint_name)
|
|
253
|
+
WHERE
|
|
254
|
+
A.table_catalog = '{table.catalog}'
|
|
255
|
+
AND A.table_schema = '{table.schema}'
|
|
256
|
+
AND A.table_name = '{table.name}'
|
|
257
|
+
AND B.constraint_type = 'PRIMARY KEY'
|
|
258
|
+
""",
|
|
259
|
+
).collect()
|
|
260
|
+
table.composite_primary_key = [col_row["column_name"] for col_row in result]
|
|
261
|
+
return table
|
|
262
|
+
|
|
263
|
+
def _identify_fk_constraints(self, table: Table) -> Table:
|
|
264
|
+
result = self._execute_sql(
|
|
265
|
+
f"""
|
|
266
|
+
SELECT
|
|
267
|
+
concat_ws(".", C.table_catalog, C.table_schema, C.table_name) as source_table_identifier,
|
|
268
|
+
C.column_name as source_column,
|
|
269
|
+
concat_ws(".", B.table_catalog, B.table_schema, B.table_name) as parent_table_identifier,
|
|
270
|
+
B.column_name as parent_column
|
|
271
|
+
-- fk_option currently not supported
|
|
272
|
+
-- ,concat_ws(" ",D.match_option, "ON UPDATE", D.update_rule, "ON DELETE", D.delete_rule) AS fk_options
|
|
273
|
+
FROM {table.catalog}.information_schema.table_constraints AS A
|
|
274
|
+
LEFT JOIN {table.catalog}.information_schema.constraint_column_usage AS B USING(constraint_name)
|
|
275
|
+
LEFT JOIN {table.catalog}.information_schema.key_column_usage AS C USING(constraint_name)
|
|
276
|
+
-- LEFT JOIN {table.catalog}.information_schema.referential_constraints AS D USING(constraint_name)
|
|
277
|
+
WHERE
|
|
278
|
+
A.table_catalog == '{table.catalog}'
|
|
279
|
+
AND A.table_schema = '{table.schema}'
|
|
280
|
+
AND A.table_name == '{table.name}'
|
|
281
|
+
AND A.constraint_type == "FOREIGN KEY"
|
|
282
|
+
""",
|
|
283
|
+
).collect()
|
|
284
|
+
table.foreign_keys = [
|
|
285
|
+
ForeignKey(
|
|
286
|
+
foreign_key_columns=fk_row["source_column"],
|
|
287
|
+
parent_table=fk_row["parent_table_identifier"],
|
|
288
|
+
parent_columns=fk_row["parent_column"],
|
|
289
|
+
)
|
|
290
|
+
for fk_row in result
|
|
291
|
+
]
|
|
292
|
+
return table
|
cloe_nessy/models/column.py
CHANGED
|
File without changes
|
cloe_nessy/models/constraint.py
CHANGED
|
File without changes
|
cloe_nessy/models/foreign_key.py
CHANGED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
cloe_nessy/models/schema.py
CHANGED
|
@@ -17,6 +17,7 @@ class Schema(ReadInstancesMixin):
|
|
|
17
17
|
storage_path: str | None = None
|
|
18
18
|
tables: list[Table] = Field(default_factory=list)
|
|
19
19
|
properties: dict[str, Any] = Field(default_factory=dict)
|
|
20
|
+
comment: str | None = None
|
|
20
21
|
|
|
21
22
|
@classmethod
|
|
22
23
|
def read_instance_from_file(
|
|
@@ -74,3 +75,21 @@ class Schema(ReadInstancesMixin):
|
|
|
74
75
|
raise ValueError(f"Table {table_name} not found in {self.catalog}.{self.name} metadata.")
|
|
75
76
|
|
|
76
77
|
return table
|
|
78
|
+
|
|
79
|
+
def add_table(self, table: Table):
|
|
80
|
+
"""Adds a table to the schema and sets the table identifier accordingly.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
table: A Table object that is added to the Schema tables.
|
|
84
|
+
"""
|
|
85
|
+
table.identifier = f"{self.catalog}.{self.name}.{table.name}"
|
|
86
|
+
self.tables.append(table)
|
|
87
|
+
|
|
88
|
+
def add_tables(self, tables: list[Table]) -> None:
|
|
89
|
+
"""Adds tables to the schema.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
tables: A list of Table objects that are added to the Schema tables.
|
|
93
|
+
"""
|
|
94
|
+
for table in tables:
|
|
95
|
+
self.add_table(table)
|
cloe_nessy/models/table.py
CHANGED
|
@@ -24,11 +24,14 @@ class Table(TemplateLoaderMixin, ReadInstancesMixin, LoggerMixin):
|
|
|
24
24
|
is_external: bool | None = None
|
|
25
25
|
partition_by: list[str] = Field(default_factory=list)
|
|
26
26
|
liquid_clustering: bool | None = None
|
|
27
|
+
composite_primary_key: list[str] = Field(default_factory=list)
|
|
27
28
|
properties: dict[str, str] = Field(default_factory=dict)
|
|
28
29
|
constraints: list[Constraint] = Field(default_factory=list)
|
|
29
30
|
foreign_keys: list[ForeignKey] = Field(default_factory=list)
|
|
30
31
|
storage_path: Path | None = None
|
|
32
|
+
business_properties: dict[str, str] = Field(default_factory=dict)
|
|
31
33
|
comment: str | None = None
|
|
34
|
+
data_source_format: str | None = None
|
|
32
35
|
|
|
33
36
|
def model_post_init(self, __context: Any) -> None:
|
|
34
37
|
"""Post init method for the Table model."""
|
|
@@ -87,10 +90,8 @@ class Table(TemplateLoaderMixin, ReadInstancesMixin, LoggerMixin):
|
|
|
87
90
|
|
|
88
91
|
@model_validator(mode="after")
|
|
89
92
|
def _validate_is_external(cls, table: Self):
|
|
90
|
-
"""If is_external is set to
|
|
91
|
-
if
|
|
92
|
-
raise ValueError("is_external cannot be false while storage_path is set.")
|
|
93
|
-
elif table.is_external and table.storage_path is None:
|
|
93
|
+
"""If is_external is set to True, storage_path has to be set."""
|
|
94
|
+
if table.is_external and table.storage_path is None:
|
|
94
95
|
raise ValueError("is_external cannot be true while storage_path is None.")
|
|
95
96
|
|
|
96
97
|
@classmethod
|
|
@@ -222,7 +223,7 @@ class Table(TemplateLoaderMixin, ReadInstancesMixin, LoggerMixin):
|
|
|
222
223
|
|
|
223
224
|
def get_create_statement(
|
|
224
225
|
self,
|
|
225
|
-
templates: Path = Path("./templates"),
|
|
226
|
+
templates: Path = Path("./src/cloe_nessy/models/templates/"),
|
|
226
227
|
template_name: str = "create_table.sql.j2",
|
|
227
228
|
replace: bool = True,
|
|
228
229
|
):
|
|
@@ -234,3 +235,47 @@ class Table(TemplateLoaderMixin, ReadInstancesMixin, LoggerMixin):
|
|
|
234
235
|
raise err
|
|
235
236
|
render = template.render(table=self, replace=replace)
|
|
236
237
|
return render
|
|
238
|
+
|
|
239
|
+
def get_column_by_name(self, column_name: str) -> Column | None:
|
|
240
|
+
"""Get a column by name.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
column_name: The name of the column to get.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
The column if found, else None.
|
|
247
|
+
"""
|
|
248
|
+
for column in self.columns:
|
|
249
|
+
if column.name == column_name:
|
|
250
|
+
return column
|
|
251
|
+
return None
|
|
252
|
+
|
|
253
|
+
def update_column(self, column: Column) -> None:
|
|
254
|
+
"""Replaces a Column with a new Column object to update it.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
column: The new column object, to replace the old one.
|
|
258
|
+
"""
|
|
259
|
+
self.remove_column(column)
|
|
260
|
+
self.add_column(column)
|
|
261
|
+
|
|
262
|
+
def add_column(self, column: Column):
|
|
263
|
+
"""Adds a column to the table.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
column: The column to be added.
|
|
267
|
+
"""
|
|
268
|
+
self.columns.append(column)
|
|
269
|
+
|
|
270
|
+
def remove_column(self, column: str | Column) -> None:
|
|
271
|
+
"""Remove a column from the Table.
|
|
272
|
+
|
|
273
|
+
Args.
|
|
274
|
+
column: The column to be removed.
|
|
275
|
+
"""
|
|
276
|
+
if isinstance(column, Column):
|
|
277
|
+
column_name = column.name
|
|
278
|
+
else:
|
|
279
|
+
column_name = column
|
|
280
|
+
|
|
281
|
+
self.columns = [col for col in self.columns if col.name != column_name]
|
cloe_nessy/models/types.py
CHANGED
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from jinja2 import TemplateNotFound
|
|
5
|
+
from pydantic import BaseModel, field_validator
|
|
6
|
+
|
|
7
|
+
from ..logging import LoggerMixin
|
|
8
|
+
from .mixins.template_loader_mixin import TemplateLoaderMixin
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Volume(TemplateLoaderMixin, LoggerMixin, BaseModel):
|
|
12
|
+
"""Volume class for managing volumes."""
|
|
13
|
+
|
|
14
|
+
identifier: str
|
|
15
|
+
storage_path: str | Path
|
|
16
|
+
comment: str | None = None
|
|
17
|
+
|
|
18
|
+
@field_validator("identifier")
|
|
19
|
+
def check_identifier(cls, value):
|
|
20
|
+
"""Check the identifier."""
|
|
21
|
+
if value.count(".") != 2:
|
|
22
|
+
raise ValueError("The identifier must be in the format 'catalog.schema.volume_name'.")
|
|
23
|
+
return value
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def storage_identifier(self) -> str:
|
|
27
|
+
"""Return the storage identifier."""
|
|
28
|
+
return f"/Volumes/{self.catalog}/{self.schema}/{self.name}/"
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def catalog(self) -> str:
|
|
32
|
+
"""Return the catalog name."""
|
|
33
|
+
return self.identifier.split(".")[0]
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def schema_name(self) -> str:
|
|
37
|
+
"""Return the schema name."""
|
|
38
|
+
return self.identifier.split(".")[1]
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def name(self) -> str:
|
|
42
|
+
"""Return the table name."""
|
|
43
|
+
return self.identifier.split(".")[2]
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def escaped_identifier(self) -> str:
|
|
47
|
+
"""Return the escaped identifier."""
|
|
48
|
+
return f"`{self.catalog}`.`{self.schema_name}`.`{self.name}`"
|
|
49
|
+
|
|
50
|
+
def model_post_init(self, __context: Any) -> None:
|
|
51
|
+
"""Post init method for the Table model."""
|
|
52
|
+
self._console_logger = self.get_console_logger()
|
|
53
|
+
self._console_logger.debug(f"Model for volume [ '{self.identifier}' ] has been initialized.")
|
|
54
|
+
|
|
55
|
+
def get_create_statement(
|
|
56
|
+
self,
|
|
57
|
+
templates: Path = Path("./src/cloe_nessy/models/templates/"),
|
|
58
|
+
template_name: str = "create_volume.sql.j2",
|
|
59
|
+
):
|
|
60
|
+
"""Get the create statement for the Volume."""
|
|
61
|
+
try:
|
|
62
|
+
template = self.get_template(templates, template_name)
|
|
63
|
+
except TemplateNotFound as err:
|
|
64
|
+
self._console_logger.error(f"Template [ {template_name} ] not found.")
|
|
65
|
+
raise err
|
|
66
|
+
render = template.render(volume=self)
|
|
67
|
+
return render
|