datacontract-cli 0.10.20__py3-none-any.whl → 0.10.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (38) hide show
  1. datacontract/{web.py → api.py} +55 -3
  2. datacontract/breaking/breaking.py +1 -1
  3. datacontract/breaking/breaking_rules.py +1 -1
  4. datacontract/cli.py +32 -10
  5. datacontract/data_contract.py +14 -100
  6. datacontract/engines/data_contract_checks.py +735 -0
  7. datacontract/engines/data_contract_test.py +51 -0
  8. datacontract/engines/soda/check_soda_execute.py +36 -30
  9. datacontract/engines/soda/connections/kafka.py +8 -3
  10. datacontract/export/avro_converter.py +2 -0
  11. datacontract/export/custom_converter.py +40 -0
  12. datacontract/export/exporter.py +1 -2
  13. datacontract/export/exporter_factory.py +4 -12
  14. datacontract/export/sodacl_converter.py +22 -294
  15. datacontract/export/sql_type_converter.py +7 -2
  16. datacontract/imports/odcs_importer.py +6 -3
  17. datacontract/imports/odcs_v3_importer.py +2 -0
  18. datacontract/imports/sql_importer.py +229 -29
  19. datacontract/lint/urls.py +4 -4
  20. datacontract/model/data_contract_specification.py +130 -129
  21. datacontract/model/exceptions.py +4 -1
  22. datacontract/model/run.py +25 -18
  23. datacontract/templates/datacontract.html +16 -2
  24. datacontract/templates/partials/definition.html +3 -95
  25. datacontract/templates/partials/model_field.html +13 -0
  26. datacontract/templates/partials/quality.html +49 -0
  27. datacontract/templates/style/output.css +151 -152
  28. {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/METADATA +238 -184
  29. {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/RECORD +34 -34
  30. datacontract/engines/soda/connections/dask.py +0 -28
  31. datacontract/export/odcs_v2_exporter.py +0 -124
  32. datacontract/imports/odcs_v2_importer.py +0 -177
  33. datacontract/lint/linters/example_model_linter.py +0 -91
  34. /datacontract/{model → breaking}/breaking_change.py +0 -0
  35. {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/LICENSE +0 -0
  36. {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/WHEEL +0 -0
  37. {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/entry_points.txt +0 -0
  38. {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/top_level.txt +0 -0
@@ -287,6 +287,8 @@ def import_fields(
287
287
 
288
288
 
289
289
  def map_type(odcs_type: str, custom_mappings: Dict[str, str]) -> str | None:
290
+ if odcs_type is None:
291
+ return None
290
292
  t = odcs_type.lower()
291
293
  if t in DATACONTRACT_TYPES:
292
294
  return t
@@ -1,44 +1,76 @@
1
- from simple_ddl_parser import parse_from_file
1
+ import logging
2
+ import os
3
+
4
+ import sqlglot
5
+ from sqlglot.dialects.dialect import Dialects
2
6
 
3
7
  from datacontract.imports.importer import Importer
4
- from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
8
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
9
+ from datacontract.model.exceptions import DataContractException
10
+ from datacontract.model.run import ResultEnum
5
11
 
6
12
 
7
13
  class SqlImporter(Importer):
8
14
  def import_source(
9
15
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
10
16
  ) -> DataContractSpecification:
11
- return import_sql(data_contract_specification, self.import_format, source)
17
+ return import_sql(data_contract_specification, self.import_format, source, import_args)
18
+
12
19
 
20
+ def import_sql(
21
+ data_contract_specification: DataContractSpecification, format: str, source: str, import_args: dict = None
22
+ ) -> DataContractSpecification:
23
+ sql = read_file(source)
13
24
 
14
- def import_sql(data_contract_specification: DataContractSpecification, format: str, source: str):
15
- ddl = parse_from_file(source, group_by_type=True)
16
- tables = ddl["tables"]
25
+ dialect = to_dialect(import_args)
26
+
27
+ try:
28
+ parsed = sqlglot.parse_one(sql=sql, read=dialect)
29
+ except Exception as e:
30
+ logging.error(f"Error parsing SQL: {str(e)}")
31
+ raise DataContractException(
32
+ type="import",
33
+ name=f"Reading source from {source}",
34
+ reason=f"Error parsing SQL: {str(e)}",
35
+ engine="datacontract",
36
+ result=ResultEnum.error,
37
+ )
38
+
39
+ server_type: str | None = to_server_type(source, dialect)
40
+ if server_type is not None:
41
+ data_contract_specification.servers[server_type] = Server(type=server_type)
42
+
43
+ tables = parsed.find_all(sqlglot.expressions.Table)
17
44
 
18
45
  for table in tables:
19
46
  if data_contract_specification.models is None:
20
47
  data_contract_specification.models = {}
21
48
 
22
- table_name = table["table_name"]
49
+ table_name = table.this.name
23
50
 
24
51
  fields = {}
25
- for column in table["columns"]:
52
+ for column in parsed.find_all(sqlglot.exp.ColumnDef):
53
+ if column.parent.this.name != table_name:
54
+ continue
55
+
26
56
  field = Field()
27
- field.type = map_type_from_sql(map_type_from_sql(column["type"]))
28
- if not column["nullable"]:
29
- field.required = True
30
- if column["unique"]:
31
- field.unique = True
32
- fields[column["name"]] = field
33
- if column["size"] is not None:
34
- field.maxLength = column["size"]
35
-
36
- if len(table["primary_key"]) == 1:
37
- primary_key = table["primary_key"][0]
38
- if primary_key in fields:
39
- fields[primary_key].unique = True
40
- fields[primary_key].required = True
41
- fields[primary_key].primaryKey = True
57
+ col_name = column.this.name
58
+ col_type = to_col_type(column, dialect)
59
+ field.type = map_type_from_sql(col_type)
60
+ col_description = get_description(column)
61
+ field.description = col_description
62
+ field.maxLength = get_max_length(column)
63
+ precision, scale = get_precision_scale(column)
64
+ field.precision = precision
65
+ field.scale = scale
66
+ field.primaryKey = get_primary_key(column)
67
+ field.required = column.find(sqlglot.exp.NotNullColumnConstraint) is not None or None
68
+ physical_type_key = to_physical_type_key(dialect)
69
+ field.config = {
70
+ physical_type_key: col_type,
71
+ }
72
+
73
+ fields[col_name] = field
42
74
 
43
75
  data_contract_specification.models[table_name] = Model(
44
76
  type="table",
@@ -48,6 +80,124 @@ def import_sql(data_contract_specification: DataContractSpecification, format: s
48
80
  return data_contract_specification
49
81
 
50
82
 
83
+ def get_primary_key(column) -> bool | None:
84
+ if column.find(sqlglot.exp.PrimaryKeyColumnConstraint) is not None:
85
+ return True
86
+ if column.find(sqlglot.exp.PrimaryKey) is not None:
87
+ return True
88
+ return None
89
+
90
+
91
+ def to_dialect(import_args: dict) -> Dialects | None:
92
+ if import_args is None:
93
+ return None
94
+ if "dialect" not in import_args:
95
+ return None
96
+ dialect = import_args.get("dialect")
97
+ if dialect is None:
98
+ return None
99
+ if dialect == "sqlserver":
100
+ return Dialects.TSQL
101
+ if dialect.upper() in Dialects.__members__:
102
+ return Dialects[dialect.upper()]
103
+ if dialect == "sqlserver":
104
+ return Dialects.TSQL
105
+ return None
106
+
107
+
108
+ def to_physical_type_key(dialect: Dialects | None) -> str:
109
+ dialect_map = {
110
+ Dialects.TSQL: "sqlserverType",
111
+ Dialects.POSTGRES: "postgresType",
112
+ Dialects.BIGQUERY: "bigqueryType",
113
+ Dialects.SNOWFLAKE: "snowflakeType",
114
+ Dialects.REDSHIFT: "redshiftType",
115
+ Dialects.ORACLE: "oracleType",
116
+ Dialects.MYSQL: "mysqlType",
117
+ Dialects.DATABRICKS: "databricksType",
118
+ }
119
+ return dialect_map.get(dialect, "physicalType")
120
+
121
+
122
+ def to_server_type(source, dialect: Dialects | None) -> str | None:
123
+ if dialect is None:
124
+ return None
125
+ dialect_map = {
126
+ Dialects.TSQL: "sqlserver",
127
+ Dialects.POSTGRES: "postgres",
128
+ Dialects.BIGQUERY: "bigquery",
129
+ Dialects.SNOWFLAKE: "snowflake",
130
+ Dialects.REDSHIFT: "redshift",
131
+ Dialects.ORACLE: "oracle",
132
+ Dialects.MYSQL: "mysql",
133
+ Dialects.DATABRICKS: "databricks",
134
+ }
135
+ return dialect_map.get(dialect, None)
136
+
137
+
138
+ def to_col_type(column, dialect):
139
+ col_type_kind = column.args["kind"]
140
+ if col_type_kind is None:
141
+ return None
142
+
143
+ return col_type_kind.sql(dialect)
144
+
145
+
146
+ def to_col_type_normalized(column):
147
+ col_type = column.args["kind"].this.name
148
+ if col_type is None:
149
+ return None
150
+ return col_type.lower()
151
+
152
+
153
+ def get_description(column: sqlglot.expressions.ColumnDef) -> str | None:
154
+ if column.comments is None:
155
+ return None
156
+ return " ".join(comment.strip() for comment in column.comments)
157
+
158
+
159
+ def get_max_length(column: sqlglot.expressions.ColumnDef) -> int | None:
160
+ col_type = to_col_type_normalized(column)
161
+ if col_type is None:
162
+ return None
163
+ if col_type not in ["varchar", "char", "nvarchar", "nchar"]:
164
+ return None
165
+ col_params = list(column.args["kind"].find_all(sqlglot.expressions.DataTypeParam))
166
+ max_length_str = None
167
+ if len(col_params) == 0:
168
+ return None
169
+ if len(col_params) == 1:
170
+ max_length_str = col_params[0].name
171
+ if len(col_params) == 2:
172
+ max_length_str = col_params[1].name
173
+ if max_length_str is not None:
174
+ return int(max_length_str) if max_length_str.isdigit() else None
175
+
176
+
177
+ def get_precision_scale(column):
178
+ col_type = to_col_type_normalized(column)
179
+ if col_type is None:
180
+ return None, None
181
+ if col_type not in ["decimal", "numeric", "float", "number"]:
182
+ return None, None
183
+ col_params = list(column.args["kind"].find_all(sqlglot.expressions.DataTypeParam))
184
+ if len(col_params) == 0:
185
+ return None, None
186
+ if len(col_params) == 1:
187
+ if not col_params[0].name.isdigit():
188
+ return None, None
189
+ precision = int(col_params[0].name)
190
+ scale = 0
191
+ return precision, scale
192
+ if len(col_params) == 2:
193
+ if not col_params[0].name.isdigit() or not col_params[1].name.isdigit():
194
+ return None, None
195
+ precision = int(col_params[0].name)
196
+ scale = int(col_params[1].name)
197
+ return precision, scale
198
+ return None, None
199
+
200
+
51
201
  def map_type_from_sql(sql_type: str):
52
202
  if sql_type is None:
53
203
  return None
@@ -55,25 +205,55 @@ def map_type_from_sql(sql_type: str):
55
205
  sql_type_normed = sql_type.lower().strip()
56
206
 
57
207
  if sql_type_normed.startswith("varchar"):
58
- return "varchar"
208
+ return "string"
209
+ elif sql_type_normed.startswith("char"):
210
+ return "string"
59
211
  elif sql_type_normed.startswith("string"):
60
212
  return "string"
213
+ elif sql_type_normed.startswith("nchar"):
214
+ return "string"
61
215
  elif sql_type_normed.startswith("text"):
62
- return "text"
216
+ return "string"
217
+ elif sql_type_normed.startswith("nvarchar"):
218
+ return "string"
219
+ elif sql_type_normed.startswith("ntext"):
220
+ return "string"
63
221
  elif sql_type_normed.startswith("int"):
64
- return "integer"
222
+ return "int"
223
+ elif sql_type_normed.startswith("bigint"):
224
+ return "long"
225
+ elif sql_type_normed.startswith("tinyint"):
226
+ return "int"
227
+ elif sql_type_normed.startswith("smallint"):
228
+ return "int"
65
229
  elif sql_type_normed.startswith("float"):
66
230
  return "float"
67
231
  elif sql_type_normed.startswith("decimal"):
68
232
  return "decimal"
69
233
  elif sql_type_normed.startswith("numeric"):
70
- return "numeric"
234
+ return "decimal"
71
235
  elif sql_type_normed.startswith("bool"):
72
236
  return "boolean"
73
- elif sql_type_normed.startswith("timestamp"):
74
- return "timestamp"
237
+ elif sql_type_normed.startswith("bit"):
238
+ return "boolean"
239
+ elif sql_type_normed.startswith("binary"):
240
+ return "bytes"
241
+ elif sql_type_normed.startswith("varbinary"):
242
+ return "bytes"
75
243
  elif sql_type_normed == "date":
76
244
  return "date"
245
+ elif sql_type_normed == "time":
246
+ return "string"
247
+ elif sql_type_normed == "timestamp":
248
+ return "timestamp_ntz"
249
+ elif (
250
+ sql_type_normed == "timestamptz"
251
+ or sql_type_normed == "timestamp_tz"
252
+ or sql_type_normed == "timestamp with time zone"
253
+ ):
254
+ return "timestamp_tz"
255
+ elif sql_type_normed == "timestampntz" or sql_type_normed == "timestamp_ntz":
256
+ return "timestamp_ntz"
77
257
  elif sql_type_normed == "smalldatetime":
78
258
  return "timestamp_ntz"
79
259
  elif sql_type_normed == "datetime":
@@ -82,5 +262,25 @@ def map_type_from_sql(sql_type: str):
82
262
  return "timestamp_ntz"
83
263
  elif sql_type_normed == "datetimeoffset":
84
264
  return "timestamp_tz"
265
+ elif sql_type_normed == "uniqueidentifier": # tsql
266
+ return "string"
267
+ elif sql_type_normed == "json":
268
+ return "string"
269
+ elif sql_type_normed == "xml": # tsql
270
+ return "string"
85
271
  else:
86
272
  return "variant"
273
+
274
+
275
+ def read_file(path):
276
+ if not os.path.exists(path):
277
+ raise DataContractException(
278
+ type="import",
279
+ name=f"Reading source from {path}",
280
+ reason=f"The file '{path}' does not exist.",
281
+ engine="datacontract",
282
+ result=ResultEnum.error,
283
+ )
284
+ with open(path, "r") as file:
285
+ file_content = file.read()
286
+ return file_content
datacontract/lint/urls.py CHANGED
@@ -33,22 +33,22 @@ def _set_api_key(headers, url):
33
33
 
34
34
  if hostname == "datamesh-manager.com" or hostname.endswith(".datamesh-manager.com"):
35
35
  if datamesh_manager_api_key is None or datamesh_manager_api_key == "":
36
- print("Error: Data Mesh Manager API Key is not set. Set env variable DATAMESH_MANAGER_API_KEY.")
36
+ print("Error: Data Mesh Manager API key is not set. Set env variable DATAMESH_MANAGER_API_KEY.")
37
37
  raise DataContractException(
38
38
  type="lint",
39
39
  name=f"Reading data contract from {url}",
40
- reason="Error: Data Mesh Manager API Key is not set. Set env variable DATAMESH_MANAGER_API_KEY.",
40
+ reason="Error: Data Mesh Manager API key is not set. Set env variable DATAMESH_MANAGER_API_KEY.",
41
41
  engine="datacontract",
42
42
  result="error",
43
43
  )
44
44
  headers["x-api-key"] = datamesh_manager_api_key
45
45
  elif hostname == "datacontract-manager.com" or hostname.endswith(".datacontract-manager.com"):
46
46
  if datacontract_manager_api_key is None or datacontract_manager_api_key == "":
47
- print("Error: Data Contract Manager API Key is not set. Set env variable DATACONTRACT_MANAGER_API_KEY.")
47
+ print("Error: Data Contract Manager API key is not set. Set env variable DATACONTRACT_MANAGER_API_KEY.")
48
48
  raise DataContractException(
49
49
  type="lint",
50
50
  name=f"Reading data contract from {url}",
51
- reason="Error: Data Contract Manager API Key is not set. Set env variable DATACONTRACT_MANAGER_API_KEY.",
51
+ reason="Error: Data Contract Manager API key is not set. Set env variable DATACONTRACT_MANAGER_API_KEY.",
52
52
  engine="datacontract",
53
53
  result="error",
54
54
  )