datacontract-cli 0.10.21__py3-none-any.whl → 0.10.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (29) hide show
  1. datacontract/breaking/breaking.py +1 -1
  2. datacontract/breaking/breaking_rules.py +1 -1
  3. datacontract/cli.py +5 -5
  4. datacontract/data_contract.py +14 -100
  5. datacontract/engines/data_contract_checks.py +735 -0
  6. datacontract/engines/data_contract_test.py +51 -0
  7. datacontract/engines/soda/check_soda_execute.py +36 -30
  8. datacontract/engines/soda/connections/kafka.py +8 -3
  9. datacontract/export/avro_converter.py +2 -0
  10. datacontract/export/exporter.py +0 -2
  11. datacontract/export/exporter_factory.py +0 -12
  12. datacontract/export/sodacl_converter.py +22 -294
  13. datacontract/export/sql_type_converter.py +7 -2
  14. datacontract/imports/odcs_importer.py +6 -3
  15. datacontract/imports/odcs_v3_importer.py +2 -0
  16. datacontract/imports/sql_importer.py +229 -29
  17. datacontract/model/exceptions.py +4 -1
  18. datacontract/model/run.py +11 -4
  19. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/METADATA +139 -166
  20. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/RECORD +25 -27
  21. datacontract/engines/soda/connections/dask.py +0 -28
  22. datacontract/export/odcs_v2_exporter.py +0 -124
  23. datacontract/imports/odcs_v2_importer.py +0 -177
  24. datacontract/lint/linters/example_model_linter.py +0 -91
  25. /datacontract/{model → breaking}/breaking_change.py +0 -0
  26. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/LICENSE +0 -0
  27. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/WHEEL +0 -0
  28. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/entry_points.txt +0 -0
  29. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/top_level.txt +0 -0
@@ -1,44 +1,76 @@
1
- from simple_ddl_parser import parse_from_file
1
+ import logging
2
+ import os
3
+
4
+ import sqlglot
5
+ from sqlglot.dialects.dialect import Dialects
2
6
 
3
7
  from datacontract.imports.importer import Importer
4
- from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
8
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
9
+ from datacontract.model.exceptions import DataContractException
10
+ from datacontract.model.run import ResultEnum
5
11
 
6
12
 
7
13
  class SqlImporter(Importer):
8
14
  def import_source(
9
15
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
10
16
  ) -> DataContractSpecification:
11
- return import_sql(data_contract_specification, self.import_format, source)
17
+ return import_sql(data_contract_specification, self.import_format, source, import_args)
18
+
12
19
 
20
+ def import_sql(
21
+ data_contract_specification: DataContractSpecification, format: str, source: str, import_args: dict = None
22
+ ) -> DataContractSpecification:
23
+ sql = read_file(source)
13
24
 
14
- def import_sql(data_contract_specification: DataContractSpecification, format: str, source: str):
15
- ddl = parse_from_file(source, group_by_type=True)
16
- tables = ddl["tables"]
25
+ dialect = to_dialect(import_args)
26
+
27
+ try:
28
+ parsed = sqlglot.parse_one(sql=sql, read=dialect)
29
+ except Exception as e:
30
+ logging.error(f"Error parsing SQL: {str(e)}")
31
+ raise DataContractException(
32
+ type="import",
33
+ name=f"Reading source from {source}",
34
+ reason=f"Error parsing SQL: {str(e)}",
35
+ engine="datacontract",
36
+ result=ResultEnum.error,
37
+ )
38
+
39
+ server_type: str | None = to_server_type(source, dialect)
40
+ if server_type is not None:
41
+ data_contract_specification.servers[server_type] = Server(type=server_type)
42
+
43
+ tables = parsed.find_all(sqlglot.expressions.Table)
17
44
 
18
45
  for table in tables:
19
46
  if data_contract_specification.models is None:
20
47
  data_contract_specification.models = {}
21
48
 
22
- table_name = table["table_name"]
49
+ table_name = table.this.name
23
50
 
24
51
  fields = {}
25
- for column in table["columns"]:
52
+ for column in parsed.find_all(sqlglot.exp.ColumnDef):
53
+ if column.parent.this.name != table_name:
54
+ continue
55
+
26
56
  field = Field()
27
- field.type = map_type_from_sql(map_type_from_sql(column["type"]))
28
- if not column["nullable"]:
29
- field.required = True
30
- if column["unique"]:
31
- field.unique = True
32
- fields[column["name"]] = field
33
- if column["size"] is not None:
34
- field.maxLength = column["size"]
35
-
36
- if len(table["primary_key"]) == 1:
37
- primary_key = table["primary_key"][0]
38
- if primary_key in fields:
39
- fields[primary_key].unique = True
40
- fields[primary_key].required = True
41
- fields[primary_key].primaryKey = True
57
+ col_name = column.this.name
58
+ col_type = to_col_type(column, dialect)
59
+ field.type = map_type_from_sql(col_type)
60
+ col_description = get_description(column)
61
+ field.description = col_description
62
+ field.maxLength = get_max_length(column)
63
+ precision, scale = get_precision_scale(column)
64
+ field.precision = precision
65
+ field.scale = scale
66
+ field.primaryKey = get_primary_key(column)
67
+ field.required = column.find(sqlglot.exp.NotNullColumnConstraint) is not None or None
68
+ physical_type_key = to_physical_type_key(dialect)
69
+ field.config = {
70
+ physical_type_key: col_type,
71
+ }
72
+
73
+ fields[col_name] = field
42
74
 
43
75
  data_contract_specification.models[table_name] = Model(
44
76
  type="table",
@@ -48,6 +80,124 @@ def import_sql(data_contract_specification: DataContractSpecification, format: s
48
80
  return data_contract_specification
49
81
 
50
82
 
83
+ def get_primary_key(column) -> bool | None:
84
+ if column.find(sqlglot.exp.PrimaryKeyColumnConstraint) is not None:
85
+ return True
86
+ if column.find(sqlglot.exp.PrimaryKey) is not None:
87
+ return True
88
+ return None
89
+
90
+
91
+ def to_dialect(import_args: dict) -> Dialects | None:
92
+ if import_args is None:
93
+ return None
94
+ if "dialect" not in import_args:
95
+ return None
96
+ dialect = import_args.get("dialect")
97
+ if dialect is None:
98
+ return None
99
+ if dialect == "sqlserver":
100
+ return Dialects.TSQL
101
+ if dialect.upper() in Dialects.__members__:
102
+ return Dialects[dialect.upper()]
103
+ if dialect == "sqlserver":
104
+ return Dialects.TSQL
105
+ return None
106
+
107
+
108
+ def to_physical_type_key(dialect: Dialects | None) -> str:
109
+ dialect_map = {
110
+ Dialects.TSQL: "sqlserverType",
111
+ Dialects.POSTGRES: "postgresType",
112
+ Dialects.BIGQUERY: "bigqueryType",
113
+ Dialects.SNOWFLAKE: "snowflakeType",
114
+ Dialects.REDSHIFT: "redshiftType",
115
+ Dialects.ORACLE: "oracleType",
116
+ Dialects.MYSQL: "mysqlType",
117
+ Dialects.DATABRICKS: "databricksType",
118
+ }
119
+ return dialect_map.get(dialect, "physicalType")
120
+
121
+
122
+ def to_server_type(source, dialect: Dialects | None) -> str | None:
123
+ if dialect is None:
124
+ return None
125
+ dialect_map = {
126
+ Dialects.TSQL: "sqlserver",
127
+ Dialects.POSTGRES: "postgres",
128
+ Dialects.BIGQUERY: "bigquery",
129
+ Dialects.SNOWFLAKE: "snowflake",
130
+ Dialects.REDSHIFT: "redshift",
131
+ Dialects.ORACLE: "oracle",
132
+ Dialects.MYSQL: "mysql",
133
+ Dialects.DATABRICKS: "databricks",
134
+ }
135
+ return dialect_map.get(dialect, None)
136
+
137
+
138
+ def to_col_type(column, dialect):
139
+ col_type_kind = column.args["kind"]
140
+ if col_type_kind is None:
141
+ return None
142
+
143
+ return col_type_kind.sql(dialect)
144
+
145
+
146
+ def to_col_type_normalized(column):
147
+ col_type = column.args["kind"].this.name
148
+ if col_type is None:
149
+ return None
150
+ return col_type.lower()
151
+
152
+
153
+ def get_description(column: sqlglot.expressions.ColumnDef) -> str | None:
154
+ if column.comments is None:
155
+ return None
156
+ return " ".join(comment.strip() for comment in column.comments)
157
+
158
+
159
+ def get_max_length(column: sqlglot.expressions.ColumnDef) -> int | None:
160
+ col_type = to_col_type_normalized(column)
161
+ if col_type is None:
162
+ return None
163
+ if col_type not in ["varchar", "char", "nvarchar", "nchar"]:
164
+ return None
165
+ col_params = list(column.args["kind"].find_all(sqlglot.expressions.DataTypeParam))
166
+ max_length_str = None
167
+ if len(col_params) == 0:
168
+ return None
169
+ if len(col_params) == 1:
170
+ max_length_str = col_params[0].name
171
+ if len(col_params) == 2:
172
+ max_length_str = col_params[1].name
173
+ if max_length_str is not None:
174
+ return int(max_length_str) if max_length_str.isdigit() else None
175
+
176
+
177
+ def get_precision_scale(column):
178
+ col_type = to_col_type_normalized(column)
179
+ if col_type is None:
180
+ return None, None
181
+ if col_type not in ["decimal", "numeric", "float", "number"]:
182
+ return None, None
183
+ col_params = list(column.args["kind"].find_all(sqlglot.expressions.DataTypeParam))
184
+ if len(col_params) == 0:
185
+ return None, None
186
+ if len(col_params) == 1:
187
+ if not col_params[0].name.isdigit():
188
+ return None, None
189
+ precision = int(col_params[0].name)
190
+ scale = 0
191
+ return precision, scale
192
+ if len(col_params) == 2:
193
+ if not col_params[0].name.isdigit() or not col_params[1].name.isdigit():
194
+ return None, None
195
+ precision = int(col_params[0].name)
196
+ scale = int(col_params[1].name)
197
+ return precision, scale
198
+ return None, None
199
+
200
+
51
201
  def map_type_from_sql(sql_type: str):
52
202
  if sql_type is None:
53
203
  return None
@@ -55,25 +205,55 @@ def map_type_from_sql(sql_type: str):
55
205
  sql_type_normed = sql_type.lower().strip()
56
206
 
57
207
  if sql_type_normed.startswith("varchar"):
58
- return "varchar"
208
+ return "string"
209
+ elif sql_type_normed.startswith("char"):
210
+ return "string"
59
211
  elif sql_type_normed.startswith("string"):
60
212
  return "string"
213
+ elif sql_type_normed.startswith("nchar"):
214
+ return "string"
61
215
  elif sql_type_normed.startswith("text"):
62
- return "text"
216
+ return "string"
217
+ elif sql_type_normed.startswith("nvarchar"):
218
+ return "string"
219
+ elif sql_type_normed.startswith("ntext"):
220
+ return "string"
63
221
  elif sql_type_normed.startswith("int"):
64
- return "integer"
222
+ return "int"
223
+ elif sql_type_normed.startswith("bigint"):
224
+ return "long"
225
+ elif sql_type_normed.startswith("tinyint"):
226
+ return "int"
227
+ elif sql_type_normed.startswith("smallint"):
228
+ return "int"
65
229
  elif sql_type_normed.startswith("float"):
66
230
  return "float"
67
231
  elif sql_type_normed.startswith("decimal"):
68
232
  return "decimal"
69
233
  elif sql_type_normed.startswith("numeric"):
70
- return "numeric"
234
+ return "decimal"
71
235
  elif sql_type_normed.startswith("bool"):
72
236
  return "boolean"
73
- elif sql_type_normed.startswith("timestamp"):
74
- return "timestamp"
237
+ elif sql_type_normed.startswith("bit"):
238
+ return "boolean"
239
+ elif sql_type_normed.startswith("binary"):
240
+ return "bytes"
241
+ elif sql_type_normed.startswith("varbinary"):
242
+ return "bytes"
75
243
  elif sql_type_normed == "date":
76
244
  return "date"
245
+ elif sql_type_normed == "time":
246
+ return "string"
247
+ elif sql_type_normed == "timestamp":
248
+ return "timestamp_ntz"
249
+ elif (
250
+ sql_type_normed == "timestamptz"
251
+ or sql_type_normed == "timestamp_tz"
252
+ or sql_type_normed == "timestamp with time zone"
253
+ ):
254
+ return "timestamp_tz"
255
+ elif sql_type_normed == "timestampntz" or sql_type_normed == "timestamp_ntz":
256
+ return "timestamp_ntz"
77
257
  elif sql_type_normed == "smalldatetime":
78
258
  return "timestamp_ntz"
79
259
  elif sql_type_normed == "datetime":
@@ -82,5 +262,25 @@ def map_type_from_sql(sql_type: str):
82
262
  return "timestamp_ntz"
83
263
  elif sql_type_normed == "datetimeoffset":
84
264
  return "timestamp_tz"
265
+ elif sql_type_normed == "uniqueidentifier": # tsql
266
+ return "string"
267
+ elif sql_type_normed == "json":
268
+ return "string"
269
+ elif sql_type_normed == "xml": # tsql
270
+ return "string"
85
271
  else:
86
272
  return "variant"
273
+
274
+
275
+ def read_file(path):
276
+ if not os.path.exists(path):
277
+ raise DataContractException(
278
+ type="import",
279
+ name=f"Reading source from {path}",
280
+ reason=f"The file '{path}' does not exist.",
281
+ engine="datacontract",
282
+ result=ResultEnum.error,
283
+ )
284
+ with open(path, "r") as file:
285
+ file_content = file.read()
286
+ return file_content
@@ -1,3 +1,6 @@
1
+ from datacontract.model.run import ResultEnum
2
+
3
+
1
4
  class DataContractException(Exception):
2
5
  """Exception raised for errors in the execution of a run.
3
6
 
@@ -19,7 +22,7 @@ class DataContractException(Exception):
19
22
  engine="datacontract",
20
23
  model=None,
21
24
  original_exception=None,
22
- result: str = "failed",
25
+ result: ResultEnum = ResultEnum.failed,
23
26
  message="Run operation failed",
24
27
  ):
25
28
  self.type = type
datacontract/model/run.py CHANGED
@@ -17,13 +17,20 @@ class ResultEnum(str, Enum):
17
17
 
18
18
 
19
19
  class Check(BaseModel):
20
+ id: str | None = None
21
+ key: str | None = None
22
+ category: str | None = None
20
23
  type: str
21
- name: str | None
22
- result: ResultEnum | None
23
- engine: str | None
24
- reason: str | None = None
24
+ name: str | None = None
25
25
  model: str | None = None
26
26
  field: str | None = None
27
+
28
+ engine: str | None = None
29
+ language: str | None = None
30
+ implementation: str | None = None
31
+
32
+ result: ResultEnum | None = None
33
+ reason: str | None = None
27
34
  details: str | None = None
28
35
  diagnostics: dict | None = None
29
36