datacontract-cli 0.10.23__py3-none-any.whl → 0.10.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (43) hide show
  1. datacontract/__init__.py +13 -0
  2. datacontract/api.py +3 -3
  3. datacontract/catalog/catalog.py +2 -2
  4. datacontract/cli.py +1 -1
  5. datacontract/data_contract.py +5 -3
  6. datacontract/engines/data_contract_test.py +13 -4
  7. datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
  8. datacontract/engines/soda/check_soda_execute.py +16 -3
  9. datacontract/engines/soda/connections/duckdb_connection.py +61 -5
  10. datacontract/engines/soda/connections/kafka.py +3 -2
  11. datacontract/export/avro_converter.py +8 -1
  12. datacontract/export/bigquery_converter.py +1 -1
  13. datacontract/export/duckdb_type_converter.py +57 -0
  14. datacontract/export/great_expectations_converter.py +49 -2
  15. datacontract/export/odcs_v3_exporter.py +162 -136
  16. datacontract/export/protobuf_converter.py +163 -69
  17. datacontract/export/spark_converter.py +1 -1
  18. datacontract/imports/avro_importer.py +30 -5
  19. datacontract/imports/csv_importer.py +111 -57
  20. datacontract/imports/excel_importer.py +850 -0
  21. datacontract/imports/importer.py +5 -2
  22. datacontract/imports/importer_factory.py +10 -0
  23. datacontract/imports/odcs_v3_importer.py +226 -127
  24. datacontract/imports/protobuf_importer.py +264 -0
  25. datacontract/lint/linters/description_linter.py +1 -3
  26. datacontract/lint/linters/field_reference_linter.py +1 -2
  27. datacontract/lint/linters/notice_period_linter.py +2 -2
  28. datacontract/lint/linters/valid_constraints_linter.py +3 -3
  29. datacontract/lint/resolve.py +23 -8
  30. datacontract/model/data_contract_specification/__init__.py +1 -0
  31. datacontract/model/run.py +3 -0
  32. datacontract/output/__init__.py +0 -0
  33. datacontract/templates/datacontract.html +2 -1
  34. datacontract/templates/index.html +2 -1
  35. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.25.dist-info}/METADATA +305 -195
  36. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.25.dist-info}/RECORD +40 -38
  37. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.25.dist-info}/WHEEL +1 -1
  38. datacontract/export/csv_type_converter.py +0 -36
  39. datacontract/lint/linters/quality_schema_linter.py +0 -52
  40. datacontract/model/data_contract_specification.py +0 -327
  41. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.25.dist-info}/entry_points.txt +0 -0
  42. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.25.dist-info/licenses}/LICENSE +0 -0
  43. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.25.dist-info}/top_level.txt +0 -0
datacontract/__init__.py CHANGED
@@ -0,0 +1,13 @@
1
+ # Configuration so that yaml.safe_dump dumps strings with line breaks with yaml literal |
2
+ import yaml
3
+
4
+ yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
5
+
6
+
7
+ def repr_str(dumper, data):
8
+ if "\n" in data:
9
+ return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
10
+ return dumper.org_represent_str(data)
11
+
12
+
13
+ yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
datacontract/api.py CHANGED
@@ -162,7 +162,7 @@ async def test(
162
162
  server: Annotated[
163
163
  str | None,
164
164
  Query(
165
- example="production",
165
+ examples=["production"],
166
166
  description="The server name to test. Optional, if there is only one server.",
167
167
  ),
168
168
  ] = None,
@@ -191,7 +191,7 @@ async def lint(
191
191
  schema: Annotated[
192
192
  str | None,
193
193
  Query(
194
- example="https://datacontract.com/datacontract.schema.json",
194
+ examples=["https://datacontract.com/datacontract.schema.json"],
195
195
  description="The schema to use for validation. This must be a URL.",
196
196
  ),
197
197
  ] = None,
@@ -220,7 +220,7 @@ def export(
220
220
  server: Annotated[
221
221
  str | None,
222
222
  Query(
223
- example="production",
223
+ examples=["production"],
224
224
  description="The server name to export. Optional, if there is only one server.",
225
225
  ),
226
226
  ] = None,
@@ -19,7 +19,7 @@ def create_data_contract_html(contracts, file: Path, path: Path, schema: str):
19
19
  file_without_suffix = file.with_suffix(".html")
20
20
  html_filepath = path / file_without_suffix
21
21
  html_filepath.parent.mkdir(parents=True, exist_ok=True)
22
- with open(html_filepath, "w") as f:
22
+ with open(html_filepath, "w", encoding="utf-8") as f:
23
23
  f.write(html)
24
24
  contracts.append(
25
25
  DataContractView(
@@ -42,7 +42,7 @@ class DataContractView:
42
42
 
43
43
  def create_index_html(contracts, path):
44
44
  index_filepath = path / "index.html"
45
- with open(index_filepath, "w") as f:
45
+ with open(index_filepath, "w", encoding="utf-8") as f:
46
46
  # Load templates from templates folder
47
47
  package_loader = PackageLoader("datacontract", "templates")
48
48
  env = Environment(
datacontract/cli.py CHANGED
@@ -244,7 +244,7 @@ def import_(
244
244
  ] = None,
245
245
  source: Annotated[
246
246
  Optional[str],
247
- typer.Option(help="The path to the file or Glue Database that should be imported."),
247
+ typer.Option(help="The path to the file that should be imported."),
248
248
  ] = None,
249
249
  dialect: Annotated[
250
250
  Optional[str],
@@ -4,6 +4,8 @@ import typing
4
4
  if typing.TYPE_CHECKING:
5
5
  from pyspark.sql import SparkSession
6
6
 
7
+ from duckdb.duckdb import DuckDBPyConnection
8
+
7
9
  from datacontract.breaking.breaking import (
8
10
  info_breaking_changes,
9
11
  models_breaking_changes,
@@ -22,7 +24,6 @@ from datacontract.lint.linters.description_linter import DescriptionLinter
22
24
  from datacontract.lint.linters.field_pattern_linter import FieldPatternLinter
23
25
  from datacontract.lint.linters.field_reference_linter import FieldReferenceLinter
24
26
  from datacontract.lint.linters.notice_period_linter import NoticePeriodLinter
25
- from datacontract.lint.linters.quality_schema_linter import QualityUsesSchemaLinter
26
27
  from datacontract.lint.linters.valid_constraints_linter import ValidFieldConstraintsLinter
27
28
  from datacontract.model.data_contract_specification import DataContractSpecification
28
29
  from datacontract.model.exceptions import DataContractException
@@ -39,6 +40,7 @@ class DataContract:
39
40
  server: str = None,
40
41
  publish_url: str = None,
41
42
  spark: "SparkSession" = None,
43
+ duckdb_connection: DuckDBPyConnection = None,
42
44
  inline_definitions: bool = True,
43
45
  inline_quality: bool = True,
44
46
  ssl_verification: bool = True,
@@ -50,11 +52,11 @@ class DataContract:
50
52
  self._server = server
51
53
  self._publish_url = publish_url
52
54
  self._spark = spark
55
+ self._duckdb_connection = duckdb_connection
53
56
  self._inline_definitions = inline_definitions
54
57
  self._inline_quality = inline_quality
55
58
  self._ssl_verification = ssl_verification
56
59
  self.all_linters = {
57
- QualityUsesSchemaLinter(),
58
60
  FieldPatternLinter(),
59
61
  FieldReferenceLinter(),
60
62
  NoticePeriodLinter(),
@@ -146,7 +148,7 @@ class DataContract:
146
148
  inline_quality=self._inline_quality,
147
149
  )
148
150
 
149
- execute_data_contract_test(data_contract, run, self._server, self._spark)
151
+ execute_data_contract_test(data_contract, run, self._server, self._spark, self._duckdb_connection)
150
152
 
151
153
  except DataContractException as e:
152
154
  run.checks.append(
@@ -1,5 +1,7 @@
1
1
  import typing
2
2
 
3
+ from duckdb.duckdb import DuckDBPyConnection
4
+
3
5
  from datacontract.engines.data_contract_checks import create_checks
4
6
 
5
7
  if typing.TYPE_CHECKING:
@@ -10,7 +12,7 @@ from datacontract.engines.datacontract.check_that_datacontract_contains_valid_se
10
12
  )
11
13
  from datacontract.engines.fastjsonschema.check_jsonschema import check_jsonschema
12
14
  from datacontract.engines.soda.check_soda_execute import check_soda_execute
13
- from datacontract.model.data_contract_specification import DataContractSpecification
15
+ from datacontract.model.data_contract_specification import DataContractSpecification, Server
14
16
  from datacontract.model.exceptions import DataContractException
15
17
  from datacontract.model.run import ResultEnum, Run
16
18
 
@@ -20,6 +22,7 @@ def execute_data_contract_test(
20
22
  run: Run,
21
23
  server_name: str = None,
22
24
  spark: "SparkSession" = None,
25
+ duckdb_connection: DuckDBPyConnection = None,
23
26
  ):
24
27
  if data_contract_specification.models is None or len(data_contract_specification.models) == 0:
25
28
  raise DataContractException(
@@ -29,6 +32,12 @@ def execute_data_contract_test(
29
32
  reason="Models block is missing. Skip executing tests.",
30
33
  engine="datacontract",
31
34
  )
35
+ if (
36
+ server_name is None
37
+ and data_contract_specification.servers is not None
38
+ and len(data_contract_specification.servers) > 0
39
+ ):
40
+ server_name = list(data_contract_specification.servers.keys())[0]
32
41
  server = get_server(data_contract_specification, server_name)
33
42
  run.log_info(f"Running tests for data contract {data_contract_specification.id} with server {server_name}")
34
43
  run.dataContractId = data_contract_specification.id
@@ -43,10 +52,10 @@ def execute_data_contract_test(
43
52
  # TODO check server credentials are complete for nicer error messages
44
53
  if server.format == "json" and server.type != "kafka":
45
54
  check_jsonschema(run, data_contract_specification, server)
46
- check_soda_execute(run, data_contract_specification, server, spark)
55
+ check_soda_execute(run, data_contract_specification, server, spark, duckdb_connection)
47
56
 
48
57
 
49
- def get_server(data_contract_specification: DataContractSpecification, server_name: str = None):
58
+ def get_server(data_contract_specification: DataContractSpecification, server_name: str = None) -> Server | None:
50
59
  """Get the server configuration from the data contract specification.
51
60
 
52
61
  Args:
@@ -59,7 +68,7 @@ def get_server(data_contract_specification: DataContractSpecification, server_na
59
68
 
60
69
  check_that_datacontract_contains_valid_server_configuration(data_contract_specification, server_name)
61
70
 
62
- if server_name:
71
+ if server_name is not None:
63
72
  server = data_contract_specification.servers.get(server_name)
64
73
  else:
65
74
  server_name = list(data_contract_specification.servers.keys())[0]
@@ -2,6 +2,7 @@ import logging
2
2
  import os
3
3
 
4
4
  from datacontract.model.exceptions import DataContractException
5
+ from datacontract.model.run import ResultEnum
5
6
 
6
7
 
7
8
  def yield_s3_files(s3_endpoint_url, s3_location):
@@ -19,9 +20,9 @@ def s3_fs(s3_endpoint_url):
19
20
  except ImportError as e:
20
21
  raise DataContractException(
21
22
  type="schema",
22
- result="failed",
23
+ result=ResultEnum.failed,
23
24
  name="s3 extra missing",
24
- reason="Install the extra datacontract-cli\[s3] to use s3",
25
+ reason="Install the extra s3 to use s3",
25
26
  engine="datacontract",
26
27
  original_exception=e,
27
28
  )
@@ -1,6 +1,12 @@
1
1
  import logging
2
+ import typing
2
3
  import uuid
3
4
 
5
+ if typing.TYPE_CHECKING:
6
+ from pyspark.sql import SparkSession
7
+
8
+ from duckdb.duckdb import DuckDBPyConnection
9
+
4
10
  from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
5
11
  from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
6
12
  from datacontract.engines.soda.connections.duckdb_connection import get_duckdb_connection
@@ -14,7 +20,13 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
14
20
  from datacontract.model.run import Check, Log, ResultEnum, Run
15
21
 
16
22
 
17
- def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark):
23
+ def check_soda_execute(
24
+ run: Run,
25
+ data_contract: DataContractSpecification,
26
+ server: Server,
27
+ spark: "SparkSession" = None,
28
+ duckdb_connection: DuckDBPyConnection = None,
29
+ ):
18
30
  from soda.common.config_helper import ConfigHelper
19
31
 
20
32
  ConfigHelper.get_instance().upsert_value("send_anonymous_usage_stats", False)
@@ -30,7 +42,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
30
42
  if server.type in ["s3", "gcs", "azure", "local"]:
31
43
  if server.format in ["json", "parquet", "csv", "delta"]:
32
44
  run.log_info(f"Configuring engine soda-core to connect to {server.type} {server.format} with duckdb")
33
- con = get_duckdb_connection(data_contract, server, run)
45
+ con = get_duckdb_connection(data_contract, server, run, duckdb_connection)
34
46
  scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
35
47
  scan.set_data_source_name(server.type)
36
48
  else:
@@ -62,7 +74,8 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
62
74
  run.log_info("Connecting to databricks via spark")
63
75
  scan.add_spark_session(spark, data_source_name=server.type)
64
76
  scan.set_data_source_name(server.type)
65
- spark.sql(f"USE {server.catalog}.{server.schema_}")
77
+ database_name = ".".join(filter(None, [server.catalog, server.schema_]))
78
+ spark.sql(f"USE {database_name}")
66
79
  else:
67
80
  run.log_info("Connecting to databricks directly")
68
81
  soda_configuration_str = to_databricks_soda_configuration(server)
@@ -1,14 +1,24 @@
1
1
  import os
2
- from typing import Any
2
+ from typing import Any, Dict
3
3
 
4
4
  import duckdb
5
5
 
6
- from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type
6
+ from datacontract.export.duckdb_type_converter import convert_to_duckdb_csv_type, convert_to_duckdb_json_type
7
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
7
8
  from datacontract.model.run import Run
8
9
 
9
10
 
10
- def get_duckdb_connection(data_contract, server, run: Run):
11
- con = duckdb.connect(database=":memory:")
11
+ def get_duckdb_connection(
12
+ data_contract: DataContractSpecification,
13
+ server: Server,
14
+ run: Run,
15
+ duckdb_connection: duckdb.DuckDBPyConnection | None = None,
16
+ ) -> duckdb.DuckDBPyConnection:
17
+ if duckdb_connection is None:
18
+ con = duckdb.connect(database=":memory:")
19
+ else:
20
+ con = duckdb_connection
21
+
12
22
  path: str = ""
13
23
  if server.type == "local":
14
24
  path = server.path
@@ -33,9 +43,16 @@ def get_duckdb_connection(data_contract, server, run: Run):
33
43
  json_format = "newline_delimited"
34
44
  elif server.delimiter == "array":
35
45
  json_format = "array"
36
- con.sql(f"""
46
+ columns = to_json_types(model)
47
+ if columns is None:
48
+ con.sql(f"""
37
49
  CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', hive_partitioning=1);
38
50
  """)
51
+ else:
52
+ con.sql(
53
+ f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', columns={columns}, hive_partitioning=1);"""
54
+ )
55
+ add_nested_views(con, model_name, model.fields)
39
56
  elif server.format == "parquet":
40
57
  con.sql(f"""
41
58
  CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{model_path}', hive_partitioning=1);
@@ -67,6 +84,45 @@ def to_csv_types(model) -> dict[Any, str | None] | None:
67
84
  return columns
68
85
 
69
86
 
87
+ def to_json_types(model: Model) -> dict[Any, str | None] | None:
88
+ if model is None:
89
+ return None
90
+ columns = {}
91
+ for field_name, field in model.fields.items():
92
+ columns[field_name] = convert_to_duckdb_json_type(field)
93
+ return columns
94
+
95
+
96
+ def add_nested_views(con: duckdb.DuckDBPyConnection, model_name: str, fields: Dict[str, Field] | None):
97
+ model_name = model_name.strip('"')
98
+ if fields is None:
99
+ return
100
+ for field_name, field in fields.items():
101
+ if field.type is None or field.type.lower() not in ["array", "object"]:
102
+ continue
103
+ field_type = field.type.lower()
104
+ if field_type == "array" and field.items is None:
105
+ continue
106
+ elif field_type == "object" and field.fields is None:
107
+ continue
108
+
109
+ nested_model_name = f"{model_name}__{field_name}"
110
+ max_depth = 2 if field_type == "array" else 1
111
+
112
+ ## if parent field is not required, the nested objects may respolve
113
+ ## to a row of NULLs -- but if the objects themselves have required
114
+ ## fields, this will fail the check.
115
+ where = "" if field.required else f" WHERE {field_name} IS NOT NULL"
116
+ con.sql(f"""
117
+ CREATE VIEW IF NOT EXISTS "{nested_model_name}" AS
118
+ SELECT unnest({field_name}, max_depth := {max_depth}) as {field_name} FROM "{model_name}" {where}
119
+ """)
120
+ if field_type == "array":
121
+ add_nested_views(con, nested_model_name, field.items.fields)
122
+ elif field_type == "object":
123
+ add_nested_views(con, nested_model_name, field.fields)
124
+
125
+
70
126
  def setup_s3_connection(con, server):
71
127
  s3_region = os.getenv("DATACONTRACT_S3_REGION")
72
128
  s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
@@ -6,6 +6,7 @@ import tempfile
6
6
  from datacontract.export.avro_converter import to_avro_schema_json
7
7
  from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server
8
8
  from datacontract.model.exceptions import DataContractException
9
+ from datacontract.model.run import ResultEnum
9
10
 
10
11
 
11
12
  def create_spark_session():
@@ -16,7 +17,7 @@ def create_spark_session():
16
17
  except ImportError as e:
17
18
  raise DataContractException(
18
19
  type="schema",
19
- result="failed",
20
+ result=ResultEnum.failed,
20
21
  name="pyspark is missing",
21
22
  reason="Install the extra datacontract-cli[kafka] to use kafka",
22
23
  engine="datacontract",
@@ -33,7 +34,7 @@ def create_spark_session():
33
34
  .config("spark.ui.enabled", "false")
34
35
  .config(
35
36
  "spark.jars.packages",
36
- "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.2,org.apache.spark:spark-avro_2.12:3.5.2",
37
+ "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.5,org.apache.spark:spark-avro_2.12:3.5.5",
37
38
  )
38
39
  .getOrCreate()
39
40
  )
@@ -91,7 +91,9 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
91
91
  if field.precision is not None:
92
92
  typeVal["precision"] = field.precision
93
93
  return typeVal
94
- elif field.type in ["float", "double"]:
94
+ elif field.type in ["float"]:
95
+ return "float"
96
+ elif field.type in ["double"]:
95
97
  return "double"
96
98
  elif field.type in ["integer", "int"]:
97
99
  return "int"
@@ -107,6 +109,11 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
107
109
  return {"type": "int", "logicalType": "date"}
108
110
  elif field.type in ["time"]:
109
111
  return "long"
112
+ elif field.type in ["map"]:
113
+ if field.config is not None and "values" in field.config:
114
+ return {"type": "map", "values": field.config["values"]}
115
+ else:
116
+ return "bytes"
110
117
  elif field.type in ["object", "record", "struct"]:
111
118
  if field.config is not None and "namespace" in field.config:
112
119
  return to_avro_record(field_name, field.fields, field.description, field.config["namespace"])
@@ -103,7 +103,7 @@ def map_type_to_bigquery(field: Field) -> str:
103
103
  elif field_type.lower() == "date":
104
104
  return "DATE"
105
105
  elif field_type.lower() == "timestamp_ntz":
106
- return "TIME"
106
+ return "DATETIME"
107
107
  elif field_type.lower() in ["number", "decimal", "numeric"]:
108
108
  return "NUMERIC"
109
109
  elif field_type.lower() == "double":
@@ -0,0 +1,57 @@
1
+ from typing import Dict
2
+
3
+ from datacontract.model.data_contract_specification import Field
4
+
5
+
6
+ # https://duckdb.org/docs/data/csv/overview.html
7
+ # ['SQLNULL', 'BOOLEAN', 'BIGINT', 'DOUBLE', 'TIME', 'DATE', 'TIMESTAMP', 'VARCHAR']
8
+ def convert_to_duckdb_csv_type(field) -> None | str:
9
+ datacontract_type = field.type
10
+ if datacontract_type is None:
11
+ return "VARCHAR"
12
+ if datacontract_type.lower() in ["string", "varchar", "text"]:
13
+ return "VARCHAR"
14
+ if datacontract_type.lower() in ["timestamp", "timestamp_tz"]:
15
+ return "TIMESTAMP"
16
+ if datacontract_type.lower() in ["timestamp_ntz"]:
17
+ return "TIMESTAMP"
18
+ if datacontract_type.lower() in ["date"]:
19
+ return "DATE"
20
+ if datacontract_type.lower() in ["time"]:
21
+ return "TIME"
22
+ if datacontract_type.lower() in ["number", "decimal", "numeric"]:
23
+ # precision and scale not supported by data contract
24
+ return "VARCHAR"
25
+ if datacontract_type.lower() in ["float", "double"]:
26
+ return "DOUBLE"
27
+ if datacontract_type.lower() in ["integer", "int", "long", "bigint"]:
28
+ return "BIGINT"
29
+ if datacontract_type.lower() in ["boolean"]:
30
+ return "BOOLEAN"
31
+ if datacontract_type.lower() in ["object", "record", "struct"]:
32
+ # not supported in CSV
33
+ return "VARCHAR"
34
+ if datacontract_type.lower() in ["bytes"]:
35
+ # not supported in CSV
36
+ return "VARCHAR"
37
+ if datacontract_type.lower() in ["array"]:
38
+ return "VARCHAR"
39
+ if datacontract_type.lower() in ["null"]:
40
+ return "SQLNULL"
41
+ return "VARCHAR"
42
+
43
+
44
+ def convert_to_duckdb_json_type(field: Field) -> None | str:
45
+ datacontract_type = field.type
46
+ if datacontract_type is None:
47
+ return "VARCHAR"
48
+ if datacontract_type.lower() in ["array"]:
49
+ return convert_to_duckdb_json_type(field.items) + "[]" # type: ignore
50
+ if datacontract_type.lower() in ["object", "record", "struct"]:
51
+ return convert_to_duckdb_object(field.fields)
52
+ return convert_to_duckdb_csv_type(field)
53
+
54
+
55
+ def convert_to_duckdb_object(fields: Dict[str, Field]):
56
+ columns = [f'"{x[0]}" {convert_to_duckdb_json_type(x[1])}' for x in fields.items()]
57
+ return f"STRUCT({', '.join(columns)})"
@@ -19,6 +19,7 @@ from datacontract.export.spark_converter import to_spark_data_type
19
19
  from datacontract.export.sql_type_converter import convert_to_sql_type
20
20
  from datacontract.model.data_contract_specification import (
21
21
  DataContractSpecification,
22
+ DeprecatedQuality,
22
23
  Field,
23
24
  Quality,
24
25
  )
@@ -91,8 +92,14 @@ def to_great_expectations(
91
92
  model_key=model_key, contract_version=data_contract_spec.info.version
92
93
  )
93
94
  model_value = data_contract_spec.models.get(model_key)
94
- quality_checks = get_quality_checks(data_contract_spec.quality)
95
+
96
+ # Support for Deprecated Quality
97
+ quality_checks = get_deprecated_quality_checks(data_contract_spec.quality)
98
+
99
+ expectations.extend(get_quality_checks(model_value.quality))
100
+
95
101
  expectations.extend(model_to_expectations(model_value.fields, engine, sql_server_type))
102
+
96
103
  expectations.extend(checks_to_expectations(quality_checks, model_key))
97
104
  model_expectation_suite = to_suite(expectations, expectation_suite_name)
98
105
 
@@ -135,6 +142,7 @@ def model_to_expectations(fields: Dict[str, Field], engine: str | None, sql_serv
135
142
  add_column_order_exp(fields, expectations)
136
143
  for field_name, field in fields.items():
137
144
  add_field_expectations(field_name, field, expectations, engine, sql_server_type)
145
+ expectations.extend(get_quality_checks(field.quality, field_name))
138
146
  return expectations
139
147
 
140
148
 
@@ -173,6 +181,8 @@ def add_field_expectations(
173
181
  expectations.append(to_column_length_exp(field_name, field.minLength, field.maxLength))
174
182
  if field.minimum is not None or field.maximum is not None:
175
183
  expectations.append(to_column_min_max_exp(field_name, field.minimum, field.maximum))
184
+ if field.enum is not None and len(field.enum) != 0:
185
+ expectations.append(to_column_enum_exp(field_name, field.enum))
176
186
 
177
187
  return expectations
178
188
 
@@ -266,7 +276,24 @@ def to_column_min_max_exp(field_name, minimum, maximum) -> Dict[str, Any]:
266
276
  }
267
277
 
268
278
 
269
- def get_quality_checks(quality: Quality) -> Dict[str, Any]:
279
+ def to_column_enum_exp(field_name, enum_list: List[str]) -> Dict[str, Any]:
280
+ """Creates a expect_column_values_to_be_in_set expectation.
281
+
282
+ Args:
283
+ field_name (str): The name of the field.
284
+ enum_list (Set[str]): enum list of value.
285
+
286
+ Returns:
287
+ Dict[str, Any]: Column value in set expectation.
288
+ """
289
+ return {
290
+ "expectation_type": "expect_column_values_to_be_in_set",
291
+ "kwargs": {"column": field_name, "value_set": enum_list},
292
+ "meta": {},
293
+ }
294
+
295
+
296
+ def get_deprecated_quality_checks(quality: DeprecatedQuality) -> Dict[str, Any]:
270
297
  """Retrieves quality checks defined in a data contract.
271
298
 
272
299
  Args:
@@ -288,6 +315,26 @@ def get_quality_checks(quality: Quality) -> Dict[str, Any]:
288
315
  return quality_specification
289
316
 
290
317
 
318
+ def get_quality_checks(qualities: List[Quality], field_name: str | None = None) -> List[Dict[str, Any]]:
319
+ """Retrieves quality checks defined in a data contract.
320
+
321
+ Args:
322
+ qualities (List[Quality]): List of quality object from the model specification.
323
+ field_name (str | None): field name if the quality list is attached to a specific field
324
+
325
+ Returns:
326
+ Dict[str, Any]: Dictionary of quality checks.
327
+ """
328
+ quality_specification = []
329
+ for quality in qualities:
330
+ if quality is not None and quality.engine is not None and quality.engine.lower() == "great-expectations":
331
+ ge_expectation = quality.implementation
332
+ if field_name is not None:
333
+ ge_expectation["column"] = field_name
334
+ quality_specification.append(ge_expectation)
335
+ return quality_specification
336
+
337
+
291
338
  def checks_to_expectations(quality_checks: Dict[str, Any], model_key: str) -> List[Dict[str, Any]]:
292
339
  """Converts quality checks to a list of expectations.
293
340