datacontract-cli 0.10.11__py3-none-any.whl → 0.10.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (40) hide show
  1. datacontract/cli.py +19 -3
  2. datacontract/data_contract.py +5 -10
  3. datacontract/engines/fastjsonschema/check_jsonschema.py +11 -0
  4. datacontract/engines/fastjsonschema/s3/s3_read_files.py +2 -0
  5. datacontract/engines/soda/check_soda_execute.py +2 -8
  6. datacontract/engines/soda/connections/duckdb.py +23 -24
  7. datacontract/engines/soda/connections/kafka.py +84 -25
  8. datacontract/export/avro_converter.py +12 -2
  9. datacontract/export/bigquery_converter.py +30 -23
  10. datacontract/export/data_caterer_converter.py +148 -0
  11. datacontract/export/dbml_converter.py +3 -2
  12. datacontract/export/exporter.py +2 -0
  13. datacontract/export/exporter_factory.py +12 -0
  14. datacontract/export/jsonschema_converter.py +13 -2
  15. datacontract/export/spark_converter.py +5 -1
  16. datacontract/export/sql_type_converter.py +65 -39
  17. datacontract/export/sqlalchemy_converter.py +169 -0
  18. datacontract/imports/avro_importer.py +1 -0
  19. datacontract/imports/bigquery_importer.py +2 -2
  20. datacontract/imports/dbml_importer.py +112 -0
  21. datacontract/imports/dbt_importer.py +67 -91
  22. datacontract/imports/glue_importer.py +62 -58
  23. datacontract/imports/importer.py +2 -1
  24. datacontract/imports/importer_factory.py +5 -0
  25. datacontract/imports/odcs_importer.py +1 -1
  26. datacontract/imports/spark_importer.py +34 -11
  27. datacontract/imports/sql_importer.py +1 -1
  28. datacontract/imports/unity_importer.py +106 -85
  29. datacontract/integration/{publish_datamesh_manager.py → datamesh_manager.py} +33 -5
  30. datacontract/integration/{publish_opentelemetry.py → opentelemetry.py} +1 -1
  31. datacontract/lint/resolve.py +10 -1
  32. datacontract/lint/urls.py +27 -13
  33. datacontract/model/data_contract_specification.py +6 -2
  34. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/METADATA +123 -32
  35. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/RECORD +39 -37
  36. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/WHEEL +1 -1
  37. datacontract/publish/publish.py +0 -32
  38. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/LICENSE +0 -0
  39. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/entry_points.txt +0 -0
  40. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/top_level.txt +0 -0
datacontract/cli.py CHANGED
@@ -17,7 +17,7 @@ from datacontract.catalog.catalog import create_index_html, create_data_contract
17
17
  from datacontract.data_contract import DataContract, ExportFormat
18
18
  from datacontract.imports.importer import ImportFormat
19
19
  from datacontract.init.download_datacontract_file import download_datacontract_file, FileExistsException
20
- from datacontract.publish.publish import publish_to_datamesh_manager
20
+ from datacontract.integration.datamesh_manager import publish_data_contract_to_datamesh_manager
21
21
 
22
22
  DEFAULT_DATA_CONTRACT_SCHEMA_URL = "https://datacontract.com/datacontract.schema.json"
23
23
 
@@ -232,6 +232,18 @@ def import_(
232
232
  help="List of models names to import from the dbt manifest file (repeat for multiple models names, leave empty for all models in the dataset)."
233
233
  ),
234
234
  ] = None,
235
+ dbml_schema: Annotated[
236
+ Optional[List[str]],
237
+ typer.Option(
238
+ help="List of schema names to import from the DBML file (repeat for multiple schema names, leave empty for all tables in the file)."
239
+ ),
240
+ ] = None,
241
+ dbml_table: Annotated[
242
+ Optional[List[str]],
243
+ typer.Option(
244
+ help="List of table names to import from the DBML file (repeat for multiple table names, leave empty for all tables in the file)."
245
+ ),
246
+ ] = None,
235
247
  ):
236
248
  """
237
249
  Create a data contract from the given source location. Prints to stdout.
@@ -245,6 +257,8 @@ def import_(
245
257
  bigquery_dataset=bigquery_dataset,
246
258
  unity_table_full_name=unity_table_full_name,
247
259
  dbt_model=dbt_model,
260
+ dbml_schema=dbml_schema,
261
+ dbml_table=dbml_table,
248
262
  )
249
263
  console.print(result.to_yaml())
250
264
 
@@ -261,8 +275,10 @@ def publish(
261
275
  """
262
276
  Publish the data contract to the Data Mesh Manager.
263
277
  """
264
- publish_to_datamesh_manager(
265
- data_contract=DataContract(data_contract_file=location, schema_location=schema),
278
+ publish_data_contract_to_datamesh_manager(
279
+ data_contract_specification=DataContract(
280
+ data_contract_file=location, schema_location=schema
281
+ ).get_data_contract_specification(),
266
282
  )
267
283
 
268
284
 
@@ -18,8 +18,8 @@ from datacontract.export.exporter import ExportFormat
18
18
  from datacontract.export.exporter_factory import exporter_factory
19
19
  from datacontract.imports.importer_factory import importer_factory
20
20
 
21
- from datacontract.integration.publish_datamesh_manager import publish_datamesh_manager
22
- from datacontract.integration.publish_opentelemetry import publish_opentelemetry
21
+ from datacontract.integration.datamesh_manager import publish_test_results_to_datamesh_manager
22
+ from datacontract.integration.opentelemetry import publish_test_results_to_opentelemetry
23
23
  from datacontract.lint import resolve
24
24
  from datacontract.lint.linters.description_linter import DescriptionLinter
25
25
  from datacontract.lint.linters.example_model_linter import ExampleModelLinter
@@ -218,15 +218,10 @@ class DataContract:
218
218
  run.finish()
219
219
 
220
220
  if self._publish_url is not None:
221
- try:
222
- publish_datamesh_manager(run, self._publish_url)
223
- except Exception:
224
- run.log_error("Failed to publish to datamesh manager")
221
+ publish_test_results_to_datamesh_manager(run, self._publish_url)
222
+
225
223
  if self._publish_to_opentelemetry:
226
- try:
227
- publish_opentelemetry(run)
228
- except Exception:
229
- run.log_error("Failed to publish to opentelemetry")
224
+ publish_test_results_to_opentelemetry(run)
230
225
 
231
226
  return run
232
227
 
@@ -158,6 +158,17 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
158
158
  process_local_file(run, server, model_name, validate)
159
159
  elif server.type == "s3":
160
160
  process_s3_file(server, model_name, validate)
161
+ elif server.type == "gcs":
162
+ run.checks.append(
163
+ Check(
164
+ type="schema",
165
+ name="Check that JSON has valid schema",
166
+ model=model_name,
167
+ result="info",
168
+ reason="JSON Schema check skipped for GCS, as GCS is currently not supported",
169
+ engine="jsonschema",
170
+ )
171
+ )
161
172
  else:
162
173
  run.checks.append(
163
174
  Check(
@@ -28,9 +28,11 @@ def s3_fs(s3_endpoint_url):
28
28
 
29
29
  aws_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
30
30
  aws_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
31
+ aws_session_token = os.getenv("DATACONTRACT_S3_SESSION_TOKEN")
31
32
  return s3fs.S3FileSystem(
32
33
  key=aws_access_key_id,
33
34
  secret=aws_secret_access_key,
35
+ token=aws_session_token,
34
36
  anon=aws_access_key_id is None,
35
37
  client_kwargs={"endpoint_url": s3_endpoint_url},
36
38
  )
@@ -1,8 +1,4 @@
1
1
  import logging
2
- import typing
3
-
4
- if typing.TYPE_CHECKING:
5
- from pyspark.sql import SparkSession
6
2
 
7
3
  from soda.scan import Scan
8
4
 
@@ -19,9 +15,7 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
19
15
  from datacontract.model.run import Run, Check, Log
20
16
 
21
17
 
22
- def check_soda_execute(
23
- run: Run, data_contract: DataContractSpecification, server: Server, spark: "SparkSession", tmp_dir
24
- ):
18
+ def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark, tmp_dir):
25
19
  if data_contract is None:
26
20
  run.log_warn("Cannot run engine soda-core, as data contract is invalid")
27
21
  return
@@ -29,7 +23,7 @@ def check_soda_execute(
29
23
  run.log_info("Running engine soda-core")
30
24
  scan = Scan()
31
25
 
32
- if server.type in ["s3", "azure", "local"]:
26
+ if server.type in ["s3", "gcs", "azure", "local"]:
33
27
  if server.format in ["json", "parquet", "csv", "delta"]:
34
28
  con = get_duckdb_connection(data_contract, server, run)
35
29
  scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
@@ -1,7 +1,5 @@
1
1
  import os
2
2
 
3
- from deltalake import DeltaTable
4
-
5
3
  import duckdb
6
4
  from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type
7
5
  from datacontract.model.run import Run
@@ -15,6 +13,9 @@ def get_duckdb_connection(data_contract, server, run: Run):
15
13
  if server.type == "s3":
16
14
  path = server.location
17
15
  setup_s3_connection(con, server)
16
+ if server.type == "gcs":
17
+ path = server.location
18
+ setup_gcs_connection(con, server)
18
19
  if server.type == "azure":
19
20
  path = server.location
20
21
  setup_azure_connection(con, server)
@@ -49,28 +50,8 @@ def get_duckdb_connection(data_contract, server, run: Run):
49
50
  f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});"""
50
51
  )
51
52
  elif server.format == "delta":
52
- if server.type == "local":
53
- delta_table_arrow = DeltaTable(model_path).to_pyarrow_dataset()
54
- con.register(model_name, delta_table_arrow)
55
-
56
- if server.type == "azure":
57
- # After switching to native delta table support
58
- # in https://github.com/datacontract/datacontract-cli/issues/258,
59
- # azure storage should also work
60
- # https://github.com/duckdb/duckdb_delta/issues/21
61
- raise NotImplementedError("Support for Delta Tables on Azure Storage is not implemented yet")
62
- if server.type == "s3":
63
- storage_options = {
64
- "AWS_ENDPOINT_URL": server.endpointUrl,
65
- "AWS_ACCESS_KEY_ID": os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID"),
66
- "AWS_SECRET_ACCESS_KEY": os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY"),
67
- "AWS_REGION": os.getenv("DATACONTRACT_S3_REGION", "us-east-1"),
68
- "AWS_ALLOW_HTTP": "True" if server.endpointUrl.startswith("http://") else "False",
69
- }
70
-
71
- delta_table_arrow = DeltaTable(model_path, storage_options=storage_options).to_pyarrow_dataset()
72
-
73
- con.register(model_name, delta_table_arrow)
53
+ con.sql("update extensions;") # Make sure we have the latest delta extension
54
+ con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""")
74
55
  return con
75
56
 
76
57
 
@@ -142,6 +123,24 @@ def setup_s3_connection(con, server):
142
123
  # print(con.sql("SELECT * FROM duckdb_settings() WHERE name like 's3%'"))
143
124
 
144
125
 
126
+ def setup_gcs_connection(con, server):
127
+ key_id = os.getenv("DATACONTRACT_GCS_KEY_ID")
128
+ secret = os.getenv("DATACONTRACT_GCS_SECRET")
129
+
130
+ if key_id is None:
131
+ raise ValueError("Error: Environment variable DATACONTRACT_GCS_KEY_ID is not set")
132
+ if secret is None:
133
+ raise ValueError("Error: Environment variable DATACONTRACT_GCS_SECRET is not set")
134
+
135
+ con.sql(f"""
136
+ CREATE SECRET gcs_secret (
137
+ TYPE GCS,
138
+ KEY_ID '{key_id}',
139
+ SECRET '{secret}'
140
+ );
141
+ """)
142
+
143
+
145
144
  def setup_azure_connection(con, server):
146
145
  tenant_id = os.getenv("DATACONTRACT_AZURE_TENANT_ID")
147
146
  client_id = os.getenv("DATACONTRACT_AZURE_CLIENT_ID")
@@ -1,40 +1,34 @@
1
1
  import logging
2
2
  import os
3
- from pyspark.sql import SparkSession
4
- from pyspark.sql.functions import col, expr, from_json
5
- from pyspark.sql.avro.functions import from_avro
6
- from pyspark.sql.types import (
7
- StructType,
8
- StructField,
9
- StringType,
10
- DecimalType,
11
- DoubleType,
12
- IntegerType,
13
- LongType,
14
- BooleanType,
15
- TimestampType,
16
- TimestampNTZType,
17
- DateType,
18
- BinaryType,
19
- ArrayType,
20
- NullType,
21
- DataType,
22
- )
23
3
 
24
4
  from datacontract.export.avro_converter import to_avro_schema_json
25
5
  from datacontract.model.data_contract_specification import DataContractSpecification, Server, Field
26
6
  from datacontract.model.exceptions import DataContractException
27
7
 
28
8
 
29
- def create_spark_session(tmp_dir: str) -> SparkSession:
9
+ def create_spark_session(tmp_dir: str):
30
10
  """Create and configure a Spark session."""
11
+
12
+ try:
13
+ from pyspark.sql import SparkSession
14
+ except ImportError as e:
15
+ raise DataContractException(
16
+ type="schema",
17
+ result="failed",
18
+ name="pyspark is missing",
19
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
20
+ engine="datacontract",
21
+ original_exception=e,
22
+ )
23
+
31
24
  spark = (
32
25
  SparkSession.builder.appName("datacontract")
33
26
  .config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
34
27
  .config("spark.streaming.stopGracefullyOnShutdown", "true")
28
+ .config("spark.ui.enabled", "false")
35
29
  .config(
36
30
  "spark.jars.packages",
37
- "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,org.apache.spark:spark-avro_2.12:3.5.0",
31
+ "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.2,org.apache.spark:spark-avro_2.12:3.5.2",
38
32
  )
39
33
  .getOrCreate()
40
34
  )
@@ -43,7 +37,7 @@ def create_spark_session(tmp_dir: str) -> SparkSession:
43
37
  return spark
44
38
 
45
39
 
46
- def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecification, server: Server, tmp_dir):
40
+ def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Server, tmp_dir):
47
41
  """Read and process data from a Kafka topic based on the server configuration."""
48
42
 
49
43
  logging.info("Reading data from Kafka server %s topic %s", server.host, server.topic)
@@ -74,6 +68,19 @@ def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecificati
74
68
 
75
69
 
76
70
  def process_avro_format(df, model_name, model):
71
+ try:
72
+ from pyspark.sql.functions import col, expr
73
+ from pyspark.sql.avro.functions import from_avro
74
+ except ImportError as e:
75
+ raise DataContractException(
76
+ type="schema",
77
+ result="failed",
78
+ name="pyspark is missing",
79
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
80
+ engine="datacontract",
81
+ original_exception=e,
82
+ )
83
+
77
84
  avro_schema = to_avro_schema_json(model_name, model)
78
85
  df2 = df.withColumn("fixedValue", expr("substring(value, 6, length(value)-5)"))
79
86
  options = {"mode": "PERMISSIVE"}
@@ -83,6 +90,18 @@ def process_avro_format(df, model_name, model):
83
90
 
84
91
 
85
92
  def process_json_format(df, model_name, model):
93
+ try:
94
+ from pyspark.sql.functions import col, from_json
95
+ except ImportError as e:
96
+ raise DataContractException(
97
+ type="schema",
98
+ result="failed",
99
+ name="pyspark is missing",
100
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
101
+ engine="datacontract",
102
+ original_exception=e,
103
+ )
104
+
86
105
  struct_type = to_struct_type(model.fields)
87
106
  df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").select(
88
107
  from_json(col("value"), struct_type, {"mode": "PERMISSIVE"}).alias("json")
@@ -94,7 +113,7 @@ def get_auth_options():
94
113
  kafka_sasl_username = os.getenv("DATACONTRACT_KAFKA_SASL_USERNAME")
95
114
  kafka_sasl_password = os.getenv("DATACONTRACT_KAFKA_SASL_PASSWORD")
96
115
 
97
- if kafka_sasl_username is None:
116
+ if kafka_sasl_username is None or kafka_sasl_username == "":
98
117
  return {}
99
118
 
100
119
  return {
@@ -108,11 +127,51 @@ def get_auth_options():
108
127
 
109
128
 
110
129
  def to_struct_type(fields):
130
+ try:
131
+ from pyspark.sql.types import StructType
132
+ except ImportError as e:
133
+ raise DataContractException(
134
+ type="schema",
135
+ result="failed",
136
+ name="pyspark is missing",
137
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
138
+ engine="datacontract",
139
+ original_exception=e,
140
+ )
141
+
111
142
  """Convert field definitions to Spark StructType."""
112
143
  return StructType([to_struct_field(field_name, field) for field_name, field in fields.items()])
113
144
 
114
145
 
115
- def to_struct_field(field_name: str, field: Field) -> StructField:
146
+ def to_struct_field(field_name: str, field: Field):
147
+ try:
148
+ from pyspark.sql.types import (
149
+ StructType,
150
+ StructField,
151
+ StringType,
152
+ DecimalType,
153
+ DoubleType,
154
+ IntegerType,
155
+ LongType,
156
+ BooleanType,
157
+ TimestampType,
158
+ TimestampNTZType,
159
+ DateType,
160
+ BinaryType,
161
+ ArrayType,
162
+ NullType,
163
+ DataType,
164
+ )
165
+ except ImportError as e:
166
+ raise DataContractException(
167
+ type="schema",
168
+ result="failed",
169
+ name="pyspark is missing",
170
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
171
+ engine="datacontract",
172
+ original_exception=e,
173
+ )
174
+
116
175
  """Map field definitions to Spark StructField using match-case."""
117
176
  match field.type:
118
177
  case "string" | "varchar" | "text":
@@ -40,11 +40,21 @@ def to_avro_field(field, field_name):
40
40
  avro_field = {"name": field_name}
41
41
  if field.description is not None:
42
42
  avro_field["doc"] = field.description
43
- avro_field["type"] = to_avro_type(field, field_name)
43
+ is_required_avro = field.required if field.required is not None else True
44
+ avro_type = to_avro_type(field, field_name)
45
+ avro_field["type"] = avro_type if is_required_avro else ["null", avro_type]
46
+
47
+ if avro_field["type"] == "enum":
48
+ avro_field["type"] = {
49
+ "type": "enum",
50
+ "name": field.title,
51
+ "symbols": field.enum,
52
+ }
44
53
 
45
54
  if field.config:
46
55
  if "avroDefault" in field.config:
47
- avro_field["default"] = field.config["avroDefault"]
56
+ if field.config.get("avroType") != "enum":
57
+ avro_field["default"] = field.config["avroDefault"]
48
58
 
49
59
  return avro_field
50
60
 
@@ -44,7 +44,7 @@ def to_fields_array(fields: Dict[str, Field]) -> List[Dict[str, Field]]:
44
44
 
45
45
 
46
46
  def to_field(field_name: str, field: Field) -> dict:
47
- bq_type = map_type_to_bigquery(field.type, field_name)
47
+ bq_type = map_type_to_bigquery(field)
48
48
  bq_field = {
49
49
  "name": field_name,
50
50
  "type": bq_type,
@@ -59,10 +59,8 @@ def to_field(field_name: str, field: Field) -> dict:
59
59
  # in case the array type is a complex object, we want to copy all its fields
60
60
  bq_field["fields"] = to_fields_array(field.items.fields)
61
61
  else:
62
- # otherwise we make up a structure that gets us a single field of the specified type
63
- bq_field["fields"] = to_fields_array(
64
- {f"{field_name}_1": Field(type=field.items.type, required=False, description="")}
65
- )
62
+ bq_field["type"] = map_type_to_bigquery(field.items)
63
+
66
64
  # all of these can carry other fields
67
65
  elif bq_type.lower() in ["record", "struct"]:
68
66
  bq_field["fields"] = to_fields_array(field.fields)
@@ -79,37 +77,46 @@ def to_field(field_name: str, field: Field) -> dict:
79
77
  return bq_field
80
78
 
81
79
 
82
- def map_type_to_bigquery(type_str: str, field_name: str) -> str:
80
+ def map_type_to_bigquery(field: Field) -> str:
83
81
  logger = logging.getLogger(__name__)
84
- if type_str.lower() in ["string", "varchar", "text"]:
82
+
83
+ field_type = field.type
84
+ if not field_type:
85
+ return None
86
+
87
+ if field.config and "bigqueryType" in field.config:
88
+ return field.config["bigqueryType"]
89
+
90
+ if field_type.lower() in ["string", "varchar", "text"]:
85
91
  return "STRING"
86
- elif type_str == "bytes":
92
+ elif field_type.lower() == "bytes":
87
93
  return "BYTES"
88
- elif type_str.lower() in ["int", "integer"]:
94
+ elif field_type.lower() in ["int", "integer"]:
89
95
  return "INTEGER"
90
- elif type_str.lower() in ["long", "bigint"]:
96
+ elif field_type.lower() in ["long", "bigint"]:
91
97
  return "INT64"
92
- elif type_str == "float":
93
- return "FLOAT"
94
- elif type_str == "boolean":
98
+ elif field_type.lower() == "float":
99
+ return "FLOAT64"
100
+ elif field_type.lower() == "boolean":
95
101
  return "BOOL"
96
- elif type_str.lower() in ["timestamp", "timestamp_tz"]:
102
+ elif field_type.lower() in ["timestamp", "timestamp_tz"]:
97
103
  return "TIMESTAMP"
98
- elif type_str == "date":
104
+ elif field_type.lower() == "date":
99
105
  return "DATE"
100
- elif type_str == "timestamp_ntz":
106
+ elif field_type.lower() == "timestamp_ntz":
101
107
  return "TIME"
102
- elif type_str.lower() in ["number", "decimal", "numeric"]:
108
+ elif field_type.lower() in ["number", "decimal", "numeric"]:
103
109
  return "NUMERIC"
104
- elif type_str == "double":
110
+ elif field_type.lower() == "double":
105
111
  return "BIGNUMERIC"
106
- elif type_str.lower() in ["object", "record", "array"]:
112
+ elif field_type.lower() in ["object", "record", "array"]:
107
113
  return "RECORD"
108
- elif type_str == "struct":
114
+ elif field_type.lower() == "struct":
109
115
  return "STRUCT"
110
- elif type_str == "null":
116
+ elif field_type.lower() == "null":
111
117
  logger.info(
112
- f"Can't properly map {field_name} to bigquery Schema, as 'null' is not supported as a type. Mapping it to STRING."
118
+ f"Can't properly map {field.title} to bigquery Schema, as 'null' \
119
+ is not supported as a type. Mapping it to STRING."
113
120
  )
114
121
  return "STRING"
115
122
  else:
@@ -117,6 +124,6 @@ def map_type_to_bigquery(type_str: str, field_name: str) -> str:
117
124
  type="schema",
118
125
  result="failed",
119
126
  name="Map datacontract type to bigquery data type",
120
- reason=f"Unsupported type {type_str} in data contract definition.",
127
+ reason=f"Unsupported type {field_type} in data contract definition.",
121
128
  engine="datacontract",
122
129
  )
@@ -0,0 +1,148 @@
1
+ from typing import Dict
2
+
3
+ import yaml
4
+
5
+ from datacontract.export.exporter import Exporter
6
+ from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field, Server
7
+
8
+
9
+ class DataCatererExporter(Exporter):
10
+ """
11
+ Exporter class for Data Caterer.
12
+ Creates a YAML file, based on the data contract, for Data Caterer to generate synthetic data.
13
+ """
14
+
15
+ def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
16
+ return to_data_caterer_generate_yaml(data_contract, server)
17
+
18
+
19
+ def to_data_caterer_generate_yaml(data_contract_spec: DataContractSpecification, server):
20
+ generation_task = {"name": data_contract_spec.info.title, "steps": []}
21
+ server_info = _get_server_info(data_contract_spec, server)
22
+
23
+ for model_key, model_value in data_contract_spec.models.items():
24
+ odcs_table = _to_data_caterer_generate_step(model_key, model_value, server_info)
25
+ generation_task["steps"].append(odcs_table)
26
+ return yaml.dump(generation_task, indent=2, sort_keys=False, allow_unicode=True)
27
+
28
+
29
+ def _get_server_info(data_contract_spec: DataContractSpecification, server):
30
+ if server is not None and server in data_contract_spec.servers:
31
+ return data_contract_spec.servers.get(server)
32
+ elif server is not None:
33
+ raise Exception(f"Server name not found in servers list in data contract, server-name={server}")
34
+ elif len(data_contract_spec.servers.keys()) > 0:
35
+ return next(iter(data_contract_spec.servers.values()))
36
+ else:
37
+ return None
38
+
39
+
40
+ def _to_data_caterer_generate_step(model_key, model_value: Model, server: Server) -> dict:
41
+ step = {
42
+ "name": model_key,
43
+ "type": _to_step_type(server),
44
+ "options": _to_data_source_options(model_key, server),
45
+ "schema": [],
46
+ }
47
+ fields = _to_fields(model_value.fields)
48
+ if fields:
49
+ step["schema"] = fields
50
+ return step
51
+
52
+
53
+ def _to_step_type(server: Server):
54
+ if server is not None and server.type is not None:
55
+ if server.type in ["s3", "gcs", "azure", "local"]:
56
+ return server.format
57
+ else:
58
+ return server.type
59
+ else:
60
+ return "csv"
61
+
62
+
63
+ def _to_data_source_options(model_key, server: Server):
64
+ options = {}
65
+ if server is not None and server.type is not None:
66
+ if server.type in ["s3", "gcs", "azure", "local"]:
67
+ if server.path is not None:
68
+ options["path"] = server.path
69
+ elif server.location is not None:
70
+ options["path"] = server.location
71
+ else:
72
+ options["path"] = "/tmp/data_caterer_data"
73
+ elif server.type == "postgres":
74
+ options["schema"] = server.schema_
75
+ options["table"] = model_key
76
+ elif server.type == "kafka":
77
+ options["topic"] = server.topic
78
+
79
+ return options
80
+
81
+
82
+ def _to_fields(fields: Dict[str, Field]) -> list:
83
+ dc_fields = []
84
+ for field_name, field in fields.items():
85
+ column = _to_field(field_name, field)
86
+ dc_fields.append(column)
87
+ return dc_fields
88
+
89
+
90
+ def _to_field(field_name: str, field: Field) -> dict:
91
+ dc_field = {"name": field_name}
92
+ dc_generator_opts = {}
93
+
94
+ if field.type is not None:
95
+ new_type = _to_data_type(field.type)
96
+ dc_field["type"] = _to_data_type(field.type)
97
+ if new_type == "object" or new_type == "record" or new_type == "struct":
98
+ # need to get nested field definitions
99
+ nested_fields = _to_fields(field.fields)
100
+ dc_field["schema"] = {"fields": nested_fields}
101
+
102
+ if field.enum is not None and len(field.enum) > 0:
103
+ dc_generator_opts["oneOf"] = field.enum
104
+ if field.unique is not None and field.unique:
105
+ dc_generator_opts["isUnique"] = field.unique
106
+ if field.minLength is not None:
107
+ dc_generator_opts["minLength"] = field.minLength
108
+ if field.maxLength is not None:
109
+ dc_generator_opts["maxLength"] = field.maxLength
110
+ if field.pattern is not None:
111
+ dc_generator_opts["regex"] = field.pattern
112
+ if field.minimum is not None:
113
+ dc_generator_opts["min"] = field.minimum
114
+ if field.maximum is not None:
115
+ dc_generator_opts["max"] = field.maximum
116
+
117
+ if len(dc_generator_opts.keys()) > 0:
118
+ dc_field["generator"] = {"options": dc_generator_opts}
119
+ return dc_field
120
+
121
+
122
+ def _to_data_type(data_type):
123
+ if data_type == "number" or data_type == "numeric" or data_type == "double":
124
+ return "double"
125
+ elif data_type == "decimal" or data_type == "bigint":
126
+ return "decimal"
127
+ elif data_type == "int":
128
+ return "integer"
129
+ elif data_type == "long":
130
+ return "long"
131
+ elif data_type == "float":
132
+ return "float"
133
+ elif data_type == "string" or data_type == "text" or data_type == "varchar":
134
+ return "string"
135
+ if data_type == "boolean":
136
+ return "boolean"
137
+ if data_type == "timestamp" or data_type == "timestamp_tz" or data_type == "timestamp_ntz":
138
+ return "timestamp"
139
+ elif data_type == "date":
140
+ return "date"
141
+ elif data_type == "array":
142
+ return "array"
143
+ elif data_type == "map" or data_type == "object" or data_type == "record" or data_type == "struct":
144
+ return "struct"
145
+ elif data_type == "bytes":
146
+ return "binary"
147
+ else:
148
+ return "string"
@@ -142,8 +142,9 @@ def generate_field(field_name: str, field: spec.Field, model_name: str, server:
142
142
  ref_str = "{0}.{1} > {2}".format(model_name, field_name, field.references)
143
143
  return (ref_str, field_str)
144
144
 
145
+
145
146
  def formatDescription(input: str) -> str:
146
- if '\n' in input or '\r' in input or '"' in input:
147
+ if "\n" in input or "\r" in input or '"' in input:
147
148
  return "'''{0}'''".format(input)
148
149
  else:
149
- return '"{0}"'.format(input)
150
+ return '"{0}"'.format(input)
@@ -35,6 +35,8 @@ class ExportFormat(str, Enum):
35
35
  bigquery = "bigquery"
36
36
  dbml = "dbml"
37
37
  spark = "spark"
38
+ sqlalchemy = "sqlalchemy"
39
+ data_caterer = "data-caterer"
38
40
 
39
41
  @classmethod
40
42
  def get_supported_formats(cls):