datacontract-cli 0.10.10__py3-none-any.whl → 0.10.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (39) hide show
  1. datacontract/cli.py +19 -3
  2. datacontract/data_contract.py +17 -17
  3. datacontract/engines/fastjsonschema/check_jsonschema.py +15 -1
  4. datacontract/engines/fastjsonschema/s3/s3_read_files.py +2 -0
  5. datacontract/engines/soda/check_soda_execute.py +2 -8
  6. datacontract/engines/soda/connections/duckdb.py +23 -20
  7. datacontract/engines/soda/connections/kafka.py +81 -23
  8. datacontract/engines/soda/connections/snowflake.py +8 -5
  9. datacontract/export/avro_converter.py +12 -2
  10. datacontract/export/dbml_converter.py +42 -19
  11. datacontract/export/exporter.py +2 -1
  12. datacontract/export/exporter_factory.py +6 -0
  13. datacontract/export/jsonschema_converter.py +1 -4
  14. datacontract/export/spark_converter.py +4 -0
  15. datacontract/export/sql_type_converter.py +64 -29
  16. datacontract/export/sqlalchemy_converter.py +169 -0
  17. datacontract/imports/avro_importer.py +1 -0
  18. datacontract/imports/bigquery_importer.py +2 -2
  19. datacontract/imports/dbml_importer.py +112 -0
  20. datacontract/imports/dbt_importer.py +67 -91
  21. datacontract/imports/glue_importer.py +64 -54
  22. datacontract/imports/importer.py +3 -2
  23. datacontract/imports/importer_factory.py +5 -0
  24. datacontract/imports/jsonschema_importer.py +106 -120
  25. datacontract/imports/odcs_importer.py +1 -1
  26. datacontract/imports/spark_importer.py +29 -10
  27. datacontract/imports/sql_importer.py +5 -1
  28. datacontract/imports/unity_importer.py +1 -1
  29. datacontract/integration/{publish_datamesh_manager.py → datamesh_manager.py} +33 -5
  30. datacontract/integration/{publish_opentelemetry.py → opentelemetry.py} +1 -1
  31. datacontract/model/data_contract_specification.py +6 -2
  32. datacontract/templates/partials/model_field.html +10 -2
  33. {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/METADATA +283 -113
  34. {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/RECORD +38 -37
  35. {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/WHEEL +1 -1
  36. datacontract/publish/publish.py +0 -32
  37. {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/LICENSE +0 -0
  38. {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/entry_points.txt +0 -0
  39. {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/top_level.txt +0 -0
datacontract/cli.py CHANGED
@@ -17,7 +17,7 @@ from datacontract.catalog.catalog import create_index_html, create_data_contract
17
17
  from datacontract.data_contract import DataContract, ExportFormat
18
18
  from datacontract.imports.importer import ImportFormat
19
19
  from datacontract.init.download_datacontract_file import download_datacontract_file, FileExistsException
20
- from datacontract.publish.publish import publish_to_datamesh_manager
20
+ from datacontract.integration.datamesh_manager import publish_data_contract_to_datamesh_manager
21
21
 
22
22
  DEFAULT_DATA_CONTRACT_SCHEMA_URL = "https://datacontract.com/datacontract.schema.json"
23
23
 
@@ -232,6 +232,18 @@ def import_(
232
232
  help="List of models names to import from the dbt manifest file (repeat for multiple models names, leave empty for all models in the dataset)."
233
233
  ),
234
234
  ] = None,
235
+ dbml_schema: Annotated[
236
+ Optional[List[str]],
237
+ typer.Option(
238
+ help="List of schema names to import from the DBML file (repeat for multiple schema names, leave empty for all tables in the file)."
239
+ ),
240
+ ] = None,
241
+ dbml_table: Annotated[
242
+ Optional[List[str]],
243
+ typer.Option(
244
+ help="List of table names to import from the DBML file (repeat for multiple table names, leave empty for all tables in the file)."
245
+ ),
246
+ ] = None,
235
247
  ):
236
248
  """
237
249
  Create a data contract from the given source location. Prints to stdout.
@@ -245,6 +257,8 @@ def import_(
245
257
  bigquery_dataset=bigquery_dataset,
246
258
  unity_table_full_name=unity_table_full_name,
247
259
  dbt_model=dbt_model,
260
+ dbml_schema=dbml_schema,
261
+ dbml_table=dbml_table,
248
262
  )
249
263
  console.print(result.to_yaml())
250
264
 
@@ -261,8 +275,10 @@ def publish(
261
275
  """
262
276
  Publish the data contract to the Data Mesh Manager.
263
277
  """
264
- publish_to_datamesh_manager(
265
- data_contract=DataContract(data_contract_file=location, schema_location=schema),
278
+ publish_data_contract_to_datamesh_manager(
279
+ data_contract_specification=DataContract(
280
+ data_contract_file=location, schema_location=schema
281
+ ).get_data_contract_specification(),
266
282
  )
267
283
 
268
284
 
@@ -18,8 +18,8 @@ from datacontract.export.exporter import ExportFormat
18
18
  from datacontract.export.exporter_factory import exporter_factory
19
19
  from datacontract.imports.importer_factory import importer_factory
20
20
 
21
- from datacontract.integration.publish_datamesh_manager import publish_datamesh_manager
22
- from datacontract.integration.publish_opentelemetry import publish_opentelemetry
21
+ from datacontract.integration.datamesh_manager import publish_test_results_to_datamesh_manager
22
+ from datacontract.integration.opentelemetry import publish_test_results_to_opentelemetry
23
23
  from datacontract.lint import resolve
24
24
  from datacontract.lint.linters.description_linter import DescriptionLinter
25
25
  from datacontract.lint.linters.example_model_linter import ExampleModelLinter
@@ -46,8 +46,8 @@ class DataContract:
46
46
  publish_url: str = None,
47
47
  publish_to_opentelemetry: bool = False,
48
48
  spark: "SparkSession" = None,
49
- inline_definitions: bool = False,
50
- inline_quality: bool = False,
49
+ inline_definitions: bool = True,
50
+ inline_quality: bool = True,
51
51
  ):
52
52
  self._data_contract_file = data_contract_file
53
53
  self._data_contract_str = data_contract_str
@@ -87,8 +87,8 @@ class DataContract:
87
87
  self._data_contract_str,
88
88
  self._data_contract,
89
89
  self._schema_location,
90
- inline_definitions=True,
91
- inline_quality=True,
90
+ inline_definitions=self._inline_definitions,
91
+ inline_quality=self._inline_quality,
92
92
  )
93
93
  run.checks.append(
94
94
  Check(type="lint", result="passed", name="Data contract is syntactically valid", engine="datacontract")
@@ -140,7 +140,12 @@ class DataContract:
140
140
  try:
141
141
  run.log_info("Testing data contract")
142
142
  data_contract = resolve.resolve_data_contract(
143
- self._data_contract_file, self._data_contract_str, self._data_contract, self._schema_location
143
+ self._data_contract_file,
144
+ self._data_contract_str,
145
+ self._data_contract,
146
+ self._schema_location,
147
+ inline_definitions=self._inline_definitions,
148
+ inline_quality=self._inline_quality,
144
149
  )
145
150
 
146
151
  if data_contract.models is None or len(data_contract.models) == 0:
@@ -213,15 +218,10 @@ class DataContract:
213
218
  run.finish()
214
219
 
215
220
  if self._publish_url is not None:
216
- try:
217
- publish_datamesh_manager(run, self._publish_url)
218
- except Exception:
219
- run.log_error("Failed to publish to datamesh manager")
221
+ publish_test_results_to_datamesh_manager(run, self._publish_url)
222
+
220
223
  if self._publish_to_opentelemetry:
221
- try:
222
- publish_opentelemetry(run)
223
- except Exception:
224
- run.log_error("Failed to publish to opentelemetry")
224
+ publish_test_results_to_opentelemetry(run)
225
225
 
226
226
  return run
227
227
 
@@ -304,8 +304,8 @@ class DataContract:
304
304
  self._data_contract_str,
305
305
  self._data_contract,
306
306
  schema_location=self._schema_location,
307
- inline_definitions=True,
308
- inline_quality=True,
307
+ inline_definitions=self._inline_definitions,
308
+ inline_quality=self._inline_quality,
309
309
  )
310
310
 
311
311
  return exporter_factory.create(export_format).export(
@@ -148,13 +148,27 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
148
148
  schema = to_jsonschema(model_name, model)
149
149
  run.log_info(f"jsonschema: {schema}")
150
150
 
151
- validate = fastjsonschema.compile(schema)
151
+ validate = fastjsonschema.compile(
152
+ schema,
153
+ formats={"uuid": r"^[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12}$"},
154
+ )
152
155
 
153
156
  # Process files based on server type
154
157
  if server.type == "local":
155
158
  process_local_file(run, server, model_name, validate)
156
159
  elif server.type == "s3":
157
160
  process_s3_file(server, model_name, validate)
161
+ elif server.type == "gcs":
162
+ run.checks.append(
163
+ Check(
164
+ type="schema",
165
+ name="Check that JSON has valid schema",
166
+ model=model_name,
167
+ result="info",
168
+ reason="JSON Schema check skipped for GCS, as GCS is currently not supported",
169
+ engine="jsonschema",
170
+ )
171
+ )
158
172
  else:
159
173
  run.checks.append(
160
174
  Check(
@@ -28,9 +28,11 @@ def s3_fs(s3_endpoint_url):
28
28
 
29
29
  aws_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
30
30
  aws_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
31
+ aws_session_token = os.getenv("DATACONTRACT_S3_SESSION_TOKEN")
31
32
  return s3fs.S3FileSystem(
32
33
  key=aws_access_key_id,
33
34
  secret=aws_secret_access_key,
35
+ token=aws_session_token,
34
36
  anon=aws_access_key_id is None,
35
37
  client_kwargs={"endpoint_url": s3_endpoint_url},
36
38
  )
@@ -1,8 +1,4 @@
1
1
  import logging
2
- import typing
3
-
4
- if typing.TYPE_CHECKING:
5
- from pyspark.sql import SparkSession
6
2
 
7
3
  from soda.scan import Scan
8
4
 
@@ -19,9 +15,7 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
19
15
  from datacontract.model.run import Run, Check, Log
20
16
 
21
17
 
22
- def check_soda_execute(
23
- run: Run, data_contract: DataContractSpecification, server: Server, spark: "SparkSession", tmp_dir
24
- ):
18
+ def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark, tmp_dir):
25
19
  if data_contract is None:
26
20
  run.log_warn("Cannot run engine soda-core, as data contract is invalid")
27
21
  return
@@ -29,7 +23,7 @@ def check_soda_execute(
29
23
  run.log_info("Running engine soda-core")
30
24
  scan = Scan()
31
25
 
32
- if server.type in ["s3", "azure", "local"]:
26
+ if server.type in ["s3", "gcs", "azure", "local"]:
33
27
  if server.format in ["json", "parquet", "csv", "delta"]:
34
28
  con = get_duckdb_connection(data_contract, server, run)
35
29
  scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
@@ -1,7 +1,5 @@
1
1
  import os
2
2
 
3
- from deltalake import DeltaTable
4
-
5
3
  import duckdb
6
4
  from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type
7
5
  from datacontract.model.run import Run
@@ -15,6 +13,9 @@ def get_duckdb_connection(data_contract, server, run: Run):
15
13
  if server.type == "s3":
16
14
  path = server.location
17
15
  setup_s3_connection(con, server)
16
+ if server.type == "gcs":
17
+ path = server.location
18
+ setup_gcs_connection(con, server)
18
19
  if server.type == "azure":
19
20
  path = server.location
20
21
  setup_azure_connection(con, server)
@@ -49,24 +50,8 @@ def get_duckdb_connection(data_contract, server, run: Run):
49
50
  f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});"""
50
51
  )
51
52
  elif server.format == "delta":
52
- if server.type == "azure":
53
- # After switching to native delta table support
54
- # in https://github.com/datacontract/datacontract-cli/issues/258,
55
- # azure storage should also work
56
- # https://github.com/duckdb/duckdb_delta/issues/21
57
- raise NotImplementedError("Support for Delta Tables on Azure Storage is not implemented yet")
58
-
59
- storage_options = {
60
- "AWS_ENDPOINT_URL": server.endpointUrl,
61
- "AWS_ACCESS_KEY_ID": os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID"),
62
- "AWS_SECRET_ACCESS_KEY": os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY"),
63
- "AWS_REGION": os.getenv("DATACONTRACT_S3_REGION", "us-east-1"),
64
- "AWS_ALLOW_HTTP": "True" if server.endpointUrl.startswith("http://") else "False",
65
- }
66
-
67
- delta_table_arrow = DeltaTable(model_path, storage_options=storage_options).to_pyarrow_dataset()
68
-
69
- con.register(model_name, delta_table_arrow)
53
+ con.sql("update extensions;") # Make sure we have the latest delta extension
54
+ con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""")
70
55
  return con
71
56
 
72
57
 
@@ -138,6 +123,24 @@ def setup_s3_connection(con, server):
138
123
  # print(con.sql("SELECT * FROM duckdb_settings() WHERE name like 's3%'"))
139
124
 
140
125
 
126
+ def setup_gcs_connection(con, server):
127
+ key_id = os.getenv("DATACONTRACT_GCS_KEY_ID")
128
+ secret = os.getenv("DATACONTRACT_GCS_SECRET")
129
+
130
+ if key_id is None:
131
+ raise ValueError("Error: Environment variable DATACONTRACT_GCS_KEY_ID is not set")
132
+ if secret is None:
133
+ raise ValueError("Error: Environment variable DATACONTRACT_GCS_SECRET is not set")
134
+
135
+ con.sql(f"""
136
+ CREATE SECRET gcs_secret (
137
+ TYPE GCS,
138
+ KEY_ID '{key_id}',
139
+ SECRET '{secret}'
140
+ );
141
+ """)
142
+
143
+
141
144
  def setup_azure_connection(con, server):
142
145
  tenant_id = os.getenv("DATACONTRACT_AZURE_TENANT_ID")
143
146
  client_id = os.getenv("DATACONTRACT_AZURE_CLIENT_ID")
@@ -1,33 +1,26 @@
1
1
  import logging
2
2
  import os
3
- from pyspark.sql import SparkSession
4
- from pyspark.sql.functions import col, expr, from_json
5
- from pyspark.sql.avro.functions import from_avro
6
- from pyspark.sql.types import (
7
- StructType,
8
- StructField,
9
- StringType,
10
- DecimalType,
11
- DoubleType,
12
- IntegerType,
13
- LongType,
14
- BooleanType,
15
- TimestampType,
16
- TimestampNTZType,
17
- DateType,
18
- BinaryType,
19
- ArrayType,
20
- NullType,
21
- DataType,
22
- )
23
3
 
24
4
  from datacontract.export.avro_converter import to_avro_schema_json
25
5
  from datacontract.model.data_contract_specification import DataContractSpecification, Server, Field
26
6
  from datacontract.model.exceptions import DataContractException
27
7
 
28
8
 
29
- def create_spark_session(tmp_dir: str) -> SparkSession:
9
+ def create_spark_session(tmp_dir: str):
30
10
  """Create and configure a Spark session."""
11
+
12
+ try:
13
+ from pyspark.sql import SparkSession
14
+ except ImportError as e:
15
+ raise DataContractException(
16
+ type="schema",
17
+ result="failed",
18
+ name="pyspark is missing",
19
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
20
+ engine="datacontract",
21
+ original_exception=e,
22
+ )
23
+
31
24
  spark = (
32
25
  SparkSession.builder.appName("datacontract")
33
26
  .config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
@@ -43,7 +36,7 @@ def create_spark_session(tmp_dir: str) -> SparkSession:
43
36
  return spark
44
37
 
45
38
 
46
- def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecification, server: Server, tmp_dir):
39
+ def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Server, tmp_dir):
47
40
  """Read and process data from a Kafka topic based on the server configuration."""
48
41
 
49
42
  logging.info("Reading data from Kafka server %s topic %s", server.host, server.topic)
@@ -74,6 +67,19 @@ def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecificati
74
67
 
75
68
 
76
69
  def process_avro_format(df, model_name, model):
70
+ try:
71
+ from pyspark.sql.functions import col, expr
72
+ from pyspark.sql.avro.functions import from_avro
73
+ except ImportError as e:
74
+ raise DataContractException(
75
+ type="schema",
76
+ result="failed",
77
+ name="pyspark is missing",
78
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
79
+ engine="datacontract",
80
+ original_exception=e,
81
+ )
82
+
77
83
  avro_schema = to_avro_schema_json(model_name, model)
78
84
  df2 = df.withColumn("fixedValue", expr("substring(value, 6, length(value)-5)"))
79
85
  options = {"mode": "PERMISSIVE"}
@@ -83,6 +89,18 @@ def process_avro_format(df, model_name, model):
83
89
 
84
90
 
85
91
  def process_json_format(df, model_name, model):
92
+ try:
93
+ from pyspark.sql.functions import col, from_json
94
+ except ImportError as e:
95
+ raise DataContractException(
96
+ type="schema",
97
+ result="failed",
98
+ name="pyspark is missing",
99
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
100
+ engine="datacontract",
101
+ original_exception=e,
102
+ )
103
+
86
104
  struct_type = to_struct_type(model.fields)
87
105
  df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").select(
88
106
  from_json(col("value"), struct_type, {"mode": "PERMISSIVE"}).alias("json")
@@ -108,11 +126,51 @@ def get_auth_options():
108
126
 
109
127
 
110
128
  def to_struct_type(fields):
129
+ try:
130
+ from pyspark.sql.types import StructType
131
+ except ImportError as e:
132
+ raise DataContractException(
133
+ type="schema",
134
+ result="failed",
135
+ name="pyspark is missing",
136
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
137
+ engine="datacontract",
138
+ original_exception=e,
139
+ )
140
+
111
141
  """Convert field definitions to Spark StructType."""
112
142
  return StructType([to_struct_field(field_name, field) for field_name, field in fields.items()])
113
143
 
114
144
 
115
- def to_struct_field(field_name: str, field: Field) -> StructField:
145
+ def to_struct_field(field_name: str, field: Field):
146
+ try:
147
+ from pyspark.sql.types import (
148
+ StructType,
149
+ StructField,
150
+ StringType,
151
+ DecimalType,
152
+ DoubleType,
153
+ IntegerType,
154
+ LongType,
155
+ BooleanType,
156
+ TimestampType,
157
+ TimestampNTZType,
158
+ DateType,
159
+ BinaryType,
160
+ ArrayType,
161
+ NullType,
162
+ DataType,
163
+ )
164
+ except ImportError as e:
165
+ raise DataContractException(
166
+ type="schema",
167
+ result="failed",
168
+ name="pyspark is missing",
169
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
170
+ engine="datacontract",
171
+ original_exception=e,
172
+ )
173
+
116
174
  """Map field definitions to Spark StructField using match-case."""
117
175
  match field.type:
118
176
  case "string" | "varchar" | "text":
@@ -4,17 +4,20 @@ import yaml
4
4
 
5
5
 
6
6
  def to_snowflake_soda_configuration(server):
7
+ prefix = "DATACONTRACT_SNOWFLAKE_"
8
+ snowflake_soda_params = {k.replace(prefix, "").lower(): v for k, v in os.environ.items() if k.startswith(prefix)}
9
+
10
+ # backward compatibility
11
+ if "connection_timeout" not in snowflake_soda_params:
12
+ snowflake_soda_params["connection_timeout"] = "5" # minutes
13
+
7
14
  soda_configuration = {
8
15
  f"data_source {server.type}": {
9
16
  "type": "snowflake",
10
- "username": os.getenv("DATACONTRACT_SNOWFLAKE_USERNAME"),
11
- "password": os.getenv("DATACONTRACT_SNOWFLAKE_PASSWORD"),
12
- "role": os.getenv("DATACONTRACT_SNOWFLAKE_ROLE"),
13
17
  "account": server.account,
14
18
  "database": server.database,
15
19
  "schema": server.schema_,
16
- "warehouse": os.getenv("DATACONTRACT_SNOWFLAKE_WAREHOUSE"),
17
- "connection_timeout": 5, # minutes
20
+ **snowflake_soda_params,
18
21
  }
19
22
  }
20
23
  soda_configuration_str = yaml.dump(soda_configuration)
@@ -40,11 +40,21 @@ def to_avro_field(field, field_name):
40
40
  avro_field = {"name": field_name}
41
41
  if field.description is not None:
42
42
  avro_field["doc"] = field.description
43
- avro_field["type"] = to_avro_type(field, field_name)
43
+ is_required_avro = field.required if field.required is not None else True
44
+ avro_type = to_avro_type(field, field_name)
45
+ avro_field["type"] = avro_type if is_required_avro else ["null", avro_type]
46
+
47
+ if avro_field["type"] == "enum":
48
+ avro_field["type"] = {
49
+ "type": "enum",
50
+ "name": field.title,
51
+ "symbols": field.enum,
52
+ }
44
53
 
45
54
  if field.config:
46
55
  if "avroDefault" in field.config:
47
- avro_field["default"] = field.config["avroDefault"]
56
+ if field.config.get("avroType") != "enum":
57
+ avro_field["default"] = field.config["avroDefault"]
48
58
 
49
59
  return avro_field
50
60
 
@@ -3,6 +3,7 @@ from importlib.metadata import version
3
3
  from typing import Tuple
4
4
 
5
5
  import pytz
6
+ from datacontract.model.exceptions import DataContractException
6
7
 
7
8
  import datacontract.model.data_contract_specification as spec
8
9
  from datacontract.export.sql_type_converter import convert_to_sql_type
@@ -48,17 +49,7 @@ Using {5} Types for the field types
48
49
  {0}
49
50
  */
50
51
  """.format(generated_info)
51
-
52
- note = """Note project_info {{
53
- '''
54
- {0}
55
- '''
56
- }}
57
- """.format(generated_info)
58
-
59
- return """{0}
60
- {1}
61
- """.format(comment, note)
52
+ return comment
62
53
 
63
54
 
64
55
  def get_version() -> str:
@@ -70,19 +61,18 @@ def get_version() -> str:
70
61
 
71
62
  def generate_project_info(contract: spec.DataContractSpecification) -> str:
72
63
  return """Project "{0}" {{
73
- Note: "{1}"
64
+ Note: '''{1}'''
74
65
  }}\n
75
- """.format(contract.info.title, " ".join(contract.info.description.splitlines()))
66
+ """.format(contract.info.title, contract.info.description)
76
67
 
77
68
 
78
69
  def generate_table(model_name: str, model: spec.Model, server: spec.Server) -> str:
79
70
  result = """Table "{0}" {{
80
- Note: "{1}"
81
- """.format(model_name, " ".join(model.description.splitlines()))
71
+ Note: {1}
72
+ """.format(model_name, formatDescription(model.description))
82
73
 
83
74
  references = []
84
75
 
85
- # Add all the fields
86
76
  for field_name, field in model.fields.items():
87
77
  ref, field_string = generate_field(field_name, field, model_name, server)
88
78
  if ref is not None:
@@ -102,6 +92,30 @@ Note: "{1}"
102
92
 
103
93
 
104
94
  def generate_field(field_name: str, field: spec.Field, model_name: str, server: spec.Server) -> Tuple[str, str]:
95
+ if field.primary:
96
+ if field.required is not None:
97
+ if not field.required:
98
+ raise DataContractException(
99
+ type="lint",
100
+ name="Primary key fields cannot have required == False.",
101
+ result="error",
102
+ reason="Primary key fields cannot have required == False.",
103
+ engine="datacontract",
104
+ )
105
+ else:
106
+ field.required = True
107
+ if field.unique is not None:
108
+ if not field.unique:
109
+ raise DataContractException(
110
+ type="lint",
111
+ name="Primary key fields cannot have unique == False",
112
+ result="error",
113
+ reason="Primary key fields cannot have unique == False.",
114
+ engine="datacontract",
115
+ )
116
+ else:
117
+ field.unique = True
118
+
105
119
  field_attrs = []
106
120
  if field.primary:
107
121
  field_attrs.append("pk")
@@ -115,13 +129,22 @@ def generate_field(field_name: str, field: spec.Field, model_name: str, server:
115
129
  field_attrs.append("null")
116
130
 
117
131
  if field.description:
118
- field_attrs.append('Note: "{0}"'.format(" ".join(field.description.splitlines())))
132
+ field_attrs.append("""Note: {0}""".format(formatDescription(field.description)))
119
133
 
120
134
  field_type = field.type if server is None else convert_to_sql_type(field, server.type)
121
135
 
122
136
  field_str = '"{0}" "{1}" [{2}]'.format(field_name, field_type, ",".join(field_attrs))
123
137
  ref_str = None
124
138
  if (field.references) is not None:
125
- # we always assume many to one, as datacontract doesn't really give us more info
126
- ref_str = "{0}.{1} > {2}".format(model_name, field_name, field.references)
139
+ if field.unique:
140
+ ref_str = "{0}.{1} - {2}".format(model_name, field_name, field.references)
141
+ else:
142
+ ref_str = "{0}.{1} > {2}".format(model_name, field_name, field.references)
127
143
  return (ref_str, field_str)
144
+
145
+
146
+ def formatDescription(input: str) -> str:
147
+ if "\n" in input or "\r" in input or '"' in input:
148
+ return "'''{0}'''".format(input)
149
+ else:
150
+ return '"{0}"'.format(input)
@@ -35,9 +35,10 @@ class ExportFormat(str, Enum):
35
35
  bigquery = "bigquery"
36
36
  dbml = "dbml"
37
37
  spark = "spark"
38
+ sqlalchemy = "sqlalchemy"
38
39
 
39
40
  @classmethod
40
- def get_suported_formats(cls):
41
+ def get_supported_formats(cls):
41
42
  return list(map(lambda c: c.value, cls))
42
43
 
43
44
 
@@ -143,3 +143,9 @@ exporter_factory.register_lazy_exporter(
143
143
  exporter_factory.register_lazy_exporter(
144
144
  name=ExportFormat.spark, module_path="datacontract.export.spark_converter", class_name="SparkExporter"
145
145
  )
146
+
147
+ exporter_factory.register_lazy_exporter(
148
+ name=ExportFormat.sqlalchemy,
149
+ module_path="datacontract.export.sqlalchemy_converter",
150
+ class_name="SQLAlchemyExporter",
151
+ )
@@ -36,10 +36,7 @@ def to_property(field: Field) -> dict:
36
36
  property = {}
37
37
  json_type, json_format = convert_type_format(field.type, field.format)
38
38
  if json_type is not None:
39
- if field.required:
40
- property["type"] = json_type
41
- else:
42
- property["type"] = [json_type, "null"]
39
+ property["type"] = json_type
43
40
  if json_format is not None:
44
41
  property["format"] = json_format
45
42
  if field.unique:
@@ -123,6 +123,8 @@ def to_data_type(field: Field) -> types.DataType:
123
123
  return types.ArrayType(to_data_type(field.items))
124
124
  if field_type in ["object", "record", "struct"]:
125
125
  return types.StructType(to_struct_type(field.fields))
126
+ if field_type == "map":
127
+ return types.MapType(to_data_type(field.keys), to_data_type(field.values))
126
128
  if field_type in ["string", "varchar", "text"]:
127
129
  return types.StringType()
128
130
  if field_type in ["number", "decimal", "numeric"]:
@@ -204,6 +206,8 @@ def print_schema(dtype: types.DataType) -> str:
204
206
  return format_struct_type(dtype)
205
207
  elif isinstance(dtype, types.ArrayType):
206
208
  return f"ArrayType({print_schema(dtype.elementType)})"
209
+ elif isinstance(dtype, types.MapType):
210
+ return f"MapType(\n{indent(print_schema(dtype.keyType), 1)}, {print_schema(dtype.valueType)})"
207
211
  elif isinstance(dtype, types.DecimalType):
208
212
  return f"DecimalType({dtype.precision}, {dtype.scale})"
209
213
  else: