datacontract-cli 0.9.7__py3-none-any.whl → 0.9.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (62) hide show
  1. datacontract/breaking/breaking.py +48 -57
  2. datacontract/cli.py +100 -80
  3. datacontract/data_contract.py +178 -128
  4. datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +5 -1
  5. datacontract/engines/datacontract/check_that_datacontract_file_exists.py +9 -8
  6. datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +26 -22
  7. datacontract/engines/fastjsonschema/check_jsonschema.py +31 -25
  8. datacontract/engines/fastjsonschema/s3/s3_read_files.py +8 -6
  9. datacontract/engines/soda/check_soda_execute.py +58 -36
  10. datacontract/engines/soda/connections/bigquery.py +5 -3
  11. datacontract/engines/soda/connections/dask.py +0 -1
  12. datacontract/engines/soda/connections/databricks.py +2 -2
  13. datacontract/engines/soda/connections/duckdb.py +25 -8
  14. datacontract/engines/soda/connections/kafka.py +36 -17
  15. datacontract/engines/soda/connections/postgres.py +3 -3
  16. datacontract/engines/soda/connections/snowflake.py +4 -4
  17. datacontract/export/avro_converter.py +9 -11
  18. datacontract/export/avro_idl_converter.py +65 -42
  19. datacontract/export/csv_type_converter.py +36 -0
  20. datacontract/export/dbt_converter.py +43 -32
  21. datacontract/export/great_expectations_converter.py +141 -0
  22. datacontract/export/html_export.py +46 -0
  23. datacontract/export/jsonschema_converter.py +3 -1
  24. datacontract/export/odcs_converter.py +5 -7
  25. datacontract/export/protobuf_converter.py +12 -10
  26. datacontract/export/pydantic_converter.py +131 -0
  27. datacontract/export/rdf_converter.py +34 -11
  28. datacontract/export/sodacl_converter.py +118 -21
  29. datacontract/export/sql_converter.py +30 -8
  30. datacontract/export/sql_type_converter.py +44 -4
  31. datacontract/export/terraform_converter.py +4 -3
  32. datacontract/imports/avro_importer.py +65 -18
  33. datacontract/imports/sql_importer.py +0 -2
  34. datacontract/init/download_datacontract_file.py +2 -2
  35. datacontract/integration/publish_datamesh_manager.py +6 -12
  36. datacontract/integration/publish_opentelemetry.py +30 -16
  37. datacontract/lint/files.py +2 -2
  38. datacontract/lint/lint.py +26 -31
  39. datacontract/lint/linters/description_linter.py +12 -21
  40. datacontract/lint/linters/example_model_linter.py +28 -29
  41. datacontract/lint/linters/field_pattern_linter.py +8 -8
  42. datacontract/lint/linters/field_reference_linter.py +11 -10
  43. datacontract/lint/linters/notice_period_linter.py +18 -22
  44. datacontract/lint/linters/quality_schema_linter.py +16 -20
  45. datacontract/lint/linters/valid_constraints_linter.py +42 -37
  46. datacontract/lint/resolve.py +50 -14
  47. datacontract/lint/schema.py +2 -3
  48. datacontract/lint/urls.py +4 -5
  49. datacontract/model/breaking_change.py +2 -1
  50. datacontract/model/data_contract_specification.py +8 -7
  51. datacontract/model/exceptions.py +13 -2
  52. datacontract/model/run.py +3 -2
  53. datacontract/web.py +3 -7
  54. datacontract_cli-0.9.9.dist-info/METADATA +951 -0
  55. datacontract_cli-0.9.9.dist-info/RECORD +64 -0
  56. datacontract/lint/linters/primary_field_linter.py +0 -30
  57. datacontract_cli-0.9.7.dist-info/METADATA +0 -603
  58. datacontract_cli-0.9.7.dist-info/RECORD +0 -61
  59. {datacontract_cli-0.9.7.dist-info → datacontract_cli-0.9.9.dist-info}/LICENSE +0 -0
  60. {datacontract_cli-0.9.7.dist-info → datacontract_cli-0.9.9.dist-info}/WHEEL +0 -0
  61. {datacontract_cli-0.9.7.dist-info → datacontract_cli-0.9.9.dist-info}/entry_points.txt +0 -0
  62. {datacontract_cli-0.9.7.dist-info → datacontract_cli-0.9.9.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,4 @@
1
- from typing import Dict
2
-
3
- import yaml
4
-
5
- from datacontract.model.data_contract_specification import \
6
- DataContractSpecification, Model, Field
1
+ from datacontract.model.data_contract_specification import DataContractSpecification
7
2
 
8
3
 
9
4
  def to_protobuf(data_contract_spec: DataContractSpecification):
@@ -24,7 +19,7 @@ def _to_protobuf_message_name(model_name):
24
19
  return model_name[0].upper() + model_name[1:]
25
20
 
26
21
 
27
- def to_protobuf_message(model_name, fields, description, indent_level:int = 0):
22
+ def to_protobuf_message(model_name, fields, description, indent_level: int = 0):
28
23
  result = ""
29
24
 
30
25
  if description is not None:
@@ -34,8 +29,15 @@ def to_protobuf_message(model_name, fields, description, indent_level:int = 0):
34
29
  number = 1
35
30
  for field_name, field in fields.items():
36
31
  if field.type in ["object", "record", "struct"]:
37
- fields_protobuf += "\n".join(
38
- map(lambda x: " " + x, to_protobuf_message(field_name, field.fields, field.description, indent_level + 1).splitlines())) + "\n"
32
+ fields_protobuf += (
33
+ "\n".join(
34
+ map(
35
+ lambda x: " " + x,
36
+ to_protobuf_message(field_name, field.fields, field.description, indent_level + 1).splitlines(),
37
+ )
38
+ )
39
+ + "\n"
40
+ )
39
41
 
40
42
  fields_protobuf += to_protobuf_field(field_name, field, field.description, number, 1) + "\n"
41
43
  number += 1
@@ -44,7 +46,7 @@ def to_protobuf_message(model_name, fields, description, indent_level:int = 0):
44
46
  return result
45
47
 
46
48
 
47
- def to_protobuf_field(field_name, field, description, number:int, indent_level:int = 0):
49
+ def to_protobuf_field(field_name, field, description, number: int, indent_level: int = 0):
48
50
  optional = ""
49
51
  if not field.required:
50
52
  optional = "optional "
@@ -0,0 +1,131 @@
1
+ import ast
2
+ import typing
3
+
4
+ import datacontract.model.data_contract_specification as spec
5
+
6
+
7
+ def to_pydantic_model_str(contract: spec.DataContractSpecification) -> str:
8
+ classdefs = [generate_model_class(model_name, model) for (model_name, model) in contract.models.items()]
9
+ documentation = (
10
+ [ast.Expr(ast.Constant(contract.info.description))] if (contract.info and contract.info.description) else []
11
+ )
12
+ result = ast.Module(
13
+ body=[
14
+ ast.Import(
15
+ names=[
16
+ ast.Name("datetime", ctx=ast.Load()),
17
+ ast.Name("typing", ctx=ast.Load()),
18
+ ast.Name("pydantic", ctx=ast.Load()),
19
+ ]
20
+ ),
21
+ *documentation,
22
+ *classdefs,
23
+ ],
24
+ type_ignores=[],
25
+ )
26
+ return ast.unparse(result)
27
+
28
+
29
+ def optional_of(node) -> ast.Subscript:
30
+ return ast.Subscript(
31
+ value=ast.Attribute(ast.Name(id="typing", ctx=ast.Load()), attr="Optional", ctx=ast.Load()), slice=node
32
+ )
33
+
34
+
35
+ def list_of(node) -> ast.Subscript:
36
+ return ast.Subscript(value=ast.Name(id="list", ctx=ast.Load()), slice=node)
37
+
38
+
39
+ def product_of(nodes: list[typing.Any]) -> ast.Subscript:
40
+ return ast.Subscript(
41
+ value=ast.Attribute(value=ast.Name(id="typing", ctx=ast.Load()), attr="Product", ctx=ast.Load()),
42
+ slice=ast.Tuple(nodes, ctx=ast.Load()),
43
+ )
44
+
45
+
46
+ type_annotation_type = typing.Union[ast.Name, ast.Attribute, ast.Constant, ast.Subscript]
47
+
48
+
49
+ def constant_field_annotation(
50
+ field_name: str, field: spec.Field
51
+ ) -> tuple[type_annotation_type, typing.Optional[ast.ClassDef]]:
52
+ match field.type:
53
+ case "string" | "text" | "varchar":
54
+ return (ast.Name("str", ctx=ast.Load()), None)
55
+ case "number", "decimal", "numeric":
56
+ # Either integer or float in specification,
57
+ # so we use float.
58
+ return (ast.Name("float", ctx=ast.Load()), None)
59
+ case "int" | "integer" | "long" | "bigint":
60
+ return (ast.Name("int", ctx=ast.Load()), None)
61
+ case "float" | "double":
62
+ return (ast.Name("float", ctx=ast.Load()), None)
63
+ case "boolean":
64
+ return (ast.Name("bool", ctx=ast.Load()), None)
65
+ case "timestamp" | "timestamp_tz" | "timestamp_ntz":
66
+ return (ast.Attribute(value=ast.Name(id="datetime", ctx=ast.Load()), attr="datetime"), None)
67
+ case "date":
68
+ return (ast.Attribute(value=ast.Name(id="datetime", ctx=ast.Load()), attr="date"), None)
69
+ case "bytes":
70
+ return (ast.Name("bytes", ctx=ast.Load()), None)
71
+ case "null":
72
+ return (ast.Constant("None"), None)
73
+ case "array":
74
+ (annotated_type, new_class) = type_annotation(field_name, field.items)
75
+ return (list_of(annotated_type), new_class)
76
+ case "object" | "record" | "struct":
77
+ classdef = generate_field_class(field_name.capitalize(), field)
78
+ return (ast.Name(field_name.capitalize(), ctx=ast.Load()), classdef)
79
+ case _:
80
+ raise RuntimeError(f"Unsupported field type {field.type}.")
81
+
82
+
83
+ def type_annotation(field_name: str, field: spec.Field) -> tuple[type_annotation_type, typing.Optional[ast.ClassDef]]:
84
+ if field.required:
85
+ return constant_field_annotation(field_name, field)
86
+ else:
87
+ (annotated_type, new_classes) = constant_field_annotation(field_name, field)
88
+ return (optional_of(annotated_type), new_classes)
89
+
90
+
91
+ def is_simple_field(field: spec.Field) -> bool:
92
+ return field.type not in set(["object", "record", "struct"])
93
+
94
+
95
+ def field_definitions(fields: dict[str, spec.Field]) -> tuple[list[ast.Expr], list[ast.ClassDef]]:
96
+ annotations = []
97
+ classes = []
98
+ for field_name, field in fields.items():
99
+ (ann, new_class) = type_annotation(field_name, field)
100
+ annotations.append(ast.AnnAssign(target=ast.Name(id=field_name, ctx=ast.Store()), annotation=ann, simple=1))
101
+ if field.description and is_simple_field(field):
102
+ annotations.append(ast.Expr(ast.Constant(field.description)))
103
+ if new_class:
104
+ classes.append(new_class)
105
+ return (annotations, classes)
106
+
107
+
108
+ def generate_field_class(field_name: str, field: spec.Field) -> ast.ClassDef:
109
+ assert field.type in set(["object", "record", "struct"])
110
+ (annotated_type, new_classes) = field_definitions(field.fields)
111
+ documentation = [ast.Expr(ast.Constant(field.description))] if field.description else []
112
+ return ast.ClassDef(
113
+ name=field_name,
114
+ bases=[ast.Attribute(value=ast.Name(id="pydantic", ctx=ast.Load()), attr="BaseModel", ctx=ast.Load())],
115
+ body=[*documentation, *new_classes, *annotated_type],
116
+ keywords=[],
117
+ decorator_list=[],
118
+ )
119
+
120
+
121
+ def generate_model_class(name: str, model_definition: spec.Model) -> ast.ClassDef:
122
+ (field_assignments, nested_classes) = field_definitions(model_definition.fields)
123
+ documentation = [ast.Expr(ast.Constant(model_definition.description))] if model_definition.description else []
124
+ result = ast.ClassDef(
125
+ name=name.capitalize(),
126
+ bases=[ast.Attribute(value=ast.Name(id="pydantic", ctx=ast.Load()), attr="BaseModel", ctx=ast.Load())],
127
+ body=[*documentation, *nested_classes, *field_assignments],
128
+ keywords=[],
129
+ decorator_list=[],
130
+ )
131
+ return result
@@ -1,17 +1,39 @@
1
- from typing import Dict
2
- import inspect
3
1
  from pydantic import BaseModel
4
2
  from rdflib import Graph, Literal, BNode, RDF, URIRef, Namespace
5
3
 
6
4
  from datacontract.model.data_contract_specification import \
7
- DataContractSpecification, Model, Field
5
+ DataContractSpecification
8
6
 
9
7
 
10
8
  def is_literal(property_name):
11
- return property_name in ["dataContractSpecification", "title", "version", "description", "name", "url", "type",
12
- "location", "format", "delimiter", "usage", "limitations",
13
- "billing", "noticePeriod", "required", "unique", "minLength", "maxLength", "example",
14
- "pii", "classification", "data", "enum", "minimum", "maximum", "patterns"]
9
+ return property_name in [
10
+ "dataContractSpecification",
11
+ "title",
12
+ "version",
13
+ "description",
14
+ "name",
15
+ "url",
16
+ "type",
17
+ "location",
18
+ "format",
19
+ "delimiter",
20
+ "usage",
21
+ "limitations",
22
+ "billing",
23
+ "noticePeriod",
24
+ "required",
25
+ "unique",
26
+ "minLength",
27
+ "maxLength",
28
+ "example",
29
+ "pii",
30
+ "classification",
31
+ "data",
32
+ "enum",
33
+ "minimum",
34
+ "maximum",
35
+ "patterns",
36
+ ]
15
37
 
16
38
 
17
39
  def is_uriref(property_name):
@@ -21,6 +43,7 @@ def is_uriref(property_name):
21
43
  def to_rdf_n3(data_contract_spec: DataContractSpecification, base) -> str:
22
44
  return to_rdf(data_contract_spec, base).serialize(format="n3")
23
45
 
46
+
24
47
  def to_rdf(data_contract_spec: DataContractSpecification, base) -> Graph:
25
48
  if base is not None:
26
49
  g = Graph(base=base)
@@ -61,7 +84,7 @@ def to_rdf(data_contract_spec: DataContractSpecification, base) -> Graph:
61
84
 
62
85
  def add_example(contract, example, graph, dc, dcx):
63
86
  an_example = BNode()
64
- graph.add((contract, dc['example'], an_example))
87
+ graph.add((contract, dc["example"], an_example))
65
88
  graph.add((an_example, RDF.type, URIRef(dc + "Example")))
66
89
  for example_property in example.model_fields:
67
90
  add_triple(sub=an_example, pred=example_property, obj=example, graph=graph, dc=dc, dcx=dcx)
@@ -81,14 +104,14 @@ def add_triple(sub, pred, obj, graph, dc, dcx):
81
104
 
82
105
  def add_model(contract, model, model_name, graph, dc, dcx):
83
106
  a_model = URIRef(model_name)
84
- graph.add((contract, dc['model'], a_model))
107
+ graph.add((contract, dc["model"], a_model))
85
108
  graph.add((a_model, dc.description, Literal(model.description)))
86
109
  graph.add((a_model, RDF.type, URIRef(dc + "Model")))
87
110
  for field_name, field in model.fields.items():
88
111
  a_field = BNode()
89
- graph.add((a_model, dc['field'], a_field))
112
+ graph.add((a_model, dc["field"], a_field))
90
113
  graph.add((a_field, RDF.type, URIRef(dc + "Field")))
91
- graph.add((a_field, dc['name'], Literal(field_name)))
114
+ graph.add((a_field, dc["name"], Literal(field_name)))
92
115
  for field_property in field.model_fields:
93
116
  add_triple(sub=a_field, pred=field_property, obj=field, graph=graph, dc=dc, dcx=dcx)
94
117
 
@@ -1,14 +1,17 @@
1
1
  import yaml
2
2
 
3
+ from datacontract.export.sql_type_converter import convert_to_sql_type
3
4
  from datacontract.model.data_contract_specification import \
4
5
  DataContractSpecification
5
6
 
6
7
 
7
- def to_sodacl_yaml(data_contract_spec: DataContractSpecification, check_types: bool = True) -> str:
8
+ def to_sodacl_yaml(
9
+ data_contract_spec: DataContractSpecification, server_type: str = None, check_types: bool = True
10
+ ) -> str:
8
11
  try:
9
12
  sodacl = {}
10
13
  for model_key, model_value in data_contract_spec.models.items():
11
- k, v = to_checks(model_key, model_value, check_types)
14
+ k, v = to_checks(model_key, model_value, server_type, check_types)
12
15
  sodacl[k] = v
13
16
  add_quality_checks(sodacl, data_contract_spec)
14
17
  sodacl_yaml_str = yaml.dump(sodacl, default_flow_style=False, sort_keys=False)
@@ -17,17 +20,41 @@ def to_sodacl_yaml(data_contract_spec: DataContractSpecification, check_types: b
17
20
  return f"Error: {e}"
18
21
 
19
22
 
20
- def to_checks(model_key, model_value, check_types: bool):
23
+ def to_checks(model_key, model_value, server_type: str, check_types: bool):
21
24
  checks = []
22
25
  fields = model_value.fields
26
+
27
+ quote_field_name = server_type in ["postgres"]
28
+
23
29
  for field_name, field in fields.items():
24
30
  checks.append(check_field_is_present(field_name))
25
31
  if check_types and field.type is not None:
26
- checks.append(check_field_type(field_name, field.type))
32
+ sql_type = convert_to_sql_type(field, server_type)
33
+ checks.append(check_field_type(field_name, sql_type))
27
34
  if field.required:
28
- checks.append(check_field_required(field_name))
35
+ checks.append(check_field_required(field_name, quote_field_name))
29
36
  if field.unique:
30
- checks.append(check_field_unique(field_name))
37
+ checks.append(check_field_unique(field_name, quote_field_name))
38
+ if field.minLength is not None:
39
+ checks.append(check_field_min_length(field_name, field.minLength))
40
+ if field.maxLength is not None:
41
+ checks.append(check_field_max_length(field_name, field.maxLength))
42
+ if field.minimum is not None:
43
+ checks.append(check_field_minimum(field_name, field.minimum))
44
+ if field.maximum is not None:
45
+ checks.append(check_field_maximum(field_name, field.maximum))
46
+ if field.exclusiveMinimum is not None:
47
+ checks.append(check_field_minimum(field_name, field.exclusiveMinimum))
48
+ checks.append(check_field_not_equal(field_name, field.exclusiveMinimum))
49
+ if field.exclusiveMaximum is not None:
50
+ checks.append(check_field_maximum(field_name, field.exclusiveMaximum))
51
+ checks.append(check_field_not_equal(field_name, field.exclusiveMaximum))
52
+ if field.pattern is not None:
53
+ checks.append(check_field_regex(field_name, field.pattern))
54
+ if field.enum is not None and len(field.enum) > 0:
55
+ checks.append(check_field_enum(field_name, field.enum))
56
+ # TODO references: str = None
57
+ # TODO format
31
58
 
32
59
  return f"checks for {model_key}", checks
33
60
 
@@ -37,10 +64,8 @@ def check_field_is_present(field_name):
37
64
  "schema": {
38
65
  "name": f"Check that field {field_name} is present",
39
66
  "fail": {
40
- "when required column missing": [
41
- field_name
42
- ],
43
- }
67
+ "when required column missing": [field_name],
68
+ },
44
69
  }
45
70
  }
46
71
 
@@ -49,27 +74,99 @@ def check_field_type(field_name: str, type: str):
49
74
  return {
50
75
  "schema": {
51
76
  "name": f"Check that field {field_name} has type {type}",
52
- "fail": {
53
- "when wrong column type": {
54
- field_name: type
55
- }
56
- }
77
+ "fail": {"when wrong column type": {field_name: type}},
78
+ }
79
+ }
80
+
81
+
82
+ def check_field_required(field_name: str, quote_field_name: bool = False):
83
+ if quote_field_name:
84
+ field_name = f'"{field_name}"'
85
+
86
+ return {f"missing_count({field_name}) = 0": {"name": f"Check that required field {field_name} has no null values"}}
87
+
88
+
89
+ def check_field_unique(field_name, quote_field_name: bool = False):
90
+ if quote_field_name:
91
+ field_name = f'"{field_name}"'
92
+ return {
93
+ f"duplicate_count({field_name}) = 0": {"name": f"Check that unique field {field_name} has no duplicate values"}
94
+ }
95
+
96
+
97
+ def check_field_min_length(field_name, min_length, quote_field_name: bool = False):
98
+ if quote_field_name:
99
+ field_name = f'"{field_name}"'
100
+ return {
101
+ f"invalid_count({field_name}) = 0": {
102
+ "name": f"Check that field {field_name} has a min length of {min}",
103
+ "valid min length": min_length,
104
+ }
105
+ }
106
+
107
+
108
+ def check_field_max_length(field_name, max_length, quote_field_name: bool = False):
109
+ if quote_field_name:
110
+ field_name = f'"{field_name}"'
111
+ return {
112
+ f"invalid_count({field_name}) = 0": {
113
+ "name": f"Check that field {field_name} has a max length of {max_length}",
114
+ "valid max length": max_length,
115
+ }
116
+ }
117
+
118
+
119
+ def check_field_minimum(field_name, minimum, quote_field_name: bool = False):
120
+ if quote_field_name:
121
+ field_name = f'"{field_name}"'
122
+ return {
123
+ f"invalid_count({field_name}) = 0": {
124
+ "name": f"Check that field {field_name} has a minimum of {min}",
125
+ "valid min": minimum,
126
+ }
127
+ }
128
+
129
+
130
+ def check_field_maximum(field_name, maximum, quote_field_name: bool = False):
131
+ if quote_field_name:
132
+ field_name = f'"{field_name}"'
133
+ return {
134
+ f"invalid_count({field_name}) = 0": {
135
+ "name": f"Check that field {field_name} has a maximum of {maximum}",
136
+ "valid max": maximum,
137
+ }
138
+ }
139
+
140
+
141
+ def check_field_not_equal(field_name, value, quote_field_name: bool = False):
142
+ if quote_field_name:
143
+ field_name = f'"{field_name}"'
144
+ return {
145
+ f"invalid_count({field_name}) = 0": {
146
+ "name": f"Check that field {field_name} is not equal to {value}",
147
+ "invalid values": [value],
57
148
  }
58
149
  }
59
150
 
60
151
 
61
- def check_field_required(field_name):
152
+ def check_field_enum(field_name, enum, quote_field_name: bool = False):
153
+ if quote_field_name:
154
+ field_name = f'"{field_name}"'
62
155
  return {
63
- f"missing_count({field_name}) = 0": {
64
- "name": f"Check that required field {field_name} has no null values"
156
+ f"invalid_count({field_name}) = 0": {
157
+ "name": f"Check that field {field_name} only contains enum values {enum}",
158
+ "valid values": enum,
65
159
  }
66
160
  }
67
161
 
68
162
 
69
- def check_field_unique(field_name):
163
+ def check_field_regex(field_name, pattern, quote_field_name: bool = False):
164
+ if quote_field_name:
165
+ field_name = f'"{field_name}"'
70
166
  return {
71
- f'duplicate_count({field_name}) = 0': {
72
- "name": f"Check that unique field {field_name} has no duplicate values"
167
+ f"invalid_count({field_name}) = 0": {
168
+ "name": f"Check that field {field_name} matches regex pattern {pattern}",
169
+ "valid regex": pattern,
73
170
  }
74
171
  }
75
172
 
@@ -1,9 +1,10 @@
1
1
  from datacontract.export.sql_type_converter import convert_to_sql_type
2
- from datacontract.model.data_contract_specification import \
3
- DataContractSpecification, Model
2
+ from datacontract.model.data_contract_specification import DataContractSpecification, Model
4
3
 
5
4
 
6
- def to_sql_query(data_contract_spec: DataContractSpecification, model_name: str, model_value: Model, server_type: str = "snowflake") -> str:
5
+ def to_sql_query(
6
+ data_contract_spec: DataContractSpecification, model_name: str, model_value: Model, server_type: str = "snowflake"
7
+ ) -> str:
7
8
  if data_contract_spec is None:
8
9
  return ""
9
10
  if data_contract_spec.models is None or len(data_contract_spec.models) == 0:
@@ -42,27 +43,39 @@ def to_sql_ddl(data_contract_spec: DataContractSpecification, server_type: str =
42
43
  if data_contract_spec.models is None or len(data_contract_spec.models) == 0:
43
44
  return ""
44
45
 
46
+ table_prefix = ""
47
+
45
48
  for server_name, server in iter(data_contract_spec.servers.items()):
46
- if server.type == server_type:
47
- break
48
49
  if server.type == "snowflake":
49
50
  server_type = "snowflake"
50
51
  break
51
52
  if server.type == "postgres":
52
53
  server_type = "postgres"
53
54
  break
55
+ if server.type == "databricks":
56
+ server_type = "databricks"
57
+ if server.catalog is not None and server.schema_ is not None:
58
+ table_prefix = server.catalog + "." + server.schema_ + "."
59
+ break
60
+ if server.type == server_type:
61
+ break
54
62
 
55
63
  result = ""
56
64
  result += f"-- Data Contract: {data_contract_spec.id}\n"
57
65
  result += f"-- SQL Dialect: {server_type}\n"
58
66
  for model_name, model in iter(data_contract_spec.models.items()):
59
- result += _to_sql_table(model_name, model, server_type)
67
+ result += _to_sql_table(table_prefix + model_name, model, server_type)
60
68
 
61
69
  return result.strip()
62
70
 
63
71
 
64
72
  def _to_sql_table(model_name, model, server_type="snowflake"):
65
- result = f"CREATE TABLE {model_name} (\n"
73
+ if server_type == "databricks":
74
+ # Databricks recommends to use the CREATE OR REPLACE statement for unity managed tables
75
+ # https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-create-table-using.html
76
+ result = f"CREATE OR REPLACE TABLE {model_name} (\n"
77
+ else:
78
+ result = f"CREATE TABLE {model_name} (\n"
66
79
  fields = len(model.fields)
67
80
  current_field_index = 1
68
81
  for field_name, field in iter(model.fields.items()):
@@ -72,11 +85,20 @@ def _to_sql_table(model_name, model, server_type="snowflake"):
72
85
  result += " not null"
73
86
  if field.primary:
74
87
  result += " primary key"
88
+ if server_type == "databricks" and field.description is not None:
89
+ result += f' COMMENT "{_escape(field.description)}"'
75
90
  if current_field_index < fields:
76
91
  result += ","
77
92
  result += "\n"
78
93
  current_field_index += 1
79
- result += ");\n"
94
+ result += ")"
95
+ if server_type == "databricks" and model.description is not None:
96
+ result += f' COMMENT "{_escape(model.description)}"'
97
+ result += ";\n"
80
98
  return result
81
99
 
82
100
 
101
+ def _escape(text: str | None) -> str | None:
102
+ if text is None:
103
+ return None
104
+ return text.replace('"', '\\"')
@@ -6,7 +6,10 @@ def convert_to_sql_type(field: Field, server_type: str) -> str:
6
6
  return convert_to_snowflake(field)
7
7
  if server_type == "postgres":
8
8
  return convert_type_to_postgres(field)
9
- return str(type)
9
+ if server_type == "databricks":
10
+ return convert_to_databricks(field)
11
+ return field.type
12
+
10
13
 
11
14
  # snowflake data types:
12
15
  # https://docs.snowflake.com/en/sql-reference/data-types.html
@@ -48,17 +51,16 @@ def convert_to_snowflake(field) -> None | str:
48
51
  return None
49
52
 
50
53
 
51
-
52
54
  # https://www.postgresql.org/docs/current/datatype.html
53
55
  # Using the name whenever possible
54
- def convert_type_to_postgres(field : Field) -> None | str:
56
+ def convert_type_to_postgres(field: Field) -> None | str:
55
57
  type = field.type
56
58
  if type is None:
57
59
  return None
58
60
  if type.lower() in ["string", "varchar", "text"]:
59
61
  if field.format == "uuid":
60
62
  return "uuid"
61
- return "text" # STRING does not exist, TEXT and VARCHAR are all the same in postrges
63
+ return "text" # STRING does not exist, TEXT and VARCHAR are all the same in postrges
62
64
  if type.lower() in ["timestamp", "timestamp_tz"]:
63
65
  return "timestamptz"
64
66
  if type.lower() in ["timestamp_ntz"]:
@@ -89,3 +91,41 @@ def convert_type_to_postgres(field : Field) -> None | str:
89
91
  if type.lower() in ["array"]:
90
92
  return convert_to_sql_type(field.items, "postgres") + "[]"
91
93
  return None
94
+
95
+
96
+ # databricks data types:
97
+ # https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html
98
+ def convert_to_databricks(field) -> None | str:
99
+ type = field.type
100
+ if type is None:
101
+ return None
102
+ if type.lower() in ["string", "varchar", "text"]:
103
+ return "STRING"
104
+ if type.lower() in ["timestamp", "timestamp_tz"]:
105
+ return "TIMESTAMP"
106
+ if type.lower() in ["timestamp_ntz"]:
107
+ return "TIMESTAMP_NTZ"
108
+ if type.lower() in ["date"]:
109
+ return "DATE"
110
+ if type.lower() in ["time"]:
111
+ return "STRING"
112
+ if type.lower() in ["number", "decimal", "numeric"]:
113
+ # precision and scale not supported by data contract
114
+ return "DECIMAL"
115
+ if type.lower() in ["float"]:
116
+ return "FLOAT"
117
+ if type.lower() in ["double"]:
118
+ return "DOUBLE"
119
+ if type.lower() in ["integer", "int"]:
120
+ return "INT"
121
+ if type.lower() in ["long", "bigint"]:
122
+ return "BIGINT"
123
+ if type.lower() in ["boolean"]:
124
+ return "BOOLEAN"
125
+ if type.lower() in ["object", "record", "struct"]:
126
+ return "STRUCT"
127
+ if type.lower() in ["bytes"]:
128
+ return "BINARY"
129
+ if type.lower() in ["array"]:
130
+ return "ARRAY"
131
+ return None
@@ -1,6 +1,7 @@
1
+ import re
2
+
1
3
  from datacontract.model.data_contract_specification import \
2
4
  DataContractSpecification, Server
3
- import re
4
5
 
5
6
 
6
7
  def to_terraform(data_contract_spec: DataContractSpecification, server_id: str = None) -> str:
@@ -18,7 +19,7 @@ def to_terraform(data_contract_spec: DataContractSpecification, server_id: str =
18
19
  return result.strip()
19
20
 
20
21
 
21
- def server_to_terraform_resource(data_contract_spec, result, server : Server, server_name):
22
+ def server_to_terraform_resource(data_contract_spec, result, server: Server, server_name):
22
23
  tag_data_contract = data_contract_spec.id
23
24
  tag_name = data_contract_spec.info.title
24
25
  tag_server = server_name
@@ -60,7 +61,7 @@ def extract_bucket_name(server) -> str | None:
60
61
  if server.type == "s3":
61
62
  s3_url = server.location
62
63
  # Regular expression to match the S3 bucket name
63
- match = re.search(r's3://([^/]+)/', s3_url)
64
+ match = re.search(r"s3://([^/]+)/", s3_url)
64
65
  if match:
65
66
  # Return the first group (bucket name)
66
67
  return match.group(1)