datacontract-cli 0.10.12__py3-none-any.whl → 0.10.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (37) hide show
  1. datacontract/cli.py +5 -0
  2. datacontract/data_contract.py +9 -1
  3. datacontract/engines/soda/connections/kafka.py +28 -6
  4. datacontract/export/avro_converter.py +8 -1
  5. datacontract/export/avro_idl_converter.py +1 -0
  6. datacontract/export/bigquery_converter.py +30 -23
  7. datacontract/export/data_caterer_converter.py +148 -0
  8. datacontract/export/dcs_exporter.py +6 -0
  9. datacontract/export/exporter.py +5 -1
  10. datacontract/export/exporter_factory.py +19 -1
  11. datacontract/export/jsonschema_converter.py +13 -2
  12. datacontract/export/{odcs_converter.py → odcs_v2_exporter.py} +4 -4
  13. datacontract/export/odcs_v3_exporter.py +294 -0
  14. datacontract/export/sodacl_converter.py +82 -2
  15. datacontract/export/spark_converter.py +3 -1
  16. datacontract/export/sql_type_converter.py +56 -21
  17. datacontract/imports/iceberg_importer.py +162 -0
  18. datacontract/imports/importer.py +1 -0
  19. datacontract/imports/importer_factory.py +5 -0
  20. datacontract/imports/odcs_importer.py +25 -168
  21. datacontract/imports/odcs_v2_importer.py +177 -0
  22. datacontract/imports/odcs_v3_importer.py +309 -0
  23. datacontract/imports/spark_importer.py +5 -1
  24. datacontract/imports/unity_importer.py +105 -84
  25. datacontract/integration/datamesh_manager.py +1 -1
  26. datacontract/lint/resolve.py +24 -10
  27. datacontract/lint/resources.py +21 -0
  28. datacontract/lint/urls.py +29 -13
  29. datacontract/model/data_contract_specification.py +72 -8
  30. datacontract/model/odcs.py +11 -0
  31. {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/METADATA +106 -52
  32. {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/RECORD +36 -29
  33. {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/WHEEL +1 -1
  34. datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
  35. {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/LICENSE +0 -0
  36. {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/entry_points.txt +0 -0
  37. {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/top_level.txt +0 -0
datacontract/cli.py CHANGED
@@ -244,6 +244,10 @@ def import_(
244
244
  help="List of table names to import from the DBML file (repeat for multiple table names, leave empty for all tables in the file)."
245
245
  ),
246
246
  ] = None,
247
+ iceberg_table: Annotated[
248
+ Optional[str],
249
+ typer.Option(help="Table name to assign to the model created from the Iceberg schema."),
250
+ ] = None,
247
251
  ):
248
252
  """
249
253
  Create a data contract from the given source location. Prints to stdout.
@@ -259,6 +263,7 @@ def import_(
259
263
  dbt_model=dbt_model,
260
264
  dbml_schema=dbml_schema,
261
265
  dbml_table=dbml_table,
266
+ iceberg_table=iceberg_table,
262
267
  )
263
268
  console.print(result.to_yaml())
264
269
 
@@ -199,7 +199,15 @@ class DataContract:
199
199
 
200
200
  except DataContractException as e:
201
201
  run.checks.append(
202
- Check(type=e.type, result=e.result, name=e.name, reason=e.reason, engine=e.engine, details="")
202
+ Check(
203
+ type=e.type,
204
+ name=e.name,
205
+ result=e.result,
206
+ reason=e.reason,
207
+ model=e.model,
208
+ engine=e.engine,
209
+ details="",
210
+ )
203
211
  )
204
212
  run.log_error(str(e))
205
213
  except Exception as e:
@@ -25,9 +25,10 @@ def create_spark_session(tmp_dir: str):
25
25
  SparkSession.builder.appName("datacontract")
26
26
  .config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
27
27
  .config("spark.streaming.stopGracefullyOnShutdown", "true")
28
+ .config("spark.ui.enabled", "false")
28
29
  .config(
29
30
  "spark.jars.packages",
30
- "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,org.apache.spark:spark-avro_2.12:3.5.0",
31
+ "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.2,org.apache.spark:spark-avro_2.12:3.5.2",
31
32
  )
32
33
  .getOrCreate()
33
34
  )
@@ -111,17 +112,38 @@ def get_auth_options():
111
112
  """Retrieve Kafka authentication options from environment variables."""
112
113
  kafka_sasl_username = os.getenv("DATACONTRACT_KAFKA_SASL_USERNAME")
113
114
  kafka_sasl_password = os.getenv("DATACONTRACT_KAFKA_SASL_PASSWORD")
115
+ kafka_sasl_mechanism = os.getenv("DATACONTRACT_KAFKA_SASL_MECHANISM", "PLAIN").upper()
114
116
 
115
- if kafka_sasl_username is None:
117
+ # Skip authentication if credentials are not provided
118
+ if not kafka_sasl_username or not kafka_sasl_password:
116
119
  return {}
117
120
 
118
- return {
119
- "kafka.sasl.mechanism": "PLAIN",
120
- "kafka.security.protocol": "SASL_SSL",
121
- "kafka.sasl.jaas.config": (
121
+ # SASL mechanisms supported by Kafka
122
+ jaas_config = {
123
+ "PLAIN": (
122
124
  f"org.apache.kafka.common.security.plain.PlainLoginModule required "
123
125
  f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
124
126
  ),
127
+ "SCRAM-SHA-256": (
128
+ f"org.apache.kafka.common.security.scram.ScramLoginModule required "
129
+ f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
130
+ ),
131
+ "SCRAM-SHA-512": (
132
+ f"org.apache.kafka.common.security.scram.ScramLoginModule required "
133
+ f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
134
+ ),
135
+ # Add more mechanisms as needed
136
+ }
137
+
138
+ # Validate SASL mechanism
139
+ if kafka_sasl_mechanism not in jaas_config:
140
+ raise ValueError(f"Unsupported SASL mechanism: {kafka_sasl_mechanism}")
141
+
142
+ # Return config
143
+ return {
144
+ "kafka.sasl.mechanism": kafka_sasl_mechanism,
145
+ "kafka.security.protocol": "SASL_SSL",
146
+ "kafka.sasl.jaas.config": jaas_config[kafka_sasl_mechanism],
125
147
  }
126
148
 
127
149
 
@@ -81,9 +81,16 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
81
81
  return "null"
82
82
  if field.type in ["string", "varchar", "text"]:
83
83
  return "string"
84
- elif field.type in ["number", "decimal", "numeric"]:
84
+ elif field.type in ["number", "numeric"]:
85
85
  # https://avro.apache.org/docs/1.11.1/specification/#decimal
86
86
  return "bytes"
87
+ elif field.type in ["decimal"]:
88
+ typeVal = {"type": "bytes", "logicalType": "decimal"}
89
+ if field.scale is not None:
90
+ typeVal["scale"] = field.scale
91
+ if field.precision is not None:
92
+ typeVal["precision"] = field.precision
93
+ return typeVal
87
94
  elif field.type in ["float", "double"]:
88
95
  return "double"
89
96
  elif field.type in ["integer", "int"]:
@@ -64,6 +64,7 @@ class AvroIDLProtocol:
64
64
  model_types: list[AvroModelType]
65
65
 
66
66
 
67
+ # TODO use DATACONTRACT_TYPES from datacontract/model/data_contract_specification.py
67
68
  avro_primitive_types = set(
68
69
  [
69
70
  "string",
@@ -44,7 +44,7 @@ def to_fields_array(fields: Dict[str, Field]) -> List[Dict[str, Field]]:
44
44
 
45
45
 
46
46
  def to_field(field_name: str, field: Field) -> dict:
47
- bq_type = map_type_to_bigquery(field.type, field_name)
47
+ bq_type = map_type_to_bigquery(field)
48
48
  bq_field = {
49
49
  "name": field_name,
50
50
  "type": bq_type,
@@ -59,10 +59,8 @@ def to_field(field_name: str, field: Field) -> dict:
59
59
  # in case the array type is a complex object, we want to copy all its fields
60
60
  bq_field["fields"] = to_fields_array(field.items.fields)
61
61
  else:
62
- # otherwise we make up a structure that gets us a single field of the specified type
63
- bq_field["fields"] = to_fields_array(
64
- {f"{field_name}_1": Field(type=field.items.type, required=False, description="")}
65
- )
62
+ bq_field["type"] = map_type_to_bigquery(field.items)
63
+
66
64
  # all of these can carry other fields
67
65
  elif bq_type.lower() in ["record", "struct"]:
68
66
  bq_field["fields"] = to_fields_array(field.fields)
@@ -79,37 +77,46 @@ def to_field(field_name: str, field: Field) -> dict:
79
77
  return bq_field
80
78
 
81
79
 
82
- def map_type_to_bigquery(type_str: str, field_name: str) -> str:
80
+ def map_type_to_bigquery(field: Field) -> str:
83
81
  logger = logging.getLogger(__name__)
84
- if type_str.lower() in ["string", "varchar", "text"]:
82
+
83
+ field_type = field.type
84
+ if not field_type:
85
+ return None
86
+
87
+ if field.config and "bigqueryType" in field.config:
88
+ return field.config["bigqueryType"]
89
+
90
+ if field_type.lower() in ["string", "varchar", "text"]:
85
91
  return "STRING"
86
- elif type_str == "bytes":
92
+ elif field_type.lower() == "bytes":
87
93
  return "BYTES"
88
- elif type_str.lower() in ["int", "integer"]:
94
+ elif field_type.lower() in ["int", "integer"]:
89
95
  return "INTEGER"
90
- elif type_str.lower() in ["long", "bigint"]:
96
+ elif field_type.lower() in ["long", "bigint"]:
91
97
  return "INT64"
92
- elif type_str == "float":
93
- return "FLOAT"
94
- elif type_str == "boolean":
98
+ elif field_type.lower() == "float":
99
+ return "FLOAT64"
100
+ elif field_type.lower() == "boolean":
95
101
  return "BOOL"
96
- elif type_str.lower() in ["timestamp", "timestamp_tz"]:
102
+ elif field_type.lower() in ["timestamp", "timestamp_tz"]:
97
103
  return "TIMESTAMP"
98
- elif type_str == "date":
104
+ elif field_type.lower() == "date":
99
105
  return "DATE"
100
- elif type_str == "timestamp_ntz":
106
+ elif field_type.lower() == "timestamp_ntz":
101
107
  return "TIME"
102
- elif type_str.lower() in ["number", "decimal", "numeric"]:
108
+ elif field_type.lower() in ["number", "decimal", "numeric"]:
103
109
  return "NUMERIC"
104
- elif type_str == "double":
110
+ elif field_type.lower() == "double":
105
111
  return "BIGNUMERIC"
106
- elif type_str.lower() in ["object", "record", "array"]:
112
+ elif field_type.lower() in ["object", "record", "array"]:
107
113
  return "RECORD"
108
- elif type_str == "struct":
114
+ elif field_type.lower() == "struct":
109
115
  return "STRUCT"
110
- elif type_str == "null":
116
+ elif field_type.lower() == "null":
111
117
  logger.info(
112
- f"Can't properly map {field_name} to bigquery Schema, as 'null' is not supported as a type. Mapping it to STRING."
118
+ f"Can't properly map {field.title} to bigquery Schema, as 'null' \
119
+ is not supported as a type. Mapping it to STRING."
113
120
  )
114
121
  return "STRING"
115
122
  else:
@@ -117,6 +124,6 @@ def map_type_to_bigquery(type_str: str, field_name: str) -> str:
117
124
  type="schema",
118
125
  result="failed",
119
126
  name="Map datacontract type to bigquery data type",
120
- reason=f"Unsupported type {type_str} in data contract definition.",
127
+ reason=f"Unsupported type {field_type} in data contract definition.",
121
128
  engine="datacontract",
122
129
  )
@@ -0,0 +1,148 @@
1
+ from typing import Dict
2
+
3
+ import yaml
4
+
5
+ from datacontract.export.exporter import Exporter
6
+ from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field, Server
7
+
8
+
9
+ class DataCatererExporter(Exporter):
10
+ """
11
+ Exporter class for Data Caterer.
12
+ Creates a YAML file, based on the data contract, for Data Caterer to generate synthetic data.
13
+ """
14
+
15
+ def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
16
+ return to_data_caterer_generate_yaml(data_contract, server)
17
+
18
+
19
+ def to_data_caterer_generate_yaml(data_contract_spec: DataContractSpecification, server):
20
+ generation_task = {"name": data_contract_spec.info.title, "steps": []}
21
+ server_info = _get_server_info(data_contract_spec, server)
22
+
23
+ for model_key, model_value in data_contract_spec.models.items():
24
+ odcs_table = _to_data_caterer_generate_step(model_key, model_value, server_info)
25
+ generation_task["steps"].append(odcs_table)
26
+ return yaml.dump(generation_task, indent=2, sort_keys=False, allow_unicode=True)
27
+
28
+
29
+ def _get_server_info(data_contract_spec: DataContractSpecification, server):
30
+ if server is not None and server in data_contract_spec.servers:
31
+ return data_contract_spec.servers.get(server)
32
+ elif server is not None:
33
+ raise Exception(f"Server name not found in servers list in data contract, server-name={server}")
34
+ elif len(data_contract_spec.servers.keys()) > 0:
35
+ return next(iter(data_contract_spec.servers.values()))
36
+ else:
37
+ return None
38
+
39
+
40
+ def _to_data_caterer_generate_step(model_key, model_value: Model, server: Server) -> dict:
41
+ step = {
42
+ "name": model_key,
43
+ "type": _to_step_type(server),
44
+ "options": _to_data_source_options(model_key, server),
45
+ "schema": [],
46
+ }
47
+ fields = _to_fields(model_value.fields)
48
+ if fields:
49
+ step["schema"] = fields
50
+ return step
51
+
52
+
53
+ def _to_step_type(server: Server):
54
+ if server is not None and server.type is not None:
55
+ if server.type in ["s3", "gcs", "azure", "local"]:
56
+ return server.format
57
+ else:
58
+ return server.type
59
+ else:
60
+ return "csv"
61
+
62
+
63
+ def _to_data_source_options(model_key, server: Server):
64
+ options = {}
65
+ if server is not None and server.type is not None:
66
+ if server.type in ["s3", "gcs", "azure", "local"]:
67
+ if server.path is not None:
68
+ options["path"] = server.path
69
+ elif server.location is not None:
70
+ options["path"] = server.location
71
+ else:
72
+ options["path"] = "/tmp/data_caterer_data"
73
+ elif server.type == "postgres":
74
+ options["schema"] = server.schema_
75
+ options["table"] = model_key
76
+ elif server.type == "kafka":
77
+ options["topic"] = server.topic
78
+
79
+ return options
80
+
81
+
82
+ def _to_fields(fields: Dict[str, Field]) -> list:
83
+ dc_fields = []
84
+ for field_name, field in fields.items():
85
+ column = _to_field(field_name, field)
86
+ dc_fields.append(column)
87
+ return dc_fields
88
+
89
+
90
+ def _to_field(field_name: str, field: Field) -> dict:
91
+ dc_field = {"name": field_name}
92
+ dc_generator_opts = {}
93
+
94
+ if field.type is not None:
95
+ new_type = _to_data_type(field.type)
96
+ dc_field["type"] = _to_data_type(field.type)
97
+ if new_type == "object" or new_type == "record" or new_type == "struct":
98
+ # need to get nested field definitions
99
+ nested_fields = _to_fields(field.fields)
100
+ dc_field["schema"] = {"fields": nested_fields}
101
+
102
+ if field.enum is not None and len(field.enum) > 0:
103
+ dc_generator_opts["oneOf"] = field.enum
104
+ if field.unique is not None and field.unique:
105
+ dc_generator_opts["isUnique"] = field.unique
106
+ if field.minLength is not None:
107
+ dc_generator_opts["minLength"] = field.minLength
108
+ if field.maxLength is not None:
109
+ dc_generator_opts["maxLength"] = field.maxLength
110
+ if field.pattern is not None:
111
+ dc_generator_opts["regex"] = field.pattern
112
+ if field.minimum is not None:
113
+ dc_generator_opts["min"] = field.minimum
114
+ if field.maximum is not None:
115
+ dc_generator_opts["max"] = field.maximum
116
+
117
+ if len(dc_generator_opts.keys()) > 0:
118
+ dc_field["generator"] = {"options": dc_generator_opts}
119
+ return dc_field
120
+
121
+
122
+ def _to_data_type(data_type):
123
+ if data_type == "number" or data_type == "numeric" or data_type == "double":
124
+ return "double"
125
+ elif data_type == "decimal" or data_type == "bigint":
126
+ return "decimal"
127
+ elif data_type == "int":
128
+ return "integer"
129
+ elif data_type == "long":
130
+ return "long"
131
+ elif data_type == "float":
132
+ return "float"
133
+ elif data_type == "string" or data_type == "text" or data_type == "varchar":
134
+ return "string"
135
+ if data_type == "boolean":
136
+ return "boolean"
137
+ if data_type == "timestamp" or data_type == "timestamp_tz" or data_type == "timestamp_ntz":
138
+ return "timestamp"
139
+ elif data_type == "date":
140
+ return "date"
141
+ elif data_type == "array":
142
+ return "array"
143
+ elif data_type == "map" or data_type == "object" or data_type == "record" or data_type == "struct":
144
+ return "struct"
145
+ elif data_type == "bytes":
146
+ return "binary"
147
+ else:
148
+ return "string"
@@ -0,0 +1,6 @@
1
+ from datacontract.export.exporter import Exporter
2
+
3
+
4
+ class DcsExporter(Exporter):
5
+ def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
6
+ return data_contract.to_yaml()
@@ -10,7 +10,7 @@ class Exporter(ABC):
10
10
  self.export_format = export_format
11
11
 
12
12
  @abstractmethod
13
- def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
13
+ def export(self, data_contract, model, server, sql_server_type, export_args) -> dict | str:
14
14
  pass
15
15
 
16
16
 
@@ -22,6 +22,8 @@ class ExportFormat(str, Enum):
22
22
  dbt_sources = "dbt-sources"
23
23
  dbt_staging_sql = "dbt-staging-sql"
24
24
  odcs = "odcs"
25
+ odcs_v2 = "odcs_v2"
26
+ odcs_v3 = "odcs_v3"
25
27
  rdf = "rdf"
26
28
  avro = "avro"
27
29
  protobuf = "protobuf"
@@ -36,6 +38,8 @@ class ExportFormat(str, Enum):
36
38
  dbml = "dbml"
37
39
  spark = "spark"
38
40
  sqlalchemy = "sqlalchemy"
41
+ data_caterer = "data-caterer"
42
+ dcs = "dcs"
39
43
 
40
44
  @classmethod
41
45
  def get_supported_formats(cls):
@@ -62,6 +62,12 @@ exporter_factory.register_lazy_exporter(
62
62
  class_name="BigQueryExporter",
63
63
  )
64
64
 
65
+ exporter_factory.register_lazy_exporter(
66
+ name=ExportFormat.data_caterer,
67
+ module_path="datacontract.export.data_caterer_converter",
68
+ class_name="DataCatererExporter",
69
+ )
70
+
65
71
  exporter_factory.register_lazy_exporter(
66
72
  name=ExportFormat.dbml, module_path="datacontract.export.dbml_converter", class_name="DbmlExporter"
67
73
  )
@@ -93,7 +99,15 @@ exporter_factory.register_lazy_exporter(
93
99
  )
94
100
 
95
101
  exporter_factory.register_lazy_exporter(
96
- name=ExportFormat.odcs, module_path="datacontract.export.odcs_converter", class_name="OdcsExporter"
102
+ name=ExportFormat.odcs_v2, module_path="datacontract.export.odcs_v2_exporter", class_name="OdcsV2Exporter"
103
+ )
104
+
105
+ exporter_factory.register_lazy_exporter(
106
+ name=ExportFormat.odcs_v3, module_path="datacontract.export.odcs_v3_exporter", class_name="OdcsV3Exporter"
107
+ )
108
+
109
+ exporter_factory.register_lazy_exporter(
110
+ name=ExportFormat.odcs, module_path="datacontract.export.odcs_v3_exporter", class_name="OdcsV3Exporter"
97
111
  )
98
112
 
99
113
  exporter_factory.register_lazy_exporter(
@@ -149,3 +163,7 @@ exporter_factory.register_lazy_exporter(
149
163
  module_path="datacontract.export.sqlalchemy_converter",
150
164
  class_name="SQLAlchemyExporter",
151
165
  )
166
+
167
+ exporter_factory.register_lazy_exporter(
168
+ name=ExportFormat.dcs, module_path="datacontract.export.dcs_exporter", class_name="DcsExporter"
169
+ )
@@ -36,7 +36,19 @@ def to_property(field: Field) -> dict:
36
36
  property = {}
37
37
  json_type, json_format = convert_type_format(field.type, field.format)
38
38
  if json_type is not None:
39
- property["type"] = json_type
39
+ if not field.required:
40
+ """
41
+ From: https://json-schema.org/understanding-json-schema/reference/type
42
+ The type keyword may either be a string or an array:
43
+
44
+ If it's a string, it is the name of one of the basic types above.
45
+ If it is an array, it must be an array of strings, where each string
46
+ is the name of one of the basic types, and each element is unique.
47
+ In this case, the JSON snippet is valid if it matches any of the given types.
48
+ """
49
+ property["type"] = [json_type, "null"]
50
+ else:
51
+ property["type"] = json_type
40
52
  if json_format is not None:
41
53
  property["format"] = json_format
42
54
  if field.unique:
@@ -50,7 +62,6 @@ def to_property(field: Field) -> dict:
50
62
  property["required"] = to_required(field.fields)
51
63
  if json_type == "array":
52
64
  property["items"] = to_property(field.items)
53
-
54
65
  if field.pattern:
55
66
  property["pattern"] = field.pattern
56
67
  if field.enum:
@@ -6,12 +6,12 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
6
6
  from datacontract.export.exporter import Exporter
7
7
 
8
8
 
9
- class OdcsExporter(Exporter):
9
+ class OdcsV2Exporter(Exporter):
10
10
  def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
11
- return to_odcs_yaml(data_contract)
11
+ return to_odcs_v2_yaml(data_contract)
12
12
 
13
13
 
14
- def to_odcs_yaml(data_contract_spec: DataContractSpecification):
14
+ def to_odcs_v2_yaml(data_contract_spec: DataContractSpecification):
15
15
  odcs = {
16
16
  "kind": "DataContract",
17
17
  "apiVersion": "2.3.0",
@@ -25,7 +25,7 @@ def to_odcs_yaml(data_contract_spec: DataContractSpecification):
25
25
  if data_contract_spec.info.contact is not None:
26
26
  if data_contract_spec.info.contact.email is not None:
27
27
  odcs["productDl"] = data_contract_spec.info.contact.email
28
- if data_contract_spec.info.contact.email is not None:
28
+ if data_contract_spec.info.contact.url is not None:
29
29
  odcs["productFeedbackUrl"] = data_contract_spec.info.contact.url
30
30
 
31
31
  if data_contract_spec.terms is not None: