datacontract-cli 0.10.9__py3-none-any.whl → 0.10.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (32) hide show
  1. datacontract/cli.py +7 -0
  2. datacontract/data_contract.py +16 -9
  3. datacontract/engines/fastjsonschema/check_jsonschema.py +4 -1
  4. datacontract/engines/soda/check_soda_execute.py +5 -2
  5. datacontract/engines/soda/connections/duckdb.py +20 -12
  6. datacontract/engines/soda/connections/snowflake.py +8 -5
  7. datacontract/export/avro_converter.py +1 -1
  8. datacontract/export/dbml_converter.py +41 -19
  9. datacontract/export/exporter.py +1 -1
  10. datacontract/export/jsonschema_converter.py +1 -4
  11. datacontract/export/sodacl_converter.py +1 -1
  12. datacontract/imports/avro_importer.py +142 -8
  13. datacontract/imports/dbt_importer.py +117 -0
  14. datacontract/imports/glue_importer.py +9 -3
  15. datacontract/imports/importer.py +7 -2
  16. datacontract/imports/importer_factory.py +24 -6
  17. datacontract/imports/jsonschema_importer.py +106 -117
  18. datacontract/imports/spark_importer.py +134 -0
  19. datacontract/imports/sql_importer.py +4 -0
  20. datacontract/integration/publish_datamesh_manager.py +10 -5
  21. datacontract/lint/resolve.py +72 -27
  22. datacontract/lint/schema.py +24 -4
  23. datacontract/model/data_contract_specification.py +3 -0
  24. datacontract/templates/datacontract.html +1 -1
  25. datacontract/templates/index.html +1 -1
  26. datacontract/templates/partials/model_field.html +10 -2
  27. {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/METADATA +300 -192
  28. {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/RECORD +32 -30
  29. {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/WHEEL +1 -1
  30. {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/LICENSE +0 -0
  31. {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/entry_points.txt +0 -0
  32. {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,7 @@ class GlueImporter(Importer):
14
14
  def import_source(
15
15
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
16
16
  ) -> dict:
17
- return import_glue(data_contract_specification, source, import_args.get("glue_tables"))
17
+ return import_glue(data_contract_specification, source, import_args.get("glue_table"))
18
18
 
19
19
 
20
20
  def get_glue_database(database_name: str):
@@ -154,7 +154,7 @@ def import_glue(
154
154
  for column in table_schema:
155
155
  field = create_typed_field(column["Type"])
156
156
 
157
- # hive partitons are required, but are not primary keys
157
+ # hive partitions are required, but are not primary keys
158
158
  if column.get("Hive"):
159
159
  field.required = True
160
160
 
@@ -186,7 +186,7 @@ def create_typed_field(dtype: str) -> Field:
186
186
  """
187
187
  field = Field()
188
188
  dtype = dtype.strip().lower().replace(" ", "")
189
- if dtype.startswith(("array", "struct")):
189
+ if dtype.startswith(("array", "struct", "map")):
190
190
  orig_dtype: str = dtype
191
191
  if dtype.startswith("array"):
192
192
  field.type = "array"
@@ -195,6 +195,12 @@ def create_typed_field(dtype: str) -> Field:
195
195
  field.type = "struct"
196
196
  for f in split_struct(orig_dtype[7:-1]):
197
197
  field.fields[f.split(":", 1)[0].strip()] = create_typed_field(f.split(":", 1)[1])
198
+ elif dtype.startswith("map"):
199
+ field.type = "map"
200
+ key_type = orig_dtype[4:-1].split(",", 1)[0]
201
+ value_type = orig_dtype[4:-1].split(",", 1)[1]
202
+ field.keys = create_typed_field(key_type)
203
+ field.values = create_typed_field(value_type)
198
204
  else:
199
205
  field.type = map_type_from_sql(dtype)
200
206
  return field
@@ -10,7 +10,10 @@ class Importer(ABC):
10
10
 
11
11
  @abstractmethod
12
12
  def import_source(
13
- self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
13
+ self,
14
+ data_contract_specification: DataContractSpecification,
15
+ source: str,
16
+ import_args: dict,
14
17
  ) -> dict:
15
18
  pass
16
19
 
@@ -18,12 +21,14 @@ class Importer(ABC):
18
21
  class ImportFormat(str, Enum):
19
22
  sql = "sql"
20
23
  avro = "avro"
24
+ dbt = "dbt"
21
25
  glue = "glue"
22
26
  jsonschema = "jsonschema"
23
27
  bigquery = "bigquery"
24
28
  odcs = "odcs"
25
29
  unity = "unity"
30
+ spark = "spark"
26
31
 
27
32
  @classmethod
28
- def get_suported_formats(cls):
33
+ def get_supported_formats(cls):
29
34
  return list(map(lambda c: c.value, cls))
@@ -18,7 +18,7 @@ class ImporterFactory:
18
18
  importers = self.dict_importer.copy()
19
19
  importers.update(self.dict_lazy_importer.copy())
20
20
  if name not in importers.keys():
21
- raise ValueError(f"The '{name}' format is not suportted.")
21
+ raise ValueError(f"The '{name}' format is not supported.")
22
22
  importer_class = importers[name]
23
23
  if type(importers[name]) is tuple:
24
24
  importer_class = load_module_class(module_path=importers[name][0], class_name=importers[name][1])
@@ -46,7 +46,9 @@ def load_module_class(module_path, class_name):
46
46
 
47
47
  importer_factory = ImporterFactory()
48
48
  importer_factory.register_lazy_importer(
49
- name=ImportFormat.avro, module_path="datacontract.imports.avro_importer", class_name="AvroImporter"
49
+ name=ImportFormat.avro,
50
+ module_path="datacontract.imports.avro_importer",
51
+ class_name="AvroImporter",
50
52
  )
51
53
  importer_factory.register_lazy_importer(
52
54
  name=ImportFormat.bigquery,
@@ -54,7 +56,9 @@ importer_factory.register_lazy_importer(
54
56
  class_name="BigQueryImporter",
55
57
  )
56
58
  importer_factory.register_lazy_importer(
57
- name=ImportFormat.glue, module_path="datacontract.imports.glue_importer", class_name="GlueImporter"
59
+ name=ImportFormat.glue,
60
+ module_path="datacontract.imports.glue_importer",
61
+ class_name="GlueImporter",
58
62
  )
59
63
  importer_factory.register_lazy_importer(
60
64
  name=ImportFormat.jsonschema,
@@ -62,11 +66,25 @@ importer_factory.register_lazy_importer(
62
66
  class_name="JsonSchemaImporter",
63
67
  )
64
68
  importer_factory.register_lazy_importer(
65
- name=ImportFormat.odcs, module_path="datacontract.imports.odcs_importer", class_name="OdcsImporter"
69
+ name=ImportFormat.odcs,
70
+ module_path="datacontract.imports.odcs_importer",
71
+ class_name="OdcsImporter",
66
72
  )
67
73
  importer_factory.register_lazy_importer(
68
- name=ImportFormat.sql, module_path="datacontract.imports.sql_importer", class_name="SqlImporter"
74
+ name=ImportFormat.sql,
75
+ module_path="datacontract.imports.sql_importer",
76
+ class_name="SqlImporter",
69
77
  )
70
78
  importer_factory.register_lazy_importer(
71
- name=ImportFormat.unity, module_path="datacontract.imports.unity_importer", class_name="UnityImporter"
79
+ name=ImportFormat.unity,
80
+ module_path="datacontract.imports.unity_importer",
81
+ class_name="UnityImporter",
82
+ )
83
+ importer_factory.register_lazy_importer(
84
+ name=ImportFormat.spark,
85
+ module_path="datacontract.imports.spark_importer",
86
+ class_name="SparkImporter",
87
+ )
88
+ importer_factory.register_lazy_importer(
89
+ name=ImportFormat.dbt, module_path="datacontract.imports.dbt_importer", class_name="DbtManifestImporter"
72
90
  )
@@ -10,137 +10,49 @@ from datacontract.model.exceptions import DataContractException
10
10
  class JsonSchemaImporter(Importer):
11
11
  def import_source(
12
12
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
13
- ) -> dict:
13
+ ) -> DataContractSpecification:
14
14
  return import_jsonschema(data_contract_specification, source)
15
15
 
16
16
 
17
- def convert_json_schema_properties(properties, is_definition=False):
18
- fields = {}
19
- for field_name, field_schema in properties.items():
20
- field_kwargs = {}
21
- field_type = field_schema.get("type")
22
-
23
- # Determine if the field is required and set the type to the non-null option if applicable
24
- if isinstance(field_type, list) and "null" in field_type:
25
- field_kwargs["required"] = False
26
- non_null_types = [t for t in field_type if t != "null"]
27
- if non_null_types:
28
- field_type = non_null_types[0]
29
- else:
30
- field_type = None
31
- else:
32
- field_kwargs["required"] = True
33
-
34
- # Set the non-null type
35
- if field_type:
36
- field_kwargs["type"] = field_type
37
-
38
- for key, value in field_schema.items():
39
- match key:
40
- case "title":
41
- field_kwargs["title"] = value
42
- case "type":
43
- pass # type is already handled above
44
- case "format":
45
- field_kwargs["format"] = value
46
- case "description":
47
- field_kwargs["description"] = value
48
- case "pattern":
49
- field_kwargs["pattern"] = value
50
- case "minLength":
51
- field_kwargs["minLength"] = value
52
- case "maxLength":
53
- field_kwargs["maxLength"] = value
54
- case "minimum":
55
- field_kwargs["minimum"] = value
56
- case "exclusiveMinimum":
57
- field_kwargs["exclusiveMinimum"] = value
58
- case "maximum":
59
- field_kwargs["maximum"] = value
60
- case "exclusiveMaximum":
61
- field_kwargs["exclusiveMaximum"] = value
62
- case "enum":
63
- field_kwargs["enum"] = value
64
- case "tags":
65
- field_kwargs["tags"] = value
66
- case "properties":
67
- field_kwargs["fields"] = convert_json_schema_properties(value)
68
- case "items":
69
- field_kwargs["items"] = convert_json_schema_properties(value)
70
-
71
- field = Field(**field_kwargs)
72
- fields[field_name] = field
73
-
74
- return fields
75
-
76
-
77
17
  def import_jsonschema(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
78
18
  if data_contract_specification.models is None:
79
19
  data_contract_specification.models = {}
80
20
 
21
+ json_schema = load_and_validate_json_schema(source)
22
+
23
+ title = json_schema.get("title", "default_model")
24
+ description = json_schema.get("description")
25
+ type_ = json_schema.get("type")
26
+ properties = json_schema.get("properties", {})
27
+ required_properties = json_schema.get("required", [])
28
+
29
+ fields_kwargs = jsonschema_to_args(properties, required_properties)
30
+ fields = {name: Field(**kwargs) for name, kwargs in fields_kwargs.items()}
31
+
32
+ model = Model(description=description, type=type_, title=title, fields=fields)
33
+ data_contract_specification.models[title] = model
34
+
35
+ definitions = json_schema.get("definitions", {})
36
+ for name, schema in definitions.items():
37
+ kwargs = schema_to_args(schema)
38
+ data_contract_specification.definitions[name] = Definition(name=name, **kwargs)
39
+
40
+ return data_contract_specification
41
+
42
+
43
+ def load_and_validate_json_schema(source):
81
44
  try:
82
45
  with open(source, "r") as file:
83
46
  json_schema = json.loads(file.read())
84
- validator = fastjsonschema.compile({})
85
- validator(json_schema)
86
-
87
- model = Model(
88
- description=json_schema.get("description"),
89
- type=json_schema.get("type"),
90
- title=json_schema.get("title"),
91
- fields=convert_json_schema_properties(json_schema.get("properties", {})),
92
- )
93
- data_contract_specification.models[json_schema.get("title", "default_model")] = model
94
-
95
- if "definitions" in json_schema:
96
- for def_name, def_schema in json_schema["definitions"].items():
97
- definition_kwargs = {}
98
-
99
- for key, value in def_schema.items():
100
- match key:
101
- case "domain":
102
- definition_kwargs["domain"] = value
103
- case "title":
104
- definition_kwargs["title"] = value
105
- case "description":
106
- definition_kwargs["description"] = value
107
- case "type":
108
- definition_kwargs["type"] = value
109
- case "enum":
110
- definition_kwargs["enum"] = value
111
- case "format":
112
- definition_kwargs["format"] = value
113
- case "minLength":
114
- definition_kwargs["minLength"] = value
115
- case "maxLength":
116
- definition_kwargs["maxLength"] = value
117
- case "pattern":
118
- definition_kwargs["pattern"] = value
119
- case "minimum":
120
- definition_kwargs["minimum"] = value
121
- case "exclusiveMinimum":
122
- definition_kwargs["exclusiveMinimum"] = value
123
- case "maximum":
124
- definition_kwargs["maximum"] = value
125
- case "exclusiveMaximum":
126
- definition_kwargs["exclusiveMaximum"] = value
127
- case "pii":
128
- definition_kwargs["pii"] = value
129
- case "classification":
130
- definition_kwargs["classification"] = value
131
- case "tags":
132
- definition_kwargs["tags"] = value
133
- case "properties":
134
- definition_kwargs["fields"] = convert_json_schema_properties(value, is_definition=True)
135
-
136
- definition = Definition(name=def_name, **definition_kwargs)
137
- data_contract_specification.definitions[def_name] = definition
47
+
48
+ validator = fastjsonschema.compile({})
49
+ validator(json_schema)
138
50
 
139
51
  except fastjsonschema.JsonSchemaException as e:
140
52
  raise DataContractException(
141
53
  type="schema",
142
54
  name="Parse json schema",
143
- reason=f"Failed to parse json schema from {source}: {e}",
55
+ reason=f"Failed to validate json schema from {source}: {e}",
144
56
  engine="datacontract",
145
57
  )
146
58
 
@@ -152,5 +64,82 @@ def import_jsonschema(data_contract_specification: DataContractSpecification, so
152
64
  engine="datacontract",
153
65
  original_exception=e,
154
66
  )
67
+ return json_schema
155
68
 
156
- return data_contract_specification
69
+
70
+ def jsonschema_to_args(properties, required_properties):
71
+ args = {}
72
+ for property, property_schema in properties.items():
73
+ is_required = property in required_properties
74
+ args[property] = schema_to_args(property_schema, is_required)
75
+
76
+ return args
77
+
78
+
79
+ def schema_to_args(property_schema, is_required: bool = None) -> dict:
80
+ direct_mappings = {
81
+ "title",
82
+ "description",
83
+ "format",
84
+ "pattern",
85
+ "enum",
86
+ "tags",
87
+ "pii",
88
+ "minLength",
89
+ "maxLength",
90
+ "minimum",
91
+ "exclusiveMinimum",
92
+ "maximum",
93
+ "exclusiveMaximum",
94
+ }
95
+
96
+ field_kwargs = {key: value for key, value in property_schema.items() if key in direct_mappings}
97
+
98
+ if is_required is not None:
99
+ field_kwargs["required"] = is_required
100
+
101
+ property_type = determine_type(property_schema)
102
+ if property_type is not None:
103
+ field_kwargs["type"] = property_type
104
+
105
+ if property_type == "array":
106
+ nested_item_type, nested_items = determine_nested_item_type(property_schema)
107
+
108
+ if nested_items is not None:
109
+ field_kwargs["items"] = schema_to_args(nested_item_type)
110
+
111
+ nested_properties = property_schema.get("properties")
112
+ if nested_properties is not None:
113
+ # recursive call for complex nested properties
114
+ field_kwargs["fields"] = jsonschema_to_args(nested_properties, property_schema["required"])
115
+
116
+ return field_kwargs
117
+
118
+
119
+ def determine_nested_item_type(property_schema):
120
+ nested_items = property_schema.get("items")
121
+ nested_items_is_list = isinstance(nested_items, list)
122
+ if nested_items_is_list and len(nested_items) != 1:
123
+ raise DataContractException(
124
+ type="schema",
125
+ name="Parse json schema",
126
+ reason=f"Union types for arrays are currently not supported ({nested_items})",
127
+ engine="datacontract",
128
+ )
129
+ if nested_items_is_list and len(nested_items) == 1:
130
+ nested_item_type = nested_items[0]
131
+ elif not nested_items_is_list and nested_items is not None:
132
+ nested_item_type = nested_items
133
+ return nested_item_type, nested_items
134
+
135
+
136
+ def determine_type(property_schema):
137
+ property_type = property_schema.get("type")
138
+ type_is_list = isinstance(property_type, list)
139
+ if type_is_list:
140
+ non_null_types = [t for t in property_type if t != "null"]
141
+ if non_null_types:
142
+ property_type = non_null_types[0]
143
+ else:
144
+ property_type = None
145
+ return property_type
@@ -0,0 +1,134 @@
1
+ from pyspark.sql import DataFrame, SparkSession, types
2
+ from datacontract.imports.importer import Importer
3
+ from datacontract.model.data_contract_specification import (
4
+ DataContractSpecification,
5
+ Model,
6
+ Field,
7
+ Server,
8
+ )
9
+
10
+
11
+ class SparkImporter(Importer):
12
+ def import_source(
13
+ self,
14
+ data_contract_specification: DataContractSpecification,
15
+ source: str,
16
+ import_args: dict,
17
+ ) -> dict:
18
+ """
19
+ Imports data from a Spark source into the data contract specification.
20
+
21
+ Args:
22
+ data_contract_specification: The data contract specification object.
23
+ source: The source string indicating the Spark tables to read.
24
+ import_args: Additional arguments for the import process.
25
+
26
+ Returns:
27
+ dict: The updated data contract specification.
28
+ """
29
+ return import_spark(data_contract_specification, source)
30
+
31
+
32
+ def import_spark(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
33
+ """
34
+ Reads Spark tables and updates the data contract specification with their schemas.
35
+
36
+ Args:
37
+ data_contract_specification: The data contract specification to update.
38
+ source: A comma-separated string of Spark temporary views to read.
39
+
40
+ Returns:
41
+ DataContractSpecification: The updated data contract specification.
42
+ """
43
+ spark = SparkSession.builder.getOrCreate()
44
+ data_contract_specification.servers["local"] = Server(type="dataframe")
45
+ for temp_view in source.split(","):
46
+ temp_view = temp_view.strip()
47
+ df = spark.read.table(temp_view)
48
+ data_contract_specification.models[temp_view] = import_from_spark_df(df)
49
+ return data_contract_specification
50
+
51
+
52
+ def import_from_spark_df(df: DataFrame) -> Model:
53
+ """
54
+ Converts a Spark DataFrame into a Model.
55
+
56
+ Args:
57
+ df: The Spark DataFrame to convert.
58
+
59
+ Returns:
60
+ Model: The generated data contract model.
61
+ """
62
+ model = Model()
63
+ schema = df.schema
64
+
65
+ for field in schema:
66
+ model.fields[field.name] = _field_from_spark(field)
67
+
68
+ return model
69
+
70
+
71
+ def _field_from_spark(spark_field: types.StructField) -> Field:
72
+ """
73
+ Converts a Spark StructField into a Field object for the data contract.
74
+
75
+ Args:
76
+ spark_field: The Spark StructField to convert.
77
+
78
+ Returns:
79
+ Field: The corresponding Field object.
80
+ """
81
+ field_type = _data_type_from_spark(spark_field.dataType)
82
+ field = Field()
83
+ field.type = field_type
84
+ field.required = not spark_field.nullable
85
+
86
+ if field_type == "array":
87
+ field.items = _field_from_spark(spark_field.dataType.elementType)
88
+
89
+ if field_type == "struct":
90
+ field.fields = {sf.name: _field_from_spark(sf) for sf in spark_field.dataType.fields}
91
+
92
+ return field
93
+
94
+
95
+ def _data_type_from_spark(spark_type: types.DataType) -> str:
96
+ """
97
+ Maps Spark data types to the Data Contract type system.
98
+
99
+ Args:
100
+ spark_type: The Spark data type to map.
101
+
102
+ Returns:
103
+ str: The corresponding Data Contract type.
104
+ """
105
+ if isinstance(spark_type, types.StringType):
106
+ return "string"
107
+ elif isinstance(spark_type, types.IntegerType):
108
+ return "integer"
109
+ elif isinstance(spark_type, types.LongType):
110
+ return "long"
111
+ elif isinstance(spark_type, types.FloatType):
112
+ return "float"
113
+ elif isinstance(spark_type, types.DoubleType):
114
+ return "double"
115
+ elif isinstance(spark_type, types.StructType):
116
+ return "struct"
117
+ elif isinstance(spark_type, types.ArrayType):
118
+ return "array"
119
+ elif isinstance(spark_type, types.TimestampType):
120
+ return "timestamp"
121
+ elif isinstance(spark_type, types.TimestampNTZType):
122
+ return "timestamp_ntz"
123
+ elif isinstance(spark_type, types.DateType):
124
+ return "date"
125
+ elif isinstance(spark_type, types.BooleanType):
126
+ return "boolean"
127
+ elif isinstance(spark_type, types.BinaryType):
128
+ return "bytes"
129
+ elif isinstance(spark_type, types.DecimalType):
130
+ return "decimal"
131
+ elif isinstance(spark_type, types.NullType):
132
+ return "null"
133
+ else:
134
+ raise ValueError(f"Unsupported Spark type: {spark_type}")
@@ -64,6 +64,10 @@ def map_type_from_sql(sql_type: str):
64
64
  return "integer"
65
65
  elif sql_type_normed.startswith("float"):
66
66
  return "float"
67
+ elif sql_type_normed.startswith("decimal"):
68
+ return "decimal"
69
+ elif sql_type_normed.startswith("numeric"):
70
+ return "numeric"
67
71
  elif sql_type_normed.startswith("bool"):
68
72
  return "boolean"
69
73
  elif sql_type_normed.startswith("timestamp"):
@@ -8,18 +8,23 @@ from datacontract.model.run import Run
8
8
  def publish_datamesh_manager(run: Run, publish_url: str):
9
9
  try:
10
10
  if publish_url is None:
11
- url = "https://api.datamesh-manager.com/api/runs"
11
+ # this url supports Data Mesh Manager and Data Contract Manager
12
+ url = "https://api.datamesh-manager.com/api/test-results"
12
13
  else:
13
14
  url = publish_url
14
- datamesh_manager_api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
15
+ api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
16
+ if api_key is None:
17
+ api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
15
18
 
16
19
  if run.dataContractId is None:
17
20
  raise Exception("Cannot publish run results, as data contract ID is unknown")
18
21
 
19
- if datamesh_manager_api_key is None:
20
- raise Exception("Cannot publish run results, as DATAMESH_MANAGER_API_KEY is not set")
22
+ if api_key is None:
23
+ raise Exception(
24
+ "Cannot publish run results, as DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY are not set"
25
+ )
21
26
 
22
- headers = {"Content-Type": "application/json", "x-api-key": datamesh_manager_api_key}
27
+ headers = {"Content-Type": "application/json", "x-api-key": api_key}
23
28
  request_body = run.model_dump_json()
24
29
  # print("Request Body:", request_body)
25
30
  response = requests.post(url, data=request_body, headers=headers)