datacontract-cli 0.10.21__py3-none-any.whl → 0.10.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (29) hide show
  1. datacontract/breaking/breaking.py +1 -1
  2. datacontract/breaking/breaking_rules.py +1 -1
  3. datacontract/cli.py +5 -5
  4. datacontract/data_contract.py +14 -100
  5. datacontract/engines/data_contract_checks.py +735 -0
  6. datacontract/engines/data_contract_test.py +51 -0
  7. datacontract/engines/soda/check_soda_execute.py +36 -30
  8. datacontract/engines/soda/connections/kafka.py +8 -3
  9. datacontract/export/avro_converter.py +2 -0
  10. datacontract/export/exporter.py +0 -2
  11. datacontract/export/exporter_factory.py +0 -12
  12. datacontract/export/sodacl_converter.py +22 -294
  13. datacontract/export/sql_type_converter.py +7 -2
  14. datacontract/imports/odcs_importer.py +6 -3
  15. datacontract/imports/odcs_v3_importer.py +2 -0
  16. datacontract/imports/sql_importer.py +229 -29
  17. datacontract/model/exceptions.py +4 -1
  18. datacontract/model/run.py +11 -4
  19. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/METADATA +139 -166
  20. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/RECORD +25 -27
  21. datacontract/engines/soda/connections/dask.py +0 -28
  22. datacontract/export/odcs_v2_exporter.py +0 -124
  23. datacontract/imports/odcs_v2_importer.py +0 -177
  24. datacontract/lint/linters/example_model_linter.py +0 -91
  25. /datacontract/{model → breaking}/breaking_change.py +0 -0
  26. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/LICENSE +0 -0
  27. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/WHEEL +0 -0
  28. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/entry_points.txt +0 -0
  29. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,51 @@
1
+ import typing
2
+
3
+ from datacontract.engines.data_contract_checks import create_checks
4
+
5
+ if typing.TYPE_CHECKING:
6
+ from pyspark.sql import SparkSession
7
+
8
+ from datacontract.engines.datacontract.check_that_datacontract_contains_valid_servers_configuration import (
9
+ check_that_datacontract_contains_valid_server_configuration,
10
+ )
11
+ from datacontract.engines.fastjsonschema.check_jsonschema import check_jsonschema
12
+ from datacontract.engines.soda.check_soda_execute import check_soda_execute
13
+ from datacontract.model.data_contract_specification import DataContractSpecification
14
+ from datacontract.model.exceptions import DataContractException
15
+ from datacontract.model.run import ResultEnum, Run
16
+
17
+
18
+ def execute_data_contract_test(
19
+ data_contract_specification: DataContractSpecification,
20
+ run: Run,
21
+ server_name: str = None,
22
+ spark: "SparkSession" = None,
23
+ ):
24
+ if data_contract_specification.models is None or len(data_contract_specification.models) == 0:
25
+ raise DataContractException(
26
+ type="lint",
27
+ name="Check that data contract contains models",
28
+ result=ResultEnum.warning,
29
+ reason="Models block is missing. Skip executing tests.",
30
+ engine="datacontract",
31
+ )
32
+ check_that_datacontract_contains_valid_server_configuration(run, data_contract_specification, server_name)
33
+ if server_name:
34
+ server = data_contract_specification.servers.get(server_name)
35
+ else:
36
+ server_name = list(data_contract_specification.servers.keys())[0]
37
+ server = data_contract_specification.servers.get(server_name)
38
+ run.log_info(f"Running tests for data contract {data_contract_specification.id} with server {server_name}")
39
+ run.dataContractId = data_contract_specification.id
40
+ run.dataContractVersion = data_contract_specification.info.version
41
+ run.dataProductId = server.dataProductId
42
+ run.outputPortId = server.outputPortId
43
+ run.server = server_name
44
+
45
+ run.checks.extend(create_checks(data_contract_specification, server))
46
+
47
+ # TODO check server is supported type for nicer error messages
48
+ # TODO check server credentials are complete for nicer error messages
49
+ if server.format == "json" and server.type != "kafka":
50
+ check_jsonschema(run, data_contract_specification, server)
51
+ check_soda_execute(run, data_contract_specification, server, spark)
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import uuid
2
3
 
3
4
  from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
4
5
  from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
@@ -13,7 +14,7 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
13
14
  from datacontract.model.run import Check, Log, ResultEnum, Run
14
15
 
15
16
 
16
- def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark, tmp_dir):
17
+ def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark):
17
18
  from soda.common.config_helper import ConfigHelper
18
19
 
19
20
  ConfigHelper.get_instance().upsert_value("send_anonymous_usage_stats", False)
@@ -80,8 +81,8 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
80
81
  scan.set_data_source_name("datacontract-cli")
81
82
  elif server.type == "kafka":
82
83
  if spark is None:
83
- spark = create_spark_session(tmp_dir)
84
- read_kafka_topic(spark, data_contract, server, tmp_dir)
84
+ spark = create_spark_session()
85
+ read_kafka_topic(spark, data_contract, server)
85
86
  scan.add_spark_session(spark, data_source_name=server.type)
86
87
  scan.set_data_source_name(server.type)
87
88
  elif server.type == "sqlserver":
@@ -106,37 +107,34 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
106
107
  run.log_warn(f"Server type {server.type} not yet supported by datacontract CLI")
107
108
  return
108
109
 
109
- # Don't check types for json format, as they are checked with json schema
110
- # Don't check types for avro format, as they are checked with avro schema
111
- # Don't check types for csv format, as they are hard to detect
112
- server_type = server.type
113
- check_types = server.format != "json" and server.format != "csv" and server.format != "avro"
114
-
115
- sodacl_yaml_str = to_sodacl_yaml(data_contract, server_type, check_types)
110
+ sodacl_yaml_str = to_sodacl_yaml(run)
116
111
  # print("sodacl_yaml_str:\n" + sodacl_yaml_str)
117
112
  scan.add_sodacl_yaml_str(sodacl_yaml_str)
118
113
 
119
114
  # Execute the scan
120
- logging.info("Starting soda scan")
115
+ logging.info("Starting soda scan with checks:\n" + sodacl_yaml_str)
121
116
  scan.execute()
122
117
  logging.info("Finished soda scan")
123
118
 
124
119
  # pprint.PrettyPrinter(indent=2).pprint(scan.build_scan_results())
125
120
 
126
121
  scan_results = scan.get_scan_results()
127
- for c in scan_results.get("checks"):
128
- check = Check(
129
- type="schema",
130
- result=to_result(c),
131
- reason=", ".join(c.get("outcomeReasons")),
132
- name=c.get("name"),
133
- model=c.get("table"),
134
- field=c.get("column"),
135
- engine="soda-core",
136
- diagnostics=c.get("diagnostics"),
137
- )
138
- update_reason(check, c)
139
- run.checks.append(check)
122
+ for scan_result in scan_results.get("checks"):
123
+ name = scan_result.get("name")
124
+ check = get_check(run, scan_result)
125
+ if check is None:
126
+ check = Check(
127
+ id=str(uuid.uuid4()),
128
+ category="custom",
129
+ type="custom",
130
+ name=name,
131
+ engine="soda-core",
132
+ )
133
+ run.checks.append(check)
134
+ check.result = to_result(scan_result)
135
+ check.reason = ", ".join(scan_result.get("outcomeReasons"))
136
+ check.diagnostics = scan_result.get("diagnostics")
137
+ update_reason(check, scan_result)
140
138
 
141
139
  for log in scan_results.get("logs"):
142
140
  run.logs.append(
@@ -152,8 +150,8 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
152
150
  run.checks.append(
153
151
  Check(
154
152
  type="general",
155
- name="Execute quality checks",
156
- result="warning",
153
+ name="Data Contract Tests",
154
+ result=ResultEnum.warning,
157
155
  reason="Engine soda-core has errors. See the logs for details.",
158
156
  engine="soda-core",
159
157
  )
@@ -161,14 +159,22 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
161
159
  return
162
160
 
163
161
 
164
- def to_result(c) -> str:
162
+ def get_check(run, scan_result) -> Check | None:
163
+ check_by_name = next((c for c in run.checks if c.key == scan_result.get("name")), None)
164
+ if check_by_name is not None:
165
+ return check_by_name
166
+
167
+ return None
168
+
169
+
170
+ def to_result(c) -> ResultEnum:
165
171
  soda_outcome = c.get("outcome")
166
172
  if soda_outcome == "pass":
167
- return "passed"
173
+ return ResultEnum.passed
168
174
  elif soda_outcome == "fail":
169
- return "failed"
175
+ return ResultEnum.failed
170
176
  else:
171
- return soda_outcome
177
+ return ResultEnum.unknown
172
178
 
173
179
 
174
180
  def update_reason(check, c):
@@ -1,12 +1,14 @@
1
+ import atexit
1
2
  import logging
2
3
  import os
4
+ import tempfile
3
5
 
4
6
  from datacontract.export.avro_converter import to_avro_schema_json
5
7
  from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server
6
8
  from datacontract.model.exceptions import DataContractException
7
9
 
8
10
 
9
- def create_spark_session(tmp_dir: str):
11
+ def create_spark_session():
10
12
  """Create and configure a Spark session."""
11
13
 
12
14
  try:
@@ -21,6 +23,9 @@ def create_spark_session(tmp_dir: str):
21
23
  original_exception=e,
22
24
  )
23
25
 
26
+ tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract-cli-spark")
27
+ atexit.register(tmp_dir.cleanup)
28
+
24
29
  spark = (
25
30
  SparkSession.builder.appName("datacontract")
26
31
  .config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
@@ -37,7 +42,7 @@ def create_spark_session(tmp_dir: str):
37
42
  return spark
38
43
 
39
44
 
40
- def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Server, tmp_dir):
45
+ def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Server):
41
46
  """Read and process data from a Kafka topic based on the server configuration."""
42
47
 
43
48
  logging.info("Reading data from Kafka server %s topic %s", server.host, server.topic)
@@ -62,7 +67,7 @@ def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Se
62
67
  type="test",
63
68
  name="Configuring Kafka checks",
64
69
  result="warning",
65
- reason=f"Kafka format '{server.format}' is not supported. " f"Skip executing tests.",
70
+ reason=f"Kafka format '{server.format}' is not supported. Skip executing tests.",
66
71
  engine="datacontract",
67
72
  )
68
73
 
@@ -108,6 +108,8 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
108
108
  elif field.type in ["time"]:
109
109
  return "long"
110
110
  elif field.type in ["object", "record", "struct"]:
111
+ if field.config is not None and 'namespace' in field.config:
112
+ return to_avro_record(field_name ,field.fields ,field.description ,field.config['namespace'])
111
113
  return to_avro_record(field_name, field.fields, field.description, None)
112
114
  elif field.type in ["binary"]:
113
115
  return "bytes"
@@ -25,8 +25,6 @@ class ExportFormat(str, Enum):
25
25
  dbt_sources = "dbt-sources"
26
26
  dbt_staging_sql = "dbt-staging-sql"
27
27
  odcs = "odcs"
28
- odcs_v2 = "odcs_v2"
29
- odcs_v3 = "odcs_v3"
30
28
  rdf = "rdf"
31
29
  avro = "avro"
32
30
  protobuf = "protobuf"
@@ -107,18 +107,6 @@ exporter_factory.register_lazy_exporter(
107
107
  class_name="JsonSchemaExporter",
108
108
  )
109
109
 
110
- exporter_factory.register_lazy_exporter(
111
- name=ExportFormat.odcs_v2,
112
- module_path="datacontract.export.odcs_v2_exporter",
113
- class_name="OdcsV2Exporter",
114
- )
115
-
116
- exporter_factory.register_lazy_exporter(
117
- name=ExportFormat.odcs_v3,
118
- module_path="datacontract.export.odcs_v3_exporter",
119
- class_name="OdcsV3Exporter",
120
- )
121
-
122
110
  exporter_factory.register_lazy_exporter(
123
111
  name=ExportFormat.odcs,
124
112
  module_path="datacontract.export.odcs_v3_exporter",
@@ -1,302 +1,30 @@
1
- from typing import List
2
- from venv import logger
3
-
4
1
  import yaml
5
2
 
3
+ from datacontract.engines.data_contract_checks import create_checks
6
4
  from datacontract.export.exporter import Exporter
7
- from datacontract.export.sql_type_converter import convert_to_sql_type
8
- from datacontract.model.data_contract_specification import DataContractSpecification, Quality
5
+ from datacontract.model.run import Run
9
6
 
10
7
 
11
8
  class SodaExporter(Exporter):
12
9
  def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
13
- return to_sodacl_yaml(data_contract)
14
-
15
-
16
- def to_sodacl_yaml(
17
- data_contract_spec: DataContractSpecification, server_type: str = None, check_types: bool = True
18
- ) -> str:
19
- try:
20
- sodacl = {}
21
- for model_key, model_value in data_contract_spec.models.items():
22
- k, v = to_checks(model_key, model_value, server_type, check_types)
23
- sodacl[k] = v
24
- add_quality_checks(sodacl, data_contract_spec)
25
- sodacl_yaml_str = yaml.dump(sodacl, default_flow_style=False, sort_keys=False)
26
- return sodacl_yaml_str
27
- except Exception as e:
28
- return f"Error: {e}"
29
-
30
-
31
- def to_checks(model_key, model_value, server_type: str, check_types: bool):
32
- checks = []
33
- model_name = to_model_name(model_key, model_value, server_type)
34
- fields = model_value.fields
35
-
36
- quote_field_name = server_type in ["postgres", "sqlserver"]
37
-
38
- for field_name, field in fields.items():
39
- checks.append(check_field_is_present(field_name))
40
- if check_types and field.type is not None:
41
- sql_type = convert_to_sql_type(field, server_type)
42
- checks.append(check_field_type(field_name, sql_type))
43
- if field.required:
44
- checks.append(check_field_required(field_name, quote_field_name))
45
- if field.unique:
46
- checks.append(check_field_unique(field_name, quote_field_name))
47
- if field.minLength is not None:
48
- checks.append(check_field_min_length(field_name, field.minLength, quote_field_name))
49
- if field.maxLength is not None:
50
- checks.append(check_field_max_length(field_name, field.maxLength, quote_field_name))
51
- if field.minimum is not None:
52
- checks.append(check_field_minimum(field_name, field.minimum, quote_field_name))
53
- if field.maximum is not None:
54
- checks.append(check_field_maximum(field_name, field.maximum, quote_field_name))
55
- if field.exclusiveMinimum is not None:
56
- checks.append(check_field_minimum(field_name, field.exclusiveMinimum, quote_field_name))
57
- checks.append(check_field_not_equal(field_name, field.exclusiveMinimum, quote_field_name))
58
- if field.exclusiveMaximum is not None:
59
- checks.append(check_field_maximum(field_name, field.exclusiveMaximum, quote_field_name))
60
- checks.append(check_field_not_equal(field_name, field.exclusiveMaximum, quote_field_name))
61
- if field.pattern is not None:
62
- checks.append(check_field_regex(field_name, field.pattern, quote_field_name))
63
- if field.enum is not None and len(field.enum) > 0:
64
- checks.append(check_field_enum(field_name, field.enum, quote_field_name))
65
- if field.quality is not None and len(field.quality) > 0:
66
- quality_list = check_quality_list(model_name, field_name, field.quality)
67
- if (quality_list is not None) and len(quality_list) > 0:
68
- checks.append(quality_list)
69
- # TODO references: str = None
70
- # TODO format
71
-
72
- if model_value.quality is not None and len(model_value.quality) > 0:
73
- quality_list = check_quality_list(model_name, None, model_value.quality)
74
- if (quality_list is not None) and len(quality_list) > 0:
75
- checks.append(quality_list)
76
-
77
- checks_for_model_key = f"checks for {model_name}"
78
-
79
- if quote_field_name:
80
- checks_for_model_key = f'checks for "{model_name}"'
81
-
82
- return checks_for_model_key, checks
83
-
84
-
85
- def to_model_name(model_key, model_value, server_type):
86
- if server_type == "databricks":
87
- if model_value.config is not None and "databricksTable" in model_value.config:
88
- return model_value.config["databricksTable"]
89
- if server_type == "snowflake":
90
- if model_value.config is not None and "snowflakeTable" in model_value.config:
91
- return model_value.config["snowflakeTable"]
92
- if server_type == "sqlserver":
93
- if model_value.config is not None and "sqlserverTable" in model_value.config:
94
- return model_value.config["sqlserverTable"]
95
- if server_type == "postgres" or server_type == "postgresql":
96
- if model_value.config is not None and "postgresTable" in model_value.config:
97
- return model_value.config["postgresTable"]
98
- return model_key
99
-
100
-
101
- def check_field_is_present(field_name):
102
- return {
103
- "schema": {
104
- "name": f"Check that field {field_name} is present",
105
- "fail": {
106
- "when required column missing": [field_name],
107
- },
108
- }
109
- }
110
-
111
-
112
- def check_field_type(field_name: str, type: str):
113
- return {
114
- "schema": {
115
- "name": f"Check that field {field_name} has type {type}",
116
- "fail": {"when wrong column type": {field_name: type}},
117
- }
118
- }
119
-
120
-
121
- def check_field_required(field_name: str, quote_field_name: bool = False):
122
- if quote_field_name:
123
- field_name = f'"{field_name}"'
124
-
125
- return {f"missing_count({field_name}) = 0": {"name": f"Check that required field {field_name} has no null values"}}
126
-
127
-
128
- def check_field_unique(field_name, quote_field_name: bool = False):
129
- if quote_field_name:
130
- field_name = f'"{field_name}"'
131
- return {
132
- f"duplicate_count({field_name}) = 0": {"name": f"Check that unique field {field_name} has no duplicate values"}
133
- }
134
-
135
-
136
- def check_field_min_length(field_name, min_length, quote_field_name: bool = False):
137
- if quote_field_name:
138
- field_name = f'"{field_name}"'
139
- return {
140
- f"invalid_count({field_name}) = 0": {
141
- "name": f"Check that field {field_name} has a min length of {min_length}",
142
- "valid min length": min_length,
143
- }
144
- }
145
-
146
-
147
- def check_field_max_length(field_name, max_length, quote_field_name: bool = False):
148
- if quote_field_name:
149
- field_name = f'"{field_name}"'
150
- return {
151
- f"invalid_count({field_name}) = 0": {
152
- "name": f"Check that field {field_name} has a max length of {max_length}",
153
- "valid max length": max_length,
154
- }
155
- }
156
-
157
-
158
- def check_field_minimum(field_name, minimum, quote_field_name: bool = False):
159
- if quote_field_name:
160
- field_name = f'"{field_name}"'
161
- return {
162
- f"invalid_count({field_name}) = 0": {
163
- "name": f"Check that field {field_name} has a minimum of {minimum}",
164
- "valid min": minimum,
165
- }
166
- }
167
-
168
-
169
- def check_field_maximum(field_name, maximum, quote_field_name: bool = False):
170
- if quote_field_name:
171
- field_name = f'"{field_name}"'
172
- return {
173
- f"invalid_count({field_name}) = 0": {
174
- "name": f"Check that field {field_name} has a maximum of {maximum}",
175
- "valid max": maximum,
176
- }
177
- }
178
-
179
-
180
- def check_field_not_equal(field_name, value, quote_field_name: bool = False):
181
- if quote_field_name:
182
- field_name = f'"{field_name}"'
183
- return {
184
- f"invalid_count({field_name}) = 0": {
185
- "name": f"Check that field {field_name} is not equal to {value}",
186
- "invalid values": [value],
187
- }
188
- }
189
-
190
-
191
- def check_field_enum(field_name, enum, quote_field_name: bool = False):
192
- if quote_field_name:
193
- field_name = f'"{field_name}"'
194
- return {
195
- f"invalid_count({field_name}) = 0": {
196
- "name": f"Check that field {field_name} only contains enum values {enum}",
197
- "valid values": enum,
198
- }
199
- }
200
-
201
-
202
- def check_field_regex(field_name, pattern, quote_field_name: bool = False):
203
- if quote_field_name:
204
- field_name = f'"{field_name}"'
205
- return {
206
- f"invalid_count({field_name}) = 0": {
207
- "name": f"Check that field {field_name} matches regex pattern {pattern}",
208
- "valid regex": pattern,
209
- }
210
- }
211
-
212
-
213
- def check_quality_list(model_name, field_name, quality_list: List[Quality]):
214
- checks = {}
215
-
216
- count = 0
217
- for quality in quality_list:
218
- if quality.type == "sql":
219
- if field_name is None:
220
- metric_name = f"{model_name}_quality_sql_{count}"
10
+ run = Run.create_run()
11
+ run.checks.extend(create_checks(data_contract, server))
12
+ return to_sodacl_yaml(run)
13
+
14
+
15
+ def to_sodacl_yaml(run: Run) -> str:
16
+ sodacl_dict = {}
17
+ for run_check in run.checks:
18
+ if run_check.engine != "soda" or run_check.language != "sodacl":
19
+ continue
20
+ check_yaml_str = run_check.implementation
21
+ check_yaml_dict = yaml.safe_load(check_yaml_str)
22
+ for key, value in check_yaml_dict.items():
23
+ if key in sodacl_dict:
24
+ if isinstance(sodacl_dict[key], list) and isinstance(value, list):
25
+ sodacl_dict[key].extend(value)
26
+ else:
27
+ sodacl_dict[key].update(value)
221
28
  else:
222
- metric_name = f"{model_name}_{field_name}_quality_sql_{count}"
223
- threshold = to_sodacl_threshold(quality)
224
- query = prepare_query(quality, model_name, field_name)
225
- if query is None:
226
- logger.warning(f"Quality check {metric_name} has no query")
227
- continue
228
- if threshold is None:
229
- logger.warning(f"Quality check {metric_name} has no valid threshold")
230
- continue
231
- checks[f"{metric_name} {threshold}"] = {f"{metric_name} query": query}
232
- count += 1
233
-
234
- return checks
235
-
236
-
237
- def prepare_query(quality: Quality, model_name: str, field_name: str = None) -> str | None:
238
- if quality.query is None:
239
- return None
240
- if quality.query == "":
241
- return None
242
-
243
- query = quality.query
244
-
245
- query = query.replace("{model}", model_name)
246
- query = query.replace("{table}", model_name)
247
-
248
- if field_name is not None:
249
- query = query.replace("{field}", field_name)
250
- query = query.replace("{column}", field_name)
251
-
252
- return query
253
-
254
-
255
- def to_sodacl_threshold(quality: Quality) -> str | None:
256
- if quality.mustBe is not None:
257
- return f"= {quality.mustBe}"
258
- if quality.mustNotBe is not None:
259
- return f"!= {quality.mustNotBe}"
260
- if quality.mustBeGreaterThan is not None:
261
- return f"> {quality.mustBeGreaterThan}"
262
- if quality.mustBeGreaterThanOrEqualTo is not None:
263
- return f">= {quality.mustBeGreaterThanOrEqualTo}"
264
- if quality.mustBeLessThan is not None:
265
- return f"< {quality.mustBeLessThan}"
266
- if quality.mustBeLessThanOrEqualTo is not None:
267
- return f"<= {quality.mustBeLessThanOrEqualTo}"
268
- if quality.mustBeBetween is not None:
269
- if len(quality.mustBeBetween) != 2:
270
- logger.warning(
271
- f"Quality check has invalid mustBeBetween, must have exactly 2 integers in an array: {quality.mustBeBetween}"
272
- )
273
- return None
274
- return f"between {quality.mustBeBetween[0]} and {quality.mustBeBetween[1]}"
275
- if quality.mustNotBeBetween is not None:
276
- if len(quality.mustNotBeBetween) != 2:
277
- logger.warning(
278
- f"Quality check has invalid mustNotBeBetween, must have exactly 2 integers in an array: {quality.mustNotBeBetween}"
279
- )
280
- return None
281
- return f"not between {quality.mustNotBeBetween[0]} and {quality.mustNotBeBetween[1]}"
282
- return None
283
-
284
-
285
- # These are deprecated root-level quality specifications, use the model-level and field-level quality fields instead
286
- def add_quality_checks(sodacl, data_contract_spec):
287
- if data_contract_spec.quality is None:
288
- return
289
- if data_contract_spec.quality.type is None:
290
- return
291
- if data_contract_spec.quality.type.lower() != "sodacl":
292
- return
293
- if isinstance(data_contract_spec.quality.specification, str):
294
- quality_specification = yaml.safe_load(data_contract_spec.quality.specification)
295
- else:
296
- quality_specification = data_contract_spec.quality.specification
297
- for key, checks in quality_specification.items():
298
- if key in sodacl:
299
- for check in checks:
300
- sodacl[key].append(check)
301
- else:
302
- sodacl[key] = checks
29
+ sodacl_dict[key] = value
30
+ return yaml.dump(sodacl_dict)
@@ -142,11 +142,16 @@ def convert_to_dataframe(field: Field) -> None | str:
142
142
  if type.lower() in ["boolean"]:
143
143
  return "BOOLEAN"
144
144
  if type.lower() in ["object", "record", "struct"]:
145
- return "STRUCT"
145
+ nested_fields = []
146
+ for nested_field_name, nested_field in field.fields.items():
147
+ nested_field_type = convert_to_dataframe(nested_field)
148
+ nested_fields.append(f"{nested_field_name}:{nested_field_type}")
149
+ return f"STRUCT<{','.join(nested_fields)}>"
146
150
  if type.lower() in ["bytes"]:
147
151
  return "BINARY"
148
152
  if type.lower() in ["array"]:
149
- return "ARRAY"
153
+ item_type = convert_to_dataframe(field.items)
154
+ return f"ARRAY<{item_type}>"
150
155
  return None
151
156
 
152
157
 
@@ -41,9 +41,12 @@ def import_odcs(data_contract_specification: DataContractSpecification, source:
41
41
  )
42
42
 
43
43
  if odcs_api_version.startswith("v2."):
44
- from datacontract.imports.odcs_v2_importer import import_odcs_v2
45
-
46
- return import_odcs_v2(data_contract_specification, source)
44
+ raise DataContractException(
45
+ type="schema",
46
+ name="Importing ODCS contract",
47
+ reason=f"Unsupported ODCS API version: {odcs_api_version}",
48
+ engine="datacontract",
49
+ )
47
50
  elif odcs_api_version.startswith("v3."):
48
51
  from datacontract.imports.odcs_v3_importer import import_odcs_v3
49
52
 
@@ -287,6 +287,8 @@ def import_fields(
287
287
 
288
288
 
289
289
  def map_type(odcs_type: str, custom_mappings: Dict[str, str]) -> str | None:
290
+ if odcs_type is None:
291
+ return None
290
292
  t = odcs_type.lower()
291
293
  if t in DATACONTRACT_TYPES:
292
294
  return t