datacontract-cli 0.9.2__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/cli.py +4 -2
- datacontract/data_contract.py +69 -19
- datacontract/engines/fastjsonschema/check_jsonschema.py +56 -42
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +1 -1
- datacontract/engines/soda/check_soda_execute.py +25 -1
- datacontract/engines/soda/connections/bigquery.py +18 -0
- datacontract/engines/soda/connections/databricks.py +20 -0
- datacontract/engines/soda/connections/duckdb.py +10 -6
- datacontract/engines/soda/connections/postgres.py +21 -0
- datacontract/export/sodacl_converter.py +2 -2
- datacontract/lint/lint.py +126 -0
- datacontract/lint/linters/__init__.py +0 -0
- datacontract/lint/linters/example_model_linter.py +67 -0
- datacontract/model/data_contract_specification.py +5 -0
- {datacontract_cli-0.9.2.dist-info → datacontract_cli-0.9.4.dist-info}/METADATA +191 -31
- {datacontract_cli-0.9.2.dist-info → datacontract_cli-0.9.4.dist-info}/RECORD +20 -14
- {datacontract_cli-0.9.2.dist-info → datacontract_cli-0.9.4.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.9.2.dist-info → datacontract_cli-0.9.4.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.9.2.dist-info → datacontract_cli-0.9.4.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.9.2.dist-info → datacontract_cli-0.9.4.dist-info}/top_level.txt +0 -0
datacontract/cli.py
CHANGED
|
@@ -87,14 +87,16 @@ def test(
|
|
|
87
87
|
"Use the key of the server object in the data contract yaml file "
|
|
88
88
|
"to refer to a server, e.g., `production`, or `all` for all "
|
|
89
89
|
"servers (default).")] = "all",
|
|
90
|
+
examples: Annotated[bool, typer.Option(
|
|
91
|
+
help="Run the schema and quality tests on the example data within the data contract.")] = None,
|
|
90
92
|
publish: Annotated[str, typer.Option(
|
|
91
|
-
help="")] = None,
|
|
93
|
+
help="The url to publish the results after the test")] = None,
|
|
92
94
|
):
|
|
93
95
|
"""
|
|
94
96
|
Run schema and quality tests on configured servers.
|
|
95
97
|
"""
|
|
96
98
|
print(f"Testing {location}")
|
|
97
|
-
run = DataContract(data_contract_file=location, publish_url=publish).test()
|
|
99
|
+
run = DataContract(data_contract_file=location, publish_url=publish, examples=examples).test()
|
|
98
100
|
_handle_result(run)
|
|
99
101
|
|
|
100
102
|
|
datacontract/data_contract.py
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import tempfile
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
3
7
|
|
|
4
8
|
from datacontract.engines.datacontract.check_that_datacontract_contains_valid_servers_configuration import \
|
|
5
9
|
check_that_datacontract_contains_valid_server_configuration
|
|
@@ -11,8 +15,9 @@ from datacontract.export.sodacl_converter import to_sodacl
|
|
|
11
15
|
from datacontract.integration.publish_datamesh_manager import \
|
|
12
16
|
publish_datamesh_manager
|
|
13
17
|
from datacontract.lint import resolve
|
|
18
|
+
from datacontract.lint.linters.example_model_linter import ExampleModelLinter
|
|
14
19
|
from datacontract.model.data_contract_specification import \
|
|
15
|
-
DataContractSpecification
|
|
20
|
+
DataContractSpecification, Server
|
|
16
21
|
from datacontract.model.exceptions import DataContractException
|
|
17
22
|
from datacontract.model.run import \
|
|
18
23
|
Run, Check
|
|
@@ -25,13 +30,17 @@ class DataContract:
|
|
|
25
30
|
data_contract_str: str = None,
|
|
26
31
|
data_contract: DataContractSpecification = None,
|
|
27
32
|
server: str = None,
|
|
33
|
+
examples: bool = False,
|
|
28
34
|
publish_url: str = None,
|
|
35
|
+
spark: str = None,
|
|
29
36
|
):
|
|
30
37
|
self._data_contract_file = data_contract_file
|
|
31
38
|
self._data_contract_str = data_contract_str
|
|
32
39
|
self._data_contract = data_contract
|
|
33
40
|
self._server = server
|
|
41
|
+
self._examples = examples
|
|
34
42
|
self._publish_url = publish_url
|
|
43
|
+
self._spark = spark
|
|
35
44
|
|
|
36
45
|
def lint(self):
|
|
37
46
|
run = Run.create_run()
|
|
@@ -39,14 +48,15 @@ class DataContract:
|
|
|
39
48
|
run.log_info("Linting data contract")
|
|
40
49
|
data_contract = resolve.resolve_data_contract(self._data_contract_file, self._data_contract_str,
|
|
41
50
|
self._data_contract)
|
|
42
|
-
run.dataContractId = data_contract.id
|
|
43
|
-
run.dataContractVersion = data_contract.info.version
|
|
44
51
|
run.checks.append(Check(
|
|
45
52
|
type="lint",
|
|
46
53
|
result="passed",
|
|
47
|
-
name="
|
|
48
|
-
engine="datacontract"
|
|
49
|
-
|
|
54
|
+
name="Data contract is syntactically valid",
|
|
55
|
+
engine="datacontract"
|
|
56
|
+
))
|
|
57
|
+
run.checks.extend(ExampleModelLinter().lint(data_contract))
|
|
58
|
+
run.dataContractId = data_contract.id
|
|
59
|
+
run.dataContractVersion = data_contract.info.version
|
|
50
60
|
except DataContractException as e:
|
|
51
61
|
run.checks.append(Check(
|
|
52
62
|
type=e.type,
|
|
@@ -78,20 +88,27 @@ class DataContract:
|
|
|
78
88
|
|
|
79
89
|
check_that_datacontract_contains_valid_server_configuration(run, data_contract, self._server)
|
|
80
90
|
# TODO check yaml contains models
|
|
81
|
-
server_name = list(data_contract.servers.keys())[0]
|
|
82
|
-
server = data_contract.servers.get(server_name)
|
|
83
|
-
run.log_info(f"Running tests for data contract {data_contract.id} with server {server_name}")
|
|
84
|
-
run.dataContractId = data_contract.id
|
|
85
|
-
run.dataContractVersion = data_contract.info.version
|
|
86
|
-
run.dataProductId = server.dataProductId
|
|
87
|
-
run.outputPortId = server.outputPortId
|
|
88
|
-
run.server = server_name
|
|
89
91
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
92
|
+
with tempfile.TemporaryDirectory(prefix="datacontract-cli") as tmp_dir:
|
|
93
|
+
if self._examples:
|
|
94
|
+
server_name = "examples"
|
|
95
|
+
server = self._get_examples_server(data_contract, run, tmp_dir)
|
|
96
|
+
else:
|
|
97
|
+
server_name = list(data_contract.servers.keys())[0]
|
|
98
|
+
server = data_contract.servers.get(server_name)
|
|
99
|
+
|
|
100
|
+
run.log_info(f"Running tests for data contract {data_contract.id} with server {server_name}")
|
|
101
|
+
run.dataContractId = data_contract.id
|
|
102
|
+
run.dataContractVersion = data_contract.info.version
|
|
103
|
+
run.dataProductId = server.dataProductId
|
|
104
|
+
run.outputPortId = server.outputPortId
|
|
105
|
+
run.server = server_name
|
|
106
|
+
|
|
107
|
+
# 5. check server is supported type
|
|
108
|
+
# 6. check server credentials are complete
|
|
109
|
+
if server.format == "json":
|
|
110
|
+
check_jsonschema(run, data_contract, server)
|
|
111
|
+
check_soda_execute(run, data_contract, server, self._spark)
|
|
95
112
|
|
|
96
113
|
except DataContractException as e:
|
|
97
114
|
run.checks.append(Check(
|
|
@@ -121,6 +138,7 @@ class DataContract:
|
|
|
121
138
|
|
|
122
139
|
return run
|
|
123
140
|
|
|
141
|
+
|
|
124
142
|
def diff(self, other):
|
|
125
143
|
pass
|
|
126
144
|
|
|
@@ -136,3 +154,35 @@ class DataContract:
|
|
|
136
154
|
else:
|
|
137
155
|
print(f"Export format {export_format} not supported.")
|
|
138
156
|
return ""
|
|
157
|
+
|
|
158
|
+
def _get_examples_server(self, data_contract, run, tmp_dir):
|
|
159
|
+
run.log_info(f"Copying examples to files in temporary directory {tmp_dir}")
|
|
160
|
+
format = "json"
|
|
161
|
+
for example in data_contract.examples:
|
|
162
|
+
format = example.type
|
|
163
|
+
p = f"{tmp_dir}/{example.model}.{format}"
|
|
164
|
+
run.log_info(f"Creating example file {p}")
|
|
165
|
+
with open(p, "w") as f:
|
|
166
|
+
content = ""
|
|
167
|
+
if format == "json" and type(example.data) is list:
|
|
168
|
+
content = json.dumps(example.data)
|
|
169
|
+
elif format == "json" and type(example.data) is str:
|
|
170
|
+
content = example.data
|
|
171
|
+
elif format == "yaml" and type(example.data) is list:
|
|
172
|
+
content = yaml.dump(example.data)
|
|
173
|
+
elif format == "yaml" and type(example.data) is str:
|
|
174
|
+
content = example.data
|
|
175
|
+
elif format == "csv":
|
|
176
|
+
content = example.data
|
|
177
|
+
logging.debug(f"Content of example file {p}: {content}")
|
|
178
|
+
f.write(content)
|
|
179
|
+
path = f"{tmp_dir}" + "/{model}." + format
|
|
180
|
+
delimiter = "array"
|
|
181
|
+
server = Server(
|
|
182
|
+
type="local",
|
|
183
|
+
path=path,
|
|
184
|
+
format=format,
|
|
185
|
+
delimiter=delimiter,
|
|
186
|
+
)
|
|
187
|
+
run.log_info(f"Using {server} for testing the examples")
|
|
188
|
+
return server
|
|
@@ -29,29 +29,40 @@ def validate_json_stream(model_name, validate, json_stream):
|
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
|
|
32
|
-
def read_json_lines(
|
|
32
|
+
def read_json_lines(file):
|
|
33
|
+
file_content = file.read()
|
|
33
34
|
for line in file_content.splitlines():
|
|
34
35
|
yield json.loads(line)
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
def
|
|
38
|
-
for line in
|
|
38
|
+
def read_json_lines_content(file_content: str):
|
|
39
|
+
for line in file_content.splitlines():
|
|
39
40
|
yield json.loads(line)
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
def read_json_array(file):
|
|
43
|
-
data = json.
|
|
44
|
+
data = json.load(file)
|
|
45
|
+
for item in data:
|
|
46
|
+
yield item
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def read_json_array_content(file_content: str):
|
|
50
|
+
data = json.loads(file_content)
|
|
44
51
|
for item in data:
|
|
45
52
|
yield item
|
|
46
53
|
|
|
47
54
|
|
|
48
55
|
def read_json_file(file):
|
|
49
|
-
yield json.
|
|
56
|
+
yield json.load(file)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def read_json_file_content(file_content: str):
|
|
60
|
+
yield json.loads(file_content)
|
|
50
61
|
|
|
51
62
|
|
|
52
63
|
def process_json_file(run, model_name, validate, file, delimiter):
|
|
53
64
|
if delimiter == "new_line":
|
|
54
|
-
json_stream =
|
|
65
|
+
json_stream = read_json_lines(file)
|
|
55
66
|
elif delimiter == "array":
|
|
56
67
|
json_stream = read_json_array(file)
|
|
57
68
|
else:
|
|
@@ -60,18 +71,23 @@ def process_json_file(run, model_name, validate, file, delimiter):
|
|
|
60
71
|
|
|
61
72
|
|
|
62
73
|
def process_local_file(run, server, model_name, validate):
|
|
63
|
-
|
|
64
|
-
|
|
74
|
+
path = server.path
|
|
75
|
+
if "{model}" in path:
|
|
76
|
+
path = path.format(model=model_name)
|
|
77
|
+
|
|
78
|
+
if os.path.isdir(path):
|
|
79
|
+
return process_directory(run, path, server, model_name, validate)
|
|
65
80
|
else:
|
|
66
|
-
|
|
81
|
+
logging.info(f"Processing file {path}")
|
|
82
|
+
with open(path, 'r') as file:
|
|
67
83
|
process_json_file(run, model_name, validate, file, server.delimiter)
|
|
68
84
|
|
|
69
85
|
|
|
70
|
-
def process_directory(run, server, model_name, validate):
|
|
86
|
+
def process_directory(run, path, server, model_name, validate):
|
|
71
87
|
success = True
|
|
72
|
-
for filename in os.listdir(
|
|
88
|
+
for filename in os.listdir(path):
|
|
73
89
|
if filename.endswith('.json'): # or make this a parameter
|
|
74
|
-
file_path = os.path.join(
|
|
90
|
+
file_path = os.path.join(path, filename)
|
|
75
91
|
with open(file_path, 'r') as file:
|
|
76
92
|
if not process_json_file(run, model_name, validate, file, server.delimiter):
|
|
77
93
|
success = False
|
|
@@ -82,15 +98,17 @@ def process_directory(run, server, model_name, validate):
|
|
|
82
98
|
def process_s3_file(server, model_name, validate):
|
|
83
99
|
s3_endpoint_url = server.endpointUrl
|
|
84
100
|
s3_location = server.location
|
|
101
|
+
if "{model}" in s3_location:
|
|
102
|
+
s3_location = s3_location.format(model=model_name)
|
|
85
103
|
json_stream = None
|
|
86
104
|
|
|
87
105
|
for file_content in yield_s3_files(s3_endpoint_url, s3_location):
|
|
88
106
|
if server.delimiter == "new_line":
|
|
89
|
-
json_stream =
|
|
107
|
+
json_stream = read_json_lines_content(file_content)
|
|
90
108
|
elif server.delimiter == "array":
|
|
91
|
-
json_stream =
|
|
109
|
+
json_stream = read_json_array_content(file_content)
|
|
92
110
|
else:
|
|
93
|
-
json_stream =
|
|
111
|
+
json_stream = read_json_file_content(file_content)
|
|
94
112
|
|
|
95
113
|
if json_stream is None:
|
|
96
114
|
raise DataContractException(
|
|
@@ -123,39 +141,35 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
123
141
|
run.log_warn("jsonschema: No models found. Skip jsonschema checks.")
|
|
124
142
|
return
|
|
125
143
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
144
|
+
for model_name, model in iter(data_contract.models.items()):
|
|
145
|
+
# Process the model
|
|
146
|
+
run.log_info(f"jsonschema: Converting model {model_name} to JSON Schema")
|
|
147
|
+
schema = to_jsonschema(model_name, model)
|
|
148
|
+
run.log_info(f"jsonschema: {schema}")
|
|
129
149
|
|
|
130
|
-
|
|
131
|
-
run.log_info("jsonschema: Converting model to JSON Schema")
|
|
132
|
-
model_name, model = next(iter(data_contract.models.items()))
|
|
133
|
-
schema = to_jsonschema(model_name, model)
|
|
134
|
-
run.log_info(f"jsonschema: {schema}")
|
|
150
|
+
validate = fastjsonschema.compile(schema)
|
|
135
151
|
|
|
136
|
-
|
|
152
|
+
# Process files based on server type
|
|
153
|
+
if server.type == "local":
|
|
154
|
+
process_local_file(run, server, model_name, validate)
|
|
155
|
+
elif server.type == "s3":
|
|
156
|
+
process_s3_file(server, model_name, validate)
|
|
157
|
+
else:
|
|
158
|
+
run.checks.append(Check(
|
|
159
|
+
type="schema",
|
|
160
|
+
name="Check that JSON has valid schema",
|
|
161
|
+
model=model_name,
|
|
162
|
+
result="warn",
|
|
163
|
+
reason=f"Server type {server.type} not supported",
|
|
164
|
+
engine="jsonschema",
|
|
165
|
+
))
|
|
166
|
+
return
|
|
137
167
|
|
|
138
|
-
# Process files based on server type
|
|
139
|
-
if server.type == "local":
|
|
140
|
-
process_local_file(run, server, model_name, validate)
|
|
141
|
-
elif server.type == "s3":
|
|
142
|
-
process_s3_file(server, model_name, validate)
|
|
143
|
-
else:
|
|
144
168
|
run.checks.append(Check(
|
|
145
169
|
type="schema",
|
|
146
170
|
name="Check that JSON has valid schema",
|
|
147
171
|
model=model_name,
|
|
148
|
-
result="
|
|
149
|
-
reason=
|
|
172
|
+
result="passed",
|
|
173
|
+
reason="All JSON entries are valid.",
|
|
150
174
|
engine="jsonschema",
|
|
151
175
|
))
|
|
152
|
-
return
|
|
153
|
-
|
|
154
|
-
run.checks.append(Check(
|
|
155
|
-
type="schema",
|
|
156
|
-
name="Check that JSON has valid schema",
|
|
157
|
-
model=model_name,
|
|
158
|
-
result="passed",
|
|
159
|
-
reason="All JSON entries are valid.",
|
|
160
|
-
engine="jsonschema",
|
|
161
|
-
))
|
|
@@ -2,7 +2,13 @@ import logging
|
|
|
2
2
|
|
|
3
3
|
from soda.scan import Scan
|
|
4
4
|
|
|
5
|
+
from datacontract.engines.soda.connections.bigquery import \
|
|
6
|
+
to_bigquery_soda_configuration
|
|
7
|
+
from datacontract.engines.soda.connections.databricks import \
|
|
8
|
+
to_databricks_soda_configuration
|
|
5
9
|
from datacontract.engines.soda.connections.duckdb import get_duckdb_connection
|
|
10
|
+
from datacontract.engines.soda.connections.postgres import \
|
|
11
|
+
to_postgres_soda_configuration
|
|
6
12
|
from datacontract.engines.soda.connections.snowflake import \
|
|
7
13
|
to_snowflake_soda_configuration
|
|
8
14
|
from datacontract.export.sodacl_converter import to_sodacl
|
|
@@ -12,7 +18,7 @@ from datacontract.model.run import \
|
|
|
12
18
|
Run, Check, Log
|
|
13
19
|
|
|
14
20
|
|
|
15
|
-
def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server):
|
|
21
|
+
def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark):
|
|
16
22
|
if data_contract is None:
|
|
17
23
|
run.log_warn("Cannot run engine soda-core, as data contract is invalid")
|
|
18
24
|
return
|
|
@@ -39,6 +45,24 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
39
45
|
soda_configuration_str = to_snowflake_soda_configuration(server)
|
|
40
46
|
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
41
47
|
scan.set_data_source_name(server.type)
|
|
48
|
+
elif server.type == "bigquery":
|
|
49
|
+
soda_configuration_str = to_bigquery_soda_configuration(server)
|
|
50
|
+
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
51
|
+
scan.set_data_source_name(server.type)
|
|
52
|
+
elif server.type == "postgres":
|
|
53
|
+
soda_configuration_str = to_postgres_soda_configuration(server)
|
|
54
|
+
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
55
|
+
scan.set_data_source_name(server.type)
|
|
56
|
+
elif server.type == "databricks":
|
|
57
|
+
if spark is not None:
|
|
58
|
+
logging.info("Use Spark to connect to data source")
|
|
59
|
+
scan.add_spark_session(spark, data_source_name=server.type)
|
|
60
|
+
scan.set_data_source_name(server.type)
|
|
61
|
+
spark.sql(f"USE {server.catalog}.{server.schema_}")
|
|
62
|
+
else:
|
|
63
|
+
soda_configuration_str = to_databricks_soda_configuration(server)
|
|
64
|
+
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
65
|
+
scan.set_data_source_name(server.type)
|
|
42
66
|
else:
|
|
43
67
|
run.checks.append(Check(
|
|
44
68
|
type="general",
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import yaml
|
|
3
|
+
|
|
4
|
+
# https://docs.soda.io/soda/connect-bigquery.html#authentication-methods
|
|
5
|
+
def to_bigquery_soda_configuration(server):
|
|
6
|
+
# with service account key, using an external json file
|
|
7
|
+
soda_configuration = {
|
|
8
|
+
f"data_source {server.type}": {
|
|
9
|
+
"type": "bigquery",
|
|
10
|
+
"account_info_json_path": os.getenv('DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH'),
|
|
11
|
+
"auth_scopes": ["https://www.googleapis.com/auth/bigquery"],
|
|
12
|
+
"project_id": server.project,
|
|
13
|
+
"dataset": server.dataset
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
soda_configuration_str = yaml.dump(soda_configuration)
|
|
18
|
+
return soda_configuration_str
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def to_databricks_soda_configuration(server):
|
|
7
|
+
soda_configuration = {
|
|
8
|
+
f"data_source {server.type}": {
|
|
9
|
+
"type": "spark",
|
|
10
|
+
"method": "databricks",
|
|
11
|
+
"host": server.host,
|
|
12
|
+
"catalog": server.catalog,
|
|
13
|
+
"schema": server.schema_,
|
|
14
|
+
"http_path": os.getenv('DATACONTRACT_DATABRICKS_HTTP_PATH'),
|
|
15
|
+
"token": os.getenv('DATACONTRACT_DATABRICKS_TOKEN'),
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
soda_configuration_str = yaml.dump(soda_configuration)
|
|
20
|
+
return soda_configuration_str
|
|
@@ -13,7 +13,11 @@ def get_duckdb_connection(data_contract, server):
|
|
|
13
13
|
path = server.location
|
|
14
14
|
setup_s3_connection(con, server)
|
|
15
15
|
for model_name in data_contract.models:
|
|
16
|
-
|
|
16
|
+
model_path = path
|
|
17
|
+
if "{model}" in model_path:
|
|
18
|
+
model_path = model_path.format(model = model_name)
|
|
19
|
+
logging.info(f"Creating table {model_name} for {model_path}")
|
|
20
|
+
|
|
17
21
|
if server.format == "json":
|
|
18
22
|
format = "auto"
|
|
19
23
|
if server.delimiter == "new_line":
|
|
@@ -21,15 +25,15 @@ def get_duckdb_connection(data_contract, server):
|
|
|
21
25
|
elif server.delimiter == "array":
|
|
22
26
|
format = "array"
|
|
23
27
|
con.sql(f"""
|
|
24
|
-
CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{
|
|
28
|
+
CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{format}', hive_partitioning=1);
|
|
25
29
|
""")
|
|
26
30
|
elif server.format == "parquet":
|
|
27
31
|
con.sql(f"""
|
|
28
|
-
CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{
|
|
32
|
+
CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{model_path}', hive_partitioning=1);
|
|
29
33
|
""")
|
|
30
34
|
elif server.format == "csv":
|
|
31
35
|
con.sql(f"""
|
|
32
|
-
CREATE VIEW "{model_name}" AS SELECT * FROM read_csv_auto('{
|
|
36
|
+
CREATE VIEW "{model_name}" AS SELECT * FROM read_csv_auto('{model_path}', hive_partitioning=1);
|
|
33
37
|
""")
|
|
34
38
|
return con
|
|
35
39
|
|
|
@@ -38,8 +42,8 @@ def setup_s3_connection(con, server):
|
|
|
38
42
|
s3_region = os.getenv('DATACONTRACT_S3_REGION')
|
|
39
43
|
s3_access_key_id = os.getenv('DATACONTRACT_S3_ACCESS_KEY_ID')
|
|
40
44
|
s3_secret_access_key = os.getenv('DATACONTRACT_S3_SECRET_ACCESS_KEY')
|
|
41
|
-
con.install_extension("httpfs")
|
|
42
|
-
con.load_extension("httpfs")
|
|
45
|
+
# con.install_extension("httpfs")
|
|
46
|
+
# con.load_extension("httpfs")
|
|
43
47
|
if server.endpointUrl is not None:
|
|
44
48
|
s3_endpoint = server.endpointUrl.removeprefix("http://").removeprefix("https://")
|
|
45
49
|
if server.endpointUrl.startswith("http://"):
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def to_postgres_soda_configuration(server):
|
|
7
|
+
# with service account key, using an external json file
|
|
8
|
+
soda_configuration = {
|
|
9
|
+
f"data_source {server.type}": {
|
|
10
|
+
"type": "postgres",
|
|
11
|
+
"host": server.host,
|
|
12
|
+
"port": str(server.port),
|
|
13
|
+
"username": os.getenv('DATACONTRACT_POSTGRES_USERNAME'),
|
|
14
|
+
"password": os.getenv('DATACONTRACT_POSTGRES_PASSWORD'),
|
|
15
|
+
"database": server.database,
|
|
16
|
+
"schema": server.schema_,
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
soda_configuration_str = yaml.dump(soda_configuration)
|
|
21
|
+
return soda_configuration_str
|
|
@@ -60,7 +60,7 @@ def check_field_type(field_name: str, type: str):
|
|
|
60
60
|
|
|
61
61
|
def check_field_required(field_name):
|
|
62
62
|
return {
|
|
63
|
-
f"missing_count(
|
|
63
|
+
f"missing_count({field_name}) = 0": {
|
|
64
64
|
"name": f"Check that required field {field_name} has no null values"
|
|
65
65
|
}
|
|
66
66
|
}
|
|
@@ -68,7 +68,7 @@ def check_field_required(field_name):
|
|
|
68
68
|
|
|
69
69
|
def check_field_unique(field_name):
|
|
70
70
|
return {
|
|
71
|
-
f'duplicate_count(
|
|
71
|
+
f'duplicate_count({field_name}) = 0': {
|
|
72
72
|
"name": f"Check that unique field {field_name} has no duplicate values"
|
|
73
73
|
}
|
|
74
74
|
}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Sequence, Any
|
|
4
|
+
import abc
|
|
5
|
+
|
|
6
|
+
from ..model.data_contract_specification import DataContractSpecification
|
|
7
|
+
from datacontract.model.run import Check
|
|
8
|
+
|
|
9
|
+
"""This module contains linter definitions for linting a data contract.
|
|
10
|
+
|
|
11
|
+
Lints are quality checks that can succeed, fail, or warn. They are
|
|
12
|
+
distinct from checks such as "valid yaml" or "file not found", which
|
|
13
|
+
will cause the processing of the data contract to stop. Lints can be
|
|
14
|
+
ignored, and are high-level requirements on the format of a data
|
|
15
|
+
contract."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LintSeverity(Enum):
|
|
19
|
+
"""The severity of a lint message. Generally, lint messages should be
|
|
20
|
+
emitted with a severity of ERROR. WARNING should be used when the linter
|
|
21
|
+
cannot determine a lint result, for example, when an unsupported model
|
|
22
|
+
type is used.
|
|
23
|
+
"""
|
|
24
|
+
ERROR = 2
|
|
25
|
+
WARNING = 1
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class LinterMessage:
|
|
30
|
+
"""A single linter message with attached severity and optional "model" that
|
|
31
|
+
caused the message.
|
|
32
|
+
|
|
33
|
+
Attributes:
|
|
34
|
+
outcome: The outcome of the linting, either ERROR or WARNING.
|
|
35
|
+
message: A message describing the error or warning in more detail.
|
|
36
|
+
model: The model that caused the lint to fail. Is optional.
|
|
37
|
+
|
|
38
|
+
"""
|
|
39
|
+
outcome: LintSeverity
|
|
40
|
+
message: str
|
|
41
|
+
model: Any = None
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def error(cls, message: str, model=None):
|
|
45
|
+
return LinterMessage(LintSeverity.ERROR, message, model)
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def warning(cls, message: str, model=None):
|
|
49
|
+
return LinterMessage(LintSeverity.WARNING, message, model)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class LinterResult:
|
|
54
|
+
"""Result of linting a contract. Contains multiple LinterResults from
|
|
55
|
+
the same linter or lint phase.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
linter: The linter that produced these results
|
|
59
|
+
results: A list of linting results. Multiple identical linting
|
|
60
|
+
results can be present in the list. An empty list means that
|
|
61
|
+
the linter ran without producing warnings or errors.
|
|
62
|
+
"""
|
|
63
|
+
results: Sequence[LinterMessage] = field(default_factory=list)
|
|
64
|
+
|
|
65
|
+
def with_warning(self, message, model=None):
|
|
66
|
+
result = LinterMessage.warning(message, model)
|
|
67
|
+
return LinterResult(self.results + [result])
|
|
68
|
+
|
|
69
|
+
def with_error(self, message, model=None):
|
|
70
|
+
result = LinterMessage.error(message, model)
|
|
71
|
+
return LinterResult(self.results + [result])
|
|
72
|
+
|
|
73
|
+
def has_errors(self) -> bool:
|
|
74
|
+
return any(map(lambda result: result.outcome == LintSeverity.ERROR,
|
|
75
|
+
self.results))
|
|
76
|
+
|
|
77
|
+
def has_warnings(self) -> bool:
|
|
78
|
+
return any(map(lambda result: result.outcome == LintSeverity.WARNING,
|
|
79
|
+
self.results))
|
|
80
|
+
|
|
81
|
+
def error_results(self) -> Sequence[LinterMessage]:
|
|
82
|
+
return [result for result in self.results
|
|
83
|
+
if result.outcome == LintSeverity.ERROR]
|
|
84
|
+
|
|
85
|
+
def warning_results(self) -> Sequence[LinterMessage]:
|
|
86
|
+
return [result for result in self.results
|
|
87
|
+
if result.outcome == LintSeverity.WARNING]
|
|
88
|
+
|
|
89
|
+
def no_errors_or_warnings(self) -> bool:
|
|
90
|
+
return len(self.results) == 0
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class Linter(abc.ABC):
|
|
94
|
+
@property
|
|
95
|
+
@abc.abstractmethod
|
|
96
|
+
def name(self) -> str:
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
@abc.abstractmethod
|
|
100
|
+
def lint_implementation(self, contract: DataContractSpecification) -> LinterResult:
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
def lint(self, contract: DataContractSpecification) -> list[Check]:
|
|
104
|
+
result = self.lint_implementation(contract)
|
|
105
|
+
checks = []
|
|
106
|
+
if not result.error_results():
|
|
107
|
+
checks.append(Check(
|
|
108
|
+
type="lint",
|
|
109
|
+
name=f"Linter '{self.name()}'",
|
|
110
|
+
result="passed",
|
|
111
|
+
engine="datacontract"
|
|
112
|
+
))
|
|
113
|
+
else:
|
|
114
|
+
# All linter messages are treated as warnings. Severity is
|
|
115
|
+
# currently ignored, but could be used in filtering in the future
|
|
116
|
+
# Linter messages with level WARNING are currently ignored, but might
|
|
117
|
+
# be logged or printed in the future.
|
|
118
|
+
for lint_error in result.error_results():
|
|
119
|
+
checks.append(Check(
|
|
120
|
+
type="lint",
|
|
121
|
+
name=f"Linter '{self.name()}'",
|
|
122
|
+
result="warning",
|
|
123
|
+
engine="datacontract",
|
|
124
|
+
reason=lint_error.message
|
|
125
|
+
))
|
|
126
|
+
return checks
|
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import yaml
|
|
3
|
+
import json
|
|
4
|
+
import io
|
|
5
|
+
|
|
6
|
+
from ..lint import Linter, LinterResult
|
|
7
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Example
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ExampleModelLinter(Linter):
|
|
11
|
+
def name(self) -> str:
|
|
12
|
+
return "Example(s) match model"
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def get_example_headers(example: Example) -> list[str]:
|
|
16
|
+
match example.type:
|
|
17
|
+
case "csv":
|
|
18
|
+
dialect = csv.Sniffer().sniff(example.data)
|
|
19
|
+
data = io.StringIO(example.data)
|
|
20
|
+
reader = csv.reader(data, dialect=dialect)
|
|
21
|
+
return next(reader)
|
|
22
|
+
case "yaml":
|
|
23
|
+
data = yaml.safe_load(example.data)
|
|
24
|
+
return data.keys()
|
|
25
|
+
case "json":
|
|
26
|
+
data = json.loads(example.data)
|
|
27
|
+
return data.keys()
|
|
28
|
+
|
|
29
|
+
def lint_implementation(
|
|
30
|
+
self,
|
|
31
|
+
data_contract_yaml: DataContractSpecification
|
|
32
|
+
) -> LinterResult:
|
|
33
|
+
"""Check whether the example(s) match the model."""
|
|
34
|
+
result = LinterResult()
|
|
35
|
+
examples = data_contract_yaml.examples
|
|
36
|
+
models = data_contract_yaml.models
|
|
37
|
+
examples_with_model = []
|
|
38
|
+
for (index, example) in enumerate(examples):
|
|
39
|
+
if example.model not in models:
|
|
40
|
+
result = result.with_error(
|
|
41
|
+
f"Example {index + 1} has non-existent model '{example.model}'")
|
|
42
|
+
else:
|
|
43
|
+
examples_with_model.append(
|
|
44
|
+
(index, example, models.get(example.model)))
|
|
45
|
+
for (index, example, model) in examples_with_model:
|
|
46
|
+
if example.type == "custom":
|
|
47
|
+
result = result.with_warning(f"Example {index + 1} has type"
|
|
48
|
+
" \"custom\", cannot check model"
|
|
49
|
+
" conformance")
|
|
50
|
+
elif model.type == "object":
|
|
51
|
+
result = result.with_warning(
|
|
52
|
+
f"Example {index + 1} uses a "
|
|
53
|
+
f"model '{example.model}' with type 'object'. Linting is "
|
|
54
|
+
"currently only supported for 'table' models")
|
|
55
|
+
else:
|
|
56
|
+
headers = self.get_example_headers(example)
|
|
57
|
+
for example_header in headers:
|
|
58
|
+
if example_header not in model.fields:
|
|
59
|
+
result = result.with_error(
|
|
60
|
+
f"Example {index + 1} has field '{example_header}'"
|
|
61
|
+
f" that's not contained in model '{example.model}'")
|
|
62
|
+
for (field_name, field_value) in model.fields.items():
|
|
63
|
+
if field_name not in headers and field_value.required:
|
|
64
|
+
result = result.with_error(
|
|
65
|
+
f"Example {index + 1} is missing field '{field_name}'"
|
|
66
|
+
f" required by model '{example.model}'")
|
|
67
|
+
return result
|
|
@@ -23,6 +23,11 @@ class Server(BaseModel):
|
|
|
23
23
|
account: str = None
|
|
24
24
|
database: str = None
|
|
25
25
|
schema_: str = pydantic.fields.Field(default=None, alias='schema')
|
|
26
|
+
host: str = None
|
|
27
|
+
port: int = None
|
|
28
|
+
catalog: str = None
|
|
29
|
+
http_path: str = None # Use ENV variable
|
|
30
|
+
token: str = None # Use ENV variable
|
|
26
31
|
dataProductId: str = None
|
|
27
32
|
outputPortId: str = None
|
|
28
33
|
|
|
@@ -1,31 +1,36 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datacontract-cli
|
|
3
|
-
Version: 0.9.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.9.4
|
|
4
|
+
Summary: Test data contracts
|
|
5
5
|
Author-email: Jochen Christ <jochen.christ@innoq.com>, Stefan Negele <stefan.negele@innoq.com>
|
|
6
6
|
Project-URL: Homepage, https://cli.datacontract.com
|
|
7
7
|
Project-URL: Issues, https://github.com/datacontract/cli/issues
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: License :: OSI Approved :: MIT License
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python:
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: typer[all] ~=0.9.0
|
|
15
|
-
Requires-Dist: pydantic
|
|
15
|
+
Requires-Dist: pydantic <2.7.0,>=2.5.3
|
|
16
16
|
Requires-Dist: pyyaml ~=6.0.1
|
|
17
17
|
Requires-Dist: requests ~=2.31.0
|
|
18
18
|
Requires-Dist: fastparquet ==2023.10.1
|
|
19
|
-
Requires-Dist: soda-core-
|
|
20
|
-
Requires-Dist: soda-core-
|
|
21
|
-
Requires-Dist:
|
|
22
|
-
Requires-Dist:
|
|
19
|
+
Requires-Dist: soda-core-bigquery ~=3.1.5
|
|
20
|
+
Requires-Dist: soda-core-duckdb ~=3.1.5
|
|
21
|
+
Requires-Dist: soda-core-postgres ~=3.1.5
|
|
22
|
+
Requires-Dist: soda-core-snowflake ~=3.1.5
|
|
23
|
+
Requires-Dist: soda-core-spark[databricks] ~=3.1.5
|
|
24
|
+
Requires-Dist: soda-core-spark-df ~=3.1.5
|
|
25
|
+
Requires-Dist: snowflake-connector-python[pandas] <3.8,>=3.6
|
|
26
|
+
Requires-Dist: duckdb ==0.10.0
|
|
23
27
|
Requires-Dist: fastjsonschema ~=2.19.1
|
|
24
28
|
Requires-Dist: python-dotenv ~=1.0.0
|
|
25
|
-
Requires-Dist: s3fs ==
|
|
29
|
+
Requires-Dist: s3fs ==2024.2.0
|
|
26
30
|
Provides-Extra: dev
|
|
27
31
|
Requires-Dist: pytest ; extra == 'dev'
|
|
28
32
|
Requires-Dist: testcontainers-minio ; extra == 'dev'
|
|
33
|
+
Requires-Dist: testcontainers-postgres ; extra == 'dev'
|
|
29
34
|
|
|
30
35
|
# Data Contract CLI
|
|
31
36
|
|
|
@@ -44,12 +49,12 @@ It uses data contract YAML files to lint the data contract, connect to data sour
|
|
|
44
49
|
|
|
45
50
|
## Getting started
|
|
46
51
|
|
|
47
|
-
Let's use [pip](https://pip.pypa.io/en/stable/getting-started/) to install the CLI.
|
|
52
|
+
Let's use [pip](https://pip.pypa.io/en/stable/getting-started/) to install the CLI.
|
|
48
53
|
```bash
|
|
49
54
|
$ pip3 install datacontract-cli
|
|
50
55
|
```
|
|
51
56
|
|
|
52
|
-
Now, let's look at this data contract:
|
|
57
|
+
Now, let's look at this data contract:
|
|
53
58
|
[https://datacontract.com/examples/covid-cases/datacontract.yaml](https://datacontract.com/examples/covid-cases/datacontract.yaml)
|
|
54
59
|
|
|
55
60
|
We have a _servers_ section with endpoint details to the (public) S3 bucket, _models_ for the structure of the data, and _quality_ attributes that describe the expected freshness and number of rows.
|
|
@@ -77,6 +82,9 @@ $ datacontract lint datacontract.yaml
|
|
|
77
82
|
# execute schema and quality checks
|
|
78
83
|
$ datacontract test datacontract.yaml
|
|
79
84
|
|
|
85
|
+
# execute schema and quality checks on the examples within the contract
|
|
86
|
+
$ datacontract test --examples datacontract.yaml
|
|
87
|
+
|
|
80
88
|
# find differences between to data contracts (Coming Soon)
|
|
81
89
|
$ datacontract diff datacontract-v1.yaml datacontract-v2.yaml
|
|
82
90
|
|
|
@@ -124,6 +132,7 @@ Choose the most appropriate installation method for your needs:
|
|
|
124
132
|
|
|
125
133
|
### pip
|
|
126
134
|
Python 3.11 recommended.
|
|
135
|
+
Python 3.12 available as pre-release release candidate for 0.9.3
|
|
127
136
|
|
|
128
137
|
```bash
|
|
129
138
|
pip3 install datacontract-cli
|
|
@@ -135,17 +144,17 @@ pipx installs into an isolated environment.
|
|
|
135
144
|
pipx install datacontract-cli
|
|
136
145
|
```
|
|
137
146
|
|
|
138
|
-
###
|
|
147
|
+
### Docker
|
|
139
148
|
|
|
140
149
|
```bash
|
|
141
|
-
|
|
150
|
+
docker pull --platform linux/amd64 datacontract/cli
|
|
151
|
+
docker run --rm --platform linux/amd64 -v ${PWD}:/home/datacontract datacontract/cli
|
|
142
152
|
```
|
|
143
153
|
|
|
144
|
-
|
|
154
|
+
Or via an alias that automatically uses the latest version:
|
|
145
155
|
|
|
146
156
|
```bash
|
|
147
|
-
docker
|
|
148
|
-
docker run --rm -v ${PWD}:/datacontract datacontract/cli
|
|
157
|
+
alias datacontract='docker run --rm -v "${PWD}:/home/datacontract" --platform linux/amd64 datacontract/cli:latest'
|
|
149
158
|
```
|
|
150
159
|
|
|
151
160
|
## Documentation
|
|
@@ -154,7 +163,7 @@ docker run --rm -v ${PWD}:/datacontract datacontract/cli
|
|
|
154
163
|
|
|
155
164
|
Data Contract CLI can connect to data sources and run schema and quality tests to verify that the data contract is valid.
|
|
156
165
|
|
|
157
|
-
```bash
|
|
166
|
+
```bash
|
|
158
167
|
$ datacontract test --server production datacontract.yaml
|
|
159
168
|
```
|
|
160
169
|
|
|
@@ -168,11 +177,12 @@ The application uses different engines, based on the server `type`.
|
|
|
168
177
|
| `s3` | `json` | Support for `new_line` delimited JSON files and one JSON record per file. | ✅ | fastjsonschema<br> soda-core-duckdb |
|
|
169
178
|
| `s3` | `csv` | | ✅ | soda-core-duckdb |
|
|
170
179
|
| `s3` | `delta` | | Coming soon | TBD |
|
|
171
|
-
| `postgres` | n/a | |
|
|
172
|
-
| `snowflake` | n/a | | ✅
|
|
173
|
-
| `bigquery` | n/a | |
|
|
180
|
+
| `postgres` | n/a | | ✅ | soda-core-postgres |
|
|
181
|
+
| `snowflake` | n/a | | ✅ | soda-core-snowflake |
|
|
182
|
+
| `bigquery` | n/a | | ✅ | soda-core-bigquery |
|
|
174
183
|
| `redshift` | n/a | | Coming soon | TBD |
|
|
175
|
-
| `databricks` | n/a |
|
|
184
|
+
| `databricks` | n/a | Support for Databricks SQL with Unity catalog and Hive metastore. | ✅ | soda-core-spark |
|
|
185
|
+
| `databricks` | n/a | Support for Spark for programmatic use in Notebooks. | ✅ | soda-core-spark-df |
|
|
176
186
|
| `kafka` | `json` | | Coming soon | TBD |
|
|
177
187
|
| `kafka` | `avro` | | Coming soon | TBD |
|
|
178
188
|
| `kafka` | `protobuf` | | Coming soon | TBD |
|
|
@@ -182,32 +192,178 @@ The application uses different engines, based on the server `type`.
|
|
|
182
192
|
|
|
183
193
|
Feel free to create an issue, if you need support for an additional type.
|
|
184
194
|
|
|
185
|
-
###
|
|
195
|
+
### S3
|
|
186
196
|
|
|
187
|
-
|
|
197
|
+
Data Contract CLI can test data that is stored in S3 buckets or any S3-compliant endpoints in various formats.
|
|
198
|
+
|
|
199
|
+
#### Example
|
|
188
200
|
|
|
189
201
|
datacontract.yaml
|
|
190
|
-
```
|
|
202
|
+
```yaml
|
|
191
203
|
servers:
|
|
192
204
|
production:
|
|
193
205
|
type: s3
|
|
194
206
|
endpointUrl: https://minio.example.com # not needed with AWS S3
|
|
195
207
|
location: s3://bucket-name/path/*/*.json
|
|
196
|
-
delimiter: new_line # new_line, array, or none
|
|
197
208
|
format: json
|
|
209
|
+
delimiter: new_line # new_line, array, or none
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
#### Environment Variables
|
|
213
|
+
|
|
214
|
+
| Environment Variable | Example | Description |
|
|
215
|
+
|-----------------------------------|-------------------------------|-----------------------|
|
|
216
|
+
| `DATACONTRACT_S3_REGION` | `eu-central-1` | Region of S3 bucket |
|
|
217
|
+
| `DATACONTRACT_S3_ACCESS_KEY_ID` | `AKIAXV5Q5QABCDEFGH` | AWS Access Key ID |
|
|
218
|
+
| `DATACONTRACT_S3_SECRET_ACCESS_KEY` | `93S7LRrJcqLaaaa/XXXXXXXXXXXXX` | AWS Secret Access Key |
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
### Postgres
|
|
222
|
+
|
|
223
|
+
Data Contract CLI can test data in Postgres or Postgres-compliant databases (e.g., RisingWave).
|
|
224
|
+
|
|
225
|
+
#### Example
|
|
226
|
+
|
|
227
|
+
datacontract.yaml
|
|
228
|
+
```yaml
|
|
229
|
+
servers:
|
|
230
|
+
postgres:
|
|
231
|
+
type: postgres
|
|
232
|
+
host: localhost
|
|
233
|
+
port: 5432
|
|
234
|
+
database: postgres
|
|
235
|
+
schema: public
|
|
236
|
+
models:
|
|
237
|
+
my_table_1: # corresponds to a table
|
|
238
|
+
type: table
|
|
239
|
+
fields:
|
|
240
|
+
my_column_1: # corresponds to a column
|
|
241
|
+
type: varchar
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
#### Environment Variables
|
|
245
|
+
|
|
246
|
+
| Environment Variable | Example | Description |
|
|
247
|
+
|----------------------------------|--------------------|-------------|
|
|
248
|
+
| `DATACONTRACT_POSTGRES_USERNAME` | `postgres` | Username |
|
|
249
|
+
| `DATACONTRACT_POSTGRES_PASSWORD` | `mysecretpassword` | Password |
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
### BigQuery
|
|
253
|
+
|
|
254
|
+
We support authentication to BigQuery using Service Account Key. The used Service Account should include the roles:
|
|
255
|
+
* BigQuery Job User
|
|
256
|
+
* BigQuery Data Viewer
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
#### Example
|
|
260
|
+
|
|
261
|
+
datacontract.yaml
|
|
262
|
+
```yaml
|
|
263
|
+
servers:
|
|
264
|
+
production:
|
|
265
|
+
type: bigquery
|
|
266
|
+
project: datameshexample-product
|
|
267
|
+
dataset: datacontract_cli_test_dataset
|
|
268
|
+
models:
|
|
269
|
+
datacontract_cli_test_table: # corresponds to a BigQuery table
|
|
270
|
+
type: table
|
|
271
|
+
fields: ...
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
#### Environment Variables
|
|
275
|
+
|
|
276
|
+
| Environment Variable | Example | Description |
|
|
277
|
+
|----------------------------------------------|---------------------------|---------------------------------------------------------|
|
|
278
|
+
| `DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH` | `~/service-access-key.json` | Service Access key as saved on key creation by BigQuery |
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
### Databricks
|
|
282
|
+
|
|
283
|
+
Works with Unity Catalog and Hive metastore.
|
|
284
|
+
|
|
285
|
+
Needs a running SQL warehouse or compute cluster.
|
|
286
|
+
|
|
287
|
+
#### Example
|
|
288
|
+
|
|
289
|
+
datacontract.yaml
|
|
290
|
+
```yaml
|
|
291
|
+
servers:
|
|
292
|
+
production:
|
|
293
|
+
type: databricks
|
|
294
|
+
host: dbc-abcdefgh-1234.cloud.databricks.com
|
|
295
|
+
catalog: acme_catalog_prod
|
|
296
|
+
schema: orders_latest
|
|
297
|
+
models:
|
|
298
|
+
orders: # corresponds to a table
|
|
299
|
+
type: table
|
|
300
|
+
fields: ...
|
|
198
301
|
```
|
|
199
302
|
|
|
200
|
-
Environment
|
|
303
|
+
#### Environment Variables
|
|
304
|
+
|
|
305
|
+
| Environment Variable | Example | Description |
|
|
306
|
+
|----------------------------------------------|--------------------------------------|-------------------------------------------------------|
|
|
307
|
+
| `DATACONTRACT_DATABRICKS_TOKEN` | `dapia00000000000000000000000000000` | The personal access token to authenticate |
|
|
308
|
+
| `DATACONTRACT_DATABRICKS_HTTP_PATH` | `/sql/1.0/warehouses/b053a3ffffffff` | The HTTP path to the SQL warehouse or compute cluster |
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
### Databricks (programmatic)
|
|
312
|
+
|
|
313
|
+
Works with Unity Catalog and Hive metastore.
|
|
314
|
+
When running in a notebook or pipeline, the provided `spark` session can be used.
|
|
315
|
+
An additional authentication is not required.
|
|
316
|
+
|
|
317
|
+
Requires a Databricks Runtime with Python >= 3.10.
|
|
318
|
+
|
|
319
|
+
#### Example
|
|
320
|
+
|
|
321
|
+
datacontract.yaml
|
|
322
|
+
```yaml
|
|
323
|
+
servers:
|
|
324
|
+
production:
|
|
325
|
+
type: databricks
|
|
326
|
+
host: dbc-abcdefgh-1234.cloud.databricks.com # ignored, always use current host
|
|
327
|
+
catalog: acme_catalog_prod
|
|
328
|
+
schema: orders_latest
|
|
329
|
+
models:
|
|
330
|
+
orders: # corresponds to a table
|
|
331
|
+
type: table
|
|
332
|
+
fields: ...
|
|
201
333
|
```
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
334
|
+
|
|
335
|
+
Notebook
|
|
336
|
+
```python
|
|
337
|
+
%pip install git+https://github.com/datacontract/cli.git
|
|
338
|
+
dbutils.library.restartPython()
|
|
339
|
+
|
|
340
|
+
from datacontract.data_contract import DataContract
|
|
341
|
+
|
|
342
|
+
data_contract = DataContract(
|
|
343
|
+
data_contract_file="/Volumes/acme_catalog_prod/orders_latest/datacontract/datacontract.yaml",
|
|
344
|
+
spark=spark)
|
|
345
|
+
run = data_contract.test()
|
|
346
|
+
run.result
|
|
205
347
|
```
|
|
206
348
|
|
|
207
349
|
|
|
350
|
+
### Exports
|
|
351
|
+
|
|
352
|
+
Available export options:
|
|
353
|
+
|
|
354
|
+
| Type | Description | Status |
|
|
355
|
+
|--------------|------------------------------------------------|--------|
|
|
356
|
+
| `jsonschema` | Export to JSON Schema | ✅ |
|
|
357
|
+
| `sodacl` | Export to SodaCL quality checks in YAML format | ✅ |
|
|
358
|
+
| `dbt` | Export to dbt model in YAML format | TBD |
|
|
359
|
+
| `avro` | Export to AVRO models | TBD |
|
|
360
|
+
| `pydantic` | Export to pydantic models | TBD |
|
|
361
|
+
| `sql` | Export to SQL DDL | TBD |
|
|
362
|
+
| `protobuf` | Export to Protobuf | TBD |
|
|
363
|
+
|
|
208
364
|
## Development Setup
|
|
209
365
|
|
|
210
|
-
Python base interpreter should be 3.11.x
|
|
366
|
+
Python base interpreter should be 3.11.x (unless working on 3.12 release candidate).
|
|
211
367
|
|
|
212
368
|
```bash
|
|
213
369
|
# create venv
|
|
@@ -237,7 +393,7 @@ Docker Build
|
|
|
237
393
|
|
|
238
394
|
```
|
|
239
395
|
docker build -t datacontract/cli .
|
|
240
|
-
docker run --rm -v ${PWD}:/datacontract datacontract/cli
|
|
396
|
+
docker run --rm -v ${PWD}:/home/datacontract datacontract/cli
|
|
241
397
|
```
|
|
242
398
|
|
|
243
399
|
## Contribution
|
|
@@ -251,3 +407,7 @@ We are happy to receive your contributions. Propose your change in an issue or d
|
|
|
251
407
|
## Credits
|
|
252
408
|
|
|
253
409
|
Created by [Stefan Negele](https://www.linkedin.com/in/stefan-negele-573153112/) and [Jochen Christ](https://www.linkedin.com/in/jochenchrist/).
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
<a href="https://github.com/datacontract/cli" class="github-corner" aria-label="View source on GitHub"><svg width="80" height="80" viewBox="0 0 250 250" style="fill:#151513; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
|
|
@@ -1,31 +1,37 @@
|
|
|
1
1
|
datacontract/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
datacontract/cli.py,sha256=
|
|
3
|
-
datacontract/data_contract.py,sha256=
|
|
2
|
+
datacontract/cli.py,sha256=CaU0B68__T6t_JzcfE4cQ54CCwKkvHnkatl_zVFIFQg,4349
|
|
3
|
+
datacontract/data_contract.py,sha256=BOJ8UuT75o-nQwuE-oHxHosSn6JC1F74OHQcmjiaoCs,7371
|
|
4
4
|
datacontract/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py,sha256=Tj_REcEYl2BtIR_W9k0pjdjE4CvBE-4vpFrGAvvrde4,1557
|
|
6
6
|
datacontract/engines/datacontract/check_that_datacontract_file_exists.py,sha256=V_YJyt1rKkkKhghU359vaAGtC8leIGmwqR4MlrLgCJ4,620
|
|
7
7
|
datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py,sha256=bRoJp8a-Hvdc2OkbcTcS0tr8M7XxNzWbJAUFrc-ceiA,1393
|
|
8
|
-
datacontract/engines/fastjsonschema/check_jsonschema.py,sha256=
|
|
9
|
-
datacontract/engines/fastjsonschema/s3/s3_read_files.py,sha256=
|
|
8
|
+
datacontract/engines/fastjsonschema/check_jsonschema.py,sha256=QuvFkeA-cE2nsHL33wQLb7QuhBQg4AQDQ3pav-iw9uE,5596
|
|
9
|
+
datacontract/engines/fastjsonschema/s3/s3_read_files.py,sha256=sCe028D8q04c2pYlzJuEXWmMZOQJLiaObyLXLe4UzUs,713
|
|
10
10
|
datacontract/engines/soda/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
datacontract/engines/soda/check_soda_execute.py,sha256=
|
|
11
|
+
datacontract/engines/soda/check_soda_execute.py,sha256=WPvan3olUY7tao_75Uun7khwYLBTSRX9wtE1vCRmWJA,5572
|
|
12
|
+
datacontract/engines/soda/connections/bigquery.py,sha256=_hNd7Lmo6DjLb3nqVx_pfePwSYp3_3T_hwivVlATEyI,658
|
|
12
13
|
datacontract/engines/soda/connections/dask.py,sha256=iQfu4swHN_QfY9l0TdSbqAQXJvfKMIxGoZ4xiNpi4eY,1497
|
|
13
|
-
datacontract/engines/soda/connections/
|
|
14
|
+
datacontract/engines/soda/connections/databricks.py,sha256=tCVE2Q2BXjuxS5ZmDyH_qN6jigStBsfIikRYMQ5LKVs,561
|
|
15
|
+
datacontract/engines/soda/connections/duckdb.py,sha256=_Tpfo5D1ahOUPHbnEZ1WloeCecQ2LYDUebIU3hnnBDg,2342
|
|
16
|
+
datacontract/engines/soda/connections/postgres.py,sha256=ow21gzxiV2_FyOXrFYeSRefLKwRQR5_qxtOR2T1rdTI,625
|
|
14
17
|
datacontract/engines/soda/connections/snowflake.py,sha256=H941nOQULZKznmarVvZcvJhseMOUwfnMsv1r_P0MMb0,719
|
|
15
18
|
datacontract/export/jsonschema_converter.py,sha256=gceZ-_euhedZzPfpVG8xYI16-ro9wLwAnqfkwDNWDTE,2977
|
|
16
|
-
datacontract/export/sodacl_converter.py,sha256=
|
|
19
|
+
datacontract/export/sodacl_converter.py,sha256=7P6be3GAsUaLchqP6GNibKvpXmpo24D6z1NvOyJCjcI,2836
|
|
17
20
|
datacontract/init/download_datacontract_file.py,sha256=H_234IfZ3xezjgcZ4sb7wSCEZCDUjM1uYsUibHHj4Ow,412
|
|
18
21
|
datacontract/integration/publish_datamesh_manager.py,sha256=_qn4lyUkyrO0IKFzfzWCNBqEH5Ur20M_cpieIPtgRwc,1358
|
|
19
22
|
datacontract/lint/files.py,sha256=DIUetslLuBvvddgza4vEvvUBMSVeJ4I1LHFID0mmMfU,470
|
|
23
|
+
datacontract/lint/lint.py,sha256=4-4vrwy-98e_Za_faPxiWM8npGMzQTQgg74_x9v7cAU,4371
|
|
20
24
|
datacontract/lint/resolve.py,sha256=5VnWG7lV5YgKFdabi5c-G212caMx9g0LrGeanQ2z1r4,3022
|
|
21
25
|
datacontract/lint/schema.py,sha256=9UipDhpY6jQEtC6vKZ44-NcVMbpPXBvs9HZYGQ0gsAM,174
|
|
22
26
|
datacontract/lint/urls.py,sha256=LXg_yzAmG71fJPc_0QeWJ0cKEqkhtZhlZZf1hWMTFNE,1408
|
|
23
|
-
datacontract/
|
|
27
|
+
datacontract/lint/linters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
datacontract/lint/linters/example_model_linter.py,sha256=XGXDbNc_LLzwiWstXloJ8dpabxd3xV9IMq5XO3VjKw8,2898
|
|
29
|
+
datacontract/model/data_contract_specification.py,sha256=pTLQH6YoiVrL7L1CrgIh0d0cBPRFV6SUPxGy1Va1TS8,2167
|
|
24
30
|
datacontract/model/exceptions.py,sha256=zhhXnKWTzEyG54N9QDVpE5F986cKuHEXN0OcR5Zy8oc,1090
|
|
25
31
|
datacontract/model/run.py,sha256=AejMAlTex2oh-zQQw6fifWntPnBSLLOB-7VaexG6Ef0,2484
|
|
26
|
-
datacontract_cli-0.9.
|
|
27
|
-
datacontract_cli-0.9.
|
|
28
|
-
datacontract_cli-0.9.
|
|
29
|
-
datacontract_cli-0.9.
|
|
30
|
-
datacontract_cli-0.9.
|
|
31
|
-
datacontract_cli-0.9.
|
|
32
|
+
datacontract_cli-0.9.4.dist-info/LICENSE,sha256=23h64qnSeIZ0DKeziWAKC-zBCt328iSbRbWBrXoYRb4,2210
|
|
33
|
+
datacontract_cli-0.9.4.dist-info/METADATA,sha256=Ks35GB42Js-cwZx-r2x5QfPZLM7etMTKprkc0Ui_Sso,17052
|
|
34
|
+
datacontract_cli-0.9.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
35
|
+
datacontract_cli-0.9.4.dist-info/entry_points.txt,sha256=D3Eqy4q_Z6bHauGd4ppIyQglwbrm1AJnLau4Ppbw9Is,54
|
|
36
|
+
datacontract_cli-0.9.4.dist-info/top_level.txt,sha256=VIRjd8EIUrBYWjEXJJjtdUgc0UAJdPZjmLiOR8BRBYM,13
|
|
37
|
+
datacontract_cli-0.9.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|