datacontract-cli 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/cli.py +4 -2
- datacontract/data_contract.py +62 -14
- datacontract/engines/fastjsonschema/check_jsonschema.py +23 -11
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +1 -1
- datacontract/engines/soda/check_soda_execute.py +21 -2
- datacontract/engines/soda/connections/databricks.py +20 -0
- datacontract/engines/soda/connections/postgres.py +21 -0
- datacontract/model/data_contract_specification.py +5 -0
- {datacontract_cli-0.9.3.dist-info → datacontract_cli-0.9.4.dist-info}/METADATA +161 -29
- {datacontract_cli-0.9.3.dist-info → datacontract_cli-0.9.4.dist-info}/RECORD +14 -12
- {datacontract_cli-0.9.3.dist-info → datacontract_cli-0.9.4.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.9.3.dist-info → datacontract_cli-0.9.4.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.9.3.dist-info → datacontract_cli-0.9.4.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.9.3.dist-info → datacontract_cli-0.9.4.dist-info}/top_level.txt +0 -0
datacontract/cli.py
CHANGED
|
@@ -87,14 +87,16 @@ def test(
|
|
|
87
87
|
"Use the key of the server object in the data contract yaml file "
|
|
88
88
|
"to refer to a server, e.g., `production`, or `all` for all "
|
|
89
89
|
"servers (default).")] = "all",
|
|
90
|
+
examples: Annotated[bool, typer.Option(
|
|
91
|
+
help="Run the schema and quality tests on the example data within the data contract.")] = None,
|
|
90
92
|
publish: Annotated[str, typer.Option(
|
|
91
|
-
help="")] = None,
|
|
93
|
+
help="The url to publish the results after the test")] = None,
|
|
92
94
|
):
|
|
93
95
|
"""
|
|
94
96
|
Run schema and quality tests on configured servers.
|
|
95
97
|
"""
|
|
96
98
|
print(f"Testing {location}")
|
|
97
|
-
run = DataContract(data_contract_file=location, publish_url=publish).test()
|
|
99
|
+
run = DataContract(data_contract_file=location, publish_url=publish, examples=examples).test()
|
|
98
100
|
_handle_result(run)
|
|
99
101
|
|
|
100
102
|
|
datacontract/data_contract.py
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import tempfile
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
3
7
|
|
|
4
8
|
from datacontract.engines.datacontract.check_that_datacontract_contains_valid_servers_configuration import \
|
|
5
9
|
check_that_datacontract_contains_valid_server_configuration
|
|
@@ -13,7 +17,7 @@ from datacontract.integration.publish_datamesh_manager import \
|
|
|
13
17
|
from datacontract.lint import resolve
|
|
14
18
|
from datacontract.lint.linters.example_model_linter import ExampleModelLinter
|
|
15
19
|
from datacontract.model.data_contract_specification import \
|
|
16
|
-
DataContractSpecification
|
|
20
|
+
DataContractSpecification, Server
|
|
17
21
|
from datacontract.model.exceptions import DataContractException
|
|
18
22
|
from datacontract.model.run import \
|
|
19
23
|
Run, Check
|
|
@@ -26,13 +30,17 @@ class DataContract:
|
|
|
26
30
|
data_contract_str: str = None,
|
|
27
31
|
data_contract: DataContractSpecification = None,
|
|
28
32
|
server: str = None,
|
|
33
|
+
examples: bool = False,
|
|
29
34
|
publish_url: str = None,
|
|
35
|
+
spark: str = None,
|
|
30
36
|
):
|
|
31
37
|
self._data_contract_file = data_contract_file
|
|
32
38
|
self._data_contract_str = data_contract_str
|
|
33
39
|
self._data_contract = data_contract
|
|
34
40
|
self._server = server
|
|
41
|
+
self._examples = examples
|
|
35
42
|
self._publish_url = publish_url
|
|
43
|
+
self._spark = spark
|
|
36
44
|
|
|
37
45
|
def lint(self):
|
|
38
46
|
run = Run.create_run()
|
|
@@ -80,20 +88,27 @@ class DataContract:
|
|
|
80
88
|
|
|
81
89
|
check_that_datacontract_contains_valid_server_configuration(run, data_contract, self._server)
|
|
82
90
|
# TODO check yaml contains models
|
|
83
|
-
server_name = list(data_contract.servers.keys())[0]
|
|
84
|
-
server = data_contract.servers.get(server_name)
|
|
85
|
-
run.log_info(f"Running tests for data contract {data_contract.id} with server {server_name}")
|
|
86
|
-
run.dataContractId = data_contract.id
|
|
87
|
-
run.dataContractVersion = data_contract.info.version
|
|
88
|
-
run.dataProductId = server.dataProductId
|
|
89
|
-
run.outputPortId = server.outputPortId
|
|
90
|
-
run.server = server_name
|
|
91
91
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
92
|
+
with tempfile.TemporaryDirectory(prefix="datacontract-cli") as tmp_dir:
|
|
93
|
+
if self._examples:
|
|
94
|
+
server_name = "examples"
|
|
95
|
+
server = self._get_examples_server(data_contract, run, tmp_dir)
|
|
96
|
+
else:
|
|
97
|
+
server_name = list(data_contract.servers.keys())[0]
|
|
98
|
+
server = data_contract.servers.get(server_name)
|
|
99
|
+
|
|
100
|
+
run.log_info(f"Running tests for data contract {data_contract.id} with server {server_name}")
|
|
101
|
+
run.dataContractId = data_contract.id
|
|
102
|
+
run.dataContractVersion = data_contract.info.version
|
|
103
|
+
run.dataProductId = server.dataProductId
|
|
104
|
+
run.outputPortId = server.outputPortId
|
|
105
|
+
run.server = server_name
|
|
106
|
+
|
|
107
|
+
# 5. check server is supported type
|
|
108
|
+
# 6. check server credentials are complete
|
|
109
|
+
if server.format == "json":
|
|
110
|
+
check_jsonschema(run, data_contract, server)
|
|
111
|
+
check_soda_execute(run, data_contract, server, self._spark)
|
|
97
112
|
|
|
98
113
|
except DataContractException as e:
|
|
99
114
|
run.checks.append(Check(
|
|
@@ -123,6 +138,7 @@ class DataContract:
|
|
|
123
138
|
|
|
124
139
|
return run
|
|
125
140
|
|
|
141
|
+
|
|
126
142
|
def diff(self, other):
|
|
127
143
|
pass
|
|
128
144
|
|
|
@@ -138,3 +154,35 @@ class DataContract:
|
|
|
138
154
|
else:
|
|
139
155
|
print(f"Export format {export_format} not supported.")
|
|
140
156
|
return ""
|
|
157
|
+
|
|
158
|
+
def _get_examples_server(self, data_contract, run, tmp_dir):
|
|
159
|
+
run.log_info(f"Copying examples to files in temporary directory {tmp_dir}")
|
|
160
|
+
format = "json"
|
|
161
|
+
for example in data_contract.examples:
|
|
162
|
+
format = example.type
|
|
163
|
+
p = f"{tmp_dir}/{example.model}.{format}"
|
|
164
|
+
run.log_info(f"Creating example file {p}")
|
|
165
|
+
with open(p, "w") as f:
|
|
166
|
+
content = ""
|
|
167
|
+
if format == "json" and type(example.data) is list:
|
|
168
|
+
content = json.dumps(example.data)
|
|
169
|
+
elif format == "json" and type(example.data) is str:
|
|
170
|
+
content = example.data
|
|
171
|
+
elif format == "yaml" and type(example.data) is list:
|
|
172
|
+
content = yaml.dump(example.data)
|
|
173
|
+
elif format == "yaml" and type(example.data) is str:
|
|
174
|
+
content = example.data
|
|
175
|
+
elif format == "csv":
|
|
176
|
+
content = example.data
|
|
177
|
+
logging.debug(f"Content of example file {p}: {content}")
|
|
178
|
+
f.write(content)
|
|
179
|
+
path = f"{tmp_dir}" + "/{model}." + format
|
|
180
|
+
delimiter = "array"
|
|
181
|
+
server = Server(
|
|
182
|
+
type="local",
|
|
183
|
+
path=path,
|
|
184
|
+
format=format,
|
|
185
|
+
delimiter=delimiter,
|
|
186
|
+
)
|
|
187
|
+
run.log_info(f"Using {server} for testing the examples")
|
|
188
|
+
return server
|
|
@@ -29,29 +29,40 @@ def validate_json_stream(model_name, validate, json_stream):
|
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
|
|
32
|
-
def read_json_lines(
|
|
32
|
+
def read_json_lines(file):
|
|
33
|
+
file_content = file.read()
|
|
33
34
|
for line in file_content.splitlines():
|
|
34
35
|
yield json.loads(line)
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
def
|
|
38
|
-
for line in
|
|
38
|
+
def read_json_lines_content(file_content: str):
|
|
39
|
+
for line in file_content.splitlines():
|
|
39
40
|
yield json.loads(line)
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
def read_json_array(file):
|
|
43
|
-
data = json.
|
|
44
|
+
data = json.load(file)
|
|
45
|
+
for item in data:
|
|
46
|
+
yield item
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def read_json_array_content(file_content: str):
|
|
50
|
+
data = json.loads(file_content)
|
|
44
51
|
for item in data:
|
|
45
52
|
yield item
|
|
46
53
|
|
|
47
54
|
|
|
48
55
|
def read_json_file(file):
|
|
49
|
-
yield json.
|
|
56
|
+
yield json.load(file)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def read_json_file_content(file_content: str):
|
|
60
|
+
yield json.loads(file_content)
|
|
50
61
|
|
|
51
62
|
|
|
52
63
|
def process_json_file(run, model_name, validate, file, delimiter):
|
|
53
64
|
if delimiter == "new_line":
|
|
54
|
-
json_stream =
|
|
65
|
+
json_stream = read_json_lines(file)
|
|
55
66
|
elif delimiter == "array":
|
|
56
67
|
json_stream = read_json_array(file)
|
|
57
68
|
else:
|
|
@@ -62,11 +73,12 @@ def process_json_file(run, model_name, validate, file, delimiter):
|
|
|
62
73
|
def process_local_file(run, server, model_name, validate):
|
|
63
74
|
path = server.path
|
|
64
75
|
if "{model}" in path:
|
|
65
|
-
path = path.format(model
|
|
76
|
+
path = path.format(model=model_name)
|
|
66
77
|
|
|
67
78
|
if os.path.isdir(path):
|
|
68
79
|
return process_directory(run, path, server, model_name, validate)
|
|
69
80
|
else:
|
|
81
|
+
logging.info(f"Processing file {path}")
|
|
70
82
|
with open(path, 'r') as file:
|
|
71
83
|
process_json_file(run, model_name, validate, file, server.delimiter)
|
|
72
84
|
|
|
@@ -87,16 +99,16 @@ def process_s3_file(server, model_name, validate):
|
|
|
87
99
|
s3_endpoint_url = server.endpointUrl
|
|
88
100
|
s3_location = server.location
|
|
89
101
|
if "{model}" in s3_location:
|
|
90
|
-
s3_location = s3_location.format(model
|
|
102
|
+
s3_location = s3_location.format(model=model_name)
|
|
91
103
|
json_stream = None
|
|
92
104
|
|
|
93
105
|
for file_content in yield_s3_files(s3_endpoint_url, s3_location):
|
|
94
106
|
if server.delimiter == "new_line":
|
|
95
|
-
json_stream =
|
|
107
|
+
json_stream = read_json_lines_content(file_content)
|
|
96
108
|
elif server.delimiter == "array":
|
|
97
|
-
json_stream =
|
|
109
|
+
json_stream = read_json_array_content(file_content)
|
|
98
110
|
else:
|
|
99
|
-
json_stream =
|
|
111
|
+
json_stream = read_json_file_content(file_content)
|
|
100
112
|
|
|
101
113
|
if json_stream is None:
|
|
102
114
|
raise DataContractException(
|
|
@@ -2,7 +2,13 @@ import logging
|
|
|
2
2
|
|
|
3
3
|
from soda.scan import Scan
|
|
4
4
|
|
|
5
|
+
from datacontract.engines.soda.connections.bigquery import \
|
|
6
|
+
to_bigquery_soda_configuration
|
|
7
|
+
from datacontract.engines.soda.connections.databricks import \
|
|
8
|
+
to_databricks_soda_configuration
|
|
5
9
|
from datacontract.engines.soda.connections.duckdb import get_duckdb_connection
|
|
10
|
+
from datacontract.engines.soda.connections.postgres import \
|
|
11
|
+
to_postgres_soda_configuration
|
|
6
12
|
from datacontract.engines.soda.connections.snowflake import \
|
|
7
13
|
to_snowflake_soda_configuration
|
|
8
14
|
from datacontract.export.sodacl_converter import to_sodacl
|
|
@@ -10,10 +16,9 @@ from datacontract.model.data_contract_specification import \
|
|
|
10
16
|
DataContractSpecification, Server
|
|
11
17
|
from datacontract.model.run import \
|
|
12
18
|
Run, Check, Log
|
|
13
|
-
from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
|
|
14
19
|
|
|
15
20
|
|
|
16
|
-
def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server):
|
|
21
|
+
def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark):
|
|
17
22
|
if data_contract is None:
|
|
18
23
|
run.log_warn("Cannot run engine soda-core, as data contract is invalid")
|
|
19
24
|
return
|
|
@@ -44,6 +49,20 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
44
49
|
soda_configuration_str = to_bigquery_soda_configuration(server)
|
|
45
50
|
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
46
51
|
scan.set_data_source_name(server.type)
|
|
52
|
+
elif server.type == "postgres":
|
|
53
|
+
soda_configuration_str = to_postgres_soda_configuration(server)
|
|
54
|
+
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
55
|
+
scan.set_data_source_name(server.type)
|
|
56
|
+
elif server.type == "databricks":
|
|
57
|
+
if spark is not None:
|
|
58
|
+
logging.info("Use Spark to connect to data source")
|
|
59
|
+
scan.add_spark_session(spark, data_source_name=server.type)
|
|
60
|
+
scan.set_data_source_name(server.type)
|
|
61
|
+
spark.sql(f"USE {server.catalog}.{server.schema_}")
|
|
62
|
+
else:
|
|
63
|
+
soda_configuration_str = to_databricks_soda_configuration(server)
|
|
64
|
+
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
65
|
+
scan.set_data_source_name(server.type)
|
|
47
66
|
else:
|
|
48
67
|
run.checks.append(Check(
|
|
49
68
|
type="general",
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def to_databricks_soda_configuration(server):
|
|
7
|
+
soda_configuration = {
|
|
8
|
+
f"data_source {server.type}": {
|
|
9
|
+
"type": "spark",
|
|
10
|
+
"method": "databricks",
|
|
11
|
+
"host": server.host,
|
|
12
|
+
"catalog": server.catalog,
|
|
13
|
+
"schema": server.schema_,
|
|
14
|
+
"http_path": os.getenv('DATACONTRACT_DATABRICKS_HTTP_PATH'),
|
|
15
|
+
"token": os.getenv('DATACONTRACT_DATABRICKS_TOKEN'),
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
soda_configuration_str = yaml.dump(soda_configuration)
|
|
20
|
+
return soda_configuration_str
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def to_postgres_soda_configuration(server):
|
|
7
|
+
# with service account key, using an external json file
|
|
8
|
+
soda_configuration = {
|
|
9
|
+
f"data_source {server.type}": {
|
|
10
|
+
"type": "postgres",
|
|
11
|
+
"host": server.host,
|
|
12
|
+
"port": str(server.port),
|
|
13
|
+
"username": os.getenv('DATACONTRACT_POSTGRES_USERNAME'),
|
|
14
|
+
"password": os.getenv('DATACONTRACT_POSTGRES_PASSWORD'),
|
|
15
|
+
"database": server.database,
|
|
16
|
+
"schema": server.schema_,
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
soda_configuration_str = yaml.dump(soda_configuration)
|
|
21
|
+
return soda_configuration_str
|
|
@@ -23,6 +23,11 @@ class Server(BaseModel):
|
|
|
23
23
|
account: str = None
|
|
24
24
|
database: str = None
|
|
25
25
|
schema_: str = pydantic.fields.Field(default=None, alias='schema')
|
|
26
|
+
host: str = None
|
|
27
|
+
port: int = None
|
|
28
|
+
catalog: str = None
|
|
29
|
+
http_path: str = None # Use ENV variable
|
|
30
|
+
token: str = None # Use ENV variable
|
|
26
31
|
dataProductId: str = None
|
|
27
32
|
outputPortId: str = None
|
|
28
33
|
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datacontract-cli
|
|
3
|
-
Version: 0.9.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.9.4
|
|
4
|
+
Summary: Test data contracts
|
|
5
5
|
Author-email: Jochen Christ <jochen.christ@innoq.com>, Stefan Negele <stefan.negele@innoq.com>
|
|
6
6
|
Project-URL: Homepage, https://cli.datacontract.com
|
|
7
7
|
Project-URL: Issues, https://github.com/datacontract/cli/issues
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: License :: OSI Approved :: MIT License
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python: >=3.
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: typer[all] ~=0.9.0
|
|
@@ -18,15 +18,19 @@ Requires-Dist: requests ~=2.31.0
|
|
|
18
18
|
Requires-Dist: fastparquet ==2023.10.1
|
|
19
19
|
Requires-Dist: soda-core-bigquery ~=3.1.5
|
|
20
20
|
Requires-Dist: soda-core-duckdb ~=3.1.5
|
|
21
|
+
Requires-Dist: soda-core-postgres ~=3.1.5
|
|
21
22
|
Requires-Dist: soda-core-snowflake ~=3.1.5
|
|
23
|
+
Requires-Dist: soda-core-spark[databricks] ~=3.1.5
|
|
24
|
+
Requires-Dist: soda-core-spark-df ~=3.1.5
|
|
22
25
|
Requires-Dist: snowflake-connector-python[pandas] <3.8,>=3.6
|
|
23
|
-
Requires-Dist: duckdb
|
|
26
|
+
Requires-Dist: duckdb ==0.10.0
|
|
24
27
|
Requires-Dist: fastjsonschema ~=2.19.1
|
|
25
28
|
Requires-Dist: python-dotenv ~=1.0.0
|
|
26
29
|
Requires-Dist: s3fs ==2024.2.0
|
|
27
30
|
Provides-Extra: dev
|
|
28
31
|
Requires-Dist: pytest ; extra == 'dev'
|
|
29
32
|
Requires-Dist: testcontainers-minio ; extra == 'dev'
|
|
33
|
+
Requires-Dist: testcontainers-postgres ; extra == 'dev'
|
|
30
34
|
|
|
31
35
|
# Data Contract CLI
|
|
32
36
|
|
|
@@ -78,6 +82,9 @@ $ datacontract lint datacontract.yaml
|
|
|
78
82
|
# execute schema and quality checks
|
|
79
83
|
$ datacontract test datacontract.yaml
|
|
80
84
|
|
|
85
|
+
# execute schema and quality checks on the examples within the contract
|
|
86
|
+
$ datacontract test --examples datacontract.yaml
|
|
87
|
+
|
|
81
88
|
# find differences between to data contracts (Coming Soon)
|
|
82
89
|
$ datacontract diff datacontract-v1.yaml datacontract-v2.yaml
|
|
83
90
|
|
|
@@ -137,17 +144,17 @@ pipx installs into an isolated environment.
|
|
|
137
144
|
pipx install datacontract-cli
|
|
138
145
|
```
|
|
139
146
|
|
|
140
|
-
###
|
|
147
|
+
### Docker
|
|
141
148
|
|
|
142
149
|
```bash
|
|
143
|
-
|
|
150
|
+
docker pull --platform linux/amd64 datacontract/cli
|
|
151
|
+
docker run --rm --platform linux/amd64 -v ${PWD}:/home/datacontract datacontract/cli
|
|
144
152
|
```
|
|
145
153
|
|
|
146
|
-
|
|
154
|
+
Or via an alias that automatically uses the latest version:
|
|
147
155
|
|
|
148
156
|
```bash
|
|
149
|
-
docker
|
|
150
|
-
docker run --rm -v ${PWD}:/datacontract datacontract/cli
|
|
157
|
+
alias datacontract='docker run --rm -v "${PWD}:/home/datacontract" --platform linux/amd64 datacontract/cli:latest'
|
|
151
158
|
```
|
|
152
159
|
|
|
153
160
|
## Documentation
|
|
@@ -170,11 +177,12 @@ The application uses different engines, based on the server `type`.
|
|
|
170
177
|
| `s3` | `json` | Support for `new_line` delimited JSON files and one JSON record per file. | ✅ | fastjsonschema<br> soda-core-duckdb |
|
|
171
178
|
| `s3` | `csv` | | ✅ | soda-core-duckdb |
|
|
172
179
|
| `s3` | `delta` | | Coming soon | TBD |
|
|
173
|
-
| `postgres` | n/a | |
|
|
174
|
-
| `snowflake` | n/a | | ✅
|
|
175
|
-
| `bigquery` | n/a
|
|
180
|
+
| `postgres` | n/a | | ✅ | soda-core-postgres |
|
|
181
|
+
| `snowflake` | n/a | | ✅ | soda-core-snowflake |
|
|
182
|
+
| `bigquery` | n/a | | ✅ | soda-core-bigquery |
|
|
176
183
|
| `redshift` | n/a | | Coming soon | TBD |
|
|
177
|
-
| `databricks` | n/a |
|
|
184
|
+
| `databricks` | n/a | Support for Databricks SQL with Unity catalog and Hive metastore. | ✅ | soda-core-spark |
|
|
185
|
+
| `databricks` | n/a | Support for Spark for programmatic use in Notebooks. | ✅ | soda-core-spark-df |
|
|
178
186
|
| `kafka` | `json` | | Coming soon | TBD |
|
|
179
187
|
| `kafka` | `avro` | | Coming soon | TBD |
|
|
180
188
|
| `kafka` | `protobuf` | | Coming soon | TBD |
|
|
@@ -184,9 +192,11 @@ The application uses different engines, based on the server `type`.
|
|
|
184
192
|
|
|
185
193
|
Feel free to create an issue, if you need support for an additional type.
|
|
186
194
|
|
|
187
|
-
###
|
|
195
|
+
### S3
|
|
196
|
+
|
|
197
|
+
Data Contract CLI can test data that is stored in S3 buckets or any S3-compliant endpoints in various formats.
|
|
188
198
|
|
|
189
|
-
Example
|
|
199
|
+
#### Example
|
|
190
200
|
|
|
191
201
|
datacontract.yaml
|
|
192
202
|
```yaml
|
|
@@ -195,24 +205,58 @@ servers:
|
|
|
195
205
|
type: s3
|
|
196
206
|
endpointUrl: https://minio.example.com # not needed with AWS S3
|
|
197
207
|
location: s3://bucket-name/path/*/*.json
|
|
198
|
-
delimiter: new_line # new_line, array, or none
|
|
199
208
|
format: json
|
|
209
|
+
delimiter: new_line # new_line, array, or none
|
|
200
210
|
```
|
|
201
211
|
|
|
202
|
-
Environment
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
212
|
+
#### Environment Variables
|
|
213
|
+
|
|
214
|
+
| Environment Variable | Example | Description |
|
|
215
|
+
|-----------------------------------|-------------------------------|-----------------------|
|
|
216
|
+
| `DATACONTRACT_S3_REGION` | `eu-central-1` | Region of S3 bucket |
|
|
217
|
+
| `DATACONTRACT_S3_ACCESS_KEY_ID` | `AKIAXV5Q5QABCDEFGH` | AWS Access Key ID |
|
|
218
|
+
| `DATACONTRACT_S3_SECRET_ACCESS_KEY` | `93S7LRrJcqLaaaa/XXXXXXXXXXXXX` | AWS Secret Access Key |
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
### Postgres
|
|
222
|
+
|
|
223
|
+
Data Contract CLI can test data in Postgres or Postgres-compliant databases (e.g., RisingWave).
|
|
224
|
+
|
|
225
|
+
#### Example
|
|
226
|
+
|
|
227
|
+
datacontract.yaml
|
|
228
|
+
```yaml
|
|
229
|
+
servers:
|
|
230
|
+
postgres:
|
|
231
|
+
type: postgres
|
|
232
|
+
host: localhost
|
|
233
|
+
port: 5432
|
|
234
|
+
database: postgres
|
|
235
|
+
schema: public
|
|
236
|
+
models:
|
|
237
|
+
my_table_1: # corresponds to a table
|
|
238
|
+
type: table
|
|
239
|
+
fields:
|
|
240
|
+
my_column_1: # corresponds to a column
|
|
241
|
+
type: varchar
|
|
207
242
|
```
|
|
208
243
|
|
|
209
|
-
|
|
244
|
+
#### Environment Variables
|
|
245
|
+
|
|
246
|
+
| Environment Variable | Example | Description |
|
|
247
|
+
|----------------------------------|--------------------|-------------|
|
|
248
|
+
| `DATACONTRACT_POSTGRES_USERNAME` | `postgres` | Username |
|
|
249
|
+
| `DATACONTRACT_POSTGRES_PASSWORD` | `mysecretpassword` | Password |
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
### BigQuery
|
|
210
253
|
|
|
211
254
|
We support authentication to BigQuery using Service Account Key. The used Service Account should include the roles:
|
|
212
255
|
* BigQuery Job User
|
|
213
256
|
* BigQuery Data Viewer
|
|
214
257
|
|
|
215
|
-
|
|
258
|
+
|
|
259
|
+
#### Example
|
|
216
260
|
|
|
217
261
|
datacontract.yaml
|
|
218
262
|
```yaml
|
|
@@ -227,15 +271,99 @@ models:
|
|
|
227
271
|
fields: ...
|
|
228
272
|
```
|
|
229
273
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
274
|
+
#### Environment Variables
|
|
275
|
+
|
|
276
|
+
| Environment Variable | Example | Description |
|
|
277
|
+
|----------------------------------------------|---------------------------|---------------------------------------------------------|
|
|
278
|
+
| `DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH` | `~/service-access-key.json` | Service Access key as saved on key creation by BigQuery |
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
### Databricks
|
|
282
|
+
|
|
283
|
+
Works with Unity Catalog and Hive metastore.
|
|
284
|
+
|
|
285
|
+
Needs a running SQL warehouse or compute cluster.
|
|
286
|
+
|
|
287
|
+
#### Example
|
|
288
|
+
|
|
289
|
+
datacontract.yaml
|
|
290
|
+
```yaml
|
|
291
|
+
servers:
|
|
292
|
+
production:
|
|
293
|
+
type: databricks
|
|
294
|
+
host: dbc-abcdefgh-1234.cloud.databricks.com
|
|
295
|
+
catalog: acme_catalog_prod
|
|
296
|
+
schema: orders_latest
|
|
297
|
+
models:
|
|
298
|
+
orders: # corresponds to a table
|
|
299
|
+
type: table
|
|
300
|
+
fields: ...
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
#### Environment Variables
|
|
304
|
+
|
|
305
|
+
| Environment Variable | Example | Description |
|
|
306
|
+
|----------------------------------------------|--------------------------------------|-------------------------------------------------------|
|
|
307
|
+
| `DATACONTRACT_DATABRICKS_TOKEN` | `dapia00000000000000000000000000000` | The personal access token to authenticate |
|
|
308
|
+
| `DATACONTRACT_DATABRICKS_HTTP_PATH` | `/sql/1.0/warehouses/b053a3ffffffff` | The HTTP path to the SQL warehouse or compute cluster |
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
### Databricks (programmatic)
|
|
312
|
+
|
|
313
|
+
Works with Unity Catalog and Hive metastore.
|
|
314
|
+
When running in a notebook or pipeline, the provided `spark` session can be used.
|
|
315
|
+
An additional authentication is not required.
|
|
316
|
+
|
|
317
|
+
Requires a Databricks Runtime with Python >= 3.10.
|
|
318
|
+
|
|
319
|
+
#### Example
|
|
320
|
+
|
|
321
|
+
datacontract.yaml
|
|
322
|
+
```yaml
|
|
323
|
+
servers:
|
|
324
|
+
production:
|
|
325
|
+
type: databricks
|
|
326
|
+
host: dbc-abcdefgh-1234.cloud.databricks.com # ignored, always use current host
|
|
327
|
+
catalog: acme_catalog_prod
|
|
328
|
+
schema: orders_latest
|
|
329
|
+
models:
|
|
330
|
+
orders: # corresponds to a table
|
|
331
|
+
type: table
|
|
332
|
+
fields: ...
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
Notebook
|
|
336
|
+
```python
|
|
337
|
+
%pip install git+https://github.com/datacontract/cli.git
|
|
338
|
+
dbutils.library.restartPython()
|
|
339
|
+
|
|
340
|
+
from datacontract.data_contract import DataContract
|
|
341
|
+
|
|
342
|
+
data_contract = DataContract(
|
|
343
|
+
data_contract_file="/Volumes/acme_catalog_prod/orders_latest/datacontract/datacontract.yaml",
|
|
344
|
+
spark=spark)
|
|
345
|
+
run = data_contract.test()
|
|
346
|
+
run.result
|
|
233
347
|
```
|
|
234
348
|
|
|
349
|
+
|
|
350
|
+
### Exports
|
|
351
|
+
|
|
352
|
+
Available export options:
|
|
353
|
+
|
|
354
|
+
| Type | Description | Status |
|
|
355
|
+
|--------------|------------------------------------------------|--------|
|
|
356
|
+
| `jsonschema` | Export to JSON Schema | ✅ |
|
|
357
|
+
| `sodacl` | Export to SodaCL quality checks in YAML format | ✅ |
|
|
358
|
+
| `dbt` | Export to dbt model in YAML format | TBD |
|
|
359
|
+
| `avro` | Export to AVRO models | TBD |
|
|
360
|
+
| `pydantic` | Export to pydantic models | TBD |
|
|
361
|
+
| `sql` | Export to SQL DDL | TBD |
|
|
362
|
+
| `protobuf` | Export to Protobuf | TBD |
|
|
363
|
+
|
|
235
364
|
## Development Setup
|
|
236
365
|
|
|
237
|
-
Python base interpreter should be 3.11.x (unless
|
|
238
|
-
working on 3.12 release candidate).
|
|
366
|
+
Python base interpreter should be 3.11.x (unless working on 3.12 release candidate).
|
|
239
367
|
|
|
240
368
|
```bash
|
|
241
369
|
# create venv
|
|
@@ -265,7 +393,7 @@ Docker Build
|
|
|
265
393
|
|
|
266
394
|
```
|
|
267
395
|
docker build -t datacontract/cli .
|
|
268
|
-
docker run --rm -v ${PWD}:/datacontract datacontract/cli
|
|
396
|
+
docker run --rm -v ${PWD}:/home/datacontract datacontract/cli
|
|
269
397
|
```
|
|
270
398
|
|
|
271
399
|
## Contribution
|
|
@@ -279,3 +407,7 @@ We are happy to receive your contributions. Propose your change in an issue or d
|
|
|
279
407
|
## Credits
|
|
280
408
|
|
|
281
409
|
Created by [Stefan Negele](https://www.linkedin.com/in/stefan-negele-573153112/) and [Jochen Christ](https://www.linkedin.com/in/jochenchrist/).
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
<a href="https://github.com/datacontract/cli" class="github-corner" aria-label="View source on GitHub"><svg width="80" height="80" viewBox="0 0 250 250" style="fill:#151513; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
|
|
@@ -1,17 +1,19 @@
|
|
|
1
1
|
datacontract/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
datacontract/cli.py,sha256=
|
|
3
|
-
datacontract/data_contract.py,sha256=
|
|
2
|
+
datacontract/cli.py,sha256=CaU0B68__T6t_JzcfE4cQ54CCwKkvHnkatl_zVFIFQg,4349
|
|
3
|
+
datacontract/data_contract.py,sha256=BOJ8UuT75o-nQwuE-oHxHosSn6JC1F74OHQcmjiaoCs,7371
|
|
4
4
|
datacontract/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py,sha256=Tj_REcEYl2BtIR_W9k0pjdjE4CvBE-4vpFrGAvvrde4,1557
|
|
6
6
|
datacontract/engines/datacontract/check_that_datacontract_file_exists.py,sha256=V_YJyt1rKkkKhghU359vaAGtC8leIGmwqR4MlrLgCJ4,620
|
|
7
7
|
datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py,sha256=bRoJp8a-Hvdc2OkbcTcS0tr8M7XxNzWbJAUFrc-ceiA,1393
|
|
8
|
-
datacontract/engines/fastjsonschema/check_jsonschema.py,sha256=
|
|
9
|
-
datacontract/engines/fastjsonschema/s3/s3_read_files.py,sha256=
|
|
8
|
+
datacontract/engines/fastjsonschema/check_jsonschema.py,sha256=QuvFkeA-cE2nsHL33wQLb7QuhBQg4AQDQ3pav-iw9uE,5596
|
|
9
|
+
datacontract/engines/fastjsonschema/s3/s3_read_files.py,sha256=sCe028D8q04c2pYlzJuEXWmMZOQJLiaObyLXLe4UzUs,713
|
|
10
10
|
datacontract/engines/soda/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
datacontract/engines/soda/check_soda_execute.py,sha256=
|
|
11
|
+
datacontract/engines/soda/check_soda_execute.py,sha256=WPvan3olUY7tao_75Uun7khwYLBTSRX9wtE1vCRmWJA,5572
|
|
12
12
|
datacontract/engines/soda/connections/bigquery.py,sha256=_hNd7Lmo6DjLb3nqVx_pfePwSYp3_3T_hwivVlATEyI,658
|
|
13
13
|
datacontract/engines/soda/connections/dask.py,sha256=iQfu4swHN_QfY9l0TdSbqAQXJvfKMIxGoZ4xiNpi4eY,1497
|
|
14
|
+
datacontract/engines/soda/connections/databricks.py,sha256=tCVE2Q2BXjuxS5ZmDyH_qN6jigStBsfIikRYMQ5LKVs,561
|
|
14
15
|
datacontract/engines/soda/connections/duckdb.py,sha256=_Tpfo5D1ahOUPHbnEZ1WloeCecQ2LYDUebIU3hnnBDg,2342
|
|
16
|
+
datacontract/engines/soda/connections/postgres.py,sha256=ow21gzxiV2_FyOXrFYeSRefLKwRQR5_qxtOR2T1rdTI,625
|
|
15
17
|
datacontract/engines/soda/connections/snowflake.py,sha256=H941nOQULZKznmarVvZcvJhseMOUwfnMsv1r_P0MMb0,719
|
|
16
18
|
datacontract/export/jsonschema_converter.py,sha256=gceZ-_euhedZzPfpVG8xYI16-ro9wLwAnqfkwDNWDTE,2977
|
|
17
19
|
datacontract/export/sodacl_converter.py,sha256=7P6be3GAsUaLchqP6GNibKvpXmpo24D6z1NvOyJCjcI,2836
|
|
@@ -24,12 +26,12 @@ datacontract/lint/schema.py,sha256=9UipDhpY6jQEtC6vKZ44-NcVMbpPXBvs9HZYGQ0gsAM,1
|
|
|
24
26
|
datacontract/lint/urls.py,sha256=LXg_yzAmG71fJPc_0QeWJ0cKEqkhtZhlZZf1hWMTFNE,1408
|
|
25
27
|
datacontract/lint/linters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
28
|
datacontract/lint/linters/example_model_linter.py,sha256=XGXDbNc_LLzwiWstXloJ8dpabxd3xV9IMq5XO3VjKw8,2898
|
|
27
|
-
datacontract/model/data_contract_specification.py,sha256=
|
|
29
|
+
datacontract/model/data_contract_specification.py,sha256=pTLQH6YoiVrL7L1CrgIh0d0cBPRFV6SUPxGy1Va1TS8,2167
|
|
28
30
|
datacontract/model/exceptions.py,sha256=zhhXnKWTzEyG54N9QDVpE5F986cKuHEXN0OcR5Zy8oc,1090
|
|
29
31
|
datacontract/model/run.py,sha256=AejMAlTex2oh-zQQw6fifWntPnBSLLOB-7VaexG6Ef0,2484
|
|
30
|
-
datacontract_cli-0.9.
|
|
31
|
-
datacontract_cli-0.9.
|
|
32
|
-
datacontract_cli-0.9.
|
|
33
|
-
datacontract_cli-0.9.
|
|
34
|
-
datacontract_cli-0.9.
|
|
35
|
-
datacontract_cli-0.9.
|
|
32
|
+
datacontract_cli-0.9.4.dist-info/LICENSE,sha256=23h64qnSeIZ0DKeziWAKC-zBCt328iSbRbWBrXoYRb4,2210
|
|
33
|
+
datacontract_cli-0.9.4.dist-info/METADATA,sha256=Ks35GB42Js-cwZx-r2x5QfPZLM7etMTKprkc0Ui_Sso,17052
|
|
34
|
+
datacontract_cli-0.9.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
35
|
+
datacontract_cli-0.9.4.dist-info/entry_points.txt,sha256=D3Eqy4q_Z6bHauGd4ppIyQglwbrm1AJnLau4Ppbw9Is,54
|
|
36
|
+
datacontract_cli-0.9.4.dist-info/top_level.txt,sha256=VIRjd8EIUrBYWjEXJJjtdUgc0UAJdPZjmLiOR8BRBYM,13
|
|
37
|
+
datacontract_cli-0.9.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|