datacontract-cli 0.10.12__py3-none-any.whl → 0.10.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/cli.py +5 -0
- datacontract/data_contract.py +9 -1
- datacontract/engines/soda/connections/kafka.py +28 -6
- datacontract/export/avro_converter.py +8 -1
- datacontract/export/avro_idl_converter.py +1 -0
- datacontract/export/bigquery_converter.py +30 -23
- datacontract/export/data_caterer_converter.py +148 -0
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/exporter.py +5 -1
- datacontract/export/exporter_factory.py +19 -1
- datacontract/export/jsonschema_converter.py +13 -2
- datacontract/export/{odcs_converter.py → odcs_v2_exporter.py} +4 -4
- datacontract/export/odcs_v3_exporter.py +294 -0
- datacontract/export/sodacl_converter.py +82 -2
- datacontract/export/spark_converter.py +3 -1
- datacontract/export/sql_type_converter.py +56 -21
- datacontract/imports/iceberg_importer.py +162 -0
- datacontract/imports/importer.py +1 -0
- datacontract/imports/importer_factory.py +5 -0
- datacontract/imports/odcs_importer.py +25 -168
- datacontract/imports/odcs_v2_importer.py +177 -0
- datacontract/imports/odcs_v3_importer.py +309 -0
- datacontract/imports/spark_importer.py +5 -1
- datacontract/imports/unity_importer.py +105 -84
- datacontract/integration/datamesh_manager.py +1 -1
- datacontract/lint/resolve.py +24 -10
- datacontract/lint/resources.py +21 -0
- datacontract/lint/urls.py +29 -13
- datacontract/model/data_contract_specification.py +72 -8
- datacontract/model/odcs.py +11 -0
- {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/METADATA +106 -52
- {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/RECORD +36 -29
- {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/WHEEL +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from datacontract.lint.files import read_file
|
|
2
|
+
from datacontract.lint.urls import fetch_resource
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def read_resource(location: str) -> str:
|
|
6
|
+
"""
|
|
7
|
+
Read a resource from a given location.
|
|
8
|
+
|
|
9
|
+
If the location is a URL, fetch the resource from the web. API-Keys are supported.
|
|
10
|
+
Otherwise, read the resource from a local file.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
location (str): The location of the resource, either a URL or a file path.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
str: The content of the resource.
|
|
17
|
+
"""
|
|
18
|
+
if location.startswith("http://") or location.startswith("https://"):
|
|
19
|
+
return fetch_resource(location)
|
|
20
|
+
else:
|
|
21
|
+
return read_file(location)
|
datacontract/lint/urls.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
from urllib.parse import urlparse
|
|
2
3
|
|
|
3
4
|
import requests
|
|
4
5
|
|
|
@@ -25,16 +26,31 @@ def fetch_resource(url: str):
|
|
|
25
26
|
|
|
26
27
|
|
|
27
28
|
def _set_api_key(headers, url):
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
29
|
+
hostname = urlparse(url).hostname
|
|
30
|
+
if hostname == "datamesh-manager.com" or hostname.endswith(".datamesh-manager.com"):
|
|
31
|
+
datamesh_manager_api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
|
|
32
|
+
if datamesh_manager_api_key is None or datamesh_manager_api_key == "":
|
|
33
|
+
print("Error: Data Mesh Manager API Key is not set. Set env variable DATAMESH_MANAGER_API_KEY.")
|
|
34
|
+
raise DataContractException(
|
|
35
|
+
type="lint",
|
|
36
|
+
name=f"Reading data contract from {url}",
|
|
37
|
+
reason="Error: Data Mesh Manager API Key is not set. Set env variable DATAMESH_MANAGER_API_KEY.",
|
|
38
|
+
engine="datacontract",
|
|
39
|
+
result="error",
|
|
40
|
+
)
|
|
41
|
+
headers["x-api-key"] = datamesh_manager_api_key
|
|
42
|
+
elif hostname == "datacontract-manager.com" or hostname.endswith(".datacontract-manager.com"):
|
|
43
|
+
datacontract_manager_api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
|
|
44
|
+
if datacontract_manager_api_key is None or datacontract_manager_api_key == "":
|
|
45
|
+
print("Error: Data Contract Manager API Key is not set. Set env variable DATACONTRACT_MANAGER_API_KEY.")
|
|
46
|
+
raise DataContractException(
|
|
47
|
+
type="lint",
|
|
48
|
+
name=f"Reading data contract from {url}",
|
|
49
|
+
reason="Error: Data Contract Manager API Key is not set. Set env variable DATACONTRACT_MANAGER_API_KEY.",
|
|
50
|
+
engine="datacontract",
|
|
51
|
+
result="error",
|
|
52
|
+
)
|
|
53
|
+
headers["x-api-key"] = datacontract_manager_api_key
|
|
54
|
+
else:
|
|
55
|
+
# do nothing
|
|
56
|
+
pass
|
|
@@ -4,6 +4,32 @@ from typing import List, Dict, Optional, Any
|
|
|
4
4
|
import pydantic as pyd
|
|
5
5
|
import yaml
|
|
6
6
|
|
|
7
|
+
DATACONTRACT_TYPES = [
|
|
8
|
+
"string",
|
|
9
|
+
"text",
|
|
10
|
+
"varchar",
|
|
11
|
+
"number",
|
|
12
|
+
"decimal",
|
|
13
|
+
"numeric",
|
|
14
|
+
"int",
|
|
15
|
+
"integer",
|
|
16
|
+
"long",
|
|
17
|
+
"bigint",
|
|
18
|
+
"float",
|
|
19
|
+
"double",
|
|
20
|
+
"boolean",
|
|
21
|
+
"timestamp",
|
|
22
|
+
"timestamp_tz",
|
|
23
|
+
"timestamp_ntz",
|
|
24
|
+
"date",
|
|
25
|
+
"array",
|
|
26
|
+
"bytes",
|
|
27
|
+
"object",
|
|
28
|
+
"record",
|
|
29
|
+
"struct",
|
|
30
|
+
"null",
|
|
31
|
+
]
|
|
32
|
+
|
|
7
33
|
|
|
8
34
|
class Contact(pyd.BaseModel):
|
|
9
35
|
name: str = None
|
|
@@ -15,6 +41,14 @@ class Contact(pyd.BaseModel):
|
|
|
15
41
|
)
|
|
16
42
|
|
|
17
43
|
|
|
44
|
+
class ServerRole(pyd.BaseModel):
|
|
45
|
+
name: str = None
|
|
46
|
+
description: str = None
|
|
47
|
+
model_config = pyd.ConfigDict(
|
|
48
|
+
extra="allow",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
18
52
|
class Server(pyd.BaseModel):
|
|
19
53
|
type: str = None
|
|
20
54
|
description: str = None
|
|
@@ -38,6 +72,7 @@ class Server(pyd.BaseModel):
|
|
|
38
72
|
dataProductId: str = None
|
|
39
73
|
outputPortId: str = None
|
|
40
74
|
driver: str = None
|
|
75
|
+
roles: List[ServerRole] = None
|
|
41
76
|
|
|
42
77
|
model_config = pyd.ConfigDict(
|
|
43
78
|
extra="allow",
|
|
@@ -83,19 +118,40 @@ class Definition(pyd.BaseModel):
|
|
|
83
118
|
)
|
|
84
119
|
|
|
85
120
|
|
|
121
|
+
class Quality(pyd.BaseModel):
|
|
122
|
+
type: str = None
|
|
123
|
+
description: str = None
|
|
124
|
+
query: str = None
|
|
125
|
+
dialect: str = None
|
|
126
|
+
mustBe: int = None
|
|
127
|
+
mustNotBe: int = None
|
|
128
|
+
mustBeGreaterThan: int = None
|
|
129
|
+
mustBeGreaterThanOrEqualTo: int = None
|
|
130
|
+
mustBeLessThan: int = None
|
|
131
|
+
mustBeLessThanOrEqualTo: int = None
|
|
132
|
+
mustBeBetween: List[int] = None
|
|
133
|
+
mustNotBeBetween: List[int] = None
|
|
134
|
+
engine: str = None
|
|
135
|
+
implementation: str | Dict[str, Any] = None
|
|
136
|
+
|
|
137
|
+
model_config = pyd.ConfigDict(
|
|
138
|
+
extra="allow",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
86
142
|
class Field(pyd.BaseModel):
|
|
87
143
|
ref: str = pyd.Field(default=None, alias="$ref")
|
|
88
144
|
ref_obj: Definition = pyd.Field(default=None, exclude=True)
|
|
89
|
-
title: str = None
|
|
145
|
+
title: str | None = None
|
|
90
146
|
type: str = None
|
|
91
147
|
format: str = None
|
|
92
148
|
required: bool = None
|
|
93
149
|
primary: bool = None
|
|
94
|
-
unique: bool = None
|
|
150
|
+
unique: bool | None = None
|
|
95
151
|
references: str = None
|
|
96
|
-
description: str = None
|
|
97
|
-
pii: bool = None
|
|
98
|
-
classification: str = None
|
|
152
|
+
description: str | None = None
|
|
153
|
+
pii: bool | None = None
|
|
154
|
+
classification: str | None = None
|
|
99
155
|
pattern: str = None
|
|
100
156
|
minLength: int = None
|
|
101
157
|
maxLength: int = None
|
|
@@ -103,8 +159,8 @@ class Field(pyd.BaseModel):
|
|
|
103
159
|
exclusiveMinimum: int = None
|
|
104
160
|
maximum: int = None
|
|
105
161
|
exclusiveMaximum: int = None
|
|
106
|
-
enum: List[str] = []
|
|
107
|
-
tags: List[str] = []
|
|
162
|
+
enum: List[str] | None = []
|
|
163
|
+
tags: List[str] | None = []
|
|
108
164
|
links: Dict[str, str] = {}
|
|
109
165
|
fields: Dict[str, "Field"] = {}
|
|
110
166
|
items: "Field" = None
|
|
@@ -113,7 +169,9 @@ class Field(pyd.BaseModel):
|
|
|
113
169
|
precision: int = None
|
|
114
170
|
scale: int = None
|
|
115
171
|
example: str = None
|
|
116
|
-
|
|
172
|
+
examples: List[Any] | None = None
|
|
173
|
+
quality: List[Quality] | None = []
|
|
174
|
+
config: Dict[str, Any] | None = None
|
|
117
175
|
|
|
118
176
|
model_config = pyd.ConfigDict(
|
|
119
177
|
extra="allow",
|
|
@@ -126,7 +184,13 @@ class Model(pyd.BaseModel):
|
|
|
126
184
|
namespace: Optional[str] = None
|
|
127
185
|
title: Optional[str] = None
|
|
128
186
|
fields: Dict[str, Field] = {}
|
|
187
|
+
quality: List[Quality] | None = []
|
|
129
188
|
config: Dict[str, Any] = None
|
|
189
|
+
tags: List[str] | None = None
|
|
190
|
+
|
|
191
|
+
model_config = pyd.ConfigDict(
|
|
192
|
+
extra="allow",
|
|
193
|
+
)
|
|
130
194
|
|
|
131
195
|
|
|
132
196
|
class Info(pyd.BaseModel):
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
def is_open_data_contract_standard(odcs: dict) -> bool:
|
|
2
|
+
"""
|
|
3
|
+
Check if the given dictionary is an OpenDataContractStandard.
|
|
4
|
+
|
|
5
|
+
Args:
|
|
6
|
+
odcs (dict): The dictionary to check.
|
|
7
|
+
|
|
8
|
+
Returns:
|
|
9
|
+
bool: True if the dictionary is an OpenDataContractStandard, False otherwise.
|
|
10
|
+
"""
|
|
11
|
+
return odcs.get("kind") == "DataContract" and odcs.get("apiVersion", "").startswith("v3")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datacontract-cli
|
|
3
|
-
Version: 0.10.
|
|
3
|
+
Version: 0.10.14
|
|
4
4
|
Summary: The datacontract CLI is an open source command-line tool for working with Data Contracts. It uses data contract YAML files to lint the data contract, connect to data sources and execute schema and quality tests, detect breaking changes, and export to different formats. The tool is written in Python. It can be used as a standalone CLI tool, in a CI/CD pipeline, or directly as a Python library.
|
|
5
5
|
Author-email: Jochen Christ <jochen.christ@innoq.com>, Stefan Negele <stefan.negele@innoq.com>, Simon Harrer <simon.harrer@innoq.com>
|
|
6
6
|
Project-URL: Homepage, https://cli.datacontract.com
|
|
@@ -12,34 +12,35 @@ Requires-Python: >=3.10
|
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: typer<0.13,>=0.12
|
|
15
|
-
Requires-Dist: pydantic<2.
|
|
15
|
+
Requires-Dist: pydantic<2.10.0,>=2.8.2
|
|
16
16
|
Requires-Dist: pyyaml~=6.0.1
|
|
17
17
|
Requires-Dist: requests<2.33,>=2.31
|
|
18
|
-
Requires-Dist: fastapi==0.
|
|
19
|
-
Requires-Dist: uvicorn==0.30.
|
|
18
|
+
Requires-Dist: fastapi==0.115.2
|
|
19
|
+
Requires-Dist: uvicorn==0.30.6
|
|
20
20
|
Requires-Dist: fastjsonschema<2.21.0,>=2.19.1
|
|
21
21
|
Requires-Dist: fastparquet==2024.5.0
|
|
22
|
-
Requires-Dist: python-multipart==0.0.
|
|
23
|
-
Requires-Dist: rich
|
|
24
|
-
Requires-Dist: simple-ddl-parser==1.
|
|
25
|
-
Requires-Dist: duckdb==1.
|
|
22
|
+
Requires-Dist: python-multipart==0.0.12
|
|
23
|
+
Requires-Dist: rich<13.10,>=13.7
|
|
24
|
+
Requires-Dist: simple-ddl-parser==1.7.1
|
|
25
|
+
Requires-Dist: duckdb==1.1.2
|
|
26
26
|
Requires-Dist: soda-core-duckdb<3.4.0,>=3.3.1
|
|
27
27
|
Requires-Dist: setuptools>=60
|
|
28
28
|
Requires-Dist: python-dotenv~=1.0.0
|
|
29
29
|
Requires-Dist: rdflib==7.0.0
|
|
30
30
|
Requires-Dist: opentelemetry-exporter-otlp-proto-grpc~=1.16
|
|
31
31
|
Requires-Dist: opentelemetry-exporter-otlp-proto-http~=1.16
|
|
32
|
-
Requires-Dist: boto3<1.35.
|
|
32
|
+
Requires-Dist: boto3<1.35.45,>=1.34.41
|
|
33
33
|
Requires-Dist: jinja-partials>=0.2.1
|
|
34
34
|
Provides-Extra: all
|
|
35
|
-
Requires-Dist: datacontract-cli[bigquery,databricks,dbml,dbt,kafka,postgres,s3,snowflake,sqlserver,trino]; extra == "all"
|
|
35
|
+
Requires-Dist: datacontract-cli[bigquery,databricks,dbml,dbt,iceberg,kafka,postgres,s3,snowflake,sqlserver,trino]; extra == "all"
|
|
36
36
|
Provides-Extra: avro
|
|
37
37
|
Requires-Dist: avro==1.12.0; extra == "avro"
|
|
38
38
|
Provides-Extra: bigquery
|
|
39
39
|
Requires-Dist: soda-core-bigquery<3.4.0,>=3.3.1; extra == "bigquery"
|
|
40
40
|
Provides-Extra: databricks
|
|
41
41
|
Requires-Dist: soda-core-spark-df<3.4.0,>=3.3.1; extra == "databricks"
|
|
42
|
-
Requires-Dist: databricks-sql-connector<3.
|
|
42
|
+
Requires-Dist: databricks-sql-connector<3.6.0,>=3.1.2; extra == "databricks"
|
|
43
|
+
Requires-Dist: databricks-sdk<0.36.0,>=0.32.0; extra == "databricks"
|
|
43
44
|
Requires-Dist: soda-core-spark[databricks]<3.4.0,>=3.3.1; extra == "databricks"
|
|
44
45
|
Provides-Extra: dbml
|
|
45
46
|
Requires-Dist: pydbml>=1.1.1; extra == "dbml"
|
|
@@ -49,7 +50,7 @@ Provides-Extra: dev
|
|
|
49
50
|
Requires-Dist: datacontract-cli[all]; extra == "dev"
|
|
50
51
|
Requires-Dist: httpx==0.27.2; extra == "dev"
|
|
51
52
|
Requires-Dist: kafka-python; extra == "dev"
|
|
52
|
-
Requires-Dist: moto==5.0.
|
|
53
|
+
Requires-Dist: moto==5.0.18; extra == "dev"
|
|
53
54
|
Requires-Dist: pandas>=2.1.0; extra == "dev"
|
|
54
55
|
Requires-Dist: pre-commit<3.9.0,>=3.7.1; extra == "dev"
|
|
55
56
|
Requires-Dist: pyarrow>=12.0.0; extra == "dev"
|
|
@@ -57,15 +58,17 @@ Requires-Dist: pytest; extra == "dev"
|
|
|
57
58
|
Requires-Dist: pytest-xdist; extra == "dev"
|
|
58
59
|
Requires-Dist: pymssql==2.3.1; extra == "dev"
|
|
59
60
|
Requires-Dist: ruff; extra == "dev"
|
|
60
|
-
Requires-Dist: testcontainers[kafka,minio,mssql,postgres]==4.8.
|
|
61
|
-
Requires-Dist: trino==0.
|
|
61
|
+
Requires-Dist: testcontainers[kafka,minio,mssql,postgres]==4.8.2; extra == "dev"
|
|
62
|
+
Requires-Dist: trino==0.330.0; extra == "dev"
|
|
63
|
+
Provides-Extra: iceberg
|
|
64
|
+
Requires-Dist: pyiceberg==0.7.1; extra == "iceberg"
|
|
62
65
|
Provides-Extra: kafka
|
|
63
66
|
Requires-Dist: datacontract-cli[avro]; extra == "kafka"
|
|
64
67
|
Requires-Dist: soda-core-spark-df<3.4.0,>=3.3.1; extra == "kafka"
|
|
65
68
|
Provides-Extra: postgres
|
|
66
69
|
Requires-Dist: soda-core-postgres<3.4.0,>=3.3.1; extra == "postgres"
|
|
67
70
|
Provides-Extra: s3
|
|
68
|
-
Requires-Dist: s3fs==2024.
|
|
71
|
+
Requires-Dist: s3fs==2024.9.0; extra == "s3"
|
|
69
72
|
Provides-Extra: snowflake
|
|
70
73
|
Requires-Dist: snowflake-connector-python[pandas]<3.13,>=3.6; extra == "snowflake"
|
|
71
74
|
Requires-Dist: soda-core-snowflake<3.4.0,>=3.3.1; extra == "snowflake"
|
|
@@ -84,8 +87,8 @@ Requires-Dist: soda-core-trino<3.4.0,>=3.3.1; extra == "trino"
|
|
|
84
87
|
<a href="https://datacontract.com/slack" rel="nofollow"><img src="https://img.shields.io/badge/slack-join_chat-white.svg?logo=slack&style=social" alt="Slack Status" data-canonical-src="https://img.shields.io/badge/slack-join_chat-white.svg?logo=slack&style=social" style="max-width: 100%;"></a>
|
|
85
88
|
</p>
|
|
86
89
|
|
|
87
|
-
The `datacontract` CLI is an open
|
|
88
|
-
It uses data contract YAML files to lint the data contract, connect to data sources and execute schema and quality tests, detect breaking changes, and export to different formats. The tool is written in Python. It can be used as a standalone CLI tool, in a CI/CD pipeline, or directly as a Python library.
|
|
90
|
+
The `datacontract` CLI is an open-source command-line tool for working with data contracts.
|
|
91
|
+
It uses data contract YAML files as [Data Contract Specification](https://datacontract.com/) or [ODCS](https://bitol-io.github.io/open-data-contract-standard/latest/) to lint the data contract, connect to data sources and execute schema and quality tests, detect breaking changes, and export to different formats. The tool is written in Python. It can be used as a standalone CLI tool, in a CI/CD pipeline, or directly as a Python library.
|
|
89
92
|
|
|
90
93
|

|
|
91
94
|
|
|
@@ -97,15 +100,15 @@ Let's look at this data contract:
|
|
|
97
100
|
|
|
98
101
|
We have a _servers_ section with endpoint details to the S3 bucket, _models_ for the structure of the data, _servicelevels_ and _quality_ attributes that describe the expected freshness and number of rows.
|
|
99
102
|
|
|
100
|
-
This data contract contains all information to connect to S3 and check that the actual data meets the defined schema and quality requirements. We can use this information to test if the actual data
|
|
103
|
+
This data contract contains all information to connect to S3 and check that the actual data meets the defined schema and quality requirements. We can use this information to test if the actual data product in S3 is compliant to the data contract.
|
|
101
104
|
|
|
102
|
-
Let's use [pip](https://pip.pypa.io/en/stable/getting-started/) to install the CLI (or use the [Docker image](#docker),
|
|
105
|
+
Let's use [pip](https://pip.pypa.io/en/stable/getting-started/) to install the CLI (or use the [Docker image](#docker)),
|
|
103
106
|
```bash
|
|
104
107
|
$ python3 -m pip install datacontract-cli[all]
|
|
105
108
|
```
|
|
106
109
|
|
|
107
110
|
|
|
108
|
-
|
|
111
|
+
now, let's run the tests:
|
|
109
112
|
|
|
110
113
|
```bash
|
|
111
114
|
$ datacontract test https://datacontract.com/examples/orders-latest/datacontract.yaml
|
|
@@ -143,7 +146,7 @@ Testing https://datacontract.com/examples/orders-latest/datacontract.yaml
|
|
|
143
146
|
|
|
144
147
|
Voilà, the CLI tested that the _datacontract.yaml_ itself is valid, all records comply with the schema, and all quality attributes are met.
|
|
145
148
|
|
|
146
|
-
We can also use the datacontract.yaml to export in many [formats](#format), e.g., to SQL:
|
|
149
|
+
We can also use the datacontract.yaml to export in many [formats](#format), e.g., to generate a SQL DDL:
|
|
147
150
|
|
|
148
151
|
```bash
|
|
149
152
|
$ datacontract export --format sql https://datacontract.com/examples/orders-latest/datacontract.yaml
|
|
@@ -190,7 +193,7 @@ $ datacontract test datacontract.yaml
|
|
|
190
193
|
# execute schema and quality checks on the examples within the contract
|
|
191
194
|
$ datacontract test --examples datacontract.yaml
|
|
192
195
|
|
|
193
|
-
# export data contract as html (other formats: avro, dbt, dbt-sources, dbt-staging-sql, jsonschema,
|
|
196
|
+
# export data contract as html (other formats: avro, dbt, dbt-sources, dbt-staging-sql, jsonschema, odcs_v2, odcs_v3, rdf, sql, sodacl, terraform, ...)
|
|
194
197
|
$ datacontract export --format html datacontract.yaml > datacontract.html
|
|
195
198
|
|
|
196
199
|
# import avro (other formats: sql, glue, bigquery...)
|
|
@@ -223,8 +226,7 @@ if not run.has_passed():
|
|
|
223
226
|
Choose the most appropriate installation method for your needs:
|
|
224
227
|
|
|
225
228
|
### pip
|
|
226
|
-
Python 3.11
|
|
227
|
-
Python 3.12 available as pre-release release candidate for 0.9.3
|
|
229
|
+
Python 3.10, 3.11, and 3.12 are supported. We recommend to use Python 3.11.
|
|
228
230
|
|
|
229
231
|
```bash
|
|
230
232
|
python3 -m pip install datacontract-cli[all]
|
|
@@ -238,17 +240,22 @@ pipx install datacontract-cli[all]
|
|
|
238
240
|
|
|
239
241
|
### Docker
|
|
240
242
|
|
|
243
|
+
You can also use our Docker image to run the CLI tool. It is also convenient for CI/CD pipelines.
|
|
244
|
+
|
|
241
245
|
```bash
|
|
242
246
|
docker pull datacontract/cli
|
|
243
247
|
docker run --rm -v ${PWD}:/home/datacontract datacontract/cli
|
|
244
248
|
```
|
|
245
249
|
|
|
246
|
-
|
|
250
|
+
You can create an alias for the Docker command to make it easier to use:
|
|
247
251
|
|
|
248
252
|
```bash
|
|
249
253
|
alias datacontract='docker run --rm -v "${PWD}:/home/datacontract" datacontract/cli:latest'
|
|
250
254
|
```
|
|
251
255
|
|
|
256
|
+
_Note:_ The output of Docker command line messages is limited to 80 columns and may include line breaks. Don't pipe docker output to files if you want to export code. Use the `--output` option instead.
|
|
257
|
+
|
|
258
|
+
|
|
252
259
|
|
|
253
260
|
## Optional Dependencies
|
|
254
261
|
|
|
@@ -741,10 +748,11 @@ servers:
|
|
|
741
748
|
|
|
742
749
|
#### Environment Variables
|
|
743
750
|
|
|
744
|
-
| Environment Variable
|
|
745
|
-
|
|
746
|
-
| `DATACONTRACT_KAFKA_SASL_USERNAME`
|
|
747
|
-
| `DATACONTRACT_KAFKA_SASL_PASSWORD`
|
|
751
|
+
| Environment Variable | Example | Description |
|
|
752
|
+
|-------------------------------------|---------|----------------------------------------------------------------------------------|
|
|
753
|
+
| `DATACONTRACT_KAFKA_SASL_USERNAME` | `xxx` | The SASL username (key). |
|
|
754
|
+
| `DATACONTRACT_KAFKA_SASL_PASSWORD` | `xxx` | The SASL password (secret). |
|
|
755
|
+
| `DATACONTRACT_KAFKA_SASL_MECHANISM` | `PLAIN` | Default `PLAIN`. Other supported mechanisms: `SCRAM-SHA-256` and `SCRAM-SHA-512` |
|
|
748
756
|
|
|
749
757
|
|
|
750
758
|
### Postgres
|
|
@@ -799,6 +807,10 @@ models:
|
|
|
799
807
|
fields:
|
|
800
808
|
my_column_1: # corresponds to a column
|
|
801
809
|
type: varchar
|
|
810
|
+
my_column_2: # corresponds to a column with custom trino type
|
|
811
|
+
type: object
|
|
812
|
+
config:
|
|
813
|
+
trinoType: row(en_us varchar, pt_br varchar)
|
|
802
814
|
```
|
|
803
815
|
|
|
804
816
|
#### Environment Variables
|
|
@@ -825,7 +837,7 @@ models:
|
|
|
825
837
|
│ * --format [jsonschema|pydantic-model|sodacl|dbt|dbt-sources|db The export format. [default: None] [required] │
|
|
826
838
|
│ t-staging-sql|odcs|rdf|avro|protobuf|great-expectati │
|
|
827
839
|
│ ons|terraform|avro-idl|sql|sql-query|html|go|bigquer │
|
|
828
|
-
│ y|dbml|spark|sqlalchemy]
|
|
840
|
+
│ y|dbml|spark|sqlalchemy|data-caterer|dcs] │
|
|
829
841
|
│ --output PATH Specify the file path where the exported data will be │
|
|
830
842
|
│ saved. If no path is provided, the output will be │
|
|
831
843
|
│ printed to stdout. │
|
|
@@ -857,26 +869,30 @@ Available export options:
|
|
|
857
869
|
|
|
858
870
|
| Type | Description | Status |
|
|
859
871
|
|----------------------|---------------------------------------------------------|--------|
|
|
860
|
-
| `html` | Export to HTML | ✅
|
|
861
|
-
| `jsonschema` | Export to JSON Schema | ✅
|
|
862
|
-
| `
|
|
863
|
-
| `
|
|
864
|
-
| `
|
|
865
|
-
| `
|
|
866
|
-
| `dbt
|
|
867
|
-
| `
|
|
868
|
-
| `
|
|
869
|
-
| `
|
|
870
|
-
| `
|
|
871
|
-
| `
|
|
872
|
-
| `
|
|
873
|
-
| `
|
|
874
|
-
| `
|
|
875
|
-
| `
|
|
876
|
-
| `
|
|
877
|
-
| `
|
|
878
|
-
| `
|
|
879
|
-
| `
|
|
872
|
+
| `html` | Export to HTML | ✅ |
|
|
873
|
+
| `jsonschema` | Export to JSON Schema | ✅ |
|
|
874
|
+
| `odcs_v2` | Export to Open Data Contract Standard (ODCS) V2 | ✅ |
|
|
875
|
+
| `odcs_v3` | Export to Open Data Contract Standard (ODCS) V3 | ✅ |
|
|
876
|
+
| `odcs` | Export to Open Data Contract Standard (ODCS) V3 | ✅ |
|
|
877
|
+
| `sodacl` | Export to SodaCL quality checks in YAML format | ✅ |
|
|
878
|
+
| `dbt` | Export to dbt models in YAML format | ✅ |
|
|
879
|
+
| `dbt-sources` | Export to dbt sources in YAML format | ✅ |
|
|
880
|
+
| `dbt-staging-sql` | Export to dbt staging SQL models | ✅ |
|
|
881
|
+
| `rdf` | Export data contract to RDF representation in N3 format | ✅ |
|
|
882
|
+
| `avro` | Export to AVRO models | ✅ |
|
|
883
|
+
| `protobuf` | Export to Protobuf | ✅ |
|
|
884
|
+
| `terraform` | Export to terraform resources | ✅ |
|
|
885
|
+
| `sql` | Export to SQL DDL | ✅ |
|
|
886
|
+
| `sql-query` | Export to SQL Query | ✅ |
|
|
887
|
+
| `great-expectations` | Export to Great Expectations Suites in JSON Format | ✅ |
|
|
888
|
+
| `bigquery` | Export to BigQuery Schemas | ✅ |
|
|
889
|
+
| `go` | Export to Go types | ✅ |
|
|
890
|
+
| `pydantic-model` | Export to pydantic models | ✅ |
|
|
891
|
+
| `DBML` | Export to a DBML Diagram description | ✅ |
|
|
892
|
+
| `spark` | Export to a Spark StructType | ✅ |
|
|
893
|
+
| `sqlalchemy` | Export to SQLAlchemy Models | ✅ |
|
|
894
|
+
| `data-caterer` | Export to Data Caterer in YAML format | ✅ |
|
|
895
|
+
| `dcs` | Export to Data Contract Specification in YAML format | ✅ |
|
|
880
896
|
| Missing something? | Please create an issue on GitHub | TBD |
|
|
881
897
|
|
|
882
898
|
#### Great Expectations
|
|
@@ -940,6 +956,20 @@ To specify custom Avro properties in your data contract, you can define them wit
|
|
|
940
956
|
|
|
941
957
|
>NOTE: At this moment, we just support [logicalType](https://avro.apache.org/docs/1.11.0/spec.html#Logical+Types) and [default](https://avro.apache.org/docs/1.11.0/spec.htm)
|
|
942
958
|
|
|
959
|
+
#### Data Caterer
|
|
960
|
+
|
|
961
|
+
The export function converts the data contract to a data generation task in YAML format that can be
|
|
962
|
+
ingested by [Data Caterer](https://github.com/data-catering/data-caterer). This gives you the
|
|
963
|
+
ability to generate production-like data in any environment based off your data contract.
|
|
964
|
+
|
|
965
|
+
```shell
|
|
966
|
+
datacontract export datacontract.yaml --format data-caterer --model orders
|
|
967
|
+
```
|
|
968
|
+
|
|
969
|
+
You can further customise the way data is generated via adding
|
|
970
|
+
[additional metadata in the YAML](https://data.catering/setup/generator/data-generator/)
|
|
971
|
+
to suit your needs.
|
|
972
|
+
|
|
943
973
|
#### Example Configuration
|
|
944
974
|
|
|
945
975
|
```yaml
|
|
@@ -980,7 +1010,7 @@ models:
|
|
|
980
1010
|
|
|
981
1011
|
╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
982
1012
|
│ * --format [sql|avro|dbt|glue|jsonschema|bigquery|odcs The format of the source file. │
|
|
983
|
-
│ |unity|spark]
|
|
1013
|
+
│ |unity|spark|iceberg] [default: None] │
|
|
984
1014
|
│ [required] │
|
|
985
1015
|
│ --source TEXT The path to the file or Glue Database that │
|
|
986
1016
|
│ should be imported. │
|
|
@@ -1010,6 +1040,8 @@ models:
|
|
|
1010
1040
|
│ file (repeat for multiple table names, leave │
|
|
1011
1041
|
│ empty for all tables in the file). │
|
|
1012
1042
|
│ [default: None] │
|
|
1043
|
+
│ --iceberg-table TEXT Table name to assign to the model created │
|
|
1044
|
+
│ from the Iceberg schema. [default: None] │
|
|
1013
1045
|
│ --help Show this message and exit. │
|
|
1014
1046
|
╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
1015
1047
|
```
|
|
@@ -1035,12 +1067,25 @@ Available import options:
|
|
|
1035
1067
|
| `spark` | Import from Spark StructTypes | ✅ |
|
|
1036
1068
|
| `dbml` | Import from DBML models | ✅ |
|
|
1037
1069
|
| `protobuf` | Import from Protobuf schemas | TBD |
|
|
1070
|
+
| `iceberg` | Import from an Iceberg JSON Schema Definition | partial |
|
|
1038
1071
|
| Missing something? | Please create an issue on GitHub | TBD |
|
|
1039
1072
|
|
|
1040
1073
|
|
|
1074
|
+
#### ODCS
|
|
1075
|
+
|
|
1076
|
+
Import from Open Data Contract Standard (ODCS) v2 or v3.
|
|
1077
|
+
The importer automatically detects the ODCS version and imports the data contract.
|
|
1078
|
+
|
|
1079
|
+
Examples:
|
|
1080
|
+
|
|
1081
|
+
```bash
|
|
1082
|
+
# Example import from ODCS
|
|
1083
|
+
datacontract import --format odcs --source my_data_contract.odcs.yaml
|
|
1084
|
+
```
|
|
1085
|
+
|
|
1041
1086
|
#### BigQuery
|
|
1042
1087
|
|
|
1043
|
-
|
|
1088
|
+
BigQuery data can either be imported off of JSON Files generated from the table descriptions or directly from the Bigquery API. In case you want to use JSON Files, specify the `source` parameter with a path to the JSON File.
|
|
1044
1089
|
|
|
1045
1090
|
To import from the Bigquery API, you have to _omit_ `source` and instead need to provide `bigquery-project` and `bigquery-dataset`. Additionally you may specify `bigquery-table` to enumerate the tables that should be imported. If no tables are given, _all_ available tables of the dataset will be imported.
|
|
1046
1091
|
|
|
@@ -1152,6 +1197,15 @@ datacontract import --format dbml --source <file_path> --dbml-table <table_name_
|
|
|
1152
1197
|
datacontract import --format dbml --source <file_path> --dbml-table <table_name_1> --dbml-schema <schema_1>
|
|
1153
1198
|
```
|
|
1154
1199
|
|
|
1200
|
+
#### Iceberg
|
|
1201
|
+
|
|
1202
|
+
Importing from an [Iceberg Table Json Schema Definition](https://iceberg.apache.org/spec/#appendix-c-json-serialization). Specify location of json files using the `source` parameter.
|
|
1203
|
+
|
|
1204
|
+
Examples:
|
|
1205
|
+
|
|
1206
|
+
```bash
|
|
1207
|
+
datacontract import --format iceberg --source ./tests/fixtures/iceberg/simple_schema.json --iceberg-table test-table
|
|
1208
|
+
```
|
|
1155
1209
|
|
|
1156
1210
|
### breaking
|
|
1157
1211
|
|