datacontract-cli 0.9.2__tar.gz → 0.9.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- {datacontract-cli-0.9.2/datacontract_cli.egg-info → datacontract-cli-0.9.4}/PKG-INFO +191 -31
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/README.md +177 -22
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/cli.py +4 -2
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/data_contract.py +69 -19
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/fastjsonschema/check_jsonschema.py +56 -42
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/fastjsonschema/s3/s3_read_files.py +1 -1
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/soda/check_soda_execute.py +25 -1
- datacontract-cli-0.9.4/datacontract/engines/soda/connections/bigquery.py +18 -0
- datacontract-cli-0.9.4/datacontract/engines/soda/connections/databricks.py +20 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/soda/connections/duckdb.py +10 -6
- datacontract-cli-0.9.4/datacontract/engines/soda/connections/postgres.py +21 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/export/sodacl_converter.py +2 -2
- datacontract-cli-0.9.4/datacontract/lint/lint.py +126 -0
- datacontract-cli-0.9.4/datacontract/lint/linters/__init__.py +0 -0
- datacontract-cli-0.9.4/datacontract/lint/linters/example_model_linter.py +67 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/model/data_contract_specification.py +5 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4/datacontract_cli.egg-info}/PKG-INFO +191 -31
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract_cli.egg-info/SOURCES.txt +15 -1
- datacontract-cli-0.9.4/datacontract_cli.egg-info/requires.txt +21 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/pyproject.toml +15 -10
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_example_parquet.py +0 -1
- datacontract-cli-0.9.4/tests/test_examples_bigquery.py +25 -0
- datacontract-cli-0.9.4/tests/test_examples_databricks.py +25 -0
- datacontract-cli-0.9.4/tests/test_examples_examples_csv.py +25 -0
- datacontract-cli-0.9.4/tests/test_examples_examples_inline.py +25 -0
- datacontract-cli-0.9.4/tests/test_examples_examples_json.py +25 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_examples_local_json.py +5 -2
- datacontract-cli-0.9.4/tests/test_examples_postgres.py +63 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_examples_s3_json_complex.py +2 -1
- datacontract-cli-0.9.4/tests/test_examples_s3_json_multiple_models.py +56 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_export_sodacl.py +2 -2
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_lint.py +1 -2
- datacontract-cli-0.9.4/tests/test_linters.py +108 -0
- datacontract-cli-0.9.2/datacontract_cli.egg-info/requires.txt +0 -16
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/LICENSE +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/__init__.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/__init__.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/datacontract/check_that_datacontract_file_exists.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/soda/__init__.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/soda/connections/dask.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/soda/connections/snowflake.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/export/jsonschema_converter.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/init/download_datacontract_file.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/integration/publish_datamesh_manager.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/lint/files.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/lint/resolve.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/lint/schema.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/lint/urls.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/model/exceptions.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/model/run.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract_cli.egg-info/dependency_links.txt +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract_cli.egg-info/entry_points.txt +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract_cli.egg-info/top_level.txt +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/setup.cfg +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_cli.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_download_datacontract_file.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_examples_s3_csv.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_examples_s3_json.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_examples_s3_json_remote.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_examples_snowflake.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_export_jsonschema.py +0 -0
- {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_integration_datameshmanager.py +0 -0
|
@@ -1,31 +1,36 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datacontract-cli
|
|
3
|
-
Version: 0.9.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.9.4
|
|
4
|
+
Summary: Test data contracts
|
|
5
5
|
Author-email: Jochen Christ <jochen.christ@innoq.com>, Stefan Negele <stefan.negele@innoq.com>
|
|
6
6
|
Project-URL: Homepage, https://cli.datacontract.com
|
|
7
7
|
Project-URL: Issues, https://github.com/datacontract/cli/issues
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: License :: OSI Approved :: MIT License
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python:
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: typer[all]~=0.9.0
|
|
15
|
-
Requires-Dist: pydantic
|
|
15
|
+
Requires-Dist: pydantic<2.7.0,>=2.5.3
|
|
16
16
|
Requires-Dist: pyyaml~=6.0.1
|
|
17
17
|
Requires-Dist: requests~=2.31.0
|
|
18
18
|
Requires-Dist: fastparquet==2023.10.1
|
|
19
|
-
Requires-Dist: soda-core-
|
|
20
|
-
Requires-Dist: soda-core-
|
|
21
|
-
Requires-Dist:
|
|
22
|
-
Requires-Dist:
|
|
19
|
+
Requires-Dist: soda-core-bigquery~=3.1.5
|
|
20
|
+
Requires-Dist: soda-core-duckdb~=3.1.5
|
|
21
|
+
Requires-Dist: soda-core-postgres~=3.1.5
|
|
22
|
+
Requires-Dist: soda-core-snowflake~=3.1.5
|
|
23
|
+
Requires-Dist: soda-core-spark[databricks]~=3.1.5
|
|
24
|
+
Requires-Dist: soda-core-spark-df~=3.1.5
|
|
25
|
+
Requires-Dist: snowflake-connector-python[pandas]<3.8,>=3.6
|
|
26
|
+
Requires-Dist: duckdb==0.10.0
|
|
23
27
|
Requires-Dist: fastjsonschema~=2.19.1
|
|
24
28
|
Requires-Dist: python-dotenv~=1.0.0
|
|
25
|
-
Requires-Dist: s3fs==
|
|
29
|
+
Requires-Dist: s3fs==2024.2.0
|
|
26
30
|
Provides-Extra: dev
|
|
27
31
|
Requires-Dist: pytest; extra == "dev"
|
|
28
32
|
Requires-Dist: testcontainers-minio; extra == "dev"
|
|
33
|
+
Requires-Dist: testcontainers-postgres; extra == "dev"
|
|
29
34
|
|
|
30
35
|
# Data Contract CLI
|
|
31
36
|
|
|
@@ -44,12 +49,12 @@ It uses data contract YAML files to lint the data contract, connect to data sour
|
|
|
44
49
|
|
|
45
50
|
## Getting started
|
|
46
51
|
|
|
47
|
-
Let's use [pip](https://pip.pypa.io/en/stable/getting-started/) to install the CLI.
|
|
52
|
+
Let's use [pip](https://pip.pypa.io/en/stable/getting-started/) to install the CLI.
|
|
48
53
|
```bash
|
|
49
54
|
$ pip3 install datacontract-cli
|
|
50
55
|
```
|
|
51
56
|
|
|
52
|
-
Now, let's look at this data contract:
|
|
57
|
+
Now, let's look at this data contract:
|
|
53
58
|
[https://datacontract.com/examples/covid-cases/datacontract.yaml](https://datacontract.com/examples/covid-cases/datacontract.yaml)
|
|
54
59
|
|
|
55
60
|
We have a _servers_ section with endpoint details to the (public) S3 bucket, _models_ for the structure of the data, and _quality_ attributes that describe the expected freshness and number of rows.
|
|
@@ -77,6 +82,9 @@ $ datacontract lint datacontract.yaml
|
|
|
77
82
|
# execute schema and quality checks
|
|
78
83
|
$ datacontract test datacontract.yaml
|
|
79
84
|
|
|
85
|
+
# execute schema and quality checks on the examples within the contract
|
|
86
|
+
$ datacontract test --examples datacontract.yaml
|
|
87
|
+
|
|
80
88
|
# find differences between to data contracts (Coming Soon)
|
|
81
89
|
$ datacontract diff datacontract-v1.yaml datacontract-v2.yaml
|
|
82
90
|
|
|
@@ -124,6 +132,7 @@ Choose the most appropriate installation method for your needs:
|
|
|
124
132
|
|
|
125
133
|
### pip
|
|
126
134
|
Python 3.11 recommended.
|
|
135
|
+
Python 3.12 available as pre-release release candidate for 0.9.3
|
|
127
136
|
|
|
128
137
|
```bash
|
|
129
138
|
pip3 install datacontract-cli
|
|
@@ -135,17 +144,17 @@ pipx installs into an isolated environment.
|
|
|
135
144
|
pipx install datacontract-cli
|
|
136
145
|
```
|
|
137
146
|
|
|
138
|
-
###
|
|
147
|
+
### Docker
|
|
139
148
|
|
|
140
149
|
```bash
|
|
141
|
-
|
|
150
|
+
docker pull --platform linux/amd64 datacontract/cli
|
|
151
|
+
docker run --rm --platform linux/amd64 -v ${PWD}:/home/datacontract datacontract/cli
|
|
142
152
|
```
|
|
143
153
|
|
|
144
|
-
|
|
154
|
+
Or via an alias that automatically uses the latest version:
|
|
145
155
|
|
|
146
156
|
```bash
|
|
147
|
-
docker
|
|
148
|
-
docker run --rm -v ${PWD}:/datacontract datacontract/cli
|
|
157
|
+
alias datacontract='docker run --rm -v "${PWD}:/home/datacontract" --platform linux/amd64 datacontract/cli:latest'
|
|
149
158
|
```
|
|
150
159
|
|
|
151
160
|
## Documentation
|
|
@@ -154,7 +163,7 @@ docker run --rm -v ${PWD}:/datacontract datacontract/cli
|
|
|
154
163
|
|
|
155
164
|
Data Contract CLI can connect to data sources and run schema and quality tests to verify that the data contract is valid.
|
|
156
165
|
|
|
157
|
-
```bash
|
|
166
|
+
```bash
|
|
158
167
|
$ datacontract test --server production datacontract.yaml
|
|
159
168
|
```
|
|
160
169
|
|
|
@@ -168,11 +177,12 @@ The application uses different engines, based on the server `type`.
|
|
|
168
177
|
| `s3` | `json` | Support for `new_line` delimited JSON files and one JSON record per file. | ✅ | fastjsonschema<br> soda-core-duckdb |
|
|
169
178
|
| `s3` | `csv` | | ✅ | soda-core-duckdb |
|
|
170
179
|
| `s3` | `delta` | | Coming soon | TBD |
|
|
171
|
-
| `postgres` | n/a | |
|
|
172
|
-
| `snowflake` | n/a | | ✅
|
|
173
|
-
| `bigquery` | n/a | |
|
|
180
|
+
| `postgres` | n/a | | ✅ | soda-core-postgres |
|
|
181
|
+
| `snowflake` | n/a | | ✅ | soda-core-snowflake |
|
|
182
|
+
| `bigquery` | n/a | | ✅ | soda-core-bigquery |
|
|
174
183
|
| `redshift` | n/a | | Coming soon | TBD |
|
|
175
|
-
| `databricks` | n/a |
|
|
184
|
+
| `databricks` | n/a | Support for Databricks SQL with Unity catalog and Hive metastore. | ✅ | soda-core-spark |
|
|
185
|
+
| `databricks` | n/a | Support for Spark for programmatic use in Notebooks. | ✅ | soda-core-spark-df |
|
|
176
186
|
| `kafka` | `json` | | Coming soon | TBD |
|
|
177
187
|
| `kafka` | `avro` | | Coming soon | TBD |
|
|
178
188
|
| `kafka` | `protobuf` | | Coming soon | TBD |
|
|
@@ -182,32 +192,178 @@ The application uses different engines, based on the server `type`.
|
|
|
182
192
|
|
|
183
193
|
Feel free to create an issue, if you need support for an additional type.
|
|
184
194
|
|
|
185
|
-
###
|
|
195
|
+
### S3
|
|
186
196
|
|
|
187
|
-
|
|
197
|
+
Data Contract CLI can test data that is stored in S3 buckets or any S3-compliant endpoints in various formats.
|
|
198
|
+
|
|
199
|
+
#### Example
|
|
188
200
|
|
|
189
201
|
datacontract.yaml
|
|
190
|
-
```
|
|
202
|
+
```yaml
|
|
191
203
|
servers:
|
|
192
204
|
production:
|
|
193
205
|
type: s3
|
|
194
206
|
endpointUrl: https://minio.example.com # not needed with AWS S3
|
|
195
207
|
location: s3://bucket-name/path/*/*.json
|
|
196
|
-
delimiter: new_line # new_line, array, or none
|
|
197
208
|
format: json
|
|
209
|
+
delimiter: new_line # new_line, array, or none
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
#### Environment Variables
|
|
213
|
+
|
|
214
|
+
| Environment Variable | Example | Description |
|
|
215
|
+
|-----------------------------------|-------------------------------|-----------------------|
|
|
216
|
+
| `DATACONTRACT_S3_REGION` | `eu-central-1` | Region of S3 bucket |
|
|
217
|
+
| `DATACONTRACT_S3_ACCESS_KEY_ID` | `AKIAXV5Q5QABCDEFGH` | AWS Access Key ID |
|
|
218
|
+
| `DATACONTRACT_S3_SECRET_ACCESS_KEY` | `93S7LRrJcqLaaaa/XXXXXXXXXXXXX` | AWS Secret Access Key |
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
### Postgres
|
|
222
|
+
|
|
223
|
+
Data Contract CLI can test data in Postgres or Postgres-compliant databases (e.g., RisingWave).
|
|
224
|
+
|
|
225
|
+
#### Example
|
|
226
|
+
|
|
227
|
+
datacontract.yaml
|
|
228
|
+
```yaml
|
|
229
|
+
servers:
|
|
230
|
+
postgres:
|
|
231
|
+
type: postgres
|
|
232
|
+
host: localhost
|
|
233
|
+
port: 5432
|
|
234
|
+
database: postgres
|
|
235
|
+
schema: public
|
|
236
|
+
models:
|
|
237
|
+
my_table_1: # corresponds to a table
|
|
238
|
+
type: table
|
|
239
|
+
fields:
|
|
240
|
+
my_column_1: # corresponds to a column
|
|
241
|
+
type: varchar
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
#### Environment Variables
|
|
245
|
+
|
|
246
|
+
| Environment Variable | Example | Description |
|
|
247
|
+
|----------------------------------|--------------------|-------------|
|
|
248
|
+
| `DATACONTRACT_POSTGRES_USERNAME` | `postgres` | Username |
|
|
249
|
+
| `DATACONTRACT_POSTGRES_PASSWORD` | `mysecretpassword` | Password |
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
### BigQuery
|
|
253
|
+
|
|
254
|
+
We support authentication to BigQuery using Service Account Key. The used Service Account should include the roles:
|
|
255
|
+
* BigQuery Job User
|
|
256
|
+
* BigQuery Data Viewer
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
#### Example
|
|
260
|
+
|
|
261
|
+
datacontract.yaml
|
|
262
|
+
```yaml
|
|
263
|
+
servers:
|
|
264
|
+
production:
|
|
265
|
+
type: bigquery
|
|
266
|
+
project: datameshexample-product
|
|
267
|
+
dataset: datacontract_cli_test_dataset
|
|
268
|
+
models:
|
|
269
|
+
datacontract_cli_test_table: # corresponds to a BigQuery table
|
|
270
|
+
type: table
|
|
271
|
+
fields: ...
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
#### Environment Variables
|
|
275
|
+
|
|
276
|
+
| Environment Variable | Example | Description |
|
|
277
|
+
|----------------------------------------------|---------------------------|---------------------------------------------------------|
|
|
278
|
+
| `DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH` | `~/service-access-key.json` | Service Access key as saved on key creation by BigQuery |
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
### Databricks
|
|
282
|
+
|
|
283
|
+
Works with Unity Catalog and Hive metastore.
|
|
284
|
+
|
|
285
|
+
Needs a running SQL warehouse or compute cluster.
|
|
286
|
+
|
|
287
|
+
#### Example
|
|
288
|
+
|
|
289
|
+
datacontract.yaml
|
|
290
|
+
```yaml
|
|
291
|
+
servers:
|
|
292
|
+
production:
|
|
293
|
+
type: databricks
|
|
294
|
+
host: dbc-abcdefgh-1234.cloud.databricks.com
|
|
295
|
+
catalog: acme_catalog_prod
|
|
296
|
+
schema: orders_latest
|
|
297
|
+
models:
|
|
298
|
+
orders: # corresponds to a table
|
|
299
|
+
type: table
|
|
300
|
+
fields: ...
|
|
198
301
|
```
|
|
199
302
|
|
|
200
|
-
Environment
|
|
303
|
+
#### Environment Variables
|
|
304
|
+
|
|
305
|
+
| Environment Variable | Example | Description |
|
|
306
|
+
|----------------------------------------------|--------------------------------------|-------------------------------------------------------|
|
|
307
|
+
| `DATACONTRACT_DATABRICKS_TOKEN` | `dapia00000000000000000000000000000` | The personal access token to authenticate |
|
|
308
|
+
| `DATACONTRACT_DATABRICKS_HTTP_PATH` | `/sql/1.0/warehouses/b053a3ffffffff` | The HTTP path to the SQL warehouse or compute cluster |
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
### Databricks (programmatic)
|
|
312
|
+
|
|
313
|
+
Works with Unity Catalog and Hive metastore.
|
|
314
|
+
When running in a notebook or pipeline, the provided `spark` session can be used.
|
|
315
|
+
An additional authentication is not required.
|
|
316
|
+
|
|
317
|
+
Requires a Databricks Runtime with Python >= 3.10.
|
|
318
|
+
|
|
319
|
+
#### Example
|
|
320
|
+
|
|
321
|
+
datacontract.yaml
|
|
322
|
+
```yaml
|
|
323
|
+
servers:
|
|
324
|
+
production:
|
|
325
|
+
type: databricks
|
|
326
|
+
host: dbc-abcdefgh-1234.cloud.databricks.com # ignored, always use current host
|
|
327
|
+
catalog: acme_catalog_prod
|
|
328
|
+
schema: orders_latest
|
|
329
|
+
models:
|
|
330
|
+
orders: # corresponds to a table
|
|
331
|
+
type: table
|
|
332
|
+
fields: ...
|
|
201
333
|
```
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
334
|
+
|
|
335
|
+
Notebook
|
|
336
|
+
```python
|
|
337
|
+
%pip install git+https://github.com/datacontract/cli.git
|
|
338
|
+
dbutils.library.restartPython()
|
|
339
|
+
|
|
340
|
+
from datacontract.data_contract import DataContract
|
|
341
|
+
|
|
342
|
+
data_contract = DataContract(
|
|
343
|
+
data_contract_file="/Volumes/acme_catalog_prod/orders_latest/datacontract/datacontract.yaml",
|
|
344
|
+
spark=spark)
|
|
345
|
+
run = data_contract.test()
|
|
346
|
+
run.result
|
|
205
347
|
```
|
|
206
348
|
|
|
207
349
|
|
|
350
|
+
### Exports
|
|
351
|
+
|
|
352
|
+
Available export options:
|
|
353
|
+
|
|
354
|
+
| Type | Description | Status |
|
|
355
|
+
|--------------|------------------------------------------------|--------|
|
|
356
|
+
| `jsonschema` | Export to JSON Schema | ✅ |
|
|
357
|
+
| `sodacl` | Export to SodaCL quality checks in YAML format | ✅ |
|
|
358
|
+
| `dbt` | Export to dbt model in YAML format | TBD |
|
|
359
|
+
| `avro` | Export to AVRO models | TBD |
|
|
360
|
+
| `pydantic` | Export to pydantic models | TBD |
|
|
361
|
+
| `sql` | Export to SQL DDL | TBD |
|
|
362
|
+
| `protobuf` | Export to Protobuf | TBD |
|
|
363
|
+
|
|
208
364
|
## Development Setup
|
|
209
365
|
|
|
210
|
-
Python base interpreter should be 3.11.x
|
|
366
|
+
Python base interpreter should be 3.11.x (unless working on 3.12 release candidate).
|
|
211
367
|
|
|
212
368
|
```bash
|
|
213
369
|
# create venv
|
|
@@ -237,7 +393,7 @@ Docker Build
|
|
|
237
393
|
|
|
238
394
|
```
|
|
239
395
|
docker build -t datacontract/cli .
|
|
240
|
-
docker run --rm -v ${PWD}:/datacontract datacontract/cli
|
|
396
|
+
docker run --rm -v ${PWD}:/home/datacontract datacontract/cli
|
|
241
397
|
```
|
|
242
398
|
|
|
243
399
|
## Contribution
|
|
@@ -251,3 +407,7 @@ We are happy to receive your contributions. Propose your change in an issue or d
|
|
|
251
407
|
## Credits
|
|
252
408
|
|
|
253
409
|
Created by [Stefan Negele](https://www.linkedin.com/in/stefan-negele-573153112/) and [Jochen Christ](https://www.linkedin.com/in/jochenchrist/).
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
<a href="https://github.com/datacontract/cli" class="github-corner" aria-label="View source on GitHub"><svg width="80" height="80" viewBox="0 0 250 250" style="fill:#151513; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
|
|
@@ -15,12 +15,12 @@ It uses data contract YAML files to lint the data contract, connect to data sour
|
|
|
15
15
|
|
|
16
16
|
## Getting started
|
|
17
17
|
|
|
18
|
-
Let's use [pip](https://pip.pypa.io/en/stable/getting-started/) to install the CLI.
|
|
18
|
+
Let's use [pip](https://pip.pypa.io/en/stable/getting-started/) to install the CLI.
|
|
19
19
|
```bash
|
|
20
20
|
$ pip3 install datacontract-cli
|
|
21
21
|
```
|
|
22
22
|
|
|
23
|
-
Now, let's look at this data contract:
|
|
23
|
+
Now, let's look at this data contract:
|
|
24
24
|
[https://datacontract.com/examples/covid-cases/datacontract.yaml](https://datacontract.com/examples/covid-cases/datacontract.yaml)
|
|
25
25
|
|
|
26
26
|
We have a _servers_ section with endpoint details to the (public) S3 bucket, _models_ for the structure of the data, and _quality_ attributes that describe the expected freshness and number of rows.
|
|
@@ -48,6 +48,9 @@ $ datacontract lint datacontract.yaml
|
|
|
48
48
|
# execute schema and quality checks
|
|
49
49
|
$ datacontract test datacontract.yaml
|
|
50
50
|
|
|
51
|
+
# execute schema and quality checks on the examples within the contract
|
|
52
|
+
$ datacontract test --examples datacontract.yaml
|
|
53
|
+
|
|
51
54
|
# find differences between to data contracts (Coming Soon)
|
|
52
55
|
$ datacontract diff datacontract-v1.yaml datacontract-v2.yaml
|
|
53
56
|
|
|
@@ -95,6 +98,7 @@ Choose the most appropriate installation method for your needs:
|
|
|
95
98
|
|
|
96
99
|
### pip
|
|
97
100
|
Python 3.11 recommended.
|
|
101
|
+
Python 3.12 available as pre-release release candidate for 0.9.3
|
|
98
102
|
|
|
99
103
|
```bash
|
|
100
104
|
pip3 install datacontract-cli
|
|
@@ -106,17 +110,17 @@ pipx installs into an isolated environment.
|
|
|
106
110
|
pipx install datacontract-cli
|
|
107
111
|
```
|
|
108
112
|
|
|
109
|
-
###
|
|
113
|
+
### Docker
|
|
110
114
|
|
|
111
115
|
```bash
|
|
112
|
-
|
|
116
|
+
docker pull --platform linux/amd64 datacontract/cli
|
|
117
|
+
docker run --rm --platform linux/amd64 -v ${PWD}:/home/datacontract datacontract/cli
|
|
113
118
|
```
|
|
114
119
|
|
|
115
|
-
|
|
120
|
+
Or via an alias that automatically uses the latest version:
|
|
116
121
|
|
|
117
122
|
```bash
|
|
118
|
-
docker
|
|
119
|
-
docker run --rm -v ${PWD}:/datacontract datacontract/cli
|
|
123
|
+
alias datacontract='docker run --rm -v "${PWD}:/home/datacontract" --platform linux/amd64 datacontract/cli:latest'
|
|
120
124
|
```
|
|
121
125
|
|
|
122
126
|
## Documentation
|
|
@@ -125,7 +129,7 @@ docker run --rm -v ${PWD}:/datacontract datacontract/cli
|
|
|
125
129
|
|
|
126
130
|
Data Contract CLI can connect to data sources and run schema and quality tests to verify that the data contract is valid.
|
|
127
131
|
|
|
128
|
-
```bash
|
|
132
|
+
```bash
|
|
129
133
|
$ datacontract test --server production datacontract.yaml
|
|
130
134
|
```
|
|
131
135
|
|
|
@@ -139,11 +143,12 @@ The application uses different engines, based on the server `type`.
|
|
|
139
143
|
| `s3` | `json` | Support for `new_line` delimited JSON files and one JSON record per file. | ✅ | fastjsonschema<br> soda-core-duckdb |
|
|
140
144
|
| `s3` | `csv` | | ✅ | soda-core-duckdb |
|
|
141
145
|
| `s3` | `delta` | | Coming soon | TBD |
|
|
142
|
-
| `postgres` | n/a | |
|
|
143
|
-
| `snowflake` | n/a | | ✅
|
|
144
|
-
| `bigquery` | n/a | |
|
|
146
|
+
| `postgres` | n/a | | ✅ | soda-core-postgres |
|
|
147
|
+
| `snowflake` | n/a | | ✅ | soda-core-snowflake |
|
|
148
|
+
| `bigquery` | n/a | | ✅ | soda-core-bigquery |
|
|
145
149
|
| `redshift` | n/a | | Coming soon | TBD |
|
|
146
|
-
| `databricks` | n/a |
|
|
150
|
+
| `databricks` | n/a | Support for Databricks SQL with Unity catalog and Hive metastore. | ✅ | soda-core-spark |
|
|
151
|
+
| `databricks` | n/a | Support for Spark for programmatic use in Notebooks. | ✅ | soda-core-spark-df |
|
|
147
152
|
| `kafka` | `json` | | Coming soon | TBD |
|
|
148
153
|
| `kafka` | `avro` | | Coming soon | TBD |
|
|
149
154
|
| `kafka` | `protobuf` | | Coming soon | TBD |
|
|
@@ -153,32 +158,178 @@ The application uses different engines, based on the server `type`.
|
|
|
153
158
|
|
|
154
159
|
Feel free to create an issue, if you need support for an additional type.
|
|
155
160
|
|
|
156
|
-
###
|
|
161
|
+
### S3
|
|
157
162
|
|
|
158
|
-
|
|
163
|
+
Data Contract CLI can test data that is stored in S3 buckets or any S3-compliant endpoints in various formats.
|
|
164
|
+
|
|
165
|
+
#### Example
|
|
159
166
|
|
|
160
167
|
datacontract.yaml
|
|
161
|
-
```
|
|
168
|
+
```yaml
|
|
162
169
|
servers:
|
|
163
170
|
production:
|
|
164
171
|
type: s3
|
|
165
172
|
endpointUrl: https://minio.example.com # not needed with AWS S3
|
|
166
173
|
location: s3://bucket-name/path/*/*.json
|
|
167
|
-
delimiter: new_line # new_line, array, or none
|
|
168
174
|
format: json
|
|
175
|
+
delimiter: new_line # new_line, array, or none
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
#### Environment Variables
|
|
179
|
+
|
|
180
|
+
| Environment Variable | Example | Description |
|
|
181
|
+
|-----------------------------------|-------------------------------|-----------------------|
|
|
182
|
+
| `DATACONTRACT_S3_REGION` | `eu-central-1` | Region of S3 bucket |
|
|
183
|
+
| `DATACONTRACT_S3_ACCESS_KEY_ID` | `AKIAXV5Q5QABCDEFGH` | AWS Access Key ID |
|
|
184
|
+
| `DATACONTRACT_S3_SECRET_ACCESS_KEY` | `93S7LRrJcqLaaaa/XXXXXXXXXXXXX` | AWS Secret Access Key |
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
### Postgres
|
|
188
|
+
|
|
189
|
+
Data Contract CLI can test data in Postgres or Postgres-compliant databases (e.g., RisingWave).
|
|
190
|
+
|
|
191
|
+
#### Example
|
|
192
|
+
|
|
193
|
+
datacontract.yaml
|
|
194
|
+
```yaml
|
|
195
|
+
servers:
|
|
196
|
+
postgres:
|
|
197
|
+
type: postgres
|
|
198
|
+
host: localhost
|
|
199
|
+
port: 5432
|
|
200
|
+
database: postgres
|
|
201
|
+
schema: public
|
|
202
|
+
models:
|
|
203
|
+
my_table_1: # corresponds to a table
|
|
204
|
+
type: table
|
|
205
|
+
fields:
|
|
206
|
+
my_column_1: # corresponds to a column
|
|
207
|
+
type: varchar
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
#### Environment Variables
|
|
211
|
+
|
|
212
|
+
| Environment Variable | Example | Description |
|
|
213
|
+
|----------------------------------|--------------------|-------------|
|
|
214
|
+
| `DATACONTRACT_POSTGRES_USERNAME` | `postgres` | Username |
|
|
215
|
+
| `DATACONTRACT_POSTGRES_PASSWORD` | `mysecretpassword` | Password |
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
### BigQuery
|
|
219
|
+
|
|
220
|
+
We support authentication to BigQuery using Service Account Key. The used Service Account should include the roles:
|
|
221
|
+
* BigQuery Job User
|
|
222
|
+
* BigQuery Data Viewer
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
#### Example
|
|
226
|
+
|
|
227
|
+
datacontract.yaml
|
|
228
|
+
```yaml
|
|
229
|
+
servers:
|
|
230
|
+
production:
|
|
231
|
+
type: bigquery
|
|
232
|
+
project: datameshexample-product
|
|
233
|
+
dataset: datacontract_cli_test_dataset
|
|
234
|
+
models:
|
|
235
|
+
datacontract_cli_test_table: # corresponds to a BigQuery table
|
|
236
|
+
type: table
|
|
237
|
+
fields: ...
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
#### Environment Variables
|
|
241
|
+
|
|
242
|
+
| Environment Variable | Example | Description |
|
|
243
|
+
|----------------------------------------------|---------------------------|---------------------------------------------------------|
|
|
244
|
+
| `DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH` | `~/service-access-key.json` | Service Access key as saved on key creation by BigQuery |
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
### Databricks
|
|
248
|
+
|
|
249
|
+
Works with Unity Catalog and Hive metastore.
|
|
250
|
+
|
|
251
|
+
Needs a running SQL warehouse or compute cluster.
|
|
252
|
+
|
|
253
|
+
#### Example
|
|
254
|
+
|
|
255
|
+
datacontract.yaml
|
|
256
|
+
```yaml
|
|
257
|
+
servers:
|
|
258
|
+
production:
|
|
259
|
+
type: databricks
|
|
260
|
+
host: dbc-abcdefgh-1234.cloud.databricks.com
|
|
261
|
+
catalog: acme_catalog_prod
|
|
262
|
+
schema: orders_latest
|
|
263
|
+
models:
|
|
264
|
+
orders: # corresponds to a table
|
|
265
|
+
type: table
|
|
266
|
+
fields: ...
|
|
169
267
|
```
|
|
170
268
|
|
|
171
|
-
Environment
|
|
269
|
+
#### Environment Variables
|
|
270
|
+
|
|
271
|
+
| Environment Variable | Example | Description |
|
|
272
|
+
|----------------------------------------------|--------------------------------------|-------------------------------------------------------|
|
|
273
|
+
| `DATACONTRACT_DATABRICKS_TOKEN` | `dapia00000000000000000000000000000` | The personal access token to authenticate |
|
|
274
|
+
| `DATACONTRACT_DATABRICKS_HTTP_PATH` | `/sql/1.0/warehouses/b053a3ffffffff` | The HTTP path to the SQL warehouse or compute cluster |
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
### Databricks (programmatic)
|
|
278
|
+
|
|
279
|
+
Works with Unity Catalog and Hive metastore.
|
|
280
|
+
When running in a notebook or pipeline, the provided `spark` session can be used.
|
|
281
|
+
An additional authentication is not required.
|
|
282
|
+
|
|
283
|
+
Requires a Databricks Runtime with Python >= 3.10.
|
|
284
|
+
|
|
285
|
+
#### Example
|
|
286
|
+
|
|
287
|
+
datacontract.yaml
|
|
288
|
+
```yaml
|
|
289
|
+
servers:
|
|
290
|
+
production:
|
|
291
|
+
type: databricks
|
|
292
|
+
host: dbc-abcdefgh-1234.cloud.databricks.com # ignored, always use current host
|
|
293
|
+
catalog: acme_catalog_prod
|
|
294
|
+
schema: orders_latest
|
|
295
|
+
models:
|
|
296
|
+
orders: # corresponds to a table
|
|
297
|
+
type: table
|
|
298
|
+
fields: ...
|
|
172
299
|
```
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
300
|
+
|
|
301
|
+
Notebook
|
|
302
|
+
```python
|
|
303
|
+
%pip install git+https://github.com/datacontract/cli.git
|
|
304
|
+
dbutils.library.restartPython()
|
|
305
|
+
|
|
306
|
+
from datacontract.data_contract import DataContract
|
|
307
|
+
|
|
308
|
+
data_contract = DataContract(
|
|
309
|
+
data_contract_file="/Volumes/acme_catalog_prod/orders_latest/datacontract/datacontract.yaml",
|
|
310
|
+
spark=spark)
|
|
311
|
+
run = data_contract.test()
|
|
312
|
+
run.result
|
|
176
313
|
```
|
|
177
314
|
|
|
178
315
|
|
|
316
|
+
### Exports
|
|
317
|
+
|
|
318
|
+
Available export options:
|
|
319
|
+
|
|
320
|
+
| Type | Description | Status |
|
|
321
|
+
|--------------|------------------------------------------------|--------|
|
|
322
|
+
| `jsonschema` | Export to JSON Schema | ✅ |
|
|
323
|
+
| `sodacl` | Export to SodaCL quality checks in YAML format | ✅ |
|
|
324
|
+
| `dbt` | Export to dbt model in YAML format | TBD |
|
|
325
|
+
| `avro` | Export to AVRO models | TBD |
|
|
326
|
+
| `pydantic` | Export to pydantic models | TBD |
|
|
327
|
+
| `sql` | Export to SQL DDL | TBD |
|
|
328
|
+
| `protobuf` | Export to Protobuf | TBD |
|
|
329
|
+
|
|
179
330
|
## Development Setup
|
|
180
331
|
|
|
181
|
-
Python base interpreter should be 3.11.x
|
|
332
|
+
Python base interpreter should be 3.11.x (unless working on 3.12 release candidate).
|
|
182
333
|
|
|
183
334
|
```bash
|
|
184
335
|
# create venv
|
|
@@ -208,7 +359,7 @@ Docker Build
|
|
|
208
359
|
|
|
209
360
|
```
|
|
210
361
|
docker build -t datacontract/cli .
|
|
211
|
-
docker run --rm -v ${PWD}:/datacontract datacontract/cli
|
|
362
|
+
docker run --rm -v ${PWD}:/home/datacontract datacontract/cli
|
|
212
363
|
```
|
|
213
364
|
|
|
214
365
|
## Contribution
|
|
@@ -222,3 +373,7 @@ We are happy to receive your contributions. Propose your change in an issue or d
|
|
|
222
373
|
## Credits
|
|
223
374
|
|
|
224
375
|
Created by [Stefan Negele](https://www.linkedin.com/in/stefan-negele-573153112/) and [Jochen Christ](https://www.linkedin.com/in/jochenchrist/).
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
<a href="https://github.com/datacontract/cli" class="github-corner" aria-label="View source on GitHub"><svg width="80" height="80" viewBox="0 0 250 250" style="fill:#151513; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
|
|
@@ -87,14 +87,16 @@ def test(
|
|
|
87
87
|
"Use the key of the server object in the data contract yaml file "
|
|
88
88
|
"to refer to a server, e.g., `production`, or `all` for all "
|
|
89
89
|
"servers (default).")] = "all",
|
|
90
|
+
examples: Annotated[bool, typer.Option(
|
|
91
|
+
help="Run the schema and quality tests on the example data within the data contract.")] = None,
|
|
90
92
|
publish: Annotated[str, typer.Option(
|
|
91
|
-
help="")] = None,
|
|
93
|
+
help="The url to publish the results after the test")] = None,
|
|
92
94
|
):
|
|
93
95
|
"""
|
|
94
96
|
Run schema and quality tests on configured servers.
|
|
95
97
|
"""
|
|
96
98
|
print(f"Testing {location}")
|
|
97
|
-
run = DataContract(data_contract_file=location, publish_url=publish).test()
|
|
99
|
+
run = DataContract(data_contract_file=location, publish_url=publish, examples=examples).test()
|
|
98
100
|
_handle_result(run)
|
|
99
101
|
|
|
100
102
|
|