datacontract-cli 0.9.2__tar.gz → 0.9.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (64) hide show
  1. {datacontract-cli-0.9.2/datacontract_cli.egg-info → datacontract-cli-0.9.4}/PKG-INFO +191 -31
  2. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/README.md +177 -22
  3. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/cli.py +4 -2
  4. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/data_contract.py +69 -19
  5. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/fastjsonschema/check_jsonschema.py +56 -42
  6. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/fastjsonschema/s3/s3_read_files.py +1 -1
  7. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/soda/check_soda_execute.py +25 -1
  8. datacontract-cli-0.9.4/datacontract/engines/soda/connections/bigquery.py +18 -0
  9. datacontract-cli-0.9.4/datacontract/engines/soda/connections/databricks.py +20 -0
  10. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/soda/connections/duckdb.py +10 -6
  11. datacontract-cli-0.9.4/datacontract/engines/soda/connections/postgres.py +21 -0
  12. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/export/sodacl_converter.py +2 -2
  13. datacontract-cli-0.9.4/datacontract/lint/lint.py +126 -0
  14. datacontract-cli-0.9.4/datacontract/lint/linters/__init__.py +0 -0
  15. datacontract-cli-0.9.4/datacontract/lint/linters/example_model_linter.py +67 -0
  16. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/model/data_contract_specification.py +5 -0
  17. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4/datacontract_cli.egg-info}/PKG-INFO +191 -31
  18. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract_cli.egg-info/SOURCES.txt +15 -1
  19. datacontract-cli-0.9.4/datacontract_cli.egg-info/requires.txt +21 -0
  20. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/pyproject.toml +15 -10
  21. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_example_parquet.py +0 -1
  22. datacontract-cli-0.9.4/tests/test_examples_bigquery.py +25 -0
  23. datacontract-cli-0.9.4/tests/test_examples_databricks.py +25 -0
  24. datacontract-cli-0.9.4/tests/test_examples_examples_csv.py +25 -0
  25. datacontract-cli-0.9.4/tests/test_examples_examples_inline.py +25 -0
  26. datacontract-cli-0.9.4/tests/test_examples_examples_json.py +25 -0
  27. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_examples_local_json.py +5 -2
  28. datacontract-cli-0.9.4/tests/test_examples_postgres.py +63 -0
  29. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_examples_s3_json_complex.py +2 -1
  30. datacontract-cli-0.9.4/tests/test_examples_s3_json_multiple_models.py +56 -0
  31. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_export_sodacl.py +2 -2
  32. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_lint.py +1 -2
  33. datacontract-cli-0.9.4/tests/test_linters.py +108 -0
  34. datacontract-cli-0.9.2/datacontract_cli.egg-info/requires.txt +0 -16
  35. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/LICENSE +0 -0
  36. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/__init__.py +0 -0
  37. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/__init__.py +0 -0
  38. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +0 -0
  39. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/datacontract/check_that_datacontract_file_exists.py +0 -0
  40. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -0
  41. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/soda/__init__.py +0 -0
  42. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/soda/connections/dask.py +0 -0
  43. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/engines/soda/connections/snowflake.py +0 -0
  44. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/export/jsonschema_converter.py +0 -0
  45. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/init/download_datacontract_file.py +0 -0
  46. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/integration/publish_datamesh_manager.py +0 -0
  47. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/lint/files.py +0 -0
  48. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/lint/resolve.py +0 -0
  49. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/lint/schema.py +0 -0
  50. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/lint/urls.py +0 -0
  51. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/model/exceptions.py +0 -0
  52. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract/model/run.py +0 -0
  53. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract_cli.egg-info/dependency_links.txt +0 -0
  54. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract_cli.egg-info/entry_points.txt +0 -0
  55. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/datacontract_cli.egg-info/top_level.txt +0 -0
  56. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/setup.cfg +0 -0
  57. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_cli.py +0 -0
  58. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_download_datacontract_file.py +0 -0
  59. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_examples_s3_csv.py +0 -0
  60. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_examples_s3_json.py +0 -0
  61. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_examples_s3_json_remote.py +0 -0
  62. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_examples_snowflake.py +0 -0
  63. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_export_jsonschema.py +0 -0
  64. {datacontract-cli-0.9.2 → datacontract-cli-0.9.4}/tests/test_integration_datameshmanager.py +0 -0
@@ -1,31 +1,36 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datacontract-cli
3
- Version: 0.9.2
4
- Summary: Validate data contracts
3
+ Version: 0.9.4
4
+ Summary: Test data contracts
5
5
  Author-email: Jochen Christ <jochen.christ@innoq.com>, Stefan Negele <stefan.negele@innoq.com>
6
6
  Project-URL: Homepage, https://cli.datacontract.com
7
7
  Project-URL: Issues, https://github.com/datacontract/cli/issues
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: ~=3.11
11
+ Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: typer[all]~=0.9.0
15
- Requires-Dist: pydantic~=2.5.3
15
+ Requires-Dist: pydantic<2.7.0,>=2.5.3
16
16
  Requires-Dist: pyyaml~=6.0.1
17
17
  Requires-Dist: requests~=2.31.0
18
18
  Requires-Dist: fastparquet==2023.10.1
19
- Requires-Dist: soda-core-duckdb~=3.1.3
20
- Requires-Dist: soda-core-snowflake~=3.1.3
21
- Requires-Dist: snowflake-connector-python[pandas]~=3.6.0
22
- Requires-Dist: duckdb~=0.9.2
19
+ Requires-Dist: soda-core-bigquery~=3.1.5
20
+ Requires-Dist: soda-core-duckdb~=3.1.5
21
+ Requires-Dist: soda-core-postgres~=3.1.5
22
+ Requires-Dist: soda-core-snowflake~=3.1.5
23
+ Requires-Dist: soda-core-spark[databricks]~=3.1.5
24
+ Requires-Dist: soda-core-spark-df~=3.1.5
25
+ Requires-Dist: snowflake-connector-python[pandas]<3.8,>=3.6
26
+ Requires-Dist: duckdb==0.10.0
23
27
  Requires-Dist: fastjsonschema~=2.19.1
24
28
  Requires-Dist: python-dotenv~=1.0.0
25
- Requires-Dist: s3fs==2023.12.2
29
+ Requires-Dist: s3fs==2024.2.0
26
30
  Provides-Extra: dev
27
31
  Requires-Dist: pytest; extra == "dev"
28
32
  Requires-Dist: testcontainers-minio; extra == "dev"
33
+ Requires-Dist: testcontainers-postgres; extra == "dev"
29
34
 
30
35
  # Data Contract CLI
31
36
 
@@ -44,12 +49,12 @@ It uses data contract YAML files to lint the data contract, connect to data sour
44
49
 
45
50
  ## Getting started
46
51
 
47
- Let's use [pip](https://pip.pypa.io/en/stable/getting-started/) to install the CLI.
52
+ Let's use [pip](https://pip.pypa.io/en/stable/getting-started/) to install the CLI.
48
53
  ```bash
49
54
  $ pip3 install datacontract-cli
50
55
  ```
51
56
 
52
- Now, let's look at this data contract:
57
+ Now, let's look at this data contract:
53
58
  [https://datacontract.com/examples/covid-cases/datacontract.yaml](https://datacontract.com/examples/covid-cases/datacontract.yaml)
54
59
 
55
60
  We have a _servers_ section with endpoint details to the (public) S3 bucket, _models_ for the structure of the data, and _quality_ attributes that describe the expected freshness and number of rows.
@@ -77,6 +82,9 @@ $ datacontract lint datacontract.yaml
77
82
  # execute schema and quality checks
78
83
  $ datacontract test datacontract.yaml
79
84
 
85
+ # execute schema and quality checks on the examples within the contract
86
+ $ datacontract test --examples datacontract.yaml
87
+
80
88
  # find differences between to data contracts (Coming Soon)
81
89
  $ datacontract diff datacontract-v1.yaml datacontract-v2.yaml
82
90
 
@@ -124,6 +132,7 @@ Choose the most appropriate installation method for your needs:
124
132
 
125
133
  ### pip
126
134
  Python 3.11 recommended.
135
+ Python 3.12 available as pre-release release candidate for 0.9.3
127
136
 
128
137
  ```bash
129
138
  pip3 install datacontract-cli
@@ -135,17 +144,17 @@ pipx installs into an isolated environment.
135
144
  pipx install datacontract-cli
136
145
  ```
137
146
 
138
- ### Homebrew (coming soon)
147
+ ### Docker
139
148
 
140
149
  ```bash
141
- brew install datacontract/brew/datacontract
150
+ docker pull --platform linux/amd64 datacontract/cli
151
+ docker run --rm --platform linux/amd64 -v ${PWD}:/home/datacontract datacontract/cli
142
152
  ```
143
153
 
144
- ### Docker (coming soon)
154
+ Or via an alias that automatically uses the latest version:
145
155
 
146
156
  ```bash
147
- docker pull datacontract/cli
148
- docker run --rm -v ${PWD}:/datacontract datacontract/cli
157
+ alias datacontract='docker run --rm -v "${PWD}:/home/datacontract" --platform linux/amd64 datacontract/cli:latest'
149
158
  ```
150
159
 
151
160
  ## Documentation
@@ -154,7 +163,7 @@ docker run --rm -v ${PWD}:/datacontract datacontract/cli
154
163
 
155
164
  Data Contract CLI can connect to data sources and run schema and quality tests to verify that the data contract is valid.
156
165
 
157
- ```bash
166
+ ```bash
158
167
  $ datacontract test --server production datacontract.yaml
159
168
  ```
160
169
 
@@ -168,11 +177,12 @@ The application uses different engines, based on the server `type`.
168
177
  | `s3` | `json` | Support for `new_line` delimited JSON files and one JSON record per file. | ✅ | fastjsonschema<br> soda-core-duckdb |
169
178
  | `s3` | `csv` | | ✅ | soda-core-duckdb |
170
179
  | `s3` | `delta` | | Coming soon | TBD |
171
- | `postgres` | n/a | | Coming soon | TBD |
172
- | `snowflake` | n/a | | ✅ | soda-core-snowflake |
173
- | `bigquery` | n/a | | Coming soon | TBD |
180
+ | `postgres` | n/a | | | soda-core-postgres |
181
+ | `snowflake` | n/a | | ✅ | soda-core-snowflake |
182
+ | `bigquery` | n/a | | | soda-core-bigquery |
174
183
  | `redshift` | n/a | | Coming soon | TBD |
175
- | `databricks` | n/a | | Coming soon | TBD |
184
+ | `databricks` | n/a | Support for Databricks SQL with Unity catalog and Hive metastore. | | soda-core-spark |
185
+ | `databricks` | n/a | Support for Spark for programmatic use in Notebooks. | ✅ | soda-core-spark-df |
176
186
  | `kafka` | `json` | | Coming soon | TBD |
177
187
  | `kafka` | `avro` | | Coming soon | TBD |
178
188
  | `kafka` | `protobuf` | | Coming soon | TBD |
@@ -182,32 +192,178 @@ The application uses different engines, based on the server `type`.
182
192
 
183
193
  Feel free to create an issue, if you need support for an additional type.
184
194
 
185
- ### Server Type S3
195
+ ### S3
186
196
 
187
- Example:
197
+ Data Contract CLI can test data that is stored in S3 buckets or any S3-compliant endpoints in various formats.
198
+
199
+ #### Example
188
200
 
189
201
  datacontract.yaml
190
- ```
202
+ ```yaml
191
203
  servers:
192
204
  production:
193
205
  type: s3
194
206
  endpointUrl: https://minio.example.com # not needed with AWS S3
195
207
  location: s3://bucket-name/path/*/*.json
196
- delimiter: new_line # new_line, array, or none
197
208
  format: json
209
+ delimiter: new_line # new_line, array, or none
210
+ ```
211
+
212
+ #### Environment Variables
213
+
214
+ | Environment Variable | Example | Description |
215
+ |-----------------------------------|-------------------------------|-----------------------|
216
+ | `DATACONTRACT_S3_REGION` | `eu-central-1` | Region of S3 bucket |
217
+ | `DATACONTRACT_S3_ACCESS_KEY_ID` | `AKIAXV5Q5QABCDEFGH` | AWS Access Key ID |
218
+ | `DATACONTRACT_S3_SECRET_ACCESS_KEY` | `93S7LRrJcqLaaaa/XXXXXXXXXXXXX` | AWS Secret Access Key |
219
+
220
+
221
+ ### Postgres
222
+
223
+ Data Contract CLI can test data in Postgres or Postgres-compliant databases (e.g., RisingWave).
224
+
225
+ #### Example
226
+
227
+ datacontract.yaml
228
+ ```yaml
229
+ servers:
230
+ postgres:
231
+ type: postgres
232
+ host: localhost
233
+ port: 5432
234
+ database: postgres
235
+ schema: public
236
+ models:
237
+ my_table_1: # corresponds to a table
238
+ type: table
239
+ fields:
240
+ my_column_1: # corresponds to a column
241
+ type: varchar
242
+ ```
243
+
244
+ #### Environment Variables
245
+
246
+ | Environment Variable | Example | Description |
247
+ |----------------------------------|--------------------|-------------|
248
+ | `DATACONTRACT_POSTGRES_USERNAME` | `postgres` | Username |
249
+ | `DATACONTRACT_POSTGRES_PASSWORD` | `mysecretpassword` | Password |
250
+
251
+
252
+ ### BigQuery
253
+
254
+ We support authentication to BigQuery using Service Account Key. The used Service Account should include the roles:
255
+ * BigQuery Job User
256
+ * BigQuery Data Viewer
257
+
258
+
259
+ #### Example
260
+
261
+ datacontract.yaml
262
+ ```yaml
263
+ servers:
264
+ production:
265
+ type: bigquery
266
+ project: datameshexample-product
267
+ dataset: datacontract_cli_test_dataset
268
+ models:
269
+ datacontract_cli_test_table: # corresponds to a BigQuery table
270
+ type: table
271
+ fields: ...
272
+ ```
273
+
274
+ #### Environment Variables
275
+
276
+ | Environment Variable | Example | Description |
277
+ |----------------------------------------------|---------------------------|---------------------------------------------------------|
278
+ | `DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH` | `~/service-access-key.json` | Service Access key as saved on key creation by BigQuery |
279
+
280
+
281
+ ### Databricks
282
+
283
+ Works with Unity Catalog and Hive metastore.
284
+
285
+ Needs a running SQL warehouse or compute cluster.
286
+
287
+ #### Example
288
+
289
+ datacontract.yaml
290
+ ```yaml
291
+ servers:
292
+ production:
293
+ type: databricks
294
+ host: dbc-abcdefgh-1234.cloud.databricks.com
295
+ catalog: acme_catalog_prod
296
+ schema: orders_latest
297
+ models:
298
+ orders: # corresponds to a table
299
+ type: table
300
+ fields: ...
198
301
  ```
199
302
 
200
- Environment variables
303
+ #### Environment Variables
304
+
305
+ | Environment Variable | Example | Description |
306
+ |----------------------------------------------|--------------------------------------|-------------------------------------------------------|
307
+ | `DATACONTRACT_DATABRICKS_TOKEN` | `dapia00000000000000000000000000000` | The personal access token to authenticate |
308
+ | `DATACONTRACT_DATABRICKS_HTTP_PATH` | `/sql/1.0/warehouses/b053a3ffffffff` | The HTTP path to the SQL warehouse or compute cluster |
309
+
310
+
311
+ ### Databricks (programmatic)
312
+
313
+ Works with Unity Catalog and Hive metastore.
314
+ When running in a notebook or pipeline, the provided `spark` session can be used.
315
+ An additional authentication is not required.
316
+
317
+ Requires a Databricks Runtime with Python >= 3.10.
318
+
319
+ #### Example
320
+
321
+ datacontract.yaml
322
+ ```yaml
323
+ servers:
324
+ production:
325
+ type: databricks
326
+ host: dbc-abcdefgh-1234.cloud.databricks.com # ignored, always use current host
327
+ catalog: acme_catalog_prod
328
+ schema: orders_latest
329
+ models:
330
+ orders: # corresponds to a table
331
+ type: table
332
+ fields: ...
201
333
  ```
202
- export DATACONTRACT_S3_REGION=eu-central-1
203
- export DATACONTRACT_S3_ACCESS_KEY_ID=AKIAXV5Q5QABCDEFGH
204
- export DATACONTRACT_S3_SECRET_ACCESS_KEY=93S7LRrJcqLkdb2/XXXXXXXXXXXXX
334
+
335
+ Notebook
336
+ ```python
337
+ %pip install git+https://github.com/datacontract/cli.git
338
+ dbutils.library.restartPython()
339
+
340
+ from datacontract.data_contract import DataContract
341
+
342
+ data_contract = DataContract(
343
+ data_contract_file="/Volumes/acme_catalog_prod/orders_latest/datacontract/datacontract.yaml",
344
+ spark=spark)
345
+ run = data_contract.test()
346
+ run.result
205
347
  ```
206
348
 
207
349
 
350
+ ### Exports
351
+
352
+ Available export options:
353
+
354
+ | Type | Description | Status |
355
+ |--------------|------------------------------------------------|--------|
356
+ | `jsonschema` | Export to JSON Schema | ✅ |
357
+ | `sodacl` | Export to SodaCL quality checks in YAML format | ✅ |
358
+ | `dbt` | Export to dbt model in YAML format | TBD |
359
+ | `avro` | Export to AVRO models | TBD |
360
+ | `pydantic` | Export to pydantic models | TBD |
361
+ | `sql` | Export to SQL DDL | TBD |
362
+ | `protobuf` | Export to Protobuf | TBD |
363
+
208
364
  ## Development Setup
209
365
 
210
- Python base interpreter should be 3.11.x
366
+ Python base interpreter should be 3.11.x (unless working on 3.12 release candidate).
211
367
 
212
368
  ```bash
213
369
  # create venv
@@ -237,7 +393,7 @@ Docker Build
237
393
 
238
394
  ```
239
395
  docker build -t datacontract/cli .
240
- docker run --rm -v ${PWD}:/datacontract datacontract/cli
396
+ docker run --rm -v ${PWD}:/home/datacontract datacontract/cli
241
397
  ```
242
398
 
243
399
  ## Contribution
@@ -251,3 +407,7 @@ We are happy to receive your contributions. Propose your change in an issue or d
251
407
  ## Credits
252
408
 
253
409
  Created by [Stefan Negele](https://www.linkedin.com/in/stefan-negele-573153112/) and [Jochen Christ](https://www.linkedin.com/in/jochenchrist/).
410
+
411
+
412
+
413
+ <a href="https://github.com/datacontract/cli" class="github-corner" aria-label="View source on GitHub"><svg width="80" height="80" viewBox="0 0 250 250" style="fill:#151513; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
@@ -15,12 +15,12 @@ It uses data contract YAML files to lint the data contract, connect to data sour
15
15
 
16
16
  ## Getting started
17
17
 
18
- Let's use [pip](https://pip.pypa.io/en/stable/getting-started/) to install the CLI.
18
+ Let's use [pip](https://pip.pypa.io/en/stable/getting-started/) to install the CLI.
19
19
  ```bash
20
20
  $ pip3 install datacontract-cli
21
21
  ```
22
22
 
23
- Now, let's look at this data contract:
23
+ Now, let's look at this data contract:
24
24
  [https://datacontract.com/examples/covid-cases/datacontract.yaml](https://datacontract.com/examples/covid-cases/datacontract.yaml)
25
25
 
26
26
  We have a _servers_ section with endpoint details to the (public) S3 bucket, _models_ for the structure of the data, and _quality_ attributes that describe the expected freshness and number of rows.
@@ -48,6 +48,9 @@ $ datacontract lint datacontract.yaml
48
48
  # execute schema and quality checks
49
49
  $ datacontract test datacontract.yaml
50
50
 
51
+ # execute schema and quality checks on the examples within the contract
52
+ $ datacontract test --examples datacontract.yaml
53
+
51
54
  # find differences between to data contracts (Coming Soon)
52
55
  $ datacontract diff datacontract-v1.yaml datacontract-v2.yaml
53
56
 
@@ -95,6 +98,7 @@ Choose the most appropriate installation method for your needs:
95
98
 
96
99
  ### pip
97
100
  Python 3.11 recommended.
101
+ Python 3.12 available as pre-release release candidate for 0.9.3
98
102
 
99
103
  ```bash
100
104
  pip3 install datacontract-cli
@@ -106,17 +110,17 @@ pipx installs into an isolated environment.
106
110
  pipx install datacontract-cli
107
111
  ```
108
112
 
109
- ### Homebrew (coming soon)
113
+ ### Docker
110
114
 
111
115
  ```bash
112
- brew install datacontract/brew/datacontract
116
+ docker pull --platform linux/amd64 datacontract/cli
117
+ docker run --rm --platform linux/amd64 -v ${PWD}:/home/datacontract datacontract/cli
113
118
  ```
114
119
 
115
- ### Docker (coming soon)
120
+ Or via an alias that automatically uses the latest version:
116
121
 
117
122
  ```bash
118
- docker pull datacontract/cli
119
- docker run --rm -v ${PWD}:/datacontract datacontract/cli
123
+ alias datacontract='docker run --rm -v "${PWD}:/home/datacontract" --platform linux/amd64 datacontract/cli:latest'
120
124
  ```
121
125
 
122
126
  ## Documentation
@@ -125,7 +129,7 @@ docker run --rm -v ${PWD}:/datacontract datacontract/cli
125
129
 
126
130
  Data Contract CLI can connect to data sources and run schema and quality tests to verify that the data contract is valid.
127
131
 
128
- ```bash
132
+ ```bash
129
133
  $ datacontract test --server production datacontract.yaml
130
134
  ```
131
135
 
@@ -139,11 +143,12 @@ The application uses different engines, based on the server `type`.
139
143
  | `s3` | `json` | Support for `new_line` delimited JSON files and one JSON record per file. | ✅ | fastjsonschema<br> soda-core-duckdb |
140
144
  | `s3` | `csv` | | ✅ | soda-core-duckdb |
141
145
  | `s3` | `delta` | | Coming soon | TBD |
142
- | `postgres` | n/a | | Coming soon | TBD |
143
- | `snowflake` | n/a | | ✅ | soda-core-snowflake |
144
- | `bigquery` | n/a | | Coming soon | TBD |
146
+ | `postgres` | n/a | | | soda-core-postgres |
147
+ | `snowflake` | n/a | | ✅ | soda-core-snowflake |
148
+ | `bigquery` | n/a | | | soda-core-bigquery |
145
149
  | `redshift` | n/a | | Coming soon | TBD |
146
- | `databricks` | n/a | | Coming soon | TBD |
150
+ | `databricks` | n/a | Support for Databricks SQL with Unity catalog and Hive metastore. | | soda-core-spark |
151
+ | `databricks` | n/a | Support for Spark for programmatic use in Notebooks. | ✅ | soda-core-spark-df |
147
152
  | `kafka` | `json` | | Coming soon | TBD |
148
153
  | `kafka` | `avro` | | Coming soon | TBD |
149
154
  | `kafka` | `protobuf` | | Coming soon | TBD |
@@ -153,32 +158,178 @@ The application uses different engines, based on the server `type`.
153
158
 
154
159
  Feel free to create an issue, if you need support for an additional type.
155
160
 
156
- ### Server Type S3
161
+ ### S3
157
162
 
158
- Example:
163
+ Data Contract CLI can test data that is stored in S3 buckets or any S3-compliant endpoints in various formats.
164
+
165
+ #### Example
159
166
 
160
167
  datacontract.yaml
161
- ```
168
+ ```yaml
162
169
  servers:
163
170
  production:
164
171
  type: s3
165
172
  endpointUrl: https://minio.example.com # not needed with AWS S3
166
173
  location: s3://bucket-name/path/*/*.json
167
- delimiter: new_line # new_line, array, or none
168
174
  format: json
175
+ delimiter: new_line # new_line, array, or none
176
+ ```
177
+
178
+ #### Environment Variables
179
+
180
+ | Environment Variable | Example | Description |
181
+ |-----------------------------------|-------------------------------|-----------------------|
182
+ | `DATACONTRACT_S3_REGION` | `eu-central-1` | Region of S3 bucket |
183
+ | `DATACONTRACT_S3_ACCESS_KEY_ID` | `AKIAXV5Q5QABCDEFGH` | AWS Access Key ID |
184
+ | `DATACONTRACT_S3_SECRET_ACCESS_KEY` | `93S7LRrJcqLaaaa/XXXXXXXXXXXXX` | AWS Secret Access Key |
185
+
186
+
187
+ ### Postgres
188
+
189
+ Data Contract CLI can test data in Postgres or Postgres-compliant databases (e.g., RisingWave).
190
+
191
+ #### Example
192
+
193
+ datacontract.yaml
194
+ ```yaml
195
+ servers:
196
+ postgres:
197
+ type: postgres
198
+ host: localhost
199
+ port: 5432
200
+ database: postgres
201
+ schema: public
202
+ models:
203
+ my_table_1: # corresponds to a table
204
+ type: table
205
+ fields:
206
+ my_column_1: # corresponds to a column
207
+ type: varchar
208
+ ```
209
+
210
+ #### Environment Variables
211
+
212
+ | Environment Variable | Example | Description |
213
+ |----------------------------------|--------------------|-------------|
214
+ | `DATACONTRACT_POSTGRES_USERNAME` | `postgres` | Username |
215
+ | `DATACONTRACT_POSTGRES_PASSWORD` | `mysecretpassword` | Password |
216
+
217
+
218
+ ### BigQuery
219
+
220
+ We support authentication to BigQuery using Service Account Key. The used Service Account should include the roles:
221
+ * BigQuery Job User
222
+ * BigQuery Data Viewer
223
+
224
+
225
+ #### Example
226
+
227
+ datacontract.yaml
228
+ ```yaml
229
+ servers:
230
+ production:
231
+ type: bigquery
232
+ project: datameshexample-product
233
+ dataset: datacontract_cli_test_dataset
234
+ models:
235
+ datacontract_cli_test_table: # corresponds to a BigQuery table
236
+ type: table
237
+ fields: ...
238
+ ```
239
+
240
+ #### Environment Variables
241
+
242
+ | Environment Variable | Example | Description |
243
+ |----------------------------------------------|---------------------------|---------------------------------------------------------|
244
+ | `DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH` | `~/service-access-key.json` | Service Access key as saved on key creation by BigQuery |
245
+
246
+
247
+ ### Databricks
248
+
249
+ Works with Unity Catalog and Hive metastore.
250
+
251
+ Needs a running SQL warehouse or compute cluster.
252
+
253
+ #### Example
254
+
255
+ datacontract.yaml
256
+ ```yaml
257
+ servers:
258
+ production:
259
+ type: databricks
260
+ host: dbc-abcdefgh-1234.cloud.databricks.com
261
+ catalog: acme_catalog_prod
262
+ schema: orders_latest
263
+ models:
264
+ orders: # corresponds to a table
265
+ type: table
266
+ fields: ...
169
267
  ```
170
268
 
171
- Environment variables
269
+ #### Environment Variables
270
+
271
+ | Environment Variable | Example | Description |
272
+ |----------------------------------------------|--------------------------------------|-------------------------------------------------------|
273
+ | `DATACONTRACT_DATABRICKS_TOKEN` | `dapia00000000000000000000000000000` | The personal access token to authenticate |
274
+ | `DATACONTRACT_DATABRICKS_HTTP_PATH` | `/sql/1.0/warehouses/b053a3ffffffff` | The HTTP path to the SQL warehouse or compute cluster |
275
+
276
+
277
+ ### Databricks (programmatic)
278
+
279
+ Works with Unity Catalog and Hive metastore.
280
+ When running in a notebook or pipeline, the provided `spark` session can be used.
281
+ An additional authentication is not required.
282
+
283
+ Requires a Databricks Runtime with Python >= 3.10.
284
+
285
+ #### Example
286
+
287
+ datacontract.yaml
288
+ ```yaml
289
+ servers:
290
+ production:
291
+ type: databricks
292
+ host: dbc-abcdefgh-1234.cloud.databricks.com # ignored, always use current host
293
+ catalog: acme_catalog_prod
294
+ schema: orders_latest
295
+ models:
296
+ orders: # corresponds to a table
297
+ type: table
298
+ fields: ...
172
299
  ```
173
- export DATACONTRACT_S3_REGION=eu-central-1
174
- export DATACONTRACT_S3_ACCESS_KEY_ID=AKIAXV5Q5QABCDEFGH
175
- export DATACONTRACT_S3_SECRET_ACCESS_KEY=93S7LRrJcqLkdb2/XXXXXXXXXXXXX
300
+
301
+ Notebook
302
+ ```python
303
+ %pip install git+https://github.com/datacontract/cli.git
304
+ dbutils.library.restartPython()
305
+
306
+ from datacontract.data_contract import DataContract
307
+
308
+ data_contract = DataContract(
309
+ data_contract_file="/Volumes/acme_catalog_prod/orders_latest/datacontract/datacontract.yaml",
310
+ spark=spark)
311
+ run = data_contract.test()
312
+ run.result
176
313
  ```
177
314
 
178
315
 
316
+ ### Exports
317
+
318
+ Available export options:
319
+
320
+ | Type | Description | Status |
321
+ |--------------|------------------------------------------------|--------|
322
+ | `jsonschema` | Export to JSON Schema | ✅ |
323
+ | `sodacl` | Export to SodaCL quality checks in YAML format | ✅ |
324
+ | `dbt` | Export to dbt model in YAML format | TBD |
325
+ | `avro` | Export to AVRO models | TBD |
326
+ | `pydantic` | Export to pydantic models | TBD |
327
+ | `sql` | Export to SQL DDL | TBD |
328
+ | `protobuf` | Export to Protobuf | TBD |
329
+
179
330
  ## Development Setup
180
331
 
181
- Python base interpreter should be 3.11.x
332
+ Python base interpreter should be 3.11.x (unless working on 3.12 release candidate).
182
333
 
183
334
  ```bash
184
335
  # create venv
@@ -208,7 +359,7 @@ Docker Build
208
359
 
209
360
  ```
210
361
  docker build -t datacontract/cli .
211
- docker run --rm -v ${PWD}:/datacontract datacontract/cli
362
+ docker run --rm -v ${PWD}:/home/datacontract datacontract/cli
212
363
  ```
213
364
 
214
365
  ## Contribution
@@ -222,3 +373,7 @@ We are happy to receive your contributions. Propose your change in an issue or d
222
373
  ## Credits
223
374
 
224
375
  Created by [Stefan Negele](https://www.linkedin.com/in/stefan-negele-573153112/) and [Jochen Christ](https://www.linkedin.com/in/jochenchrist/).
376
+
377
+
378
+
379
+ <a href="https://github.com/datacontract/cli" class="github-corner" aria-label="View source on GitHub"><svg width="80" height="80" viewBox="0 0 250 250" style="fill:#151513; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
@@ -87,14 +87,16 @@ def test(
87
87
  "Use the key of the server object in the data contract yaml file "
88
88
  "to refer to a server, e.g., `production`, or `all` for all "
89
89
  "servers (default).")] = "all",
90
+ examples: Annotated[bool, typer.Option(
91
+ help="Run the schema and quality tests on the example data within the data contract.")] = None,
90
92
  publish: Annotated[str, typer.Option(
91
- help="")] = None,
93
+ help="The url to publish the results after the test")] = None,
92
94
  ):
93
95
  """
94
96
  Run schema and quality tests on configured servers.
95
97
  """
96
98
  print(f"Testing {location}")
97
- run = DataContract(data_contract_file=location, publish_url=publish).test()
99
+ run = DataContract(data_contract_file=location, publish_url=publish, examples=examples).test()
98
100
  _handle_result(run)
99
101
 
100
102