datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. datacontract/__init__.py +13 -0
  2. datacontract/api.py +260 -0
  3. datacontract/breaking/breaking.py +242 -12
  4. datacontract/breaking/breaking_rules.py +37 -1
  5. datacontract/catalog/catalog.py +80 -0
  6. datacontract/cli.py +387 -117
  7. datacontract/data_contract.py +216 -353
  8. datacontract/engines/data_contract_checks.py +1041 -0
  9. datacontract/engines/data_contract_test.py +113 -0
  10. datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
  11. datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
  12. datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
  13. datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
  14. datacontract/engines/soda/check_soda_execute.py +100 -56
  15. datacontract/engines/soda/connections/athena.py +79 -0
  16. datacontract/engines/soda/connections/bigquery.py +8 -1
  17. datacontract/engines/soda/connections/databricks.py +12 -3
  18. datacontract/engines/soda/connections/duckdb_connection.py +241 -0
  19. datacontract/engines/soda/connections/kafka.py +206 -113
  20. datacontract/engines/soda/connections/snowflake.py +8 -5
  21. datacontract/engines/soda/connections/sqlserver.py +43 -0
  22. datacontract/engines/soda/connections/trino.py +26 -0
  23. datacontract/export/avro_converter.py +72 -8
  24. datacontract/export/avro_idl_converter.py +31 -25
  25. datacontract/export/bigquery_converter.py +130 -0
  26. datacontract/export/custom_converter.py +40 -0
  27. datacontract/export/data_caterer_converter.py +161 -0
  28. datacontract/export/dbml_converter.py +148 -0
  29. datacontract/export/dbt_converter.py +141 -54
  30. datacontract/export/dcs_exporter.py +6 -0
  31. datacontract/export/dqx_converter.py +126 -0
  32. datacontract/export/duckdb_type_converter.py +57 -0
  33. datacontract/export/excel_exporter.py +923 -0
  34. datacontract/export/exporter.py +100 -0
  35. datacontract/export/exporter_factory.py +216 -0
  36. datacontract/export/go_converter.py +105 -0
  37. datacontract/export/great_expectations_converter.py +257 -36
  38. datacontract/export/html_exporter.py +86 -0
  39. datacontract/export/iceberg_converter.py +188 -0
  40. datacontract/export/jsonschema_converter.py +71 -16
  41. datacontract/export/markdown_converter.py +337 -0
  42. datacontract/export/mermaid_exporter.py +110 -0
  43. datacontract/export/odcs_v3_exporter.py +375 -0
  44. datacontract/export/pandas_type_converter.py +40 -0
  45. datacontract/export/protobuf_converter.py +168 -68
  46. datacontract/export/pydantic_converter.py +6 -0
  47. datacontract/export/rdf_converter.py +13 -6
  48. datacontract/export/sodacl_converter.py +36 -188
  49. datacontract/export/spark_converter.py +245 -0
  50. datacontract/export/sql_converter.py +37 -3
  51. datacontract/export/sql_type_converter.py +269 -8
  52. datacontract/export/sqlalchemy_converter.py +170 -0
  53. datacontract/export/terraform_converter.py +7 -2
  54. datacontract/imports/avro_importer.py +246 -26
  55. datacontract/imports/bigquery_importer.py +221 -0
  56. datacontract/imports/csv_importer.py +143 -0
  57. datacontract/imports/dbml_importer.py +112 -0
  58. datacontract/imports/dbt_importer.py +240 -0
  59. datacontract/imports/excel_importer.py +1111 -0
  60. datacontract/imports/glue_importer.py +288 -0
  61. datacontract/imports/iceberg_importer.py +172 -0
  62. datacontract/imports/importer.py +51 -0
  63. datacontract/imports/importer_factory.py +128 -0
  64. datacontract/imports/json_importer.py +325 -0
  65. datacontract/imports/jsonschema_importer.py +146 -0
  66. datacontract/imports/odcs_importer.py +60 -0
  67. datacontract/imports/odcs_v3_importer.py +516 -0
  68. datacontract/imports/parquet_importer.py +81 -0
  69. datacontract/imports/protobuf_importer.py +264 -0
  70. datacontract/imports/spark_importer.py +262 -0
  71. datacontract/imports/sql_importer.py +274 -35
  72. datacontract/imports/unity_importer.py +219 -0
  73. datacontract/init/init_template.py +20 -0
  74. datacontract/integration/datamesh_manager.py +86 -0
  75. datacontract/lint/resolve.py +271 -49
  76. datacontract/lint/resources.py +21 -0
  77. datacontract/lint/schema.py +53 -17
  78. datacontract/lint/urls.py +32 -12
  79. datacontract/model/data_contract_specification/__init__.py +1 -0
  80. datacontract/model/exceptions.py +4 -1
  81. datacontract/model/odcs.py +24 -0
  82. datacontract/model/run.py +49 -29
  83. datacontract/output/__init__.py +0 -0
  84. datacontract/output/junit_test_results.py +135 -0
  85. datacontract/output/output_format.py +10 -0
  86. datacontract/output/test_results_writer.py +79 -0
  87. datacontract/py.typed +0 -0
  88. datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
  89. datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
  90. datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
  91. datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
  92. datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
  93. datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
  94. datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
  95. datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
  96. datacontract/templates/datacontract.html +139 -294
  97. datacontract/templates/datacontract_odcs.html +685 -0
  98. datacontract/templates/index.html +236 -0
  99. datacontract/templates/partials/datacontract_information.html +86 -0
  100. datacontract/templates/partials/datacontract_servicelevels.html +253 -0
  101. datacontract/templates/partials/datacontract_terms.html +51 -0
  102. datacontract/templates/partials/definition.html +25 -0
  103. datacontract/templates/partials/example.html +27 -0
  104. datacontract/templates/partials/model_field.html +144 -0
  105. datacontract/templates/partials/quality.html +49 -0
  106. datacontract/templates/partials/server.html +211 -0
  107. datacontract/templates/style/output.css +491 -72
  108. datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
  109. datacontract_cli-0.10.37.dist-info/RECORD +119 -0
  110. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
  111. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
  112. datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
  113. datacontract/engines/soda/connections/dask.py +0 -28
  114. datacontract/engines/soda/connections/duckdb.py +0 -76
  115. datacontract/export/csv_type_converter.py +0 -36
  116. datacontract/export/html_export.py +0 -66
  117. datacontract/export/odcs_converter.py +0 -102
  118. datacontract/init/download_datacontract_file.py +0 -17
  119. datacontract/integration/publish_datamesh_manager.py +0 -33
  120. datacontract/integration/publish_opentelemetry.py +0 -107
  121. datacontract/lint/lint.py +0 -141
  122. datacontract/lint/linters/description_linter.py +0 -34
  123. datacontract/lint/linters/example_model_linter.py +0 -91
  124. datacontract/lint/linters/field_pattern_linter.py +0 -34
  125. datacontract/lint/linters/field_reference_linter.py +0 -38
  126. datacontract/lint/linters/notice_period_linter.py +0 -55
  127. datacontract/lint/linters/quality_schema_linter.py +0 -52
  128. datacontract/lint/linters/valid_constraints_linter.py +0 -99
  129. datacontract/model/data_contract_specification.py +0 -141
  130. datacontract/web.py +0 -14
  131. datacontract_cli-0.10.0.dist-info/METADATA +0 -951
  132. datacontract_cli-0.10.0.dist-info/RECORD +0 -66
  133. /datacontract/{model → breaking}/breaking_change.py +0 -0
  134. /datacontract/{lint/linters → export}/__init__.py +0 -0
  135. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
  136. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2235 @@
1
+ Metadata-Version: 2.4
2
+ Name: datacontract-cli
3
+ Version: 0.10.37
4
+ Summary: The datacontract CLI is an open source command-line tool for working with Data Contracts. It uses data contract YAML files to lint the data contract, connect to data sources and execute schema and quality tests, detect breaking changes, and export to different formats. The tool is written in Python. It can be used as a standalone CLI tool, in a CI/CD pipeline, or directly as a Python library.
5
+ Author-email: Jochen Christ <jochen.christ@innoq.com>, Stefan Negele <stefan.negele@innoq.com>, Simon Harrer <simon.harrer@innoq.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://cli.datacontract.com
8
+ Project-URL: Issues, https://github.com/datacontract/datacontract-cli/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: typer<0.20,>=0.15.1
15
+ Requires-Dist: pydantic<2.13.0,>=2.8.2
16
+ Requires-Dist: pyyaml~=6.0.1
17
+ Requires-Dist: requests<2.33,>=2.31
18
+ Requires-Dist: fastjsonschema<2.22.0,>=2.19.1
19
+ Requires-Dist: fastparquet<2025.0.0,>=2024.5.0
20
+ Requires-Dist: numpy<2.0.0,>=1.26.4
21
+ Requires-Dist: python-multipart<1.0.0,>=0.0.20
22
+ Requires-Dist: rich<15.0,>=13.7
23
+ Requires-Dist: sqlglot<28.0.0,>=26.6.0
24
+ Requires-Dist: duckdb<2.0.0,>=1.0.0
25
+ Requires-Dist: soda-core-duckdb<3.6.0,>=3.3.20
26
+ Requires-Dist: setuptools>=60
27
+ Requires-Dist: python-dotenv<2.0.0,>=1.0.0
28
+ Requires-Dist: boto3<2.0.0,>=1.34.41
29
+ Requires-Dist: Jinja2<4.0.0,>=3.1.5
30
+ Requires-Dist: jinja_partials<1.0.0,>=0.2.1
31
+ Requires-Dist: datacontract-specification<2.0.0,>=1.2.3
32
+ Requires-Dist: open-data-contract-standard<4.0.0,>=3.0.5
33
+ Provides-Extra: avro
34
+ Requires-Dist: avro==1.12.0; extra == "avro"
35
+ Provides-Extra: bigquery
36
+ Requires-Dist: soda-core-bigquery<3.6.0,>=3.3.20; extra == "bigquery"
37
+ Provides-Extra: csv
38
+ Requires-Dist: pandas>=2.0.0; extra == "csv"
39
+ Provides-Extra: excel
40
+ Requires-Dist: openpyxl<4.0.0,>=3.1.5; extra == "excel"
41
+ Provides-Extra: databricks
42
+ Requires-Dist: soda-core-spark-df<3.6.0,>=3.3.20; extra == "databricks"
43
+ Requires-Dist: soda-core-spark[databricks]<3.6.0,>=3.3.20; extra == "databricks"
44
+ Requires-Dist: databricks-sql-connector<4.2.0,>=3.7.0; extra == "databricks"
45
+ Requires-Dist: databricks-sdk<0.68.0; extra == "databricks"
46
+ Requires-Dist: pyspark<4.0.0,>=3.5.5; extra == "databricks"
47
+ Provides-Extra: iceberg
48
+ Requires-Dist: pyiceberg==0.9.1; extra == "iceberg"
49
+ Provides-Extra: kafka
50
+ Requires-Dist: datacontract-cli[avro]; extra == "kafka"
51
+ Requires-Dist: soda-core-spark-df<3.6.0,>=3.3.20; extra == "kafka"
52
+ Requires-Dist: pyspark<4.0.0,>=3.5.5; extra == "kafka"
53
+ Provides-Extra: postgres
54
+ Requires-Dist: soda-core-postgres<3.6.0,>=3.3.20; extra == "postgres"
55
+ Provides-Extra: s3
56
+ Requires-Dist: s3fs<2026.0.0,>=2025.2.0; extra == "s3"
57
+ Requires-Dist: aiobotocore<2.26.0,>=2.17.0; extra == "s3"
58
+ Provides-Extra: snowflake
59
+ Requires-Dist: snowflake-connector-python[pandas]<4.1,>=3.6; extra == "snowflake"
60
+ Requires-Dist: soda-core-snowflake<3.6.0,>=3.3.20; extra == "snowflake"
61
+ Provides-Extra: sqlserver
62
+ Requires-Dist: soda-core-sqlserver<3.6.0,>=3.3.20; extra == "sqlserver"
63
+ Provides-Extra: athena
64
+ Requires-Dist: soda-core-athena<3.6.0,>=3.3.20; extra == "athena"
65
+ Provides-Extra: trino
66
+ Requires-Dist: soda-core-trino<3.6.0,>=3.3.20; extra == "trino"
67
+ Provides-Extra: dbt
68
+ Requires-Dist: dbt-core>=1.8.0; extra == "dbt"
69
+ Provides-Extra: dbml
70
+ Requires-Dist: pydbml>=1.1.1; extra == "dbml"
71
+ Provides-Extra: parquet
72
+ Requires-Dist: pyarrow>=18.1.0; extra == "parquet"
73
+ Provides-Extra: rdf
74
+ Requires-Dist: rdflib==7.0.0; extra == "rdf"
75
+ Provides-Extra: api
76
+ Requires-Dist: fastapi==0.116.1; extra == "api"
77
+ Requires-Dist: uvicorn==0.38.0; extra == "api"
78
+ Provides-Extra: protobuf
79
+ Requires-Dist: grpcio-tools>=1.53; extra == "protobuf"
80
+ Provides-Extra: all
81
+ Requires-Dist: datacontract-cli[api,athena,bigquery,csv,databricks,dbml,dbt,excel,iceberg,kafka,parquet,postgres,protobuf,rdf,s3,snowflake,sqlserver,trino]; extra == "all"
82
+ Provides-Extra: dev
83
+ Requires-Dist: datacontract-cli[all]; extra == "dev"
84
+ Requires-Dist: httpx==0.28.1; extra == "dev"
85
+ Requires-Dist: kafka-python; extra == "dev"
86
+ Requires-Dist: moto==5.1.13; extra == "dev"
87
+ Requires-Dist: pandas>=2.1.0; extra == "dev"
88
+ Requires-Dist: pre-commit<4.4.0,>=3.7.1; extra == "dev"
89
+ Requires-Dist: pytest; extra == "dev"
90
+ Requires-Dist: pytest-xdist; extra == "dev"
91
+ Requires-Dist: pymssql==2.3.8; extra == "dev"
92
+ Requires-Dist: ruff; extra == "dev"
93
+ Requires-Dist: testcontainers[kafka,minio,mssql,postgres]==4.12.0; extra == "dev"
94
+ Requires-Dist: trino==0.336.0; extra == "dev"
95
+ Dynamic: license-file
96
+
97
+ # Data Contract CLI
98
+
99
+ <p>
100
+ <a href="https://github.com/datacontract/datacontract-cli/actions/workflows/ci.yaml?query=branch%3Amain">
101
+ <img alt="Test Workflow" src="https://img.shields.io/github/actions/workflow/status/datacontract/datacontract-cli/ci.yaml?branch=main"></a>
102
+ <a href="https://github.com/datacontract/datacontract-cli">
103
+ <img alt="Stars" src="https://img.shields.io/github/stars/datacontract/datacontract-cli" /></a>
104
+ <a href="https://datacontract.com/slack" rel="nofollow"><img src="https://img.shields.io/badge/slack-join_chat-white.svg?logo=slack&amp;style=social" alt="Slack Status" data-canonical-src="https://img.shields.io/badge/slack-join_chat-white.svg?logo=slack&amp;style=social" style="max-width: 100%;"></a>
105
+ </p>
106
+
107
+ The `datacontract` CLI is an open-source command-line tool for working with data contracts.
108
+ It uses data contract YAML files as [Data Contract Specification](https://datacontract.com/) or [ODCS](https://bitol-io.github.io/open-data-contract-standard/latest/) to lint the data contract, connect to data sources and execute schema and quality tests, detect breaking changes, and export to different formats. The tool is written in Python. It can be used as a standalone CLI tool, in a CI/CD pipeline, or directly as a Python library.
109
+
110
+ ![Main features of the Data Contract CLI](datacontractcli.png)
111
+
112
+
113
+ ## Getting started
114
+
115
+ Let's look at this data contract:
116
+ [https://datacontract.com/examples/orders-latest/datacontract.yaml](https://datacontract.com/examples/orders-latest/datacontract.yaml)
117
+
118
+ We have a _servers_ section with endpoint details to the S3 bucket, _models_ for the structure of the data, _servicelevels_ and _quality_ attributes that describe the expected freshness and number of rows.
119
+
120
+ This data contract contains all information to connect to S3 and check that the actual data meets the defined schema and quality requirements. We can use this information to test if the actual data product in S3 is compliant to the data contract.
121
+
122
+ Let's use [uv](https://docs.astral.sh/uv/) to install the CLI (or use the [Docker image](#docker)),
123
+ ```bash
124
+ $ uv tool install --python python3.11 'datacontract-cli[all]'
125
+ ```
126
+
127
+
128
+ now, let's run the tests:
129
+
130
+ ```bash
131
+ $ datacontract test https://datacontract.com/examples/orders-latest/datacontract.yaml
132
+
133
+ # returns:
134
+ Testing https://datacontract.com/examples/orders-latest/datacontract.yaml
135
+ ╭────────┬─────────────────────────────────────────────────────────────────────┬───────────────────────────────┬─────────╮
136
+ │ Result │ Check │ Field │ Details │
137
+ ├────────┼─────────────────────────────────────────────────────────────────────┼───────────────────────────────┼─────────┤
138
+ │ passed │ Check that JSON has valid schema │ orders │ │
139
+ │ passed │ Check that JSON has valid schema │ line_items │ │
140
+ │ passed │ Check that field order_id is present │ orders │ │
141
+ │ passed │ Check that field order_timestamp is present │ orders │ │
142
+ │ passed │ Check that field order_total is present │ orders │ │
143
+ │ passed │ Check that field customer_id is present │ orders │ │
144
+ │ passed │ Check that field customer_email_address is present │ orders │ │
145
+ │ passed │ row_count >= 5000 │ orders │ │
146
+ │ passed │ Check that required field order_id has no null values │ orders.order_id │ │
147
+ │ passed │ Check that unique field order_id has no duplicate values │ orders.order_id │ │
148
+ │ passed │ duplicate_count(order_id) = 0 │ orders.order_id │ │
149
+ │ passed │ Check that required field order_timestamp has no null values │ orders.order_timestamp │ │
150
+ │ passed │ freshness(order_timestamp) < 24h │ orders.order_timestamp │ │
151
+ │ passed │ Check that required field order_total has no null values │ orders.order_total │ │
152
+ │ passed │ Check that required field customer_email_address has no null values │ orders.customer_email_address │ │
153
+ │ passed │ Check that field lines_item_id is present │ line_items │ │
154
+ │ passed │ Check that field order_id is present │ line_items │ │
155
+ │ passed │ Check that field sku is present │ line_items │ │
156
+ │ passed │ values in (order_id) must exist in orders (order_id) │ line_items.order_id │ │
157
+ │ passed │ row_count >= 5000 │ line_items │ │
158
+ │ passed │ Check that required field lines_item_id has no null values │ line_items.lines_item_id │ │
159
+ │ passed │ Check that unique field lines_item_id has no duplicate values │ line_items.lines_item_id │ │
160
+ ╰────────┴─────────────────────────────────────────────────────────────────────┴───────────────────────────────┴─────────╯
161
+ 🟢 data contract is valid. Run 22 checks. Took 6.739514 seconds.
162
+ ```
163
+
164
+ Voilà, the CLI tested that the _datacontract.yaml_ itself is valid, all records comply with the schema, and all quality attributes are met.
165
+
166
+ We can also use the datacontract.yaml to export in many [formats](#format), e.g., to generate a SQL DDL:
167
+
168
+ ```bash
169
+ $ datacontract export --format sql https://datacontract.com/examples/orders-latest/datacontract.yaml
170
+
171
+ # returns:
172
+ -- Data Contract: urn:datacontract:checkout:orders-latest
173
+ -- SQL Dialect: snowflake
174
+ CREATE TABLE orders (
175
+ order_id TEXT not null primary key,
176
+ order_timestamp TIMESTAMP_TZ not null,
177
+ order_total NUMBER not null,
178
+ customer_id TEXT,
179
+ customer_email_address TEXT not null,
180
+ processed_timestamp TIMESTAMP_TZ not null
181
+ );
182
+ CREATE TABLE line_items (
183
+ lines_item_id TEXT not null primary key,
184
+ order_id TEXT,
185
+ sku TEXT
186
+ );
187
+ ```
188
+
189
+ Or generate an HTML export:
190
+
191
+ ```bash
192
+ $ datacontract export --format html https://datacontract.com/examples/orders-latest/datacontract.yaml > datacontract.html
193
+ ```
194
+
195
+ which will create this [HTML export](https://datacontract.com/examples/orders-latest/datacontract.html).
196
+
197
+
198
+ ## Usage
199
+
200
+ ```bash
201
+ # create a new data contract from example and write it to datacontract.yaml
202
+ $ datacontract init datacontract.yaml
203
+
204
+ # lint the datacontract.yaml
205
+ $ datacontract lint datacontract.yaml
206
+
207
+ # execute schema and quality checks (define credentials as environment variables)
208
+ $ datacontract test datacontract.yaml
209
+
210
+ # export data contract as html (other formats: avro, dbt, dbt-sources, dbt-staging-sql, jsonschema, odcs, rdf, sql, sodacl, terraform, ...)
211
+ $ datacontract export --format html datacontract.yaml --output datacontract.html
212
+
213
+ # export data contract to ODCS
214
+ $ datacontract export --format odcs datacontract.yaml --output odcs.yaml
215
+
216
+ # import ODCS to data contract
217
+ $ datacontract import --format odcs odcs.yaml --output datacontract.yaml
218
+
219
+ # import sql (other formats: avro, glue, bigquery, jsonschema, excel ...)
220
+ $ datacontract import --format sql --source my-ddl.sql --dialect postgres --output datacontract.yaml
221
+
222
+ # import from Excel template
223
+ $ datacontract import --format excel --source odcs.xlsx --output datacontract.yaml
224
+
225
+ # export to Excel template
226
+ $ datacontract export --format excel --output odcs.xlsx datacontract.yaml
227
+
228
+ # find differences between two data contracts
229
+ $ datacontract diff datacontract-v1.yaml datacontract-v2.yaml
230
+
231
+ # find differences between two data contracts categorized into error, warning, and info.
232
+ $ datacontract changelog datacontract-v1.yaml datacontract-v2.yaml
233
+
234
+ # fail pipeline on breaking changes. Uses changelog internally and showing only error and warning.
235
+ $ datacontract breaking datacontract-v1.yaml datacontract-v2.yaml
236
+ ```
237
+
238
+ ## Programmatic (Python)
239
+ ```python
240
+ from datacontract.data_contract import DataContract
241
+
242
+ data_contract = DataContract(data_contract_file="datacontract.yaml")
243
+ run = data_contract.test()
244
+ if not run.has_passed():
245
+ print("Data quality validation failed.")
246
+ # Abort pipeline, alert, or take corrective actions...
247
+ ```
248
+
249
+ ## How to
250
+
251
+ - [How to integrate Data Contract CLI in your CI/CD pipeline as a GitHub Action](https://github.com/datacontract/datacontract-action/)
252
+ - [How to run the Data Contract CLI API to test data contracts with POST requests](https://cli.datacontract.com/API)
253
+ - [How to run Data Contract CLI in a Databricks pipeline](https://www.datamesh-architecture.com/howto/build-a-dataproduct-with-databricks#test-the-data-product)
254
+
255
+
256
+ ## Installation
257
+
258
+ Choose the most appropriate installation method for your needs:
259
+
260
+ ### uv
261
+
262
+ If you have [uv](https://docs.astral.sh/uv/) installed, you can run datacontract-cli directly without installing:
263
+
264
+ ```
265
+ uv run --with 'datacontract-cli[all]' datacontract --version
266
+ ```
267
+
268
+ ### pip
269
+ Python 3.10, 3.11, and 3.12 are supported. We recommend to use Python 3.11.
270
+
271
+ ```bash
272
+ python3 -m pip install 'datacontract-cli[all]'
273
+ datacontract --version
274
+ ```
275
+
276
+ ### pip with venv
277
+
278
+ Typically it is better to install the application in a virtual environment for your projects:
279
+
280
+ ```bash
281
+ cd my-project
282
+ python3.11 -m venv venv
283
+ source venv/bin/activate
284
+ pip install 'datacontract-cli[all]'
285
+ datacontract --version
286
+ ```
287
+
288
+ ### pipx
289
+
290
+ pipx installs into an isolated environment.
291
+
292
+ ```bash
293
+ pipx install 'datacontract-cli[all]'
294
+ datacontract --version
295
+ ```
296
+
297
+ ### Docker
298
+
299
+ You can also use our Docker image to run the CLI tool. It is also convenient for CI/CD pipelines.
300
+
301
+ ```bash
302
+ docker pull datacontract/cli
303
+ docker run --rm -v ${PWD}:/home/datacontract datacontract/cli
304
+ ```
305
+
306
+ You can create an alias for the Docker command to make it easier to use:
307
+
308
+ ```bash
309
+ alias datacontract='docker run --rm -v "${PWD}:/home/datacontract" datacontract/cli:latest'
310
+ ```
311
+
312
+ _Note:_ The output of Docker command line messages is limited to 80 columns and may include line breaks. Don't pipe docker output to files if you want to export code. Use the `--output` option instead.
313
+
314
+
315
+
316
+ ## Optional Dependencies (Extras)
317
+
318
+ The CLI tool defines several optional dependencies (also known as extras) that can be installed for using with specific servers types.
319
+ With _all_, all server dependencies are included.
320
+
321
+ ```bash
322
+ pip install datacontract-cli[all]
323
+ ```
324
+
325
+ A list of available extras:
326
+
327
+ | Dependency | Installation Command |
328
+ |-------------------------|--------------------------------------------|
329
+ | Amazon Athena | `pip install datacontract-cli[athena]` |
330
+ | Avro Support | `pip install datacontract-cli[avro]` |
331
+ | Google BigQuery | `pip install datacontract-cli[bigquery]` |
332
+ | Databricks Integration | `pip install datacontract-cli[databricks]` |
333
+ | Iceberg | `pip install datacontract-cli[iceberg]` |
334
+ | Kafka Integration | `pip install datacontract-cli[kafka]` |
335
+ | PostgreSQL Integration | `pip install datacontract-cli[postgres]` |
336
+ | S3 Integration | `pip install datacontract-cli[s3]` |
337
+ | Snowflake Integration | `pip install datacontract-cli[snowflake]` |
338
+ | Microsoft SQL Server | `pip install datacontract-cli[sqlserver]` |
339
+ | Trino | `pip install datacontract-cli[trino]` |
340
+ | dbt | `pip install datacontract-cli[dbt]` |
341
+ | DBML | `pip install datacontract-cli[dbml]` |
342
+ | Parquet | `pip install datacontract-cli[parquet]` |
343
+ | RDF | `pip install datacontract-cli[rdf]` |
344
+ | API (run as web server) | `pip install datacontract-cli[api]` |
345
+ | protobuf | `pip install datacontract-cli[protobuf]` |
346
+
347
+
348
+ ## Documentation
349
+
350
+ Commands
351
+
352
+ - [init](#init)
353
+ - [lint](#lint)
354
+ - [test](#test)
355
+ - [export](#export)
356
+ - [import](#import)
357
+ - [breaking](#breaking)
358
+ - [changelog](#changelog)
359
+ - [diff](#diff)
360
+ - [catalog](#catalog)
361
+ - [publish](#publish)
362
+ - [api](#api)
363
+
364
+ ### init
365
+ ```
366
+
367
+ Usage: datacontract init [OPTIONS] [LOCATION]
368
+
369
+ Create an empty data contract.
370
+
371
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮
372
+ │ location [LOCATION] The location of the data contract file to create. │
373
+ │ [default: datacontract.yaml] │
374
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
375
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮
376
+ │ --template TEXT URL of a template or data contract [default: None] │
377
+ │ --overwrite --no-overwrite Replace the existing datacontract.yaml │
378
+ │ [default: no-overwrite] │
379
+ │ --help Show this message and exit. │
380
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
381
+
382
+ ```
383
+
384
+ ### lint
385
+ ```
386
+
387
+ Usage: datacontract lint [OPTIONS] [LOCATION]
388
+
389
+ Validate that the datacontract.yaml is correctly formatted.
390
+
391
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮
392
+ │ location [LOCATION] The location (url or path) of the data contract yaml. │
393
+ │ [default: datacontract.yaml] │
394
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
395
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮
396
+ │ --schema TEXT The location (url or path) of the Data Contract Specification │
397
+ │ JSON Schema │
398
+ │ [default: None] │
399
+ │ --output PATH Specify the file path where the test results should be written │
400
+ │ to (e.g., './test-results/TEST-datacontract.xml'). If no path is │
401
+ │ provided, the output will be printed to stdout. │
402
+ │ [default: None] │
403
+ │ --output-format [junit] The target format for the test results. [default: None] │
404
+ │ --help Show this message and exit. │
405
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
406
+
407
+ ```
408
+
409
+ ### test
410
+ ```
411
+
412
+ Usage: datacontract test [OPTIONS] [LOCATION]
413
+
414
+ Run schema and quality tests on configured servers.
415
+
416
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮
417
+ │ location [LOCATION] The location (url or path) of the data contract yaml. │
418
+ │ [default: datacontract.yaml] │
419
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
420
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮
421
+ │ --schema TEXT The location (url or path) of │
422
+ │ the Data Contract Specification │
423
+ │ JSON Schema │
424
+ │ [default: None] │
425
+ │ --server TEXT The server configuration to run │
426
+ │ the schema and quality tests. │
427
+ │ Use the key of the server object │
428
+ │ in the data contract yaml file │
429
+ │ to refer to a server, e.g., │
430
+ │ `production`, or `all` for all │
431
+ │ servers (default). │
432
+ │ [default: all] │
433
+ │ --publish-test-results --no-publish-test-results Publish the results after the │
434
+ │ test │
435
+ │ [default: │
436
+ │ no-publish-test-results] │
437
+ │ --publish TEXT DEPRECATED. The url to publish │
438
+ │ the results after the test. │
439
+ │ [default: None] │
440
+ │ --output PATH Specify the file path where the │
441
+ │ test results should be written │
442
+ │ to (e.g., │
443
+ │ './test-results/TEST-datacontra… │
444
+ │ [default: None] │
445
+ │ --output-format [junit] The target format for the test │
446
+ │ results. │
447
+ │ [default: None] │
448
+ │ --logs --no-logs Print logs [default: no-logs] │
449
+ │ --ssl-verification --no-ssl-verification SSL verification when publishing │
450
+ │ the data contract. │
451
+ │ [default: ssl-verification] │
452
+ │ --help Show this message and exit. │
453
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
454
+
455
+ ```
456
+
457
+ Data Contract CLI connects to a data source and runs schema and quality tests to verify that the data contract is valid.
458
+
459
+ ```bash
460
+ $ datacontract test --server production datacontract.yaml
461
+ ```
462
+
463
+ To connect to the databases the `server` block in the datacontract.yaml is used to set up the connection.
464
+ In addition, credentials, such as username and passwords, may be defined with environment variables.
465
+
466
+ The application uses different engines, based on the server `type`.
467
+ Internally, it connects with DuckDB, Spark, or a native connection and executes the most tests with _soda-core_ and _fastjsonschema_.
468
+
469
+ Credentials are provided with environment variables.
470
+
471
+ Supported server types:
472
+
473
+ - [s3](#S3)
474
+ - [athena](#athena)
475
+ - [bigquery](#bigquery)
476
+ - [azure](#azure)
477
+ - [sqlserver](#sqlserver)
478
+ - [databricks](#databricks)
479
+ - [databricks (programmatic)](#databricks-programmatic)
480
+ - [dataframe (programmatic)](#dataframe-programmatic)
481
+ - [snowflake](#snowflake)
482
+ - [kafka](#kafka)
483
+ - [postgres](#postgres)
484
+ - [trino](#trino)
485
+ - [api](#api)
486
+ - [local](#local)
487
+
488
+ Supported formats:
489
+
490
+ - parquet
491
+ - json
492
+ - csv
493
+ - delta
494
+ - iceberg (coming soon)
495
+
496
+ Feel free to create an [issue](https://github.com/datacontract/datacontract-cli/issues), if you need support for an additional type and formats.
497
+
498
+ #### S3
499
+
500
+ Data Contract CLI can test data that is stored in S3 buckets or any S3-compliant endpoints in various formats.
501
+
502
+ - CSV
503
+ - JSON
504
+ - Delta
505
+ - Parquet
506
+ - Iceberg (coming soon)
507
+
508
+ ##### Examples
509
+
510
+ ###### JSON
511
+
512
+ datacontract.yaml
513
+ ```yaml
514
+ servers:
515
+ production:
516
+ type: s3
517
+ endpointUrl: https://minio.example.com # not needed with AWS S3
518
+ location: s3://bucket-name/path/*/*.json
519
+ format: json
520
+ delimiter: new_line # new_line, array, or none
521
+ ```
522
+
523
+ ###### Delta Tables
524
+
525
+ datacontract.yaml
526
+ ```yaml
527
+ servers:
528
+ production:
529
+ type: s3
530
+ endpointUrl: https://minio.example.com # not needed with AWS S3
531
+ location: s3://bucket-name/path/table.delta # path to the Delta table folder containing parquet data files and the _delta_log
532
+ format: delta
533
+ ```
534
+
535
+ ##### Environment Variables
536
+
537
+ | Environment Variable | Example | Description |
538
+ |-------------------------------------|---------------------------------|----------------------------------------|
539
+ | `DATACONTRACT_S3_REGION` | `eu-central-1` | Region of S3 bucket |
540
+ | `DATACONTRACT_S3_ACCESS_KEY_ID` | `AKIAXV5Q5QABCDEFGH` | AWS Access Key ID |
541
+ | `DATACONTRACT_S3_SECRET_ACCESS_KEY` | `93S7LRrJcqLaaaa/XXXXXXXXXXXXX` | AWS Secret Access Key |
542
+ | `DATACONTRACT_S3_SESSION_TOKEN` | `AQoDYXdzEJr...` | AWS temporary session token (optional) |
543
+
544
+
545
+ #### Athena
546
+
547
+ Data Contract CLI can test data in AWS Athena stored in S3.
548
+ Supports different file formats, such as Iceberg, Parquet, JSON, CSV...
549
+
550
+ ##### Example
551
+
552
+ datacontract.yaml
553
+ ```yaml
554
+ servers:
555
+ athena:
556
+ type: athena
557
+ catalog: awsdatacatalog # awsdatacatalog is the default setting
558
+ schema: icebergdemodb # in Athena, this is called "database"
559
+ regionName: eu-central-1
560
+ stagingDir: s3://my-bucket/athena-results/
561
+ models:
562
+ my_table: # corresponds to a table of view name
563
+ type: table
564
+ fields:
565
+ my_column_1: # corresponds to a column
566
+ type: string
567
+ config:
568
+ physicalType: varchar
569
+ ```
570
+
571
+ ##### Environment Variables
572
+
573
+ | Environment Variable | Example | Description |
574
+ |-------------------------------------|---------------------------------|----------------------------------------|
575
+ | `DATACONTRACT_S3_REGION` | `eu-central-1` | Region of Athena service |
576
+ | `DATACONTRACT_S3_ACCESS_KEY_ID` | `AKIAXV5Q5QABCDEFGH` | AWS Access Key ID |
577
+ | `DATACONTRACT_S3_SECRET_ACCESS_KEY` | `93S7LRrJcqLaaaa/XXXXXXXXXXXXX` | AWS Secret Access Key |
578
+ | `DATACONTRACT_S3_SESSION_TOKEN` | `AQoDYXdzEJr...` | AWS temporary session token (optional) |
579
+
580
+
581
+ #### Google Cloud Storage (GCS)
582
+
583
+ The [S3](#S3) integration also works with files on Google Cloud Storage through its [interoperability](https://cloud.google.com/storage/docs/interoperability).
584
+ Use `https://storage.googleapis.com` as the endpoint URL.
585
+
586
+ ##### Example
587
+
588
+ datacontract.yaml
589
+ ```yaml
590
+ servers:
591
+ production:
592
+ type: s3
593
+ endpointUrl: https://storage.googleapis.com
594
+ location: s3://bucket-name/path/*/*.json # use s3:// schema instead of gs://
595
+ format: json
596
+ delimiter: new_line # new_line, array, or none
597
+ ```
598
+
599
+ ##### Environment Variables
600
+
601
+ | Environment Variable | Example | Description |
602
+ |-------------------------------------|----------------|------------------------------------------------------------------------------------------|
603
+ | `DATACONTRACT_S3_ACCESS_KEY_ID` | `GOOG1EZZZ...` | The GCS [HMAC Key](https://cloud.google.com/storage/docs/authentication/hmackeys) Key ID |
604
+ | `DATACONTRACT_S3_SECRET_ACCESS_KEY` | `PDWWpb...` | The GCS [HMAC Key](https://cloud.google.com/storage/docs/authentication/hmackeys) Secret |
605
+
606
+
607
+ #### BigQuery
608
+
609
+ We support authentication to BigQuery using Service Account Key. The used Service Account should include the roles:
610
+ * BigQuery Job User
611
+ * BigQuery Data Viewer
612
+
613
+
614
+ ##### Example
615
+
616
+ datacontract.yaml
617
+ ```yaml
618
+ servers:
619
+ production:
620
+ type: bigquery
621
+ project: datameshexample-product
622
+ dataset: datacontract_cli_test_dataset
623
+ models:
624
+ datacontract_cli_test_table: # corresponds to a BigQuery table
625
+ type: table
626
+ fields: ...
627
+ ```
628
+
629
+ ##### Environment Variables
630
+
631
+ | Environment Variable | Example | Description |
632
+ |----------------------------------------------|---------------------------|---------------------------------------------------------|
633
+ | `DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH` | `~/service-access-key.json` | Service Access key as saved on key creation by BigQuery. If this environment variable isn't set, the cli tries to use `GOOGLE_APPLICATION_CREDENTIALS` as a fallback, so if you have that set for using their Python library anyway, it should work seamlessly. |
634
+
635
+
636
+ #### Azure
637
+
638
+ Data Contract CLI can test data that is stored in Azure Blob storage or Azure Data Lake Storage (Gen2) (ADLS) in various formats.
639
+
640
+ ##### Example
641
+
642
+ datacontract.yaml
643
+ ```yaml
644
+ servers:
645
+ production:
646
+ type: azure
647
+ storageAccount: datameshdatabricksdemo
648
+ location: abfss://dataproducts/inventory_events/*.parquet
649
+ format: parquet
650
+ ```
651
+
652
+ ##### Environment Variables
653
+
654
+ Authentication works with an Azure Service Principal (SPN) aka App Registration with a secret.
655
+
656
+ | Environment Variable | Example | Description |
657
+ |------------------------------------|----------------------------------------|------------------------------------------------------|
658
+ | `DATACONTRACT_AZURE_TENANT_ID` | `79f5b80f-10ff-40b9-9d1f-774b42d605fc` | The Azure Tenant ID |
659
+ | `DATACONTRACT_AZURE_CLIENT_ID` | `3cf7ce49-e2e9-4cbc-a922-4328d4a58622` | The ApplicationID / ClientID of the app registration |
660
+ | `DATACONTRACT_AZURE_CLIENT_SECRET` | `yZK8Q~GWO1MMXXXXXXXXXXXXX` | The Client Secret value |
661
+
662
+
663
+
664
+ #### Sqlserver
665
+
666
+ Data Contract CLI can test data in MS SQL Server (including Azure SQL, Synapse Analytics SQL Pool).
667
+
668
+ ##### Example
669
+
670
+ datacontract.yaml
671
+ ```yaml
672
+ servers:
673
+ production:
674
+ type: sqlserver
675
+ host: localhost
676
+ port: 5432
677
+ database: tempdb
678
+ schema: dbo
679
+ driver: ODBC Driver 18 for SQL Server
680
+ models:
681
+ my_table_1: # corresponds to a table
682
+ type: table
683
+ fields:
684
+ my_column_1: # corresponds to a column
685
+ type: varchar
686
+ ```
687
+
688
+ ##### Environment Variables
689
+
690
+ | Environment Variable | Example| Description |
691
+ |---------------------------------------------------|--------|----------------------------------------------|
692
+ | `DATACONTRACT_SQLSERVER_USERNAME` | `root` | Username |
693
+ | `DATACONTRACT_SQLSERVER_PASSWORD` | `toor` | Password |
694
+ | `DATACONTRACT_SQLSERVER_TRUSTED_CONNECTION` | `True` | Use windows authentication, instead of login |
695
+ | `DATACONTRACT_SQLSERVER_TRUST_SERVER_CERTIFICATE` | `True` | Trust self-signed certificate |
696
+ | `DATACONTRACT_SQLSERVER_ENCRYPTED_CONNECTION` | `True` | Use SSL |
697
+
698
+
699
+
700
+
701
+ #### Databricks
702
+
703
+ Works with Unity Catalog and Hive metastore.
704
+
705
+ Needs a running SQL warehouse or compute cluster.
706
+
707
+ ##### Example
708
+
709
+ datacontract.yaml
710
+ ```yaml
711
+ servers:
712
+ production:
713
+ type: databricks
714
+ catalog: acme_catalog_prod
715
+ schema: orders_latest
716
+ models:
717
+ orders: # corresponds to a table
718
+ type: table
719
+ fields: ...
720
+ ```
721
+
722
+ ##### Environment Variables
723
+
724
+ | Environment Variable | Example | Description |
725
+ |-------------------------------------------|--------------------------------------|-----------------------------------------------------------|
726
+ | `DATACONTRACT_DATABRICKS_TOKEN` | `dapia00000000000000000000000000000` | The personal access token to authenticate |
727
+ | `DATACONTRACT_DATABRICKS_HTTP_PATH` | `/sql/1.0/warehouses/b053a3ffffffff` | The HTTP path to the SQL warehouse or compute cluster |
728
+ | `DATACONTRACT_DATABRICKS_SERVER_HOSTNAME` | `dbc-abcdefgh-1234.cloud.databricks.com` | The host name of the SQL warehouse or compute cluster |
729
+
730
+
731
+ #### Databricks (programmatic)
732
+
733
+ Works with Unity Catalog and Hive metastore.
734
+ When running in a notebook or pipeline, the provided `spark` session can be used.
735
+ An additional authentication is not required.
736
+
737
+ Requires a Databricks Runtime with Python >= 3.10.
738
+
739
+ ##### Example
740
+
741
+ datacontract.yaml
742
+ ```yaml
743
+ servers:
744
+ production:
745
+ type: databricks
746
+ host: dbc-abcdefgh-1234.cloud.databricks.com # ignored, always use current host
747
+ catalog: acme_catalog_prod
748
+ schema: orders_latest
749
+ models:
750
+ orders: # corresponds to a table
751
+ type: table
752
+ fields: ...
753
+ ```
754
+
755
+ ##### Installing on Databricks Compute
756
+
757
+ **Important:** When using Databricks LTS ML runtimes (15.4, 16.4), installing via `%pip install` in notebooks can issues.
758
+
759
+ **Recommended approach:** Use Databricks' native library management instead:
760
+
761
+ 1. **Create or configure your compute cluster:**
762
+ - Navigate to **Compute** in the Databricks workspace
763
+ - Create a new cluster or select an existing one
764
+ - Go to the **Libraries** tab
765
+
766
+ 2. **Add the datacontract-cli library:**
767
+ - Click **Install new**
768
+ - Select **PyPI** as the library source
769
+ - Enter package name: `datacontract-cli[databricks]`
770
+ - Click **Install**
771
+
772
+ 3. **Restart the cluster** to apply the library installation
773
+
774
+ 4. **Use in your notebook** without additional installation:
775
+ ```python
776
+ from datacontract.data_contract import DataContract
777
+
778
+ data_contract = DataContract(
779
+ data_contract_file="/Volumes/acme_catalog_prod/orders_latest/datacontract/datacontract.yaml",
780
+ spark=spark)
781
+ run = data_contract.test()
782
+ run.result
783
+ ```
784
+
785
+ Databricks' library management properly resolves dependencies during cluster initialization, rather than at runtime in the notebook.
786
+
787
+ #### Dataframe (programmatic)
788
+
789
+ Works with Spark DataFrames.
790
+ DataFrames need to be created as named temporary views.
791
+ Multiple temporary views are supported if your data contract contains multiple models.
792
+
793
+ Testing DataFrames is useful to test your datasets in a pipeline before writing them to a data source.
794
+
795
+ ##### Example
796
+
797
+ datacontract.yaml
798
+ ```yaml
799
+ servers:
800
+ production:
801
+ type: dataframe
802
+ models:
803
+ my_table: # corresponds to a temporary view
804
+ type: table
805
+ fields: ...
806
+ ```
807
+
808
+ Example code
809
+ ```python
810
+ from datacontract.data_contract import DataContract
811
+
812
+ df.createOrReplaceTempView("my_table")
813
+
814
+ data_contract = DataContract(
815
+ data_contract_file="datacontract.yaml",
816
+ spark=spark,
817
+ )
818
+ run = data_contract.test()
819
+ assert run.result == "passed"
820
+ ```
821
+
822
+
823
+ #### Snowflake
824
+
825
+ Data Contract CLI can test data in Snowflake.
826
+
827
+ ##### Example
828
+
829
+ datacontract.yaml
830
+ ```yaml
831
+
832
+ servers:
833
+ snowflake:
834
+ type: snowflake
835
+ account: abcdefg-xn12345
836
+ database: ORDER_DB
837
+ schema: ORDERS_PII_V2
838
+ models:
839
+ my_table_1: # corresponds to a table
840
+ type: table
841
+ fields:
842
+ my_column_1: # corresponds to a column
843
+ type: varchar
844
+ ```
845
+
846
+ ##### Environment Variables
847
+ All [parameters supported by Soda](https://docs.soda.io/soda/connect-snowflake.html), uppercased and prepended by `DATACONTRACT_SNOWFLAKE_` prefix.
848
+ For example:
849
+
850
+ | Soda parameter | Environment Variable |
851
+ |----------------------|---------------------------------------------|
852
+ | `username` | `DATACONTRACT_SNOWFLAKE_USERNAME` |
853
+ | `password` | `DATACONTRACT_SNOWFLAKE_PASSWORD` |
854
+ | `warehouse` | `DATACONTRACT_SNOWFLAKE_WAREHOUSE` |
855
+ | `role` | `DATACONTRACT_SNOWFLAKE_ROLE` |
856
+ | `connection_timeout` | `DATACONTRACT_SNOWFLAKE_CONNECTION_TIMEOUT` |
857
+
858
+ Beware, that parameters:
859
+ * `account`
860
+ * `database`
861
+ * `schema`
862
+
863
+ are obtained from the `servers` section of the YAML-file.
864
+ E.g. from the example above:
865
+ ```yaml
866
+ servers:
867
+ snowflake:
868
+ account: abcdefg-xn12345
869
+ database: ORDER_DB
870
+ schema: ORDERS_PII_V2
871
+ ```
872
+
873
+
874
+ #### Kafka
875
+
876
+ Kafka support is currently considered experimental.
877
+
878
+ ##### Example
879
+
880
+ datacontract.yaml
881
+ ```yaml
882
+ servers:
883
+ production:
884
+ type: kafka
885
+ host: abc-12345.eu-central-1.aws.confluent.cloud:9092
886
+ topic: my-topic-name
887
+ format: json
888
+ ```
889
+
890
+ ##### Environment Variables
891
+
892
+ | Environment Variable | Example | Description |
893
+ |-------------------------------------|---------|----------------------------------------------------------------------------------|
894
+ | `DATACONTRACT_KAFKA_SASL_USERNAME` | `xxx` | The SASL username (key). |
895
+ | `DATACONTRACT_KAFKA_SASL_PASSWORD` | `xxx` | The SASL password (secret). |
896
+ | `DATACONTRACT_KAFKA_SASL_MECHANISM` | `PLAIN` | Default `PLAIN`. Other supported mechanisms: `SCRAM-SHA-256` and `SCRAM-SHA-512` |
897
+
898
+
899
+ #### Postgres
900
+
901
+ Data Contract CLI can test data in Postgres or Postgres-compliant databases (e.g., RisingWave).
902
+
903
+ ##### Example
904
+
905
+ datacontract.yaml
906
+ ```yaml
907
+ servers:
908
+ postgres:
909
+ type: postgres
910
+ host: localhost
911
+ port: 5432
912
+ database: postgres
913
+ schema: public
914
+ models:
915
+ my_table_1: # corresponds to a table
916
+ type: table
917
+ fields:
918
+ my_column_1: # corresponds to a column
919
+ type: varchar
920
+ ```
921
+
922
+ ##### Environment Variables
923
+
924
+ | Environment Variable | Example | Description |
925
+ |----------------------------------|--------------------|-------------|
926
+ | `DATACONTRACT_POSTGRES_USERNAME` | `postgres` | Username |
927
+ | `DATACONTRACT_POSTGRES_PASSWORD` | `mysecretpassword` | Password |
928
+
929
+
930
+ #### Trino
931
+
932
+ Data Contract CLI can test data in Trino.
933
+
934
+ ##### Example
935
+
936
+ datacontract.yaml
937
+ ```yaml
938
+ servers:
939
+ trino:
940
+ type: trino
941
+ host: localhost
942
+ port: 8080
943
+ catalog: my_catalog
944
+ schema: my_schema
945
+ models:
946
+ my_table_1: # corresponds to a table
947
+ type: table
948
+ fields:
949
+ my_column_1: # corresponds to a column
950
+ type: varchar
951
+ my_column_2: # corresponds to a column with custom trino type
952
+ type: object
953
+ config:
954
+ trinoType: row(en_us varchar, pt_br varchar)
955
+ ```
956
+
957
+ ##### Environment Variables
958
+
959
+ | Environment Variable | Example | Description |
960
+ |-------------------------------|--------------------|-------------|
961
+ | `DATACONTRACT_TRINO_USERNAME` | `trino` | Username |
962
+ | `DATACONTRACT_TRINO_PASSWORD` | `mysecretpassword` | Password |
963
+
964
+
965
+ #### API
966
+
967
+ Data Contract CLI can test APIs that return data in JSON format.
968
+ Currently, only GET requests are supported.
969
+
970
+ ##### Example
971
+
972
+ datacontract.yaml
973
+ ```yaml
974
+ servers:
975
+ api:
976
+ type: "api"
977
+ location: "https://api.example.com/path"
978
+ delimiter: none # new_line, array, or none (default)
979
+
980
+ models:
981
+ my_object: # corresponds to the root element of the JSON response
982
+ type: object
983
+ fields:
984
+ field1:
985
+ type: string
986
+ fields2:
987
+ type: number
988
+ ```
989
+
990
+ ##### Environment Variables
991
+
992
+ | Environment Variable | Example | Description |
993
+ |-----------------------------------------|------------------|---------------------------------------------------|
994
+ | `DATACONTRACT_API_HEADER_AUTHORIZATION` | `Bearer <token>` | The value for the `authorization` header. Optional. |
995
+
996
+
997
+ #### Local
998
+
999
+ Data Contract CLI can test local files in parquet, json, csv, or delta format.
1000
+
1001
+ ##### Example
1002
+
1003
+ datacontract.yaml
1004
+ ```yaml
1005
+ servers:
1006
+ local:
1007
+ type: local
1008
+ path: ./*.parquet
1009
+ format: parquet
1010
+ models:
1011
+ my_table_1: # corresponds to a table
1012
+ type: table
1013
+ fields:
1014
+ my_column_1: # corresponds to a column
1015
+ type: varchar
1016
+ my_column_2: # corresponds to a column
1017
+ type: string
1018
+ ```
1019
+
1020
+
1021
+ ### export
1022
+ ```
1023
+
1024
+ Usage: datacontract export [OPTIONS] [LOCATION]
1025
+
1026
+ Convert data contract to a specific format. Saves to file specified by `output` option if present,
1027
+ otherwise prints to stdout.
1028
+
1029
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮
1030
+ │ location [LOCATION] The location (url or path) of the data contract yaml. │
1031
+ │ [default: datacontract.yaml] │
1032
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1033
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮
1034
+ │ * --format [jsonschema|pydantic-model|sodacl|db The export format. [default: None] │
1035
+ │ t|dbt-sources|dbt-staging-sql|odcs|r [required] │
1036
+ │ df|avro|protobuf|great-expectations| │
1037
+ │ terraform|avro-idl|sql|sql-query|mer │
1038
+ │ maid|html|go|bigquery|dbml|spark|sql │
1039
+ │ alchemy|data-caterer|dcs|markdown|ic │
1040
+ │ eberg|custom|excel|dqx] │
1041
+ │ --output PATH Specify the file path where the │
1042
+ │ exported data will be saved. If no │
1043
+ │ path is provided, the output will be │
1044
+ │ printed to stdout. │
1045
+ │ [default: None] │
1046
+ │ --server TEXT The server name to export. │
1047
+ │ [default: None] │
1048
+ │ --model TEXT Use the key of the model in the data │
1049
+ │ contract yaml file to refer to a │
1050
+ │ model, e.g., `orders`, or `all` for │
1051
+ │ all models (default). │
1052
+ │ [default: all] │
1053
+ │ --schema TEXT The location (url or path) of the │
1054
+ │ Data Contract Specification JSON │
1055
+ │ Schema │
1056
+ │ [default: None] │
1057
+ │ --engine TEXT [engine] The engine used for great │
1058
+ │ expection run. │
1059
+ │ [default: None] │
1060
+ │ --template PATH The file path or URL of a template. │
1061
+ │ For Excel format: path/URL to custom │
1062
+ │ Excel template. For custom format: │
1063
+ │ path to Jinja template. │
1064
+ │ [default: None] │
1065
+ │ --help Show this message and exit. │
1066
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1067
+ ╭─ RDF Options ────────────────────────────────────────────────────────────────────────────────────╮
1068
+ │ --rdf-base TEXT [rdf] The base URI used to generate the RDF graph. [default: None] │
1069
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1070
+ ╭─ SQL Options ────────────────────────────────────────────────────────────────────────────────────╮
1071
+ │ --sql-server-type TEXT [sql] The server type to determine the sql dialect. By default, │
1072
+ │ it uses 'auto' to automatically detect the sql dialect via the │
1073
+ │ specified servers in the data contract. │
1074
+ │ [default: auto] │
1075
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1076
+
1077
+ ```
1078
+
1079
+ ```bash
1080
+ # Example export data contract as HTML
1081
+ datacontract export --format html --output datacontract.html
1082
+ ```
1083
+
1084
+ Available export options:
1085
+
1086
+ | Type | Description | Status |
1087
+ |----------------------|---------------------------------------------------------|---------|
1088
+ | `html` | Export to HTML | ✅ |
1089
+ | `jsonschema` | Export to JSON Schema | ✅ |
1090
+ | `odcs` | Export to Open Data Contract Standard (ODCS) V3 | ✅ |
1091
+ | `sodacl` | Export to SodaCL quality checks in YAML format | ✅ |
1092
+ | `dbt` | Export to dbt models in YAML format | ✅ |
1093
+ | `dbt-sources` | Export to dbt sources in YAML format | ✅ |
1094
+ | `dbt-staging-sql` | Export to dbt staging SQL models | ✅ |
1095
+ | `rdf` | Export data contract to RDF representation in N3 format | ✅ |
1096
+ | `avro` | Export to AVRO models | ✅ |
1097
+ | `protobuf` | Export to Protobuf | ✅ |
1098
+ | `terraform` | Export to terraform resources | ✅ |
1099
+ | `sql` | Export to SQL DDL | ✅ |
1100
+ | `sql-query` | Export to SQL Query | ✅ |
1101
+ | `great-expectations` | Export to Great Expectations Suites in JSON Format | ✅ |
1102
+ | `bigquery` | Export to BigQuery Schemas | ✅ |
1103
+ | `go` | Export to Go types | ✅ |
1104
+ | `pydantic-model` | Export to pydantic models | ✅ |
1105
+ | `DBML` | Export to a DBML Diagram description | ✅ |
1106
+ | `spark` | Export to a Spark StructType | ✅ |
1107
+ | `sqlalchemy` | Export to SQLAlchemy Models | ✅ |
1108
+ | `data-caterer` | Export to Data Caterer in YAML format | ✅ |
1109
+ | `dcs` | Export to Data Contract Specification in YAML format | ✅ |
1110
+ | `markdown` | Export to Markdown | ✅ |
1111
+ | `iceberg` | Export to an Iceberg JSON Schema Definition | partial |
1112
+ | `excel` | Export to ODCS Excel Template | ✅ |
1113
+ | `custom` | Export to Custom format with Jinja | ✅ |
1114
+ | `dqx` | Export to DQX in YAML format | ✅ |
1115
+ | Missing something? | Please create an issue on GitHub | TBD |
1116
+
1117
+ #### SQL
1118
+
1119
+ The `export` function converts a given data contract into a SQL data definition language (DDL).
1120
+
1121
+ ```shell
1122
+ datacontract export datacontract.yaml --format sql --output output.sql
1123
+ ```
1124
+
1125
+ If using Databricks, and an error is thrown when trying to deploy the SQL DDLs with `variant` columns set the following properties.
1126
+
1127
+ ```shell
1128
+ spark.conf.set(“spark.databricks.delta.schema.typeCheck.enabled”, “false”)
1129
+ ```
1130
+
1131
+ #### Great Expectations
1132
+
1133
+ The `export` function transforms a specified data contract into a comprehensive Great Expectations JSON suite.
1134
+ If the contract includes multiple models, you need to specify the names of the model you wish to export.
1135
+
1136
+ ```shell
1137
+ datacontract export datacontract.yaml --format great-expectations --model orders
1138
+ ```
1139
+
1140
+ The export creates a list of expectations by utilizing:
1141
+
1142
+ - The data from the Model definition with a fixed mapping
1143
+ - The expectations provided in the quality field for each model (find here the expectations gallery: [Great Expectations Gallery](https://greatexpectations.io/expectations/))
1144
+
1145
+ ##### Additional Arguments
1146
+
1147
+ To further customize the export, the following optional arguments are available:
1148
+
1149
+ - **`suite_name`**: The name of the expectation suite. This suite groups all generated expectations and provides a convenient identifier within Great Expectations. If not provided, a default suite name will be generated based on the model name(s).
1150
+
1151
+ - **`engine`**: Specifies the engine used to run Great Expectations checks. Accepted values are:
1152
+ - `pandas` — Use this when working with in-memory data frames through the Pandas library.
1153
+ - `spark` — Use this for working with Spark dataframes.
1154
+ - `sql` — Use this for working with SQL databases.
1155
+
1156
+ - **`sql_server_type`**: Specifies the type of SQL server to connect with when `engine` is set to `sql`.
1157
+
1158
+ Providing `sql_server_type` ensures that the appropriate SQL dialect and connection settings are applied during the expectation validation.
1159
+
1160
+ #### RDF
1161
+
1162
+ The `export` function converts a given data contract into a RDF representation. You have the option to
1163
+ add a base_url which will be used as the default prefix to resolve relative IRIs inside the document.
1164
+
1165
+ ```shell
1166
+ datacontract export --format rdf --rdf-base https://www.example.com/ datacontract.yaml
1167
+ ```
1168
+
1169
+ The data contract is mapped onto the following concepts of a yet to be defined Data Contract
1170
+ Ontology named https://datacontract.com/DataContractSpecification/ :
1171
+ - DataContract
1172
+ - Server
1173
+ - Model
1174
+
1175
+ Having the data contract inside an RDF Graph gives us access the following use cases:
1176
+ - Interoperability with other data contract specification formats
1177
+ - Store data contracts inside a knowledge graph
1178
+ - Enhance a semantic search to find and retrieve data contracts
1179
+ - Linking model elements to already established ontologies and knowledge
1180
+ - Using full power of OWL to reason about the graph structure of data contracts
1181
+ - Apply graph algorithms on multiple data contracts (Find similar data contracts, find "gatekeeper"
1182
+ data products, find the true domain owner of a field attribute)
1183
+
1184
+ #### DBML
1185
+
1186
+ The export function converts the logical data types of the datacontract into the specific ones of a concrete Database
1187
+ if a server is selected via the `--server` option (based on the `type` of that server). If no server is selected, the
1188
+ logical data types are exported.
1189
+
1190
+ #### DBT & DBT-SOURCES
1191
+
1192
+ The export funciton converts the datacontract to dbt models in YAML format, with support for SQL dialects.
1193
+ If a server is selected via the `--server` option (based on the `type` of that server) then the DBT column `data_types` match the expected data types of the server.
1194
+ If no server is selected, then it defaults to `snowflake`.
1195
+
1196
+ #### Spark
1197
+
1198
+ The export function converts the data contract specification into a StructType Spark schema. The returned value is a Python code picture of the model schemas.
1199
+ Spark DataFrame schema is defined as StructType. For more details about Spark Data Types please see [the spark documentation](https://spark.apache.org/docs/latest/sql-ref-datatypes.html)
1200
+
1201
+ #### Avro
1202
+
1203
+ The export function converts the data contract specification into an avro schema. It supports specifying custom avro properties for logicalTypes and default values.
1204
+
1205
+ ##### Custom Avro Properties
1206
+
1207
+ We support a **config map on field level**. A config map may include any additional key-value pairs and support multiple server type bindings.
1208
+
1209
+ To specify custom Avro properties in your data contract, you can define them within the `config` section of your field definition. Below is an example of how to structure your YAML configuration to include custom Avro properties, such as `avroLogicalType` and `avroDefault`.
1210
+
1211
+ >NOTE: At this moment, we just support [logicalType](https://avro.apache.org/docs/1.11.0/spec.html#Logical+Types) and [default](https://avro.apache.org/docs/1.11.0/spec.htm)
1212
+
1213
+ #### Example Configuration
1214
+
1215
+ ```yaml
1216
+ models:
1217
+ orders:
1218
+ fields:
1219
+ my_field_1:
1220
+ description: Example for AVRO with Timestamp (microsecond precision) https://avro.apache.org/docs/current/spec.html#Local+timestamp+%28microsecond+precision%29
1221
+ type: long
1222
+ example: 1672534861000000 # Equivalent to 2023-01-01 01:01:01 in microseconds
1223
+ required: true
1224
+ config:
1225
+ avroLogicalType: local-timestamp-micros
1226
+ avroDefault: 1672534861000000
1227
+ ```
1228
+
1229
+ #### Explanation
1230
+
1231
+ - **models**: The top-level key that contains different models (tables or objects) in your data contract.
1232
+ - **orders**: A specific model name. Replace this with the name of your model.
1233
+ - **fields**: The fields within the model. Each field can have various properties defined.
1234
+ - **my_field_1**: The name of a specific field. Replace this with your field name.
1235
+ - **description**: A textual description of the field.
1236
+ - **type**: The data type of the field. In this example, it is `long`.
1237
+ - **example**: An example value for the field.
1238
+ - **required**: Is this a required field (as opposed to optional/nullable).
1239
+ - **config**: Section to specify custom Avro properties.
1240
+ - **avroLogicalType**: Specifies the logical type of the field in Avro. In this example, it is `local-timestamp-micros`.
1241
+ - **avroDefault**: Specifies the default value for the field in Avro. In this example, it is 1672534861000000 which corresponds to ` 2023-01-01 01:01:01 UTC`.
1242
+
1243
+ #### Data Caterer
1244
+
1245
+ The export function converts the data contract to a data generation task in YAML format that can be
1246
+ ingested by [Data Caterer](https://github.com/data-catering/data-caterer). This gives you the
1247
+ ability to generate production-like data in any environment based off your data contract.
1248
+
1249
+ ```shell
1250
+ datacontract export datacontract.yaml --format data-caterer --model orders
1251
+ ```
1252
+
1253
+ You can further customise the way data is generated via adding
1254
+ [additional metadata in the YAML](https://data.catering/setup/generator/data-generator/)
1255
+ to suit your needs.
1256
+
1257
+ #### Iceberg
1258
+
1259
+ Exports to an [Iceberg Table Json Schema Definition](https://iceberg.apache.org/spec/#appendix-c-json-serialization).
1260
+
1261
+ This export only supports a single model export at a time because Iceberg's schema definition is for a single table and the exporter maps 1 model to 1 table, use the `--model` flag
1262
+ to limit your contract export to a single model.
1263
+
1264
+ ```bash
1265
+ $ datacontract export --format iceberg --model orders https://datacontract.com/examples/orders-latest/datacontract.yaml --output /tmp/orders_iceberg.json
1266
+
1267
+ $ cat /tmp/orders_iceberg.json | jq '.'
1268
+ {
1269
+ "type": "struct",
1270
+ "fields": [
1271
+ {
1272
+ "id": 1,
1273
+ "name": "order_id",
1274
+ "type": "string",
1275
+ "required": true
1276
+ },
1277
+ {
1278
+ "id": 2,
1279
+ "name": "order_timestamp",
1280
+ "type": "timestamptz",
1281
+ "required": true
1282
+ },
1283
+ {
1284
+ "id": 3,
1285
+ "name": "order_total",
1286
+ "type": "long",
1287
+ "required": true
1288
+ },
1289
+ {
1290
+ "id": 4,
1291
+ "name": "customer_id",
1292
+ "type": "string",
1293
+ "required": false
1294
+ },
1295
+ {
1296
+ "id": 5,
1297
+ "name": "customer_email_address",
1298
+ "type": "string",
1299
+ "required": true
1300
+ },
1301
+ {
1302
+ "id": 6,
1303
+ "name": "processed_timestamp",
1304
+ "type": "timestamptz",
1305
+ "required": true
1306
+ }
1307
+ ],
1308
+ "schema-id": 0,
1309
+ "identifier-field-ids": [
1310
+ 1
1311
+ ]
1312
+ }
1313
+ ```
1314
+
1315
+ #### Custom
1316
+
1317
+ The export function converts the data contract specification into the custom format with Jinja. You can specify the path to a Jinja template with the `--template` argument, allowing you to output files in any format.
1318
+
1319
+ ```shell
1320
+ datacontract export --format custom --template template.txt datacontract.yaml
1321
+ ```
1322
+
1323
+ ##### Jinja variables
1324
+
1325
+ You can directly use the Data Contract Specification as template variables.
1326
+
1327
+ ```shell
1328
+ $ cat template.txt
1329
+ title: {{ data_contract.info.title }}
1330
+
1331
+ $ datacontract export --format custom --template template.txt datacontract.yaml
1332
+ title: Orders Latest
1333
+ ```
1334
+
1335
+ ##### Example Jinja Templates
1336
+
1337
+ ###### Customized dbt model
1338
+
1339
+ You can export the dbt models containing any logic.
1340
+
1341
+ Below is an example of a dbt staging layer that converts a field of `type: timestamp` to a `DATETIME` type with time zone conversion.
1342
+
1343
+ template.sql
1344
+
1345
+ {% raw %}
1346
+ ```sql
1347
+ {%- for model_name, model in data_contract.models.items() %}
1348
+ {#- Export only the first model #}
1349
+ {%- if loop.first -%}
1350
+ SELECT
1351
+ {%- for field_name, field in model.fields.items() %}
1352
+ {%- if field.type == "timestamp" %}
1353
+ DATETIME({{ field_name }}, "Asia/Tokyo") AS {{ field_name }},
1354
+ {%- else %}
1355
+ {{ field_name }} AS {{ field_name }},
1356
+ {%- endif %}
1357
+ {%- endfor %}
1358
+ FROM
1359
+ {{ "{{" }} ref('{{ model_name }}') {{ "}}" }}
1360
+ {%- endif %}
1361
+ {%- endfor %}
1362
+ ```
1363
+ {% endraw %}
1364
+
1365
+ command
1366
+
1367
+ ```shell
1368
+ datacontract export --format custom --template template.sql --output output.sql datacontract.yaml
1369
+ ```
1370
+
1371
+ output.sql
1372
+
1373
+ ```sql
1374
+ SELECT
1375
+ order_id AS order_id,
1376
+ DATETIME(order_timestamp, "Asia/Tokyo") AS order_timestamp,
1377
+ order_total AS order_total,
1378
+ customer_id AS customer_id,
1379
+ customer_email_address AS customer_email_address,
1380
+ DATETIME(processed_timestamp, "Asia/Tokyo") AS processed_timestamp,
1381
+ FROM
1382
+ {{ ref('orders') }}
1383
+ ```
1384
+
1385
+ #### ODCS Excel Templace
1386
+
1387
+ The `export` function converts a data contract into an ODCS (Open Data Contract Standard) Excel template. This creates a user-friendly Excel spreadsheet that can be used for authoring, sharing, and managing data contracts using the familiar Excel interface.
1388
+
1389
+ ```shell
1390
+ datacontract export --format excel --output datacontract.xlsx datacontract.yaml
1391
+ ```
1392
+
1393
+ The Excel format enables:
1394
+ - **User-friendly authoring**: Create and edit data contracts in Excel's familiar interface
1395
+ - **Easy sharing**: Distribute data contracts as standard Excel files
1396
+ - **Collaboration**: Enable non-technical stakeholders to contribute to data contract definitions
1397
+ - **Round-trip conversion**: Import Excel templates back to YAML data contracts
1398
+
1399
+ For more information about the Excel template structure, visit the [ODCS Excel Template repository](https://github.com/datacontract/open-data-contract-standard-excel-template).
1400
+
1401
+ ### import
1402
+ ```
1403
+
1404
+ Usage: datacontract import [OPTIONS]
1405
+
1406
+ Create a data contract from the given source location. Saves to file specified by `output` option
1407
+ if present, otherwise prints to stdout.
1408
+
1409
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮
1410
+ │ * --format [sql|avro|dbt|dbml|glue|jsonsc The format of the source file. │
1411
+ │ hema|json|bigquery|odcs|unity| [default: None] │
1412
+ │ spark|iceberg|parquet|csv|prot [required] │
1413
+ │ obuf|excel] │
1414
+ │ --output PATH Specify the file path where │
1415
+ │ the Data Contract will be │
1416
+ │ saved. If no path is provided, │
1417
+ │ the output will be printed to │
1418
+ │ stdout. │
1419
+ │ [default: None] │
1420
+ │ --source TEXT The path to the file that │
1421
+ │ should be imported. │
1422
+ │ [default: None] │
1423
+ │ --spec [datacontract_specification|od The format of the data │
1424
+ │ cs] contract to import. │
1425
+ │ [default: │
1426
+ │ datacontract_specification] │
1427
+ │ --dialect TEXT The SQL dialect to use when │
1428
+ │ importing SQL files, e.g., │
1429
+ │ postgres, tsql, bigquery. │
1430
+ │ [default: None] │
1431
+ │ --glue-table TEXT List of table ids to import │
1432
+ │ from the Glue Database (repeat │
1433
+ │ for multiple table ids, leave │
1434
+ │ empty for all tables in the │
1435
+ │ dataset). │
1436
+ │ [default: None] │
1437
+ │ --bigquery-project TEXT The bigquery project id. │
1438
+ │ [default: None] │
1439
+ │ --bigquery-dataset TEXT The bigquery dataset id. │
1440
+ │ [default: None] │
1441
+ │ --bigquery-table TEXT List of table ids to import │
1442
+ │ from the bigquery API (repeat │
1443
+ │ for multiple table ids, leave │
1444
+ │ empty for all tables in the │
1445
+ │ dataset). │
1446
+ │ [default: None] │
1447
+ │ --unity-table-full-name TEXT Full name of a table in the │
1448
+ │ unity catalog │
1449
+ │ [default: None] │
1450
+ │ --dbt-model TEXT List of models names to import │
1451
+ │ from the dbt manifest file │
1452
+ │ (repeat for multiple models │
1453
+ │ names, leave empty for all │
1454
+ │ models in the dataset). │
1455
+ │ [default: None] │
1456
+ │ --dbml-schema TEXT List of schema names to import │
1457
+ │ from the DBML file (repeat for │
1458
+ │ multiple schema names, leave │
1459
+ │ empty for all tables in the │
1460
+ │ file). │
1461
+ │ [default: None] │
1462
+ │ --dbml-table TEXT List of table names to import │
1463
+ │ from the DBML file (repeat for │
1464
+ │ multiple table names, leave │
1465
+ │ empty for all tables in the │
1466
+ │ file). │
1467
+ │ [default: None] │
1468
+ │ --iceberg-table TEXT Table name to assign to the │
1469
+ │ model created from the Iceberg │
1470
+ │ schema. │
1471
+ │ [default: None] │
1472
+ │ --template TEXT The location (url or path) of │
1473
+ │ the Data Contract │
1474
+ │ Specification Template │
1475
+ │ [default: None] │
1476
+ │ --schema TEXT The location (url or path) of │
1477
+ │ the Data Contract │
1478
+ │ Specification JSON Schema │
1479
+ │ [default: None] │
1480
+ │ --owner TEXT The owner or team responsible │
1481
+ │ for managing the data │
1482
+ │ contract. │
1483
+ │ [default: None] │
1484
+ │ --id TEXT The identifier for the the │
1485
+ │ data contract. │
1486
+ │ [default: None] │
1487
+ │ --help Show this message and exit. │
1488
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1489
+
1490
+ ```
1491
+
1492
+ Example:
1493
+ ```bash
1494
+ # Example import from SQL DDL
1495
+ datacontract import --format sql --source my_ddl.sql --dialect postgres
1496
+ # To save to file
1497
+ datacontract import --format sql --source my_ddl.sql --dialect postgres --output datacontract.yaml
1498
+ ```
1499
+
1500
+ Available import options:
1501
+
1502
+ | Type | Description | Status |
1503
+ |--------------------|------------------------------------------------|--------|
1504
+ | `avro` | Import from AVRO schemas | ✅ |
1505
+ | `bigquery` | Import from BigQuery Schemas | ✅ |
1506
+ | `csv` | Import from CSV File | ✅ |
1507
+ | `dbml` | Import from DBML models | ✅ |
1508
+ | `dbt` | Import from dbt models | ✅ |
1509
+ | `excel` | Import from ODCS Excel Template | ✅ |
1510
+ | `glue` | Import from AWS Glue DataCatalog | ✅ |
1511
+ | `iceberg` | Import from an Iceberg JSON Schema Definition | partial |
1512
+ | `jsonschema` | Import from JSON Schemas | ✅ |
1513
+ | `odcs` | Import from Open Data Contract Standard (ODCS) | ✅ |
1514
+ | `parquet` | Import from Parquet File Metadata | ✅ |
1515
+ | `protobuf` | Import from Protobuf schemas | ✅ |
1516
+ | `spark` | Import from Spark StructTypes, Variant | ✅ |
1517
+ | `sql` | Import from SQL DDL | ✅ |
1518
+ | `unity` | Import from Databricks Unity Catalog | partial |
1519
+ | `excel` | Import from ODCS Excel Template | ✅ |
1520
+ | Missing something? | Please create an issue on GitHub | TBD |
1521
+
1522
+
1523
+ #### ODCS
1524
+
1525
+ Import from Open Data Contract Standard (ODCS) v2 or v3.
1526
+ The importer automatically detects the ODCS version and imports the data contract.
1527
+
1528
+ Examples:
1529
+
1530
+ ```bash
1531
+ # Example import from ODCS
1532
+ datacontract import --format odcs --source my_data_contract.odcs.yaml
1533
+ ```
1534
+
1535
+ #### BigQuery
1536
+
1537
+ BigQuery data can either be imported off of JSON Files generated from the table descriptions or directly from the Bigquery API. In case you want to use JSON Files, specify the `source` parameter with a path to the JSON File.
1538
+
1539
+ To import from the Bigquery API, you have to _omit_ `source` and instead need to provide `bigquery-project` and `bigquery-dataset`. Additionally you may specify `bigquery-table` to enumerate the tables that should be imported. If no tables are given, _all_ available tables of the dataset will be imported.
1540
+
1541
+ For providing authentication to the Client, please see [the google documentation](https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to) or the one [about authorizing client libraries](https://cloud.google.com/bigquery/docs/authentication#client-libs).
1542
+
1543
+ Examples:
1544
+
1545
+ ```bash
1546
+ # Example import from Bigquery JSON
1547
+ datacontract import --format bigquery --source my_bigquery_table.json
1548
+ ```
1549
+
1550
+ ```bash
1551
+ # Example import from Bigquery API with specifying the tables to import
1552
+ datacontract import --format bigquery --bigquery-project <project_id> --bigquery-dataset <dataset_id> --bigquery-table <tableid_1> --bigquery-table <tableid_2> --bigquery-table <tableid_3>
1553
+ ```
1554
+
1555
+ ```bash
1556
+ # Example import from Bigquery API importing all tables in the dataset
1557
+ datacontract import --format bigquery --bigquery-project <project_id> --bigquery-dataset <dataset_id>
1558
+ ```
1559
+
1560
+ #### Unity Catalog
1561
+ ```bash
1562
+ # Example import from a Unity Catalog JSON file
1563
+ datacontract import --format unity --source my_unity_table.json
1564
+ ```
1565
+
1566
+ ```bash
1567
+ # Example import single table from Unity Catalog via HTTP endpoint using PAT
1568
+ export DATACONTRACT_DATABRICKS_SERVER_HOSTNAME="https://xyz.cloud.databricks.com"
1569
+ export DATACONTRACT_DATABRICKS_TOKEN=<token>
1570
+ datacontract import --format unity --unity-table-full-name <table_full_name>
1571
+ ```
1572
+ Please Refer to [Databricks documentation](https://docs.databricks.com/aws/en/dev-tools/auth/unified-auth) on how to set up a profile
1573
+ ```bash
1574
+ # Example import single table from Unity Catalog via HTTP endpoint using Profile
1575
+ export DATACONTRACT_DATABRICKS_PROFILE="my-profile"
1576
+ datacontract import --format unity --unity-table-full-name <table_full_name>
1577
+ ```
1578
+
1579
+ #### dbt
1580
+
1581
+ Importing from dbt manifest file.
1582
+ You may give the `dbt-model` parameter to enumerate the tables that should be imported. If no tables are given, _all_ available tables of the database will be imported.
1583
+
1584
+ Examples:
1585
+
1586
+ ```bash
1587
+ # Example import from dbt manifest with specifying the tables to import
1588
+ datacontract import --format dbt --source <manifest_path> --dbt-model <model_name_1> --dbt-model <model_name_2> --dbt-model <model_name_3>
1589
+ ```
1590
+
1591
+ ```bash
1592
+ # Example import from dbt manifest importing all tables in the database
1593
+ datacontract import --format dbt --source <manifest_path>
1594
+ ```
1595
+
1596
+ ### Excel
1597
+
1598
+ Importing from [ODCS Excel Template](https://github.com/datacontract/open-data-contract-standard-excel-template).
1599
+
1600
+ Examples:
1601
+
1602
+ ```bash
1603
+ # Example import from ODCS Excel Template
1604
+ datacontract import --format excel --source odcs.xlsx
1605
+ ```
1606
+
1607
+ #### Glue
1608
+
1609
+ Importing from Glue reads the necessary Data directly off of the AWS API.
1610
+ You may give the `glue-table` parameter to enumerate the tables that should be imported. If no tables are given, _all_ available tables of the database will be imported.
1611
+
1612
+ Examples:
1613
+
1614
+ ```bash
1615
+ # Example import from AWS Glue with specifying the tables to import
1616
+ datacontract import --format glue --source <database_name> --glue-table <table_name_1> --glue-table <table_name_2> --glue-table <table_name_3>
1617
+ ```
1618
+
1619
+ ```bash
1620
+ # Example import from AWS Glue importing all tables in the database
1621
+ datacontract import --format glue --source <database_name>
1622
+ ```
1623
+
1624
+ #### Spark
1625
+
1626
+ Importing from Spark table or view these must be created or accessible in the Spark context. Specify tables list in `source` parameter. If the `source` tables are registered as tables in Databricks, and they have a table-level descriptions they will also be added to the Data Contract Specification.
1627
+
1628
+ ```bash
1629
+ # Example: Import Spark table(s) from Spark context
1630
+ datacontract import --format spark --source "users,orders"
1631
+ ```
1632
+
1633
+ ```bash
1634
+ # Example: Import Spark table
1635
+ DataContract.import_from_source("spark", "users")
1636
+ DataContract.import_from_source(format = "spark", source = "users")
1637
+
1638
+ # Example: Import Spark dataframe
1639
+ DataContract.import_from_source("spark", "users", dataframe = df_user)
1640
+ DataContract.import_from_source(format = "spark", source = "users", dataframe = df_user)
1641
+
1642
+ # Example: Import Spark table + table description
1643
+ DataContract.import_from_source("spark", "users", description = "description")
1644
+ DataContract.import_from_source(format = "spark", source = "users", description = "description")
1645
+
1646
+ # Example: Import Spark dataframe + table description
1647
+ DataContract.import_from_source("spark", "users", dataframe = df_user, description = "description")
1648
+ DataContract.import_from_source(format = "spark", source = "users", dataframe = df_user, description = "description")
1649
+ ```
1650
+
1651
+ #### DBML
1652
+
1653
+ Importing from DBML Documents.
1654
+ **NOTE:** Since DBML does _not_ have strict requirements on the types of columns, this import _may_ create non-valid datacontracts, as not all types of fields can be properly mapped. In this case you will have to adapt the generated document manually.
1655
+ We also assume, that the description for models and fields is stored in a Note within the DBML model.
1656
+
1657
+ You may give the `dbml-table` or `dbml-schema` parameter to enumerate the tables or schemas that should be imported.
1658
+ If no tables are given, _all_ available tables of the source will be imported. Likewise, if no schema is given, _all_ schemas are imported.
1659
+
1660
+ Examples:
1661
+
1662
+ ```bash
1663
+ # Example import from DBML file, importing everything
1664
+ datacontract import --format dbml --source <file_path>
1665
+ ```
1666
+
1667
+ ```bash
1668
+ # Example import from DBML file, filtering for tables from specific schemas
1669
+ datacontract import --format dbml --source <file_path> --dbml-schema <schema_1> --dbml-schema <schema_2>
1670
+ ```
1671
+
1672
+ ```bash
1673
+ # Example import from DBML file, filtering for tables with specific names
1674
+ datacontract import --format dbml --source <file_path> --dbml-table <table_name_1> --dbml-table <table_name_2>
1675
+ ```
1676
+
1677
+ ```bash
1678
+ # Example import from DBML file, filtering for tables with specific names from a specific schema
1679
+ datacontract import --format dbml --source <file_path> --dbml-table <table_name_1> --dbml-schema <schema_1>
1680
+ ```
1681
+
1682
+ #### Iceberg
1683
+
1684
+ Importing from an [Iceberg Table Json Schema Definition](https://iceberg.apache.org/spec/#appendix-c-json-serialization). Specify location of json files using the `source` parameter.
1685
+
1686
+ Examples:
1687
+
1688
+ ```bash
1689
+ datacontract import --format iceberg --source ./tests/fixtures/iceberg/simple_schema.json --iceberg-table test-table
1690
+ ```
1691
+
1692
+ #### CSV
1693
+
1694
+ Importing from CSV File. Specify file in `source` parameter. It does autodetection for encoding and csv dialect
1695
+
1696
+ Example:
1697
+
1698
+ ```bash
1699
+ datacontract import --format csv --source "test.csv"
1700
+ ```
1701
+
1702
+ #### protobuf
1703
+
1704
+ Importing from protobuf File. Specify file in `source` parameter.
1705
+
1706
+ Example:
1707
+
1708
+ ```bash
1709
+ datacontract import --format protobuf --source "test.proto"
1710
+ ```
1711
+
1712
+
1713
+ ### breaking
1714
+ ```
1715
+
1716
+ Usage: datacontract breaking [OPTIONS] LOCATION_OLD LOCATION_NEW
1717
+
1718
+ Identifies breaking changes between data contracts. Prints to stdout.
1719
+
1720
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮
1721
+ │ * location_old TEXT The location (url or path) of the old data contract yaml. │
1722
+ │ [default: None] │
1723
+ │ [required] │
1724
+ │ * location_new TEXT The location (url or path) of the new data contract yaml. │
1725
+ │ [default: None] │
1726
+ │ [required] │
1727
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1728
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮
1729
+ │ --help Show this message and exit. │
1730
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1731
+
1732
+ ```
1733
+
1734
+ ### changelog
1735
+ ```
1736
+
1737
+ Usage: datacontract changelog [OPTIONS] LOCATION_OLD LOCATION_NEW
1738
+
1739
+ Generate a changelog between data contracts. Prints to stdout.
1740
+
1741
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮
1742
+ │ * location_old TEXT The location (url or path) of the old data contract yaml. │
1743
+ │ [default: None] │
1744
+ │ [required] │
1745
+ │ * location_new TEXT The location (url or path) of the new data contract yaml. │
1746
+ │ [default: None] │
1747
+ │ [required] │
1748
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1749
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮
1750
+ │ --help Show this message and exit. │
1751
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1752
+
1753
+ ```
1754
+
1755
+ ### diff
1756
+ ```
1757
+
1758
+ Usage: datacontract diff [OPTIONS] LOCATION_OLD LOCATION_NEW
1759
+
1760
+ PLACEHOLDER. Currently works as 'changelog' does.
1761
+
1762
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮
1763
+ │ * location_old TEXT The location (url or path) of the old data contract yaml. │
1764
+ │ [default: None] │
1765
+ │ [required] │
1766
+ │ * location_new TEXT The location (url or path) of the new data contract yaml. │
1767
+ │ [default: None] │
1768
+ │ [required] │
1769
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1770
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮
1771
+ │ --help Show this message and exit. │
1772
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1773
+
1774
+ ```
1775
+
1776
+ ### catalog
1777
+ ```
1778
+
1779
+ Usage: datacontract catalog [OPTIONS]
1780
+
1781
+ Create a html catalog of data contracts.
1782
+
1783
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮
1784
+ │ --files TEXT Glob pattern for the data contract files to include in the catalog. │
1785
+ │ Applies recursively to any subfolders. │
1786
+ │ [default: *.yaml] │
1787
+ │ --output TEXT Output directory for the catalog html files. [default: catalog/] │
1788
+ │ --schema TEXT The location (url or path) of the Data Contract Specification JSON Schema │
1789
+ │ [default: None] │
1790
+ │ --help Show this message and exit. │
1791
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1792
+
1793
+ ```
1794
+
1795
+ Examples:
1796
+
1797
+ ```
1798
+ # create a catalog right in the current folder
1799
+ datacontract catalog --output "."
1800
+
1801
+ # Create a catalog based on a filename convention
1802
+ datacontract catalog --files "*.odcs.yaml"
1803
+ ```
1804
+
1805
+ ### publish
1806
+ ```
1807
+
1808
+ Usage: datacontract publish [OPTIONS] [LOCATION]
1809
+
1810
+ Publish the data contract to the Data Mesh Manager.
1811
+
1812
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────╮
1813
+ │ location [LOCATION] The location (url or path) of the data contract yaml. │
1814
+ │ [default: datacontract.yaml] │
1815
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1816
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮
1817
+ │ --schema TEXT The location (url or path) of the Data │
1818
+ │ Contract Specification JSON Schema │
1819
+ │ [default: None] │
1820
+ │ --ssl-verification --no-ssl-verification SSL verification when publishing the data │
1821
+ │ contract. │
1822
+ │ [default: ssl-verification] │
1823
+ │ --help Show this message and exit. │
1824
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1825
+
1826
+ ```
1827
+
1828
+ ### api
1829
+ ```
1830
+
1831
+ Usage: datacontract api [OPTIONS]
1832
+
1833
+ Start the datacontract CLI as server application with REST API.
1834
+ The OpenAPI documentation as Swagger UI is available on http://localhost:4242. You can execute the
1835
+ commands directly from the Swagger UI.
1836
+ To protect the API, you can set the environment variable DATACONTRACT_CLI_API_KEY to a secret API
1837
+ key. To authenticate, requests must include the header 'x-api-key' with the correct API key. This
1838
+ is highly recommended, as data contract tests may be subject to SQL injections or leak sensitive
1839
+ information.
1840
+ To connect to servers (such as a Snowflake data source), set the credentials as environment
1841
+ variables as documented in https://cli.datacontract.com/#test
1842
+ It is possible to run the API with extra arguments for `uvicorn.run()` as keyword arguments, e.g.:
1843
+ `datacontract api --port 1234 --root_path /datacontract`.
1844
+
1845
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮
1846
+ │ --port INTEGER Bind socket to this port. [default: 4242] │
1847
+ │ --host TEXT Bind socket to this host. Hint: For running in docker, set it to 0.0.0.0 │
1848
+ │ [default: 127.0.0.1] │
1849
+ │ --help Show this message and exit. │
1850
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
1851
+
1852
+ ```
1853
+
1854
+ ## Integrations
1855
+
1856
+ | Integration | Option | Description |
1857
+ |-----------------------|------------------------------|---------------------------------------------------------------------------------------------------------------|
1858
+ | Data Mesh Manager | `--publish` | Push full results to the [Data Mesh Manager API](https://api.datamesh-manager.com/swagger/index.html) |
1859
+ | Data Contract Manager | `--publish` | Push full results to the [Data Contract Manager API](https://api.datacontract-manager.com/swagger/index.html) |
1860
+
1861
+ ### Integration with Data Mesh Manager
1862
+
1863
+ If you use [Data Mesh Manager](https://datamesh-manager.com/) or [Data Contract Manager](https://datacontract-manager.com/), you can use the data contract URL and append the `--publish` option to send and display the test results. Set an environment variable for your API key.
1864
+
1865
+ ```bash
1866
+ # Fetch current data contract, execute tests on production, and publish result to data mesh manager
1867
+ $ EXPORT DATAMESH_MANAGER_API_KEY=xxx
1868
+ $ datacontract test https://demo.datamesh-manager.com/demo279750347121/datacontracts/4df9d6ee-e55d-4088-9598-b635b2fdcbbc/datacontract.yaml \
1869
+ --server production \
1870
+ --publish https://api.datamesh-manager.com/api/test-results
1871
+ ```
1872
+
1873
+ ## Best Practices
1874
+
1875
+ We share best practices in using the Data Contract CLI.
1876
+
1877
+ ### Data-first Approach
1878
+
1879
+ Create a data contract based on the actual data. This is the fastest way to get started and to get feedback from the data consumers.
1880
+
1881
+ 1. Use an existing physical schema (e.g., SQL DDL) as a starting point to define your logical data model in the contract. Double check right after the import whether the actual data meets the imported logical data model. Just to be sure.
1882
+ ```bash
1883
+ $ datacontract import --format sql --source ddl.sql
1884
+ $ datacontract test
1885
+ ```
1886
+
1887
+ 2. Add quality checks and additional type constraints one by one to the contract and make sure the
1888
+ data still adheres to the contract.
1889
+ ```bash
1890
+ $ datacontract test
1891
+ ```
1892
+
1893
+ 3. Validate that the `datacontract.yaml` is correctly formatted and adheres to the Data Contract Specification.
1894
+ ```bash
1895
+ $ datacontract lint
1896
+ ```
1897
+
1898
+ 4. Set up a CI pipeline that executes daily for continuous quality checks. You can also report the
1899
+ test results to tools like [Data Mesh Manager](https://datamesh-manager.com)
1900
+ ```bash
1901
+ $ datacontract test --publish https://api.datamesh-manager.com/api/test-results
1902
+ ```
1903
+
1904
+ ### Contract-First
1905
+
1906
+ Create a data contract based on the requirements from use cases.
1907
+
1908
+ 1. Start with a `datacontract.yaml` template.
1909
+ ```bash
1910
+ $ datacontract init
1911
+ ```
1912
+
1913
+ 2. Create the model and quality guarantees based on your business requirements. Fill in the terms,
1914
+ descriptions, etc. Validate that your `datacontract.yaml` is correctly formatted.
1915
+ ```bash
1916
+ $ datacontract lint
1917
+ ```
1918
+
1919
+ 3. Use the export function to start building the providing data product as well as the integration
1920
+ into the consuming data products.
1921
+ ```bash
1922
+ # data provider
1923
+ $ datacontract export --format dbt
1924
+ # data consumer
1925
+ $ datacontract export --format dbt-sources
1926
+ $ datacontract export --format dbt-staging-sql
1927
+ ```
1928
+
1929
+ 4. Test that your data product implementation adheres to the contract.
1930
+ ```bash
1931
+ $ datacontract test
1932
+ ```
1933
+
1934
+ ### Schema Evolution
1935
+
1936
+ #### Non-breaking Changes
1937
+ Examples: adding models or fields
1938
+
1939
+ - Add the models or fields in the datacontract.yaml
1940
+ - Increment the minor version of the datacontract.yaml on any change. Simply edit the datacontract.yaml for this.
1941
+ - You need a policy that these changes are non-breaking. That means that one cannot use the star expression in SQL to query a table under contract. Make the consequences known.
1942
+ - Fail the build in the Pull Request if a datacontract.yaml accidentally adds a breaking change even despite only a minor version change
1943
+ ```bash
1944
+ $ datacontract breaking datacontract-from-pr.yaml datacontract-from-main.yaml
1945
+ ```
1946
+ - Create a changelog of this minor change.
1947
+ ```bash
1948
+ $ datacontract changelog datacontract-from-pr.yaml datacontract-from-main.yaml
1949
+ ```
1950
+ #### Breaking Changes
1951
+ Examples: Removing or renaming models and fields.
1952
+
1953
+ - Remove or rename models and fields in the datacontract.yaml, and any other change that might be part of this new major version of this data contract.
1954
+ - Increment the major version of the datacontract.yaml for this and create a new file for the major version. The reason being, that one needs to offer an upgrade path for the data consumers from the old to the new major version.
1955
+ - As data consumers need to migrate, try to reduce the frequency of major versions by making multiple breaking changes together if possible.
1956
+ - Be aware of the notice period in the data contract as this is the minimum amount of time you have to offer both the old and the new version for a migration path.
1957
+ - Do not fear making breaking changes with data contracts. It's okay to do them in this controlled way. Really!
1958
+ - Create a changelog of this major change.
1959
+ ```bash
1960
+ $ datacontract changelog datacontract-from-pr.yaml datacontract-from-main.yaml
1961
+ ```
1962
+
1963
+ ## Customizing Exporters and Importers
1964
+
1965
+ ### Custom Exporter
1966
+ Using the exporter factory to add a new custom exporter
1967
+ ```python
1968
+
1969
+ from datacontract.data_contract import DataContract
1970
+ from datacontract.export.exporter import Exporter
1971
+ from datacontract.export.exporter_factory import exporter_factory
1972
+
1973
+
1974
+ # Create a custom class that implements export method
1975
+ class CustomExporter(Exporter):
1976
+ def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
1977
+ result = {
1978
+ "title": data_contract.info.title,
1979
+ "version": data_contract.info.version,
1980
+ "description": data_contract.info.description,
1981
+ "email": data_contract.info.contact.email,
1982
+ "url": data_contract.info.contact.url,
1983
+ "model": model,
1984
+ "model_columns": ", ".join(list(data_contract.models.get(model).fields.keys())),
1985
+ "export_args": export_args,
1986
+ "custom_args": export_args.get("custom_arg", ""),
1987
+ }
1988
+ return result
1989
+
1990
+
1991
+ # Register the new custom class into factory
1992
+ exporter_factory.register_exporter("custom_exporter", CustomExporter)
1993
+
1994
+
1995
+ if __name__ == "__main__":
1996
+ # Create a DataContract instance
1997
+ data_contract = DataContract(
1998
+ data_contract_file="/path/datacontract.yaml"
1999
+ )
2000
+ # Call export
2001
+ result = data_contract.export(
2002
+ export_format="custom_exporter", model="orders", server="production", custom_arg="my_custom_arg"
2003
+ )
2004
+ print(result)
2005
+
2006
+ ```
2007
+ Output
2008
+ ```python
2009
+ {
2010
+ 'title': 'Orders Unit Test',
2011
+ 'version': '1.0.0',
2012
+ 'description': 'The orders data contract',
2013
+ 'email': 'team-orders@example.com',
2014
+ 'url': 'https://wiki.example.com/teams/checkout',
2015
+ 'model': 'orders',
2016
+ 'model_columns': 'order_id, order_total, order_status',
2017
+ 'export_args': {'server': 'production', 'custom_arg': 'my_custom_arg'},
2018
+ 'custom_args': 'my_custom_arg'
2019
+ }
2020
+ ```
2021
+
2022
+ ### Custom Importer
2023
+ Using the importer factory to add a new custom importer
2024
+ ```python
2025
+
2026
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
2027
+ from datacontract.data_contract import DataContract
2028
+ from datacontract.imports.importer import Importer
2029
+ from datacontract.imports.importer_factory import importer_factory
2030
+
2031
+ import json
2032
+
2033
+ # Create a custom class that implements import_source method
2034
+ class CustomImporter(Importer):
2035
+ def import_source(
2036
+ self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
2037
+ ) -> dict:
2038
+ source_dict = json.loads(source)
2039
+ data_contract_specification.id = source_dict.get("id_custom")
2040
+ data_contract_specification.info.title = source_dict.get("title")
2041
+ data_contract_specification.info.version = source_dict.get("version")
2042
+ data_contract_specification.info.description = source_dict.get("description_from_app")
2043
+
2044
+ for model in source_dict.get("models", []):
2045
+ fields = {}
2046
+ for column in model.get('columns'):
2047
+ field = Field(
2048
+ description=column.get('column_description'),
2049
+ type=column.get('type')
2050
+ )
2051
+ fields[column.get('name')] = field
2052
+
2053
+ dc_model = Model(
2054
+ description=model.get('description'),
2055
+ fields= fields
2056
+ )
2057
+
2058
+ data_contract_specification.models[model.get('name')] = dc_model
2059
+ return data_contract_specification
2060
+
2061
+
2062
+ # Register the new custom class into factory
2063
+ importer_factory.register_importer("custom_company_importer", CustomImporter)
2064
+
2065
+
2066
+ if __name__ == "__main__":
2067
+ # Get a custom data from other app
2068
+ json_from_custom_app = '''
2069
+ {
2070
+ "id_custom": "uuid-custom",
2071
+ "version": "0.0.2",
2072
+ "title": "my_custom_imported_data",
2073
+ "description_from_app": "Custom contract description",
2074
+ "models": [
2075
+ {
2076
+ "name": "model1",
2077
+ "description": "model description from app",
2078
+ "columns": [
2079
+ {
2080
+ "name": "columnA",
2081
+ "type": "varchar",
2082
+ "column_description": "my_column description"
2083
+ },
2084
+ {
2085
+ "name": "columnB",
2086
+ "type": "varchar",
2087
+ "column_description": "my_columnB description"
2088
+ }
2089
+ ]
2090
+ }
2091
+ ]
2092
+ }
2093
+ '''
2094
+ # Create a DataContract instance
2095
+ data_contract = DataContract()
2096
+
2097
+ # Call import_from_source
2098
+ result = data_contract.import_from_source(
2099
+ format="custom_company_importer",
2100
+ data_contract_specification=DataContract.init(),
2101
+ source=json_from_custom_app
2102
+ )
2103
+ print(result.to_yaml() )
2104
+ ```
2105
+ Output
2106
+
2107
+ ```yaml
2108
+ dataContractSpecification: 1.2.1
2109
+ id: uuid-custom
2110
+ info:
2111
+ title: my_custom_imported_data
2112
+ version: 0.0.2
2113
+ description: Custom contract description
2114
+ models:
2115
+ model1:
2116
+ fields:
2117
+ columnA:
2118
+ type: varchar
2119
+ description: my_column description
2120
+ columnB:
2121
+ type: varchar
2122
+ description: my_columnB description
2123
+
2124
+ ```
2125
+ ## Development Setup
2126
+
2127
+ - Install [uv](https://docs.astral.sh/uv/)
2128
+ - Python base interpreter should be 3.11.x .
2129
+ - Docker engine must be running to execute the tests.
2130
+
2131
+ ```bash
2132
+ # make sure uv is installed
2133
+ uv python pin 3.11
2134
+ uv venv
2135
+ uv pip install -e '.[dev]'
2136
+ uv run ruff check
2137
+ uv run pytest
2138
+ ```
2139
+
2140
+ ### Troubleshooting
2141
+
2142
+ #### Windows: Some tests fail
2143
+
2144
+ Run in wsl. (We need to fix the pathes in the tests so that normal Windows will work, contributions are appreciated)
2145
+
2146
+ #### PyCharm does not pick up the `.venv`
2147
+
2148
+ This [uv issue](https://github.com/astral-sh/uv/issues/12545) might be relevant.
2149
+
2150
+ Try to sync all groups:
2151
+
2152
+ ```
2153
+ uv sync --all-groups --all-extras
2154
+ ```
2155
+
2156
+ #### Errors in tests that use PySpark (e.g. test_test_kafka.py)
2157
+
2158
+ Ensure you have a JDK 17 or 21 installed. Java 25 causes issues.
2159
+
2160
+ ```
2161
+ java --version
2162
+ ```
2163
+
2164
+
2165
+ ### Docker Build
2166
+
2167
+ ```bash
2168
+ docker build -t datacontract/cli .
2169
+ docker run --rm -v ${PWD}:/home/datacontract datacontract/cli
2170
+ ```
2171
+
2172
+ #### Docker compose integration
2173
+
2174
+ We've included a [docker-compose.yml](./docker-compose.yml) configuration to simplify the build, test, and deployment of the image.
2175
+
2176
+ ##### Building the Image with Docker Compose
2177
+
2178
+ To build the Docker image using Docker Compose, run the following command:
2179
+
2180
+ ```bash
2181
+ docker compose build
2182
+ ```
2183
+
2184
+ This command utilizes the `docker-compose.yml` to build the image, leveraging predefined settings such as the build context and Dockerfile location. This approach streamlines the image creation process, avoiding the need for manual build specifications each time.
2185
+
2186
+ #### Testing the Image
2187
+
2188
+ After building the image, you can test it directly with Docker Compose:
2189
+
2190
+ ```bash
2191
+ docker compose run --rm datacontract --version
2192
+ ```
2193
+
2194
+ This command runs the container momentarily to check the version of the `datacontract` CLI. The `--rm` flag ensures that the container is automatically removed after the command executes, keeping your environment clean.
2195
+
2196
+
2197
+ ## Release Steps
2198
+
2199
+ 1. Update the version in `pyproject.toml`
2200
+ 2. Have a look at the `CHANGELOG.md`
2201
+ 3. Create release commit manually
2202
+ 4. Execute `./release`
2203
+ 5. Wait until GitHub Release is created
2204
+ 6. Add the release notes to the GitHub Release
2205
+
2206
+ ## Contribution
2207
+
2208
+ We are happy to receive your contributions. Propose your change in an issue or directly create a pull request with your improvements.
2209
+
2210
+ ## Companies using this tool
2211
+
2212
+ - [Entropy Data](https://www.entropy-data.com)
2213
+ - [INNOQ](https://innoq.com)
2214
+ - [Data Catering](https://data.catering/)
2215
+ - [Oliver Wyman](https://www.oliverwyman.com/)
2216
+ - And many more. To add your company, please create a pull request.
2217
+
2218
+ ## Related Tools
2219
+
2220
+ - [Data Contract Manager](https://www.datacontract-manager.com/) is a commercial tool to manage data contracts. It contains a web UI, access management, and data governance for a full enterprise data marketplace.
2221
+ - [Data Contract GPT](https://gpt.datacontract.com) is a custom GPT that can help you write data contracts.
2222
+ - [Data Contract Editor](https://editor.datacontract.com) is an editor for Data Contracts, including a live html preview.
2223
+ - [Data Contract Playground](https://data-catering.github.io/data-contract-playground/) allows you to validate and export your data contract to different formats within your browser.
2224
+
2225
+ ## License
2226
+
2227
+ [MIT License](LICENSE)
2228
+
2229
+ ## Credits
2230
+
2231
+ Created by [Stefan Negele](https://www.linkedin.com/in/stefan-negele-573153112/), [Jochen Christ](https://www.linkedin.com/in/jochenchrist/), and [Simon Harrer]().
2232
+
2233
+
2234
+
2235
+ <a href="https://github.com/datacontract/datacontract-cli" class="github-corner" aria-label="View source on GitHub"><svg width="80" height="80" viewBox="0 0 250 250" style="fill:#151513; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>