datacontract-cli 0.10.5__py3-none-any.whl → 0.10.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/cli.py +8 -24
- datacontract/data_contract.py +33 -4
- datacontract/engines/soda/check_soda_execute.py +9 -0
- datacontract/engines/soda/connections/kafka.py +3 -0
- datacontract/export/avro_converter.py +14 -19
- datacontract/imports/glue_importer.py +4 -3
- datacontract/web.py +39 -2
- {datacontract_cli-0.10.5.dist-info → datacontract_cli-0.10.7.dist-info}/METADATA +139 -67
- {datacontract_cli-0.10.5.dist-info → datacontract_cli-0.10.7.dist-info}/RECORD +13 -13
- {datacontract_cli-0.10.5.dist-info → datacontract_cli-0.10.7.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.5.dist-info → datacontract_cli-0.10.7.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.10.5.dist-info → datacontract_cli-0.10.7.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.5.dist-info → datacontract_cli-0.10.7.dist-info}/top_level.txt +0 -0
datacontract/cli.py
CHANGED
|
@@ -13,7 +13,7 @@ from typer.core import TyperGroup
|
|
|
13
13
|
from typing_extensions import Annotated
|
|
14
14
|
|
|
15
15
|
from datacontract.catalog.catalog import create_index_html, create_data_contract_html
|
|
16
|
-
from datacontract.data_contract import DataContract
|
|
16
|
+
from datacontract.data_contract import DataContract, ExportFormat
|
|
17
17
|
from datacontract.init.download_datacontract_file import download_datacontract_file, FileExistsException
|
|
18
18
|
from datacontract.publish.publish import publish_to_datamesh_manager
|
|
19
19
|
|
|
@@ -141,28 +141,6 @@ def test(
|
|
|
141
141
|
_handle_result(run)
|
|
142
142
|
|
|
143
143
|
|
|
144
|
-
class ExportFormat(str, Enum):
|
|
145
|
-
jsonschema = "jsonschema"
|
|
146
|
-
pydantic_model = "pydantic-model"
|
|
147
|
-
sodacl = "sodacl"
|
|
148
|
-
dbt = "dbt"
|
|
149
|
-
dbt_sources = "dbt-sources"
|
|
150
|
-
dbt_staging_sql = "dbt-staging-sql"
|
|
151
|
-
odcs = "odcs"
|
|
152
|
-
rdf = "rdf"
|
|
153
|
-
avro = "avro"
|
|
154
|
-
protobuf = "protobuf"
|
|
155
|
-
great_expectations = "great-expectations"
|
|
156
|
-
terraform = "terraform"
|
|
157
|
-
avro_idl = "avro-idl"
|
|
158
|
-
sql = "sql"
|
|
159
|
-
sql_query = "sql-query"
|
|
160
|
-
html = "html"
|
|
161
|
-
go = "go"
|
|
162
|
-
bigquery = "bigquery"
|
|
163
|
-
dbml = "dbml"
|
|
164
|
-
|
|
165
|
-
|
|
166
144
|
@app.command()
|
|
167
145
|
def export(
|
|
168
146
|
format: Annotated[ExportFormat, typer.Option(help="The export format.")],
|
|
@@ -231,6 +209,12 @@ def import_(
|
|
|
231
209
|
source: Annotated[
|
|
232
210
|
Optional[str], typer.Option(help="The path to the file or Glue Database that should be imported.")
|
|
233
211
|
] = None,
|
|
212
|
+
glue_table: Annotated[
|
|
213
|
+
Optional[List[str]],
|
|
214
|
+
typer.Option(
|
|
215
|
+
help="List of table ids to import from the Glue Database (repeat for multiple table ids, leave empty for all tables in the dataset)."
|
|
216
|
+
),
|
|
217
|
+
] = None,
|
|
234
218
|
bigquery_project: Annotated[Optional[str], typer.Option(help="The bigquery project id.")] = None,
|
|
235
219
|
bigquery_dataset: Annotated[Optional[str], typer.Option(help="The bigquery dataset id.")] = None,
|
|
236
220
|
bigquery_table: Annotated[
|
|
@@ -243,7 +227,7 @@ def import_(
|
|
|
243
227
|
"""
|
|
244
228
|
Create a data contract from the given source location. Prints to stdout.
|
|
245
229
|
"""
|
|
246
|
-
result = DataContract().import_from_source(format, source, bigquery_table, bigquery_project, bigquery_dataset)
|
|
230
|
+
result = DataContract().import_from_source(format, source, glue_table, bigquery_table, bigquery_project, bigquery_dataset)
|
|
247
231
|
console.print(result.to_yaml())
|
|
248
232
|
|
|
249
233
|
|
datacontract/data_contract.py
CHANGED
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import tempfile
|
|
4
4
|
import typing
|
|
5
|
+
from enum import Enum
|
|
5
6
|
|
|
6
7
|
import yaml
|
|
7
8
|
from pyspark.sql import SparkSession
|
|
@@ -49,6 +50,28 @@ from datacontract.model.exceptions import DataContractException
|
|
|
49
50
|
from datacontract.model.run import Run, Check
|
|
50
51
|
|
|
51
52
|
|
|
53
|
+
class ExportFormat(str, Enum):
|
|
54
|
+
jsonschema = "jsonschema"
|
|
55
|
+
pydantic_model = "pydantic-model"
|
|
56
|
+
sodacl = "sodacl"
|
|
57
|
+
dbt = "dbt"
|
|
58
|
+
dbt_sources = "dbt-sources"
|
|
59
|
+
dbt_staging_sql = "dbt-staging-sql"
|
|
60
|
+
odcs = "odcs"
|
|
61
|
+
rdf = "rdf"
|
|
62
|
+
avro = "avro"
|
|
63
|
+
protobuf = "protobuf"
|
|
64
|
+
great_expectations = "great-expectations"
|
|
65
|
+
terraform = "terraform"
|
|
66
|
+
avro_idl = "avro-idl"
|
|
67
|
+
sql = "sql"
|
|
68
|
+
sql_query = "sql-query"
|
|
69
|
+
html = "html"
|
|
70
|
+
go = "go"
|
|
71
|
+
bigquery = "bigquery"
|
|
72
|
+
dbml = "dbml"
|
|
73
|
+
|
|
74
|
+
|
|
52
75
|
class DataContract:
|
|
53
76
|
def __init__(
|
|
54
77
|
self,
|
|
@@ -195,10 +218,13 @@ class DataContract:
|
|
|
195
218
|
run.outputPortId = server.outputPortId
|
|
196
219
|
run.server = server_name
|
|
197
220
|
|
|
198
|
-
#
|
|
199
|
-
|
|
221
|
+
# TODO check server is supported type for nicer error messages
|
|
222
|
+
|
|
223
|
+
# TODO check server credentials are complete for nicer error messages
|
|
224
|
+
|
|
200
225
|
if server.format == "json" and server.type != "kafka":
|
|
201
226
|
check_jsonschema(run, data_contract, server)
|
|
227
|
+
|
|
202
228
|
check_soda_execute(run, data_contract, server, self._spark, tmp_dir)
|
|
203
229
|
|
|
204
230
|
except DataContractException as e:
|
|
@@ -275,7 +301,9 @@ class DataContract:
|
|
|
275
301
|
inline_quality=self._inline_quality,
|
|
276
302
|
)
|
|
277
303
|
|
|
278
|
-
def export(
|
|
304
|
+
def export(
|
|
305
|
+
self, export_format: ExportFormat, model: str = "all", rdf_base: str = None, sql_server_type: str = "auto"
|
|
306
|
+
) -> str:
|
|
279
307
|
data_contract = resolve.resolve_data_contract(
|
|
280
308
|
self._data_contract_file,
|
|
281
309
|
self._data_contract_str,
|
|
@@ -422,6 +450,7 @@ class DataContract:
|
|
|
422
450
|
self,
|
|
423
451
|
format: str,
|
|
424
452
|
source: typing.Optional[str] = None,
|
|
453
|
+
glue_tables: typing.Optional[typing.List[str]] = None,
|
|
425
454
|
bigquery_tables: typing.Optional[typing.List[str]] = None,
|
|
426
455
|
bigquery_project: typing.Optional[str] = None,
|
|
427
456
|
bigquery_dataset: typing.Optional[str] = None,
|
|
@@ -433,7 +462,7 @@ class DataContract:
|
|
|
433
462
|
elif format == "avro":
|
|
434
463
|
data_contract_specification = import_avro(data_contract_specification, source)
|
|
435
464
|
elif format == "glue":
|
|
436
|
-
data_contract_specification = import_glue(data_contract_specification, source)
|
|
465
|
+
data_contract_specification = import_glue(data_contract_specification, source, glue_tables)
|
|
437
466
|
elif format == "jsonschema":
|
|
438
467
|
data_contract_specification = import_jsonschema(data_contract_specification, source)
|
|
439
468
|
elif format == "bigquery":
|
|
@@ -64,6 +64,15 @@ def check_soda_execute(
|
|
|
64
64
|
soda_configuration_str = to_databricks_soda_configuration(server)
|
|
65
65
|
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
66
66
|
scan.set_data_source_name(server.type)
|
|
67
|
+
elif server.type == "dataframe":
|
|
68
|
+
if spark is None:
|
|
69
|
+
run.log_warn("Server type dataframe only works with the Python library and requires a Spark session, "
|
|
70
|
+
"please provide one with the DataContract class")
|
|
71
|
+
return
|
|
72
|
+
else:
|
|
73
|
+
logging.info("Use Spark to connect to data source")
|
|
74
|
+
scan.add_spark_session(spark, data_source_name="datacontract-cli")
|
|
75
|
+
scan.set_data_source_name("datacontract-cli")
|
|
67
76
|
elif server.type == "kafka":
|
|
68
77
|
if spark is None:
|
|
69
78
|
spark = create_spark_session(tmp_dir)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import os
|
|
2
3
|
from pyspark.sql import SparkSession
|
|
3
4
|
from pyspark.sql.functions import col, expr, from_json
|
|
@@ -44,6 +45,8 @@ def create_spark_session(tmp_dir: str) -> SparkSession:
|
|
|
44
45
|
|
|
45
46
|
def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecification, server: Server, tmp_dir):
|
|
46
47
|
"""Read and process data from a Kafka topic based on the server configuration."""
|
|
48
|
+
|
|
49
|
+
logging.info("Reading data from Kafka server %s topic %s", server.host, server.topic)
|
|
47
50
|
df = (
|
|
48
51
|
spark.read.format("kafka")
|
|
49
52
|
.options(**get_auth_options())
|
|
@@ -34,13 +34,8 @@ def to_avro_field(field, field_name):
|
|
|
34
34
|
if field.description is not None:
|
|
35
35
|
avro_field["doc"] = field.description
|
|
36
36
|
avro_field["type"] = to_avro_type(field, field_name)
|
|
37
|
-
# add logical type definitions for any of the date type fields
|
|
38
|
-
if field.type in ["timestamp", "timestamp_tz", "timestamp_ntz", "date"]:
|
|
39
|
-
avro_field["logicalType"] = to_avro_logical_type(field.type)
|
|
40
37
|
|
|
41
38
|
if field.config:
|
|
42
|
-
if "avroLogicalType" in field.config:
|
|
43
|
-
avro_field["logicalType"] = field.config["avroLogicalType"]
|
|
44
39
|
if "avroDefault" in field.config:
|
|
45
40
|
avro_field["default"] = field.config["avroDefault"]
|
|
46
41
|
|
|
@@ -48,6 +43,17 @@ def to_avro_field(field, field_name):
|
|
|
48
43
|
|
|
49
44
|
|
|
50
45
|
def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
46
|
+
if field.config:
|
|
47
|
+
if "avroLogicalType" in field.config and "avroType" in field.config:
|
|
48
|
+
return {"type": field.config["avroType"], "logicalType": field.config["avroLogicalType"]}
|
|
49
|
+
if "avroLogicalType" in field.config:
|
|
50
|
+
if field.config["avroLogicalType"] in ["timestamp-millis", "timestamp-micros", "local-timestamp-millis", "local-timestamp-micros", "time-micros"]:
|
|
51
|
+
return {"type": "long", "logicalType": field.config["avroLogicalType"]}
|
|
52
|
+
if field.config["avroLogicalType"] in ["time-millis", "date"]:
|
|
53
|
+
return {"type": "int", "logicalType": field.config["avroLogicalType"]}
|
|
54
|
+
if "avroType" in field.config:
|
|
55
|
+
return field.config["avroLogicalType"]
|
|
56
|
+
|
|
51
57
|
if field.type is None:
|
|
52
58
|
return "null"
|
|
53
59
|
if field.type in ["string", "varchar", "text"]:
|
|
@@ -64,11 +70,11 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
|
64
70
|
elif field.type in ["boolean"]:
|
|
65
71
|
return "boolean"
|
|
66
72
|
elif field.type in ["timestamp", "timestamp_tz"]:
|
|
67
|
-
return "long"
|
|
73
|
+
return {"type": "long", "logicalType": "timestamp-millis"}
|
|
68
74
|
elif field.type in ["timestamp_ntz"]:
|
|
69
|
-
return "long"
|
|
75
|
+
return {"type": "long", "logicalType": "local-timestamp-millis"}
|
|
70
76
|
elif field.type in ["date"]:
|
|
71
|
-
return "int"
|
|
77
|
+
return {"type": "int", "logicalType": "date"}
|
|
72
78
|
elif field.type in ["time"]:
|
|
73
79
|
return "long"
|
|
74
80
|
elif field.type in ["object", "record", "struct"]:
|
|
@@ -82,14 +88,3 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
|
82
88
|
return "null"
|
|
83
89
|
else:
|
|
84
90
|
return "bytes"
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def to_avro_logical_type(type: str) -> str:
|
|
88
|
-
if type in ["timestamp", "timestamp_tz"]:
|
|
89
|
-
return "timestamp-millis"
|
|
90
|
-
elif type in ["timestamp_ntz"]:
|
|
91
|
-
return "local-timestamp-millis"
|
|
92
|
-
elif type in ["date"]:
|
|
93
|
-
return "date"
|
|
94
|
-
else:
|
|
95
|
-
return ""
|
|
@@ -107,7 +107,7 @@ def get_glue_table_schema(database_name: str, table_name: str):
|
|
|
107
107
|
return table_schema
|
|
108
108
|
|
|
109
109
|
|
|
110
|
-
def import_glue(data_contract_specification: DataContractSpecification, source: str):
|
|
110
|
+
def import_glue(data_contract_specification: DataContractSpecification, source: str, table_names: List[str]):
|
|
111
111
|
"""Import the schema of a Glue database."""
|
|
112
112
|
|
|
113
113
|
catalogid, location_uri = get_glue_database(source)
|
|
@@ -116,13 +116,14 @@ def import_glue(data_contract_specification: DataContractSpecification, source:
|
|
|
116
116
|
if catalogid is None:
|
|
117
117
|
return data_contract_specification
|
|
118
118
|
|
|
119
|
-
|
|
119
|
+
if table_names is None:
|
|
120
|
+
table_names = get_glue_tables(source)
|
|
120
121
|
|
|
121
122
|
data_contract_specification.servers = {
|
|
122
123
|
"production": Server(type="glue", account=catalogid, database=source, location=location_uri),
|
|
123
124
|
}
|
|
124
125
|
|
|
125
|
-
for table_name in
|
|
126
|
+
for table_name in table_names:
|
|
126
127
|
if data_contract_specification.models is None:
|
|
127
128
|
data_contract_specification.models = {}
|
|
128
129
|
|
datacontract/web.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
from typing import Annotated, Union
|
|
1
|
+
from typing import Annotated, Union, Optional
|
|
2
2
|
|
|
3
|
+
import typer
|
|
3
4
|
from fastapi import FastAPI, File
|
|
4
5
|
|
|
5
|
-
from datacontract.data_contract import DataContract
|
|
6
|
+
from datacontract.data_contract import DataContract, ExportFormat
|
|
7
|
+
from fastapi.responses import PlainTextResponse
|
|
6
8
|
|
|
7
9
|
app = FastAPI()
|
|
8
10
|
|
|
@@ -12,3 +14,38 @@ def lint(file: Annotated[bytes, File()], linters: Union[str, set[str]] = "all"):
|
|
|
12
14
|
data_contract = DataContract(data_contract_str=str(file, encoding="utf-8"))
|
|
13
15
|
lint_result = data_contract.lint(enabled_linters=linters)
|
|
14
16
|
return {"result": lint_result.result, "checks": lint_result.checks}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@app.post("/export", response_class=PlainTextResponse)
|
|
20
|
+
def export(
|
|
21
|
+
file: Annotated[bytes, File()],
|
|
22
|
+
export_format: Annotated[ExportFormat, typer.Option(help="The export format.")],
|
|
23
|
+
server: Annotated[str, typer.Option(help="The server name to export.")] = None,
|
|
24
|
+
model: Annotated[
|
|
25
|
+
str,
|
|
26
|
+
typer.Option(
|
|
27
|
+
help="Use the key of the model in the data contract yaml file "
|
|
28
|
+
"to refer to a model, e.g., `orders`, or `all` for all "
|
|
29
|
+
"models (default)."
|
|
30
|
+
),
|
|
31
|
+
] = "all",
|
|
32
|
+
rdf_base: Annotated[
|
|
33
|
+
Optional[str],
|
|
34
|
+
typer.Option(help="[rdf] The base URI used to generate the RDF graph.", rich_help_panel="RDF Options"),
|
|
35
|
+
] = None,
|
|
36
|
+
sql_server_type: Annotated[
|
|
37
|
+
Optional[str],
|
|
38
|
+
typer.Option(
|
|
39
|
+
help="[sql] The server type to determine the sql dialect. By default, it uses 'auto' to automatically detect the sql dialect via the specified servers in the data contract.",
|
|
40
|
+
rich_help_panel="SQL Options",
|
|
41
|
+
),
|
|
42
|
+
] = "auto",
|
|
43
|
+
):
|
|
44
|
+
result = DataContract(data_contract_str=str(file, encoding="utf-8"), server=server).export(
|
|
45
|
+
export_format=export_format,
|
|
46
|
+
model=model,
|
|
47
|
+
rdf_base=rdf_base,
|
|
48
|
+
sql_server_type=sql_server_type,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datacontract-cli
|
|
3
|
-
Version: 0.10.
|
|
3
|
+
Version: 0.10.7
|
|
4
4
|
Summary: Test data contracts
|
|
5
5
|
Author-email: Jochen Christ <jochen.christ@innoq.com>, Stefan Negele <stefan.negele@innoq.com>
|
|
6
6
|
Project-URL: Homepage, https://cli.datacontract.com
|
|
@@ -49,10 +49,11 @@ Requires-Dist: pytest ; extra == 'dev'
|
|
|
49
49
|
Requires-Dist: pytest-xdist ; extra == 'dev'
|
|
50
50
|
Requires-Dist: moto ; extra == 'dev'
|
|
51
51
|
Requires-Dist: pymssql ==2.3.0 ; extra == 'dev'
|
|
52
|
-
Requires-Dist:
|
|
53
|
-
Requires-Dist: testcontainers
|
|
54
|
-
Requires-Dist: testcontainers
|
|
55
|
-
Requires-Dist: testcontainers
|
|
52
|
+
Requires-Dist: kafka-python ; extra == 'dev'
|
|
53
|
+
Requires-Dist: testcontainers ~=4.5.0 ; extra == 'dev'
|
|
54
|
+
Requires-Dist: testcontainers[minio] ; extra == 'dev'
|
|
55
|
+
Requires-Dist: testcontainers[postgres] ; extra == 'dev'
|
|
56
|
+
Requires-Dist: testcontainers[kafka] ; extra == 'dev'
|
|
56
57
|
Requires-Dist: testcontainers[mssql] ; extra == 'dev'
|
|
57
58
|
|
|
58
59
|
# Data Contract CLI
|
|
@@ -333,8 +334,10 @@ Supported server types:
|
|
|
333
334
|
- [s3](#S3)
|
|
334
335
|
- [bigquery](#bigquery)
|
|
335
336
|
- [azure](#azure)
|
|
337
|
+
- [sqlserver](#sqlserver)
|
|
336
338
|
- [databricks](#databricks)
|
|
337
339
|
- [databricks (programmatic)](#databricks-programmatic)
|
|
340
|
+
- [dataframr (programmatic)](#dataframe-programmatic)
|
|
338
341
|
- [snowflake](#snowflake)
|
|
339
342
|
- [kafka](#kafka)
|
|
340
343
|
- [postgres](#postgres)
|
|
@@ -448,6 +451,43 @@ Authentication works with an Azure Service Principal (SPN) aka App Registration
|
|
|
448
451
|
|
|
449
452
|
|
|
450
453
|
|
|
454
|
+
### Sqlserver
|
|
455
|
+
|
|
456
|
+
Data Contract CLI can test data in MS SQL Server (including Azure SQL, Synapse Analytics SQL Pool).
|
|
457
|
+
|
|
458
|
+
#### Example
|
|
459
|
+
|
|
460
|
+
datacontract.yaml
|
|
461
|
+
```yaml
|
|
462
|
+
servers:
|
|
463
|
+
production:
|
|
464
|
+
type: sqlserver
|
|
465
|
+
host: localhost
|
|
466
|
+
port: 5432
|
|
467
|
+
database: tempdb
|
|
468
|
+
schema: dbo
|
|
469
|
+
driver: ODBC Driver 18 for SQL Server
|
|
470
|
+
models:
|
|
471
|
+
my_table_1: # corresponds to a table
|
|
472
|
+
type: table
|
|
473
|
+
fields:
|
|
474
|
+
my_column_1: # corresponds to a column
|
|
475
|
+
type: varchar
|
|
476
|
+
```
|
|
477
|
+
|
|
478
|
+
#### Environment Variables
|
|
479
|
+
|
|
480
|
+
| Environment Variable | Example | Description |
|
|
481
|
+
|----------------------------------|--------------------|-------------|
|
|
482
|
+
| `DATACONTRACT_SQLSERVER_USERNAME` | `root` | Username |
|
|
483
|
+
| `DATACONTRACT_SQLSERVER_PASSWORD` | `toor` | Password |
|
|
484
|
+
| `DATACONTRACT_SQLSERVER_TRUSTED_CONNECTION` | `True` | Use windows authentication, instead of login |
|
|
485
|
+
| `DATACONTRACT_SQLSERVER_TRUST_SERVER_CERTIFICATE` | `True` | Trust self-signed certificate |
|
|
486
|
+
| `DATACONTRACT_SQLSERVER_ENCRYPTED_CONNECTION` | `True` | Use SSL |
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
|
|
451
491
|
### Databricks
|
|
452
492
|
|
|
453
493
|
Works with Unity Catalog and Hive metastore.
|
|
@@ -516,6 +556,41 @@ run = data_contract.test()
|
|
|
516
556
|
run.result
|
|
517
557
|
```
|
|
518
558
|
|
|
559
|
+
### Dataframe (programmatic)
|
|
560
|
+
|
|
561
|
+
Works with Spark DataFrames.
|
|
562
|
+
DataFrames need to be created as named temporary views.
|
|
563
|
+
Multiple temporary views are suppored if your data contract contains multiple models.
|
|
564
|
+
|
|
565
|
+
Testing DataFrames is useful to test your datasets in a pipeline before writing them to a data source.
|
|
566
|
+
|
|
567
|
+
#### Example
|
|
568
|
+
|
|
569
|
+
datacontract.yaml
|
|
570
|
+
```yaml
|
|
571
|
+
servers:
|
|
572
|
+
production:
|
|
573
|
+
type: dataframe
|
|
574
|
+
models:
|
|
575
|
+
my_table: # corresponds to a temporary view
|
|
576
|
+
type: table
|
|
577
|
+
fields: ...
|
|
578
|
+
```
|
|
579
|
+
|
|
580
|
+
Example code
|
|
581
|
+
```python
|
|
582
|
+
from datacontract.data_contract import DataContract
|
|
583
|
+
|
|
584
|
+
df.createOrReplaceTempView("my_table")
|
|
585
|
+
|
|
586
|
+
data_contract = DataContract(
|
|
587
|
+
data_contract_file="datacontract.yaml",
|
|
588
|
+
spark=spark,
|
|
589
|
+
)
|
|
590
|
+
run = data_contract.test()
|
|
591
|
+
assert run.result == "passed"
|
|
592
|
+
```
|
|
593
|
+
|
|
519
594
|
|
|
520
595
|
### Snowflake
|
|
521
596
|
|
|
@@ -608,41 +683,6 @@ models:
|
|
|
608
683
|
|
|
609
684
|
|
|
610
685
|
|
|
611
|
-
### Postgres
|
|
612
|
-
|
|
613
|
-
Data Contract CLI can test data in Postgres or Postgres-compliant databases (e.g., RisingWave).
|
|
614
|
-
|
|
615
|
-
#### Example
|
|
616
|
-
|
|
617
|
-
datacontract.yaml
|
|
618
|
-
```yaml
|
|
619
|
-
servers:
|
|
620
|
-
postgres:
|
|
621
|
-
type: sqlserver
|
|
622
|
-
host: localhost
|
|
623
|
-
port: 5432
|
|
624
|
-
database: tempdb
|
|
625
|
-
schema: dbo
|
|
626
|
-
driver: ODBC Driver 18 for SQL Server
|
|
627
|
-
models:
|
|
628
|
-
my_table_1: # corresponds to a table
|
|
629
|
-
type: table
|
|
630
|
-
fields:
|
|
631
|
-
my_column_1: # corresponds to a column
|
|
632
|
-
type: varchar
|
|
633
|
-
```
|
|
634
|
-
|
|
635
|
-
#### Environment Variables
|
|
636
|
-
|
|
637
|
-
| Environment Variable | Example | Description |
|
|
638
|
-
|----------------------------------|--------------------|-------------|
|
|
639
|
-
| `DATACONTRACT_SQLSERVER_USERNAME` | `root` | Username |
|
|
640
|
-
| `DATACONTRACT_SQLSERVER_PASSWORD` | `toor` | Password |
|
|
641
|
-
| `DATACONTRACT_SQLSERVER_TRUSTED_CONNECTION` | `True` | Use windows authentication, instead of login |
|
|
642
|
-
| `DATACONTRACT_SQLSERVER_TRUST_SERVER_CERTIFICATE` | `True` | Trust self-signed certificate |
|
|
643
|
-
| `DATACONTRACT_SQLSERVER_ENCRYPTED_CONNECTION` | `True` | Use SSL |
|
|
644
|
-
|
|
645
|
-
|
|
646
686
|
|
|
647
687
|
### export
|
|
648
688
|
|
|
@@ -802,41 +842,30 @@ models:
|
|
|
802
842
|
```
|
|
803
843
|
Usage: datacontract import [OPTIONS]
|
|
804
844
|
|
|
805
|
-
Create a data contract from the given source location. Prints to stdout.
|
|
806
|
-
|
|
807
|
-
╭─ Options
|
|
808
|
-
│ * --format [sql|avro|glue|bigquery|jsonschema] The format of the source file. [default: None] [required]
|
|
809
|
-
│ --source TEXT
|
|
810
|
-
│
|
|
811
|
-
│ --
|
|
812
|
-
│
|
|
813
|
-
│
|
|
814
|
-
│
|
|
815
|
-
│ --
|
|
816
|
-
|
|
845
|
+
Create a data contract from the given source location. Prints to stdout.
|
|
846
|
+
|
|
847
|
+
╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
848
|
+
│ * --format [sql|avro|glue|bigquery|jsonschema] The format of the source file. [default: None] [required] │
|
|
849
|
+
│ --source TEXT The path to the file or Glue Database that should be imported. │
|
|
850
|
+
│ [default: None] │
|
|
851
|
+
│ --glue-table TEXT List of table ids to import from the Glue Database (repeat for │
|
|
852
|
+
│ multiple table ids, leave empty for all tables in the dataset). │
|
|
853
|
+
│ [default: None] │
|
|
854
|
+
│ --bigquery-project TEXT The bigquery project id. [default: None] │
|
|
855
|
+
│ --bigquery-dataset TEXT The bigquery dataset id. [default: None] │
|
|
856
|
+
│ --bigquery-table TEXT List of table ids to import from the bigquery API (repeat for │
|
|
857
|
+
│ multiple table ids, leave empty for all tables in the dataset). │
|
|
858
|
+
│ [default: None] │
|
|
859
|
+
│ --help Show this message and exit. │
|
|
860
|
+
╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
817
861
|
```
|
|
818
862
|
|
|
819
|
-
As shown, some options are only relevant in certain conditions: For `format` Bigtable we support to directly read off the Bigtable APIs.
|
|
820
|
-
In this case there's no need to specify `source` but instead `bt-project-id`, `bt-dataset-id` and `table` must be specified.
|
|
821
|
-
|
|
822
|
-
For providing authentication to the Client, please see [the google documentation](https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to) or the one [about authorizing client libraries](https://cloud.google.com/bigquery/docs/authentication#client-libs).
|
|
823
|
-
|
|
824
863
|
Example:
|
|
825
864
|
```bash
|
|
826
865
|
# Example import from SQL DDL
|
|
827
866
|
datacontract import --format sql --source my_ddl.sql
|
|
828
867
|
```
|
|
829
868
|
|
|
830
|
-
```bash
|
|
831
|
-
# Example import from Bigquery JSON
|
|
832
|
-
datacontract import --format bigquery --source my_bigquery_table.json
|
|
833
|
-
```
|
|
834
|
-
|
|
835
|
-
```bash
|
|
836
|
-
# Example import from Bigquery API
|
|
837
|
-
datacontract import --format bigquery --btProjectId <project_id> --btDatasetId <dataset_id> --table <tableid_1> --table <tableid_2> --table <tableid_3>
|
|
838
|
-
```
|
|
839
|
-
|
|
840
869
|
Available import options:
|
|
841
870
|
|
|
842
871
|
| Type | Description | Status |
|
|
@@ -852,6 +881,49 @@ Available import options:
|
|
|
852
881
|
| Missing something? | Please create an issue on GitHub | TBD |
|
|
853
882
|
|
|
854
883
|
|
|
884
|
+
#### BigQuery
|
|
885
|
+
|
|
886
|
+
Bigquery data can either be imported off of JSON Files generated from the table descriptions or directly from the Bigquery API. In case you want to use JSON Files, specify the `source` parameter with a path to the JSON File.
|
|
887
|
+
|
|
888
|
+
To import from the Bigquery API, you have to _omit_ `source` and instead need to provide `bigquery-project` and `bigquery-dataset`. Additionally you may specify `bigquery-table` to enumerate the tables that should be imported. If no tables are given, _all_ available tables of the dataset will be imported.
|
|
889
|
+
|
|
890
|
+
For providing authentication to the Client, please see [the google documentation](https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to) or the one [about authorizing client libraries](https://cloud.google.com/bigquery/docs/authentication#client-libs).
|
|
891
|
+
|
|
892
|
+
Examples:
|
|
893
|
+
|
|
894
|
+
```bash
|
|
895
|
+
# Example import from Bigquery JSON
|
|
896
|
+
datacontract import --format bigquery --source my_bigquery_table.json
|
|
897
|
+
```
|
|
898
|
+
|
|
899
|
+
```bash
|
|
900
|
+
# Example import from Bigquery API with specifying the tables to import
|
|
901
|
+
datacontract import --format bigquery --bigquery-project <project_id> --bigquery-dataset <dataset_id> --bigquery-table <tableid_1> --bigquery-table <tableid_2> --bigquery-table <tableid_3>
|
|
902
|
+
```
|
|
903
|
+
|
|
904
|
+
```bash
|
|
905
|
+
# Example import from Bigquery API importing all tables in the dataset
|
|
906
|
+
datacontract import --format bigquery --bigquery-project <project_id> --bigquery-dataset <dataset_id>
|
|
907
|
+
```
|
|
908
|
+
|
|
909
|
+
### Glue
|
|
910
|
+
|
|
911
|
+
Importing from Glue reads the necessary Data directly off of the AWS API.
|
|
912
|
+
You may give the `glue-table` parameter to enumerate the tables that should be imported. If no tables are given, _all_ available tables of the database will be imported.
|
|
913
|
+
|
|
914
|
+
Examples:
|
|
915
|
+
|
|
916
|
+
```bash
|
|
917
|
+
# Example import from AWS Glue with specifying the tables to import
|
|
918
|
+
datacontract import --format glue --source <database_name> --glue-table <table_name_1> --glue-table <table_name_2> --glue-table <table_name_3>
|
|
919
|
+
```
|
|
920
|
+
|
|
921
|
+
```bash
|
|
922
|
+
# Example import from AWS Glue importing all tables in the database
|
|
923
|
+
datacontract import --format glue --source <database_name>
|
|
924
|
+
```
|
|
925
|
+
|
|
926
|
+
|
|
855
927
|
### breaking
|
|
856
928
|
|
|
857
929
|
```
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
datacontract/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
datacontract/cli.py,sha256
|
|
3
|
-
datacontract/data_contract.py,sha256=
|
|
2
|
+
datacontract/cli.py,sha256=-PFT-P03aiT5RlodXPGBKVqz2etlnnzMLoNqDrsOXKE,12767
|
|
3
|
+
datacontract/data_contract.py,sha256=bsvf-_nSFKStcPPdKkSprD4KUecON4EBIk2dvwbkxWw,21444
|
|
4
4
|
datacontract/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
-
datacontract/web.py,sha256=
|
|
5
|
+
datacontract/web.py,sha256=toFF9L3ueToMIa5v0BjS-PyI-w1htNlIBV8O8QLrigM,1834
|
|
6
6
|
datacontract/breaking/breaking.py,sha256=l0ZwUX8G4QK_0nLRLJHArpwgTWTD7WQYMuAp3l_y1bY,12184
|
|
7
7
|
datacontract/breaking/breaking_rules.py,sha256=Qj7XbsyD-BynNwhrNiETTk7_Hs_tHrtrZmSU634zEJI,3157
|
|
8
8
|
datacontract/catalog/catalog.py,sha256=GMBGZPVo5lGZQDmdWCwWb5-upXlUz6R6eTMg9xI9a3s,2640
|
|
@@ -13,16 +13,16 @@ datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py,sha256
|
|
|
13
13
|
datacontract/engines/fastjsonschema/check_jsonschema.py,sha256=rXGfGDu9-RJomj9WcVe8vEfluR25vn2rOS7BeOVQ0XA,5748
|
|
14
14
|
datacontract/engines/fastjsonschema/s3/s3_read_files.py,sha256=iupiyqBa1dzgT2BtVGna-BjC5rqe6MTLs2QRp8GTs7M,665
|
|
15
15
|
datacontract/engines/soda/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
-
datacontract/engines/soda/check_soda_execute.py,sha256=
|
|
16
|
+
datacontract/engines/soda/check_soda_execute.py,sha256=zGIIkT-m65kP9_eQauHwfri1Poos6DkNx9Lr0_k6pu8,7456
|
|
17
17
|
datacontract/engines/soda/connections/bigquery.py,sha256=Ao0KaJe4R28auU_4umxvVaLB6ZHEbKaNoYZ-RfAUmeo,662
|
|
18
18
|
datacontract/engines/soda/connections/dask.py,sha256=Yy6Et2n_vDVsdjtqyBWDSZt7mnjPzPk_MZ-92VZHfnY,1496
|
|
19
19
|
datacontract/engines/soda/connections/databricks.py,sha256=lpMju-o_TzLZeF0EEVwePPr8JahqvFnj5xRYjF15fc8,561
|
|
20
20
|
datacontract/engines/soda/connections/duckdb.py,sha256=2wzUWnK7CLi7EJTT2Mh8Arv2pg6XGToe_9DdvLu0cNY,5585
|
|
21
|
-
datacontract/engines/soda/connections/kafka.py,sha256=
|
|
21
|
+
datacontract/engines/soda/connections/kafka.py,sha256=vh7z-4ZsmpXiYcogf3oTkagrAPcq6HG2SccnxNwFeVQ,5635
|
|
22
22
|
datacontract/engines/soda/connections/postgres.py,sha256=9GTF4Es3M5vb7ocSGqAxXmslvkS5CjsPQGIuo020CFc,626
|
|
23
23
|
datacontract/engines/soda/connections/snowflake.py,sha256=y1t2a1DWY4_tr5k-X5_nhLE6v1rfCwTahzhtHR91x9A,719
|
|
24
24
|
datacontract/engines/soda/connections/sqlserver.py,sha256=RzGLbCUdRyfmDcqtM_AB9WZ-Xk-XYX91nkXpVNpYbvc,1440
|
|
25
|
-
datacontract/export/avro_converter.py,sha256
|
|
25
|
+
datacontract/export/avro_converter.py,sha256=KC4TLeacHSXDeEPGEuUO8wvMH5cwoSOLuep5VCBNmww,3320
|
|
26
26
|
datacontract/export/avro_idl_converter.py,sha256=_2acoImuBqNqEt97OpBSewWT_w3aDBOdNosuy0gbkSY,9576
|
|
27
27
|
datacontract/export/bigquery_converter.py,sha256=XSVX7aVqyhBrOI-_BiPz9gtZXoT6wd5XucHaoJfWOCo,3802
|
|
28
28
|
datacontract/export/csv_type_converter.py,sha256=ZZuJwBgQnafZC7PPvAXsBf2IajPJq8TYZ1l8Qq0GYeI,1290
|
|
@@ -42,7 +42,7 @@ datacontract/export/sql_type_converter.py,sha256=DSIyBhRxU-Jo8NihwozE9Q_CZauBCoY
|
|
|
42
42
|
datacontract/export/terraform_converter.py,sha256=-xIIspVrvCyB2AVf1vd7bVGkWI3iiMUHX1btM_o1h-g,1943
|
|
43
43
|
datacontract/imports/avro_importer.py,sha256=3QTnGNps-g1dxnJjLOLr8vk64jRNebHgN1EHrdcMiXc,5559
|
|
44
44
|
datacontract/imports/bigquery_importer.py,sha256=HLotmmwCSe2sGBCI57gPQa3WyeTHA8h1yaDxp25TtLQ,6802
|
|
45
|
-
datacontract/imports/glue_importer.py,sha256=
|
|
45
|
+
datacontract/imports/glue_importer.py,sha256=HpS5E7774YqjF4hUItgtb5NVwA6OTwwjQbf-IyEejS8,5331
|
|
46
46
|
datacontract/imports/jsonschema_importer.py,sha256=f_x8DbWo423b6xcSIvY7jbk_rs2FM4lMMF75fSRE3sk,6329
|
|
47
47
|
datacontract/imports/sql_importer.py,sha256=tCSayA0YK_zr_R_KElfz0GOQwP0Tzz3TdBHAICnPN84,2419
|
|
48
48
|
datacontract/init/download_datacontract_file.py,sha256=pj_4mhWKlEtfueWohDgkb1nyuG5ERDipUDszxKwpZUs,413
|
|
@@ -76,9 +76,9 @@ datacontract/templates/partials/example.html,sha256=F1dWbHDIXQScgfs4OVgqM1lR4uV4
|
|
|
76
76
|
datacontract/templates/partials/model_field.html,sha256=Xy8fauErx61HuTSPLLQHgAHfX3_ilRbE0c17-nX9jAY,5072
|
|
77
77
|
datacontract/templates/partials/server.html,sha256=CINXVsdVAzDzrABtBOw4lD2qBUWb8kwcOsgZi_y8ZeU,4505
|
|
78
78
|
datacontract/templates/style/output.css,sha256=lfmd1Kmrtq8F5entx2o-yy9XOAZN3WkWMKNvbEtSO1k,24999
|
|
79
|
-
datacontract_cli-0.10.
|
|
80
|
-
datacontract_cli-0.10.
|
|
81
|
-
datacontract_cli-0.10.
|
|
82
|
-
datacontract_cli-0.10.
|
|
83
|
-
datacontract_cli-0.10.
|
|
84
|
-
datacontract_cli-0.10.
|
|
79
|
+
datacontract_cli-0.10.7.dist-info/LICENSE,sha256=23h64qnSeIZ0DKeziWAKC-zBCt328iSbRbWBrXoYRb4,2210
|
|
80
|
+
datacontract_cli-0.10.7.dist-info/METADATA,sha256=-S9oqCD7uZWLX30vxB6LThbZo8Ocv3hlEWUbTa_CBMI,70579
|
|
81
|
+
datacontract_cli-0.10.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
82
|
+
datacontract_cli-0.10.7.dist-info/entry_points.txt,sha256=D3Eqy4q_Z6bHauGd4ppIyQglwbrm1AJnLau4Ppbw9Is,54
|
|
83
|
+
datacontract_cli-0.10.7.dist-info/top_level.txt,sha256=VIRjd8EIUrBYWjEXJJjtdUgc0UAJdPZjmLiOR8BRBYM,13
|
|
84
|
+
datacontract_cli-0.10.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|