datacontract-cli 0.10.18__py3-none-any.whl → 0.10.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/cli.py +22 -30
- datacontract/data_contract.py +7 -8
- datacontract/engines/soda/connections/duckdb.py +22 -9
- datacontract/export/data_caterer_converter.py +20 -7
- datacontract/export/sodacl_converter.py +21 -4
- datacontract/export/sql_type_converter.py +7 -2
- datacontract/imports/csv_importer.py +89 -0
- datacontract/imports/importer.py +1 -0
- datacontract/imports/importer_factory.py +5 -0
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +5 -10
- datacontract/lint/linters/field_reference_linter.py +10 -1
- datacontract/lint/resolve.py +22 -1
- datacontract/lint/schema.py +10 -3
- datacontract/lint/urls.py +9 -5
- datacontract/model/data_contract_specification.py +2 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/templates/datacontract.html +20 -1
- datacontract/templates/partials/definition.html +15 -5
- datacontract/templates/partials/model_field.html +9 -0
- datacontract/web.py +170 -36
- {datacontract_cli-0.10.18.dist-info → datacontract_cli-0.10.20.dist-info}/METADATA +448 -297
- {datacontract_cli-0.10.18.dist-info → datacontract_cli-0.10.20.dist-info}/RECORD +29 -25
- datacontract/init/download_datacontract_file.py +0 -17
- {datacontract_cli-0.10.18.dist-info → datacontract_cli-0.10.20.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.18.dist-info → datacontract_cli-0.10.20.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.10.18.dist-info → datacontract_cli-0.10.20.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.18.dist-info → datacontract_cli-0.10.20.dist-info}/top_level.txt +0 -0
datacontract/cli.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from importlib import metadata
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import Iterable, List, Optional
|
|
4
5
|
|
|
5
6
|
import typer
|
|
6
|
-
import uvicorn
|
|
7
7
|
from click import Context
|
|
8
8
|
from rich import box
|
|
9
9
|
from rich.console import Console
|
|
@@ -11,19 +11,14 @@ from rich.table import Table
|
|
|
11
11
|
from typer.core import TyperGroup
|
|
12
12
|
from typing_extensions import Annotated
|
|
13
13
|
|
|
14
|
-
from datacontract import web
|
|
15
14
|
from datacontract.catalog.catalog import create_data_contract_html, create_index_html
|
|
16
15
|
from datacontract.data_contract import DataContract, ExportFormat
|
|
17
16
|
from datacontract.imports.importer import ImportFormat
|
|
18
|
-
from datacontract.init.
|
|
19
|
-
FileExistsException,
|
|
20
|
-
download_datacontract_file,
|
|
21
|
-
)
|
|
17
|
+
from datacontract.init.init_template import get_init_template
|
|
22
18
|
from datacontract.integration.datamesh_manager import (
|
|
23
19
|
publish_data_contract_to_datamesh_manager,
|
|
24
20
|
)
|
|
25
|
-
|
|
26
|
-
DEFAULT_DATA_CONTRACT_SCHEMA_URL = "https://datacontract.com/datacontract.schema.json"
|
|
21
|
+
from datacontract.lint.resolve import resolve_data_contract_dict
|
|
27
22
|
|
|
28
23
|
console = Console()
|
|
29
24
|
|
|
@@ -70,24 +65,21 @@ def common(
|
|
|
70
65
|
@app.command()
|
|
71
66
|
def init(
|
|
72
67
|
location: Annotated[
|
|
73
|
-
str,
|
|
74
|
-
typer.Argument(help="The location (url or path) of the data contract yaml to create."),
|
|
68
|
+
str, typer.Argument(help="The location of the data contract file to create.")
|
|
75
69
|
] = "datacontract.yaml",
|
|
76
|
-
template: Annotated[
|
|
77
|
-
str, typer.Option(help="URL of a template or data contract")
|
|
78
|
-
] = "https://datacontract.com/datacontract.init.yaml",
|
|
70
|
+
template: Annotated[str, typer.Option(help="URL of a template or data contract")] = None,
|
|
79
71
|
overwrite: Annotated[bool, typer.Option(help="Replace the existing datacontract.yaml")] = False,
|
|
80
72
|
):
|
|
81
73
|
"""
|
|
82
|
-
|
|
74
|
+
Create an empty data contract.
|
|
83
75
|
"""
|
|
84
|
-
|
|
85
|
-
download_datacontract_file(location, template, overwrite)
|
|
86
|
-
except FileExistsException:
|
|
76
|
+
if not overwrite and os.path.exists(location):
|
|
87
77
|
console.print("File already exists, use --overwrite to overwrite")
|
|
88
78
|
raise typer.Exit(code=1)
|
|
89
|
-
|
|
90
|
-
|
|
79
|
+
template_str = get_init_template(template)
|
|
80
|
+
with open(location, "w") as f:
|
|
81
|
+
f.write(template_str)
|
|
82
|
+
console.print("📄 data contract written to " + location)
|
|
91
83
|
|
|
92
84
|
|
|
93
85
|
@app.command()
|
|
@@ -99,7 +91,7 @@ def lint(
|
|
|
99
91
|
schema: Annotated[
|
|
100
92
|
str,
|
|
101
93
|
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
102
|
-
] =
|
|
94
|
+
] = None,
|
|
103
95
|
):
|
|
104
96
|
"""
|
|
105
97
|
Validate that the datacontract.yaml is correctly formatted.
|
|
@@ -117,7 +109,7 @@ def test(
|
|
|
117
109
|
schema: Annotated[
|
|
118
110
|
str,
|
|
119
111
|
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
120
|
-
] =
|
|
112
|
+
] = None,
|
|
121
113
|
server: Annotated[
|
|
122
114
|
str,
|
|
123
115
|
typer.Option(
|
|
@@ -135,7 +127,7 @@ def test(
|
|
|
135
127
|
logs: Annotated[bool, typer.Option(help="Print logs")] = False,
|
|
136
128
|
ssl_verification: Annotated[
|
|
137
129
|
bool,
|
|
138
|
-
typer.Option(help="SSL verification when publishing the
|
|
130
|
+
typer.Option(help="SSL verification when publishing the data contract."),
|
|
139
131
|
] = True,
|
|
140
132
|
):
|
|
141
133
|
"""
|
|
@@ -150,6 +142,7 @@ def test(
|
|
|
150
142
|
publish_url=publish,
|
|
151
143
|
server=server,
|
|
152
144
|
examples=examples,
|
|
145
|
+
ssl_verification=ssl_verification,
|
|
153
146
|
).test()
|
|
154
147
|
if logs:
|
|
155
148
|
_print_logs(run)
|
|
@@ -197,7 +190,7 @@ def export(
|
|
|
197
190
|
schema: Annotated[
|
|
198
191
|
str,
|
|
199
192
|
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
200
|
-
] =
|
|
193
|
+
] = None,
|
|
201
194
|
# TODO: this should be a subcommand
|
|
202
195
|
engine: Annotated[
|
|
203
196
|
Optional[str],
|
|
@@ -284,7 +277,7 @@ def import_(
|
|
|
284
277
|
schema: Annotated[
|
|
285
278
|
str,
|
|
286
279
|
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
287
|
-
] =
|
|
280
|
+
] = None,
|
|
288
281
|
):
|
|
289
282
|
"""
|
|
290
283
|
Create a data contract from the given source location. Saves to file specified by `output` option if present, otherwise prints to stdout.
|
|
@@ -321,7 +314,7 @@ def publish(
|
|
|
321
314
|
schema: Annotated[
|
|
322
315
|
str,
|
|
323
316
|
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
324
|
-
] =
|
|
317
|
+
] = None,
|
|
325
318
|
ssl_verification: Annotated[
|
|
326
319
|
bool,
|
|
327
320
|
typer.Option(help="SSL verification when publishing the data contract."),
|
|
@@ -331,9 +324,7 @@ def publish(
|
|
|
331
324
|
Publish the data contract to the Data Mesh Manager.
|
|
332
325
|
"""
|
|
333
326
|
publish_data_contract_to_datamesh_manager(
|
|
334
|
-
|
|
335
|
-
data_contract_file=location, schema_location=schema
|
|
336
|
-
).get_data_contract_specification(),
|
|
327
|
+
data_contract_dict=resolve_data_contract_dict(location),
|
|
337
328
|
ssl_verification=ssl_verification,
|
|
338
329
|
)
|
|
339
330
|
|
|
@@ -350,7 +341,7 @@ def catalog(
|
|
|
350
341
|
schema: Annotated[
|
|
351
342
|
str,
|
|
352
343
|
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
353
|
-
] =
|
|
344
|
+
] = None,
|
|
354
345
|
):
|
|
355
346
|
"""
|
|
356
347
|
Create an html catalog of data contracts.
|
|
@@ -449,8 +440,9 @@ def serve(
|
|
|
449
440
|
"""
|
|
450
441
|
Start the datacontract web server.
|
|
451
442
|
"""
|
|
443
|
+
import uvicorn
|
|
452
444
|
|
|
453
|
-
uvicorn.run(web
|
|
445
|
+
uvicorn.run("datacontract.web:app", port=port, host=host, reload=True)
|
|
454
446
|
|
|
455
447
|
|
|
456
448
|
def _handle_result(run):
|
datacontract/data_contract.py
CHANGED
|
@@ -22,6 +22,7 @@ from datacontract.engines.soda.check_soda_execute import check_soda_execute
|
|
|
22
22
|
from datacontract.export.exporter import ExportFormat
|
|
23
23
|
from datacontract.export.exporter_factory import exporter_factory
|
|
24
24
|
from datacontract.imports.importer_factory import importer_factory
|
|
25
|
+
from datacontract.init.init_template import get_init_template
|
|
25
26
|
from datacontract.integration.datamesh_manager import publish_test_results_to_datamesh_manager
|
|
26
27
|
from datacontract.lint import resolve
|
|
27
28
|
from datacontract.lint.linters.description_linter import DescriptionLinter
|
|
@@ -36,8 +37,6 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
|
|
|
36
37
|
from datacontract.model.exceptions import DataContractException
|
|
37
38
|
from datacontract.model.run import Check, Run
|
|
38
39
|
|
|
39
|
-
DEFAULT_DATA_CONTRACT_TEMPLATE_URL = "https://datacontract.com/datacontract.init.yaml"
|
|
40
|
-
|
|
41
40
|
|
|
42
41
|
class DataContract:
|
|
43
42
|
def __init__(
|
|
@@ -52,6 +51,7 @@ class DataContract:
|
|
|
52
51
|
spark: "SparkSession" = None,
|
|
53
52
|
inline_definitions: bool = True,
|
|
54
53
|
inline_quality: bool = True,
|
|
54
|
+
ssl_verification: bool = True,
|
|
55
55
|
):
|
|
56
56
|
self._data_contract_file = data_contract_file
|
|
57
57
|
self._data_contract_str = data_contract_str
|
|
@@ -63,6 +63,7 @@ class DataContract:
|
|
|
63
63
|
self._spark = spark
|
|
64
64
|
self._inline_definitions = inline_definitions
|
|
65
65
|
self._inline_quality = inline_quality
|
|
66
|
+
self._ssl_verification = ssl_verification
|
|
66
67
|
self.all_linters = {
|
|
67
68
|
ExampleModelLinter(),
|
|
68
69
|
QualityUsesSchemaLinter(),
|
|
@@ -74,10 +75,9 @@ class DataContract:
|
|
|
74
75
|
}
|
|
75
76
|
|
|
76
77
|
@classmethod
|
|
77
|
-
def init(
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
return resolve.resolve_data_contract(data_contract_location=template, schema_location=schema)
|
|
78
|
+
def init(cls, template: typing.Optional[str], schema: typing.Optional[str] = None) -> DataContractSpecification:
|
|
79
|
+
template_str = get_init_template(template)
|
|
80
|
+
return resolve.resolve_data_contract(data_contract_str=template_str, schema_location=schema)
|
|
81
81
|
|
|
82
82
|
def lint(self, enabled_linters: typing.Union[str, set[str]] = "all") -> Run:
|
|
83
83
|
"""Lint the data contract by deserializing the contract and checking the schema, as well as calling the configured linters.
|
|
@@ -231,7 +231,7 @@ class DataContract:
|
|
|
231
231
|
run.finish()
|
|
232
232
|
|
|
233
233
|
if self._publish_url is not None:
|
|
234
|
-
publish_test_results_to_datamesh_manager(run, self._publish_url)
|
|
234
|
+
publish_test_results_to_datamesh_manager(run, self._publish_url, self._ssl_verification)
|
|
235
235
|
|
|
236
236
|
return run
|
|
237
237
|
|
|
@@ -352,7 +352,6 @@ class DataContract:
|
|
|
352
352
|
schema: typing.Optional[str] = None,
|
|
353
353
|
**kwargs,
|
|
354
354
|
) -> DataContractSpecification:
|
|
355
|
-
template = DEFAULT_DATA_CONTRACT_TEMPLATE_URL if template is None else template
|
|
356
355
|
data_contract_specification_initial = DataContract.init(template=template, schema=schema)
|
|
357
356
|
|
|
358
357
|
return importer_factory.create(format).import_source(
|
|
@@ -146,6 +146,7 @@ def setup_azure_connection(con, server):
|
|
|
146
146
|
tenant_id = os.getenv("DATACONTRACT_AZURE_TENANT_ID")
|
|
147
147
|
client_id = os.getenv("DATACONTRACT_AZURE_CLIENT_ID")
|
|
148
148
|
client_secret = os.getenv("DATACONTRACT_AZURE_CLIENT_SECRET")
|
|
149
|
+
storage_account = server.storageAccount
|
|
149
150
|
|
|
150
151
|
if tenant_id is None:
|
|
151
152
|
raise ValueError("Error: Environment variable DATACONTRACT_AZURE_TENANT_ID is not set")
|
|
@@ -157,12 +158,24 @@ def setup_azure_connection(con, server):
|
|
|
157
158
|
con.install_extension("azure")
|
|
158
159
|
con.load_extension("azure")
|
|
159
160
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
161
|
+
if storage_account is not None:
|
|
162
|
+
con.sql(f"""
|
|
163
|
+
CREATE SECRET azure_spn (
|
|
164
|
+
TYPE AZURE,
|
|
165
|
+
PROVIDER SERVICE_PRINCIPAL,
|
|
166
|
+
TENANT_ID '{tenant_id}',
|
|
167
|
+
CLIENT_ID '{client_id}',
|
|
168
|
+
CLIENT_SECRET '{client_secret}',
|
|
169
|
+
ACCOUNT_NAME '{storage_account}'
|
|
170
|
+
);
|
|
171
|
+
""")
|
|
172
|
+
else:
|
|
173
|
+
con.sql(f"""
|
|
174
|
+
CREATE SECRET azure_spn (
|
|
175
|
+
TYPE AZURE,
|
|
176
|
+
PROVIDER SERVICE_PRINCIPAL,
|
|
177
|
+
TENANT_ID '{tenant_id}',
|
|
178
|
+
CLIENT_ID '{client_id}',
|
|
179
|
+
CLIENT_SECRET '{client_secret}'
|
|
180
|
+
);
|
|
181
|
+
""")
|
|
@@ -42,11 +42,11 @@ def _to_data_caterer_generate_step(model_key, model_value: Model, server: Server
|
|
|
42
42
|
"name": model_key,
|
|
43
43
|
"type": _to_step_type(server),
|
|
44
44
|
"options": _to_data_source_options(model_key, server),
|
|
45
|
-
"
|
|
45
|
+
"fields": [],
|
|
46
46
|
}
|
|
47
47
|
fields = _to_fields(model_value.fields)
|
|
48
48
|
if fields:
|
|
49
|
-
step["
|
|
49
|
+
step["fields"] = fields
|
|
50
50
|
return step
|
|
51
51
|
|
|
52
52
|
|
|
@@ -97,16 +97,29 @@ def _to_field(field_name: str, field: Field) -> dict:
|
|
|
97
97
|
if new_type == "object" or new_type == "record" or new_type == "struct":
|
|
98
98
|
# need to get nested field definitions
|
|
99
99
|
nested_fields = _to_fields(field.fields)
|
|
100
|
-
dc_field["
|
|
100
|
+
dc_field["fields"] = nested_fields
|
|
101
|
+
elif new_type == "array":
|
|
102
|
+
if field.items is not None and field.items.type is not None:
|
|
103
|
+
dc_generator_opts["arrayType"] = _to_data_type(field.items.type)
|
|
104
|
+
else:
|
|
105
|
+
dc_generator_opts["arrayType"] = "string"
|
|
101
106
|
|
|
102
107
|
if field.enum is not None and len(field.enum) > 0:
|
|
103
108
|
dc_generator_opts["oneOf"] = field.enum
|
|
104
109
|
if field.unique is not None and field.unique:
|
|
105
110
|
dc_generator_opts["isUnique"] = field.unique
|
|
111
|
+
if field.primaryKey is not None and field.primaryKey:
|
|
112
|
+
dc_generator_opts["isPrimaryKey"] = field.primaryKey
|
|
106
113
|
if field.minLength is not None:
|
|
107
|
-
|
|
114
|
+
if field.type is not None and field.type == "array":
|
|
115
|
+
dc_generator_opts["arrayMinLen"] = field.minLength
|
|
116
|
+
else:
|
|
117
|
+
dc_generator_opts["minLen"] = field.minLength
|
|
108
118
|
if field.maxLength is not None:
|
|
109
|
-
|
|
119
|
+
if field.type is not None and field.type == "array":
|
|
120
|
+
dc_generator_opts["arrayMaxLen"] = field.maxLength
|
|
121
|
+
else:
|
|
122
|
+
dc_generator_opts["maxLen"] = field.maxLength
|
|
110
123
|
if field.pattern is not None:
|
|
111
124
|
dc_generator_opts["regex"] = field.pattern
|
|
112
125
|
if field.minimum is not None:
|
|
@@ -115,7 +128,7 @@ def _to_field(field_name: str, field: Field) -> dict:
|
|
|
115
128
|
dc_generator_opts["max"] = field.maximum
|
|
116
129
|
|
|
117
130
|
if len(dc_generator_opts.keys()) > 0:
|
|
118
|
-
dc_field["
|
|
131
|
+
dc_field["options"] = dc_generator_opts
|
|
119
132
|
return dc_field
|
|
120
133
|
|
|
121
134
|
|
|
@@ -124,7 +137,7 @@ def _to_data_type(data_type):
|
|
|
124
137
|
return "double"
|
|
125
138
|
elif data_type == "decimal" or data_type == "bigint":
|
|
126
139
|
return "decimal"
|
|
127
|
-
elif data_type == "int":
|
|
140
|
+
elif data_type == "int" or data_type == "integer":
|
|
128
141
|
return "integer"
|
|
129
142
|
elif data_type == "long":
|
|
130
143
|
return "long"
|
|
@@ -30,6 +30,7 @@ def to_sodacl_yaml(
|
|
|
30
30
|
|
|
31
31
|
def to_checks(model_key, model_value, server_type: str, check_types: bool):
|
|
32
32
|
checks = []
|
|
33
|
+
model_name = to_model_name(model_key, model_value, server_type)
|
|
33
34
|
fields = model_value.fields
|
|
34
35
|
|
|
35
36
|
quote_field_name = server_type in ["postgres", "sqlserver"]
|
|
@@ -62,25 +63,41 @@ def to_checks(model_key, model_value, server_type: str, check_types: bool):
|
|
|
62
63
|
if field.enum is not None and len(field.enum) > 0:
|
|
63
64
|
checks.append(check_field_enum(field_name, field.enum, quote_field_name))
|
|
64
65
|
if field.quality is not None and len(field.quality) > 0:
|
|
65
|
-
quality_list = check_quality_list(
|
|
66
|
+
quality_list = check_quality_list(model_name, field_name, field.quality)
|
|
66
67
|
if (quality_list is not None) and len(quality_list) > 0:
|
|
67
68
|
checks.append(quality_list)
|
|
68
69
|
# TODO references: str = None
|
|
69
70
|
# TODO format
|
|
70
71
|
|
|
71
72
|
if model_value.quality is not None and len(model_value.quality) > 0:
|
|
72
|
-
quality_list = check_quality_list(
|
|
73
|
+
quality_list = check_quality_list(model_name, None, model_value.quality)
|
|
73
74
|
if (quality_list is not None) and len(quality_list) > 0:
|
|
74
75
|
checks.append(quality_list)
|
|
75
76
|
|
|
76
|
-
checks_for_model_key = f"checks for {
|
|
77
|
+
checks_for_model_key = f"checks for {model_name}"
|
|
77
78
|
|
|
78
79
|
if quote_field_name:
|
|
79
|
-
checks_for_model_key = f'checks for "{
|
|
80
|
+
checks_for_model_key = f'checks for "{model_name}"'
|
|
80
81
|
|
|
81
82
|
return checks_for_model_key, checks
|
|
82
83
|
|
|
83
84
|
|
|
85
|
+
def to_model_name(model_key, model_value, server_type):
|
|
86
|
+
if server_type == "databricks":
|
|
87
|
+
if model_value.config is not None and "databricksTable" in model_value.config:
|
|
88
|
+
return model_value.config["databricksTable"]
|
|
89
|
+
if server_type == "snowflake":
|
|
90
|
+
if model_value.config is not None and "snowflakeTable" in model_value.config:
|
|
91
|
+
return model_value.config["snowflakeTable"]
|
|
92
|
+
if server_type == "sqlserver":
|
|
93
|
+
if model_value.config is not None and "sqlserverTable" in model_value.config:
|
|
94
|
+
return model_value.config["sqlserverTable"]
|
|
95
|
+
if server_type == "postgres" or server_type == "postgresql":
|
|
96
|
+
if model_value.config is not None and "postgresTable" in model_value.config:
|
|
97
|
+
return model_value.config["postgresTable"]
|
|
98
|
+
return model_key
|
|
99
|
+
|
|
100
|
+
|
|
84
101
|
def check_field_is_present(field_name):
|
|
85
102
|
return {
|
|
86
103
|
"schema": {
|
|
@@ -182,11 +182,16 @@ def convert_to_databricks(field: Field) -> None | str:
|
|
|
182
182
|
if type.lower() in ["boolean"]:
|
|
183
183
|
return "BOOLEAN"
|
|
184
184
|
if type.lower() in ["object", "record", "struct"]:
|
|
185
|
-
|
|
185
|
+
nested_fields = []
|
|
186
|
+
for nested_field_name, nested_field in field.fields.items():
|
|
187
|
+
nested_field_type = convert_to_databricks(nested_field)
|
|
188
|
+
nested_fields.append(f"{nested_field_name} {nested_field_type}")
|
|
189
|
+
return f"STRUCT<{', '.join(nested_fields)}>"
|
|
186
190
|
if type.lower() in ["bytes"]:
|
|
187
191
|
return "BINARY"
|
|
188
192
|
if type.lower() in ["array"]:
|
|
189
|
-
|
|
193
|
+
item_type = convert_to_databricks(field.items)
|
|
194
|
+
return f"ARRAY<{item_type}>"
|
|
190
195
|
return None
|
|
191
196
|
|
|
192
197
|
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import clevercsv
|
|
4
|
+
|
|
5
|
+
from datacontract.imports.importer import Importer
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Example, Field, Model, Server
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CsvImporter(Importer):
|
|
10
|
+
def import_source(
|
|
11
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
12
|
+
) -> DataContractSpecification:
|
|
13
|
+
return import_csv(data_contract_specification, self.import_format, source)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def import_csv(data_contract_specification: DataContractSpecification, format: str, source: str):
|
|
17
|
+
include_example = False
|
|
18
|
+
|
|
19
|
+
# detect encoding and dialect
|
|
20
|
+
encoding = clevercsv.encoding.get_encoding(source)
|
|
21
|
+
with open(source, "r", newline="") as fp:
|
|
22
|
+
dialect = clevercsv.Sniffer().sniff(fp.read(10000))
|
|
23
|
+
|
|
24
|
+
# using auto detecting of the format and encoding
|
|
25
|
+
df = clevercsv.read_dataframe(source)
|
|
26
|
+
|
|
27
|
+
if data_contract_specification.models is None:
|
|
28
|
+
data_contract_specification.models = {}
|
|
29
|
+
|
|
30
|
+
# use the file name as table name
|
|
31
|
+
table_name = os.path.splitext(os.path.basename(source))[0]
|
|
32
|
+
|
|
33
|
+
if data_contract_specification.servers is None:
|
|
34
|
+
data_contract_specification.servers = {}
|
|
35
|
+
|
|
36
|
+
data_contract_specification.servers["production"] = Server(
|
|
37
|
+
type="local", path=source, format="csv", delimiter=dialect.delimiter
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
fields = {}
|
|
41
|
+
for column, dtype in df.dtypes.items():
|
|
42
|
+
field = Field()
|
|
43
|
+
field.type = map_type_from_pandas(dtype.name)
|
|
44
|
+
fields[column] = field
|
|
45
|
+
|
|
46
|
+
data_contract_specification.models[table_name] = Model(
|
|
47
|
+
type="table",
|
|
48
|
+
description=f"Csv file with encoding {encoding}",
|
|
49
|
+
fields=fields,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# multiline data is not correctly handled by yaml dump
|
|
53
|
+
if include_example:
|
|
54
|
+
if data_contract_specification.examples is None:
|
|
55
|
+
data_contract_specification.examples = []
|
|
56
|
+
|
|
57
|
+
# read first 10 lines with the detected encoding
|
|
58
|
+
with open(source, "r", encoding=encoding) as csvfile:
|
|
59
|
+
lines = csvfile.readlines()[:10]
|
|
60
|
+
|
|
61
|
+
data_contract_specification.examples.append(Example(type="csv", model=table_name, data="".join(lines)))
|
|
62
|
+
|
|
63
|
+
return data_contract_specification
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def map_type_from_pandas(sql_type: str):
|
|
67
|
+
if sql_type is None:
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
sql_type_normed = sql_type.lower().strip()
|
|
71
|
+
|
|
72
|
+
if sql_type_normed == "object":
|
|
73
|
+
return "string"
|
|
74
|
+
elif sql_type_normed.startswith("str"):
|
|
75
|
+
return "string"
|
|
76
|
+
elif sql_type_normed.startswith("int"):
|
|
77
|
+
return "integer"
|
|
78
|
+
elif sql_type_normed.startswith("float"):
|
|
79
|
+
return "float"
|
|
80
|
+
elif sql_type_normed.startswith("bool"):
|
|
81
|
+
return "boolean"
|
|
82
|
+
elif sql_type_normed.startswith("timestamp"):
|
|
83
|
+
return "timestamp"
|
|
84
|
+
elif sql_type_normed == "datetime64":
|
|
85
|
+
return "date"
|
|
86
|
+
elif sql_type_normed == "timedelta[ns]":
|
|
87
|
+
return "timestamp_ntz"
|
|
88
|
+
else:
|
|
89
|
+
return "variant"
|
datacontract/imports/importer.py
CHANGED
|
@@ -104,3 +104,8 @@ importer_factory.register_lazy_importer(
|
|
|
104
104
|
module_path="datacontract.imports.parquet_importer",
|
|
105
105
|
class_name="ParquetImporter",
|
|
106
106
|
)
|
|
107
|
+
importer_factory.register_lazy_importer(
|
|
108
|
+
name=ImportFormat.csv,
|
|
109
|
+
module_path="datacontract.imports.csv_importer",
|
|
110
|
+
class_name="CsvImporter",
|
|
111
|
+
)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import importlib.resources as resources
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
DEFAULT_DATA_CONTRACT_INIT_TEMPLATE = "datacontract-1.1.0.init.yaml"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_init_template(location: str = None) -> str:
|
|
10
|
+
if location is None:
|
|
11
|
+
logging.info("Use default bundled template " + DEFAULT_DATA_CONTRACT_INIT_TEMPLATE)
|
|
12
|
+
schemas = resources.files("datacontract")
|
|
13
|
+
template = schemas.joinpath("schemas", DEFAULT_DATA_CONTRACT_INIT_TEMPLATE)
|
|
14
|
+
with template.open("r") as file:
|
|
15
|
+
return file.read()
|
|
16
|
+
elif location.startswith("http://") or location.startswith("https://"):
|
|
17
|
+
return requests.get(location).text
|
|
18
|
+
else:
|
|
19
|
+
with open(location, "r") as file:
|
|
20
|
+
return file.read()
|
|
@@ -2,11 +2,10 @@ import os
|
|
|
2
2
|
|
|
3
3
|
import requests
|
|
4
4
|
|
|
5
|
-
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
6
5
|
from datacontract.model.run import Run
|
|
7
6
|
|
|
8
7
|
|
|
9
|
-
def publish_test_results_to_datamesh_manager(run: Run, publish_url: str):
|
|
8
|
+
def publish_test_results_to_datamesh_manager(run: Run, publish_url: str, ssl_verification: bool):
|
|
10
9
|
try:
|
|
11
10
|
if publish_url is None:
|
|
12
11
|
# this url supports Data Mesh Manager and Data Contract Manager
|
|
@@ -32,7 +31,7 @@ def publish_test_results_to_datamesh_manager(run: Run, publish_url: str):
|
|
|
32
31
|
url,
|
|
33
32
|
data=request_body,
|
|
34
33
|
headers=headers,
|
|
35
|
-
verify=
|
|
34
|
+
verify=ssl_verification,
|
|
36
35
|
)
|
|
37
36
|
# print("Status Code:", response.status_code)
|
|
38
37
|
# print("Response Body:", response.text)
|
|
@@ -44,9 +43,7 @@ def publish_test_results_to_datamesh_manager(run: Run, publish_url: str):
|
|
|
44
43
|
run.log_error(f"Failed publishing test results. Error: {str(e)}")
|
|
45
44
|
|
|
46
45
|
|
|
47
|
-
def publish_data_contract_to_datamesh_manager(
|
|
48
|
-
data_contract_specification: DataContractSpecification, ssl_verification: bool
|
|
49
|
-
):
|
|
46
|
+
def publish_data_contract_to_datamesh_manager(data_contract_dict: dict, ssl_verification: bool):
|
|
50
47
|
try:
|
|
51
48
|
api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
|
|
52
49
|
host = "https://api.datamesh-manager.com"
|
|
@@ -59,13 +56,11 @@ def publish_data_contract_to_datamesh_manager(
|
|
|
59
56
|
"Cannot publish data contract, as neither DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY is set"
|
|
60
57
|
)
|
|
61
58
|
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
|
62
|
-
|
|
63
|
-
id = spec.id
|
|
59
|
+
id = data_contract_dict["id"]
|
|
64
60
|
url = f"{host}/api/datacontracts/{id}"
|
|
65
|
-
request_body = spec.model_dump_json().encode("utf-8")
|
|
66
61
|
response = requests.put(
|
|
67
62
|
url=url,
|
|
68
|
-
|
|
63
|
+
json=data_contract_dict,
|
|
69
64
|
headers=headers,
|
|
70
65
|
verify=ssl_verification,
|
|
71
66
|
)
|
|
@@ -22,7 +22,16 @@ class FieldReferenceLinter(Linter):
|
|
|
22
22
|
for model_name, model in contract.models.items():
|
|
23
23
|
for field_name, field in model.fields.items():
|
|
24
24
|
if field.references:
|
|
25
|
-
|
|
25
|
+
reference_hierarchy = field.references.split(".")
|
|
26
|
+
if len(reference_hierarchy) != 2:
|
|
27
|
+
result = result.with_error(
|
|
28
|
+
f"Field '{field_name}' in model '{model_name}'"
|
|
29
|
+
f" references must follow the model.field syntax and refer to a field in a model in this data contract."
|
|
30
|
+
)
|
|
31
|
+
continue
|
|
32
|
+
ref_model = reference_hierarchy[0]
|
|
33
|
+
ref_field = reference_hierarchy[1]
|
|
34
|
+
|
|
26
35
|
if ref_model not in contract.models:
|
|
27
36
|
result = result.with_error(
|
|
28
37
|
f"Field '{field_name}' in model '{model_name}'"
|
datacontract/lint/resolve.py
CHANGED
|
@@ -44,6 +44,27 @@ def resolve_data_contract(
|
|
|
44
44
|
)
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
def resolve_data_contract_dict(
|
|
48
|
+
data_contract_location: str = None,
|
|
49
|
+
data_contract_str: str = None,
|
|
50
|
+
data_contract: DataContractSpecification = None,
|
|
51
|
+
) -> dict:
|
|
52
|
+
if data_contract_location is not None:
|
|
53
|
+
return _to_yaml(read_resource(data_contract_location))
|
|
54
|
+
elif data_contract_str is not None:
|
|
55
|
+
return _to_yaml(data_contract_str)
|
|
56
|
+
elif data_contract is not None:
|
|
57
|
+
return data_contract.model_dump()
|
|
58
|
+
else:
|
|
59
|
+
raise DataContractException(
|
|
60
|
+
type="lint",
|
|
61
|
+
result="failed",
|
|
62
|
+
name="Check that data contract YAML is valid",
|
|
63
|
+
reason="Data contract needs to be provided",
|
|
64
|
+
engine="datacontract",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
47
68
|
def resolve_data_contract_from_location(
|
|
48
69
|
location, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
|
|
49
70
|
) -> DataContractSpecification:
|
|
@@ -231,7 +252,7 @@ def _resolve_data_contract_from_str(
|
|
|
231
252
|
return spec
|
|
232
253
|
|
|
233
254
|
|
|
234
|
-
def _to_yaml(data_contract_str):
|
|
255
|
+
def _to_yaml(data_contract_str) -> dict:
|
|
235
256
|
try:
|
|
236
257
|
yaml_dict = yaml.safe_load(data_contract_str)
|
|
237
258
|
return yaml_dict
|
datacontract/lint/schema.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
import importlib.resources as resources
|
|
1
2
|
import json
|
|
3
|
+
import logging
|
|
2
4
|
import os
|
|
3
5
|
from typing import Any, Dict
|
|
4
6
|
|
|
@@ -6,6 +8,8 @@ import requests
|
|
|
6
8
|
|
|
7
9
|
from datacontract.model.exceptions import DataContractException
|
|
8
10
|
|
|
11
|
+
DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.1.0.schema.json"
|
|
12
|
+
|
|
9
13
|
|
|
10
14
|
def fetch_schema(location: str = None) -> Dict[str, Any]:
|
|
11
15
|
"""
|
|
@@ -27,9 +31,12 @@ def fetch_schema(location: str = None) -> Dict[str, Any]:
|
|
|
27
31
|
|
|
28
32
|
"""
|
|
29
33
|
if location is None:
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
34
|
+
logging.info("Use default bundled schema " + DEFAULT_DATA_CONTRACT_SCHEMA)
|
|
35
|
+
schemas = resources.files("datacontract")
|
|
36
|
+
schema_file = schemas.joinpath("schemas", DEFAULT_DATA_CONTRACT_SCHEMA)
|
|
37
|
+
with schema_file.open("r") as file:
|
|
38
|
+
schema = json.load(file)
|
|
39
|
+
elif location.startswith("http://") or location.startswith("https://"):
|
|
33
40
|
response = requests.get(location)
|
|
34
41
|
schema = response.json()
|
|
35
42
|
else:
|