datacontract-cli 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/cli.py +26 -24
- datacontract/data_contract.py +69 -152
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +13 -1
- datacontract/engines/soda/check_soda_execute.py +11 -0
- datacontract/engines/soda/connections/bigquery.py +8 -1
- datacontract/engines/soda/connections/kafka.py +3 -0
- datacontract/export/__init__.py +0 -0
- datacontract/export/avro_converter.py +28 -21
- datacontract/export/avro_idl_converter.py +29 -22
- datacontract/export/bigquery_converter.py +15 -0
- datacontract/export/dbml_converter.py +9 -0
- datacontract/export/dbt_converter.py +26 -1
- datacontract/export/exporter.py +87 -0
- datacontract/export/exporter_factory.py +52 -0
- datacontract/export/go_converter.py +6 -0
- datacontract/export/great_expectations_converter.py +10 -0
- datacontract/export/html_export.py +6 -0
- datacontract/export/jsonschema_converter.py +24 -16
- datacontract/export/odcs_converter.py +24 -1
- datacontract/export/protobuf_converter.py +6 -0
- datacontract/export/pydantic_converter.py +6 -0
- datacontract/export/rdf_converter.py +9 -0
- datacontract/export/sodacl_converter.py +7 -1
- datacontract/export/sql_converter.py +32 -2
- datacontract/export/sql_type_converter.py +4 -5
- datacontract/export/terraform_converter.py +6 -0
- datacontract/imports/bigquery_importer.py +30 -4
- datacontract/imports/glue_importer.py +13 -3
- datacontract/imports/odcs_importer.py +192 -0
- datacontract/imports/unity_importer.py +138 -0
- datacontract/model/data_contract_specification.py +2 -0
- datacontract/templates/partials/server.html +64 -32
- datacontract/templates/style/output.css +9 -0
- datacontract/web.py +56 -2
- {datacontract_cli-0.10.6.dist-info → datacontract_cli-0.10.8.dist-info}/METADATA +232 -96
- {datacontract_cli-0.10.6.dist-info → datacontract_cli-0.10.8.dist-info}/RECORD +40 -35
- {datacontract_cli-0.10.6.dist-info → datacontract_cli-0.10.8.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.6.dist-info → datacontract_cli-0.10.8.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.10.6.dist-info → datacontract_cli-0.10.8.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.6.dist-info → datacontract_cli-0.10.8.dist-info}/top_level.txt +0 -0
datacontract/cli.py
CHANGED
|
@@ -5,6 +5,7 @@ from typing import Iterable, Optional
|
|
|
5
5
|
from typing import List
|
|
6
6
|
|
|
7
7
|
import typer
|
|
8
|
+
import uvicorn
|
|
8
9
|
from click import Context
|
|
9
10
|
from rich import box
|
|
10
11
|
from rich.console import Console
|
|
@@ -12,8 +13,9 @@ from rich.table import Table
|
|
|
12
13
|
from typer.core import TyperGroup
|
|
13
14
|
from typing_extensions import Annotated
|
|
14
15
|
|
|
16
|
+
from datacontract import web
|
|
15
17
|
from datacontract.catalog.catalog import create_index_html, create_data_contract_html
|
|
16
|
-
from datacontract.data_contract import DataContract
|
|
18
|
+
from datacontract.data_contract import DataContract, ExportFormat
|
|
17
19
|
from datacontract.init.download_datacontract_file import download_datacontract_file, FileExistsException
|
|
18
20
|
from datacontract.publish.publish import publish_to_datamesh_manager
|
|
19
21
|
|
|
@@ -141,28 +143,6 @@ def test(
|
|
|
141
143
|
_handle_result(run)
|
|
142
144
|
|
|
143
145
|
|
|
144
|
-
class ExportFormat(str, Enum):
|
|
145
|
-
jsonschema = "jsonschema"
|
|
146
|
-
pydantic_model = "pydantic-model"
|
|
147
|
-
sodacl = "sodacl"
|
|
148
|
-
dbt = "dbt"
|
|
149
|
-
dbt_sources = "dbt-sources"
|
|
150
|
-
dbt_staging_sql = "dbt-staging-sql"
|
|
151
|
-
odcs = "odcs"
|
|
152
|
-
rdf = "rdf"
|
|
153
|
-
avro = "avro"
|
|
154
|
-
protobuf = "protobuf"
|
|
155
|
-
great_expectations = "great-expectations"
|
|
156
|
-
terraform = "terraform"
|
|
157
|
-
avro_idl = "avro-idl"
|
|
158
|
-
sql = "sql"
|
|
159
|
-
sql_query = "sql-query"
|
|
160
|
-
html = "html"
|
|
161
|
-
go = "go"
|
|
162
|
-
bigquery = "bigquery"
|
|
163
|
-
dbml = "dbml"
|
|
164
|
-
|
|
165
|
-
|
|
166
146
|
@app.command()
|
|
167
147
|
def export(
|
|
168
148
|
format: Annotated[ExportFormat, typer.Option(help="The export format.")],
|
|
@@ -205,6 +185,7 @@ def export(
|
|
|
205
185
|
result = DataContract(data_contract_file=location, server=server).export(
|
|
206
186
|
export_format=format,
|
|
207
187
|
model=model,
|
|
188
|
+
server=server,
|
|
208
189
|
rdf_base=rdf_base,
|
|
209
190
|
sql_server_type=sql_server_type,
|
|
210
191
|
)
|
|
@@ -223,6 +204,8 @@ class ImportFormat(str, Enum):
|
|
|
223
204
|
glue = "glue"
|
|
224
205
|
bigquery = "bigquery"
|
|
225
206
|
jsonschema = "jsonschema"
|
|
207
|
+
odcs="odcs"
|
|
208
|
+
unity = "unity"
|
|
226
209
|
|
|
227
210
|
|
|
228
211
|
@app.command(name="import")
|
|
@@ -231,6 +214,12 @@ def import_(
|
|
|
231
214
|
source: Annotated[
|
|
232
215
|
Optional[str], typer.Option(help="The path to the file or Glue Database that should be imported.")
|
|
233
216
|
] = None,
|
|
217
|
+
glue_table: Annotated[
|
|
218
|
+
Optional[List[str]],
|
|
219
|
+
typer.Option(
|
|
220
|
+
help="List of table ids to import from the Glue Database (repeat for multiple table ids, leave empty for all tables in the dataset)."
|
|
221
|
+
),
|
|
222
|
+
] = None,
|
|
234
223
|
bigquery_project: Annotated[Optional[str], typer.Option(help="The bigquery project id.")] = None,
|
|
235
224
|
bigquery_dataset: Annotated[Optional[str], typer.Option(help="The bigquery dataset id.")] = None,
|
|
236
225
|
bigquery_table: Annotated[
|
|
@@ -239,11 +228,12 @@ def import_(
|
|
|
239
228
|
help="List of table ids to import from the bigquery API (repeat for multiple table ids, leave empty for all tables in the dataset)."
|
|
240
229
|
),
|
|
241
230
|
] = None,
|
|
231
|
+
unity_table_full_name: Annotated[Optional[str], typer.Option(help="Full name of a table in the unity catalog")] = None,
|
|
242
232
|
):
|
|
243
233
|
"""
|
|
244
234
|
Create a data contract from the given source location. Prints to stdout.
|
|
245
235
|
"""
|
|
246
|
-
result = DataContract().import_from_source(format, source, bigquery_table, bigquery_project, bigquery_dataset)
|
|
236
|
+
result = DataContract().import_from_source(format, source, glue_table, bigquery_table, bigquery_project, bigquery_dataset, unity_table_full_name)
|
|
247
237
|
console.print(result.to_yaml())
|
|
248
238
|
|
|
249
239
|
|
|
@@ -339,6 +329,18 @@ def diff(
|
|
|
339
329
|
console.print(result.changelog_str())
|
|
340
330
|
|
|
341
331
|
|
|
332
|
+
@app.command()
|
|
333
|
+
def serve(
|
|
334
|
+
port: Annotated[int, typer.Option(help="Bind socket to this port.")] = 4242,
|
|
335
|
+
host: Annotated[str, typer.Option(help="Bind socket to this host.")] = "127.0.0.1",
|
|
336
|
+
):
|
|
337
|
+
"""
|
|
338
|
+
Start the datacontract web server.
|
|
339
|
+
"""
|
|
340
|
+
|
|
341
|
+
uvicorn.run(web.app, port=port, host=host)
|
|
342
|
+
|
|
343
|
+
|
|
342
344
|
def _handle_result(run):
|
|
343
345
|
_print_table(run)
|
|
344
346
|
if run.result == "passed":
|
datacontract/data_contract.py
CHANGED
|
@@ -12,27 +12,15 @@ from datacontract.engines.datacontract.check_that_datacontract_contains_valid_se
|
|
|
12
12
|
)
|
|
13
13
|
from datacontract.engines.fastjsonschema.check_jsonschema import check_jsonschema
|
|
14
14
|
from datacontract.engines.soda.check_soda_execute import check_soda_execute
|
|
15
|
-
from datacontract.export.
|
|
16
|
-
from datacontract.export.
|
|
17
|
-
from datacontract.export.bigquery_converter import to_bigquery_json
|
|
18
|
-
from datacontract.export.dbml_converter import to_dbml_diagram
|
|
19
|
-
from datacontract.export.dbt_converter import to_dbt_models_yaml, to_dbt_sources_yaml, to_dbt_staging_sql
|
|
20
|
-
from datacontract.export.go_converter import to_go_types
|
|
21
|
-
from datacontract.export.great_expectations_converter import to_great_expectations
|
|
22
|
-
from datacontract.export.html_export import to_html
|
|
23
|
-
from datacontract.export.jsonschema_converter import to_jsonschema_json
|
|
24
|
-
from datacontract.export.odcs_converter import to_odcs_yaml
|
|
25
|
-
from datacontract.export.protobuf_converter import to_protobuf
|
|
26
|
-
from datacontract.export.pydantic_converter import to_pydantic_model_str
|
|
27
|
-
from datacontract.export.rdf_converter import to_rdf_n3
|
|
28
|
-
from datacontract.export.sodacl_converter import to_sodacl_yaml
|
|
29
|
-
from datacontract.export.sql_converter import to_sql_ddl, to_sql_query
|
|
30
|
-
from datacontract.export.terraform_converter import to_terraform
|
|
15
|
+
from datacontract.export.exporter import ExportFormat
|
|
16
|
+
from datacontract.export.exporter_factory import exporter_factory
|
|
31
17
|
from datacontract.imports.avro_importer import import_avro
|
|
32
18
|
from datacontract.imports.bigquery_importer import import_bigquery_from_api, import_bigquery_from_json
|
|
33
19
|
from datacontract.imports.glue_importer import import_glue
|
|
34
20
|
from datacontract.imports.jsonschema_importer import import_jsonschema
|
|
21
|
+
from datacontract.imports.odcs_importer import import_odcs
|
|
35
22
|
from datacontract.imports.sql_importer import import_sql
|
|
23
|
+
from datacontract.imports.unity_importer import import_unity_from_json, import_unity_from_api
|
|
36
24
|
from datacontract.integration.publish_datamesh_manager import publish_datamesh_manager
|
|
37
25
|
from datacontract.integration.publish_opentelemetry import publish_opentelemetry
|
|
38
26
|
from datacontract.lint import resolve
|
|
@@ -184,6 +172,9 @@ class DataContract:
|
|
|
184
172
|
if self._examples:
|
|
185
173
|
server_name = "examples"
|
|
186
174
|
server = self._get_examples_server(data_contract, run, tmp_dir)
|
|
175
|
+
elif self._server:
|
|
176
|
+
server_name = self._server
|
|
177
|
+
server = data_contract.servers.get(server_name)
|
|
187
178
|
else:
|
|
188
179
|
server_name = list(data_contract.servers.keys())[0]
|
|
189
180
|
server = data_contract.servers.get(server_name)
|
|
@@ -195,10 +186,13 @@ class DataContract:
|
|
|
195
186
|
run.outputPortId = server.outputPortId
|
|
196
187
|
run.server = server_name
|
|
197
188
|
|
|
198
|
-
#
|
|
199
|
-
|
|
189
|
+
# TODO check server is supported type for nicer error messages
|
|
190
|
+
|
|
191
|
+
# TODO check server credentials are complete for nicer error messages
|
|
192
|
+
|
|
200
193
|
if server.format == "json" and server.type != "kafka":
|
|
201
194
|
check_jsonschema(run, data_contract, server)
|
|
195
|
+
|
|
202
196
|
check_soda_execute(run, data_contract, server, self._spark, tmp_dir)
|
|
203
197
|
|
|
204
198
|
except DataContractException as e:
|
|
@@ -234,6 +228,38 @@ class DataContract:
|
|
|
234
228
|
|
|
235
229
|
return run
|
|
236
230
|
|
|
231
|
+
def _get_examples_server(self, data_contract, run, tmp_dir):
|
|
232
|
+
run.log_info(f"Copying examples to files in temporary directory {tmp_dir}")
|
|
233
|
+
format = "json"
|
|
234
|
+
for example in data_contract.examples:
|
|
235
|
+
format = example.type
|
|
236
|
+
p = f"{tmp_dir}/{example.model}.{format}"
|
|
237
|
+
run.log_info(f"Creating example file {p}")
|
|
238
|
+
with open(p, "w") as f:
|
|
239
|
+
content = ""
|
|
240
|
+
if format == "json" and isinstance(example.data, list):
|
|
241
|
+
content = json.dumps(example.data)
|
|
242
|
+
elif format == "json" and isinstance(example.data, str):
|
|
243
|
+
content = example.data
|
|
244
|
+
elif format == "yaml" and isinstance(example.data, list):
|
|
245
|
+
content = yaml.dump(example.data, allow_unicode=True)
|
|
246
|
+
elif format == "yaml" and isinstance(example.data, str):
|
|
247
|
+
content = example.data
|
|
248
|
+
elif format == "csv":
|
|
249
|
+
content = example.data
|
|
250
|
+
logging.debug(f"Content of example file {p}: {content}")
|
|
251
|
+
f.write(content)
|
|
252
|
+
path = f"{tmp_dir}" + "/{model}." + format
|
|
253
|
+
delimiter = "array"
|
|
254
|
+
server = Server(
|
|
255
|
+
type="local",
|
|
256
|
+
path=path,
|
|
257
|
+
format=format,
|
|
258
|
+
delimiter=delimiter,
|
|
259
|
+
)
|
|
260
|
+
run.log_info(f"Using {server} for testing the examples")
|
|
261
|
+
return server
|
|
262
|
+
|
|
237
263
|
def breaking(self, other: "DataContract") -> BreakingChanges:
|
|
238
264
|
return self.changelog(other, include_severities=[Severity.ERROR, Severity.WARNING])
|
|
239
265
|
|
|
@@ -275,7 +301,13 @@ class DataContract:
|
|
|
275
301
|
inline_quality=self._inline_quality,
|
|
276
302
|
)
|
|
277
303
|
|
|
278
|
-
def export(
|
|
304
|
+
def export(
|
|
305
|
+
self,
|
|
306
|
+
export_format: ExportFormat,
|
|
307
|
+
model: str = "all",
|
|
308
|
+
sql_server_type: str = "auto",
|
|
309
|
+
**kwargs,
|
|
310
|
+
) -> str:
|
|
279
311
|
data_contract = resolve.resolve_data_contract(
|
|
280
312
|
self._data_contract_file,
|
|
281
313
|
self._data_contract_str,
|
|
@@ -283,148 +315,24 @@ class DataContract:
|
|
|
283
315
|
inline_definitions=True,
|
|
284
316
|
inline_quality=True,
|
|
285
317
|
)
|
|
286
|
-
if export_format == "jsonschema":
|
|
287
|
-
model_name, model_value = self._check_models_for_export(data_contract, model, export_format)
|
|
288
|
-
return to_jsonschema_json(model_name, model_value)
|
|
289
|
-
if export_format == "sodacl":
|
|
290
|
-
return to_sodacl_yaml(data_contract)
|
|
291
|
-
if export_format == "dbt":
|
|
292
|
-
return to_dbt_models_yaml(data_contract)
|
|
293
|
-
if export_format == "dbt-sources":
|
|
294
|
-
return to_dbt_sources_yaml(data_contract, self._server)
|
|
295
|
-
if export_format == "dbt-staging-sql":
|
|
296
|
-
model_name, model_value = self._check_models_for_export(data_contract, model, export_format)
|
|
297
|
-
return to_dbt_staging_sql(data_contract, model_name, model_value)
|
|
298
|
-
if export_format == "odcs":
|
|
299
|
-
return to_odcs_yaml(data_contract)
|
|
300
|
-
if export_format == "rdf":
|
|
301
|
-
return to_rdf_n3(data_contract, rdf_base)
|
|
302
|
-
if export_format == "protobuf":
|
|
303
|
-
return to_protobuf(data_contract)
|
|
304
|
-
if export_format == "avro":
|
|
305
|
-
model_name, model_value = self._check_models_for_export(data_contract, model, export_format)
|
|
306
|
-
return to_avro_schema_json(model_name, model_value)
|
|
307
|
-
if export_format == "avro-idl":
|
|
308
|
-
return to_avro_idl(data_contract)
|
|
309
|
-
if export_format == "terraform":
|
|
310
|
-
return to_terraform(data_contract)
|
|
311
|
-
if export_format == "sql":
|
|
312
|
-
server_type = self._determine_sql_server_type(data_contract, sql_server_type)
|
|
313
|
-
return to_sql_ddl(data_contract, server_type=server_type)
|
|
314
|
-
if export_format == "sql-query":
|
|
315
|
-
model_name, model_value = self._check_models_for_export(data_contract, model, export_format)
|
|
316
|
-
server_type = self._determine_sql_server_type(data_contract, sql_server_type)
|
|
317
|
-
return to_sql_query(data_contract, model_name, model_value, server_type)
|
|
318
|
-
if export_format == "great-expectations":
|
|
319
|
-
model_name, model_value = self._check_models_for_export(data_contract, model, export_format)
|
|
320
|
-
return to_great_expectations(data_contract, model_name)
|
|
321
|
-
if export_format == "pydantic-model":
|
|
322
|
-
return to_pydantic_model_str(data_contract)
|
|
323
|
-
if export_format == "html":
|
|
324
|
-
return to_html(data_contract)
|
|
325
|
-
if export_format == "go":
|
|
326
|
-
return to_go_types(data_contract)
|
|
327
|
-
if export_format == "bigquery":
|
|
328
|
-
model_name, model_value = self._check_models_for_export(data_contract, model, export_format)
|
|
329
|
-
found_server = data_contract.servers.get(self._server)
|
|
330
|
-
if found_server is None:
|
|
331
|
-
raise RuntimeError(
|
|
332
|
-
f"Export to {export_format} requires selecting a bigquery server from the data contract."
|
|
333
|
-
)
|
|
334
|
-
if found_server.type != "bigquery":
|
|
335
|
-
raise RuntimeError(
|
|
336
|
-
f"Export to {export_format} requires selecting a bigquery server from the data contract."
|
|
337
|
-
)
|
|
338
|
-
return to_bigquery_json(model_name, model_value, found_server)
|
|
339
|
-
if export_format == "dbml":
|
|
340
|
-
found_server = data_contract.servers.get(self._server)
|
|
341
|
-
return to_dbml_diagram(data_contract, found_server)
|
|
342
|
-
else:
|
|
343
|
-
print(f"Export format {export_format} not supported.")
|
|
344
|
-
return ""
|
|
345
|
-
|
|
346
|
-
def _determine_sql_server_type(self, data_contract: DataContractSpecification, sql_server_type: str):
|
|
347
|
-
if sql_server_type == "auto":
|
|
348
|
-
if data_contract.servers is None or len(data_contract.servers) == 0:
|
|
349
|
-
raise RuntimeError("Export with server_type='auto' requires servers in the data contract.")
|
|
350
|
-
|
|
351
|
-
server_types = set([server.type for server in data_contract.servers.values()])
|
|
352
|
-
if "snowflake" in server_types:
|
|
353
|
-
return "snowflake"
|
|
354
|
-
elif "postgres" in server_types:
|
|
355
|
-
return "postgres"
|
|
356
|
-
elif "databricks" in server_types:
|
|
357
|
-
return "databricks"
|
|
358
|
-
else:
|
|
359
|
-
# default to snowflake dialect
|
|
360
|
-
return "snowflake"
|
|
361
|
-
else:
|
|
362
|
-
return sql_server_type
|
|
363
318
|
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
run.log_info(f"Creating example file {p}")
|
|
371
|
-
with open(p, "w") as f:
|
|
372
|
-
content = ""
|
|
373
|
-
if format == "json" and isinstance(example.data, list):
|
|
374
|
-
content = json.dumps(example.data)
|
|
375
|
-
elif format == "json" and isinstance(example.data, str):
|
|
376
|
-
content = example.data
|
|
377
|
-
elif format == "yaml" and isinstance(example.data, list):
|
|
378
|
-
content = yaml.dump(example.data, allow_unicode=True)
|
|
379
|
-
elif format == "yaml" and isinstance(example.data, str):
|
|
380
|
-
content = example.data
|
|
381
|
-
elif format == "csv":
|
|
382
|
-
content = example.data
|
|
383
|
-
logging.debug(f"Content of example file {p}: {content}")
|
|
384
|
-
f.write(content)
|
|
385
|
-
path = f"{tmp_dir}" + "/{model}." + format
|
|
386
|
-
delimiter = "array"
|
|
387
|
-
server = Server(
|
|
388
|
-
type="local",
|
|
389
|
-
path=path,
|
|
390
|
-
format=format,
|
|
391
|
-
delimiter=delimiter,
|
|
319
|
+
return exporter_factory.create(export_format).export(
|
|
320
|
+
data_contract=data_contract,
|
|
321
|
+
model=model,
|
|
322
|
+
server=self._server,
|
|
323
|
+
sql_server_type=sql_server_type,
|
|
324
|
+
export_args=kwargs,
|
|
392
325
|
)
|
|
393
|
-
run.log_info(f"Using {server} for testing the examples")
|
|
394
|
-
return server
|
|
395
|
-
|
|
396
|
-
def _check_models_for_export(
|
|
397
|
-
self, data_contract: DataContractSpecification, model: str, export_format: str
|
|
398
|
-
) -> typing.Tuple[str, str]:
|
|
399
|
-
if data_contract.models is None:
|
|
400
|
-
raise RuntimeError(f"Export to {export_format} requires models in the data contract.")
|
|
401
|
-
|
|
402
|
-
model_names = list(data_contract.models.keys())
|
|
403
|
-
|
|
404
|
-
if model == "all":
|
|
405
|
-
if len(data_contract.models.items()) != 1:
|
|
406
|
-
raise RuntimeError(
|
|
407
|
-
f"Export to {export_format} is model specific. Specify the model via --model $MODEL_NAME. Available models: {model_names}"
|
|
408
|
-
)
|
|
409
|
-
|
|
410
|
-
model_name, model_value = next(iter(data_contract.models.items()))
|
|
411
|
-
else:
|
|
412
|
-
model_name = model
|
|
413
|
-
model_value = data_contract.models.get(model_name)
|
|
414
|
-
if model_value is None:
|
|
415
|
-
raise RuntimeError(
|
|
416
|
-
f"Model {model_name} not found in the data contract. Available models: {model_names}"
|
|
417
|
-
)
|
|
418
|
-
|
|
419
|
-
return model_name, model_value
|
|
420
326
|
|
|
421
327
|
def import_from_source(
|
|
422
328
|
self,
|
|
423
329
|
format: str,
|
|
424
330
|
source: typing.Optional[str] = None,
|
|
331
|
+
glue_tables: typing.Optional[typing.List[str]] = None,
|
|
425
332
|
bigquery_tables: typing.Optional[typing.List[str]] = None,
|
|
426
333
|
bigquery_project: typing.Optional[str] = None,
|
|
427
334
|
bigquery_dataset: typing.Optional[str] = None,
|
|
335
|
+
unity_table_full_name: typing.Optional[str] = None
|
|
428
336
|
) -> DataContractSpecification:
|
|
429
337
|
data_contract_specification = DataContract.init()
|
|
430
338
|
|
|
@@ -433,7 +341,7 @@ class DataContract:
|
|
|
433
341
|
elif format == "avro":
|
|
434
342
|
data_contract_specification = import_avro(data_contract_specification, source)
|
|
435
343
|
elif format == "glue":
|
|
436
|
-
data_contract_specification = import_glue(data_contract_specification, source)
|
|
344
|
+
data_contract_specification = import_glue(data_contract_specification, source, glue_tables)
|
|
437
345
|
elif format == "jsonschema":
|
|
438
346
|
data_contract_specification = import_jsonschema(data_contract_specification, source)
|
|
439
347
|
elif format == "bigquery":
|
|
@@ -443,6 +351,15 @@ class DataContract:
|
|
|
443
351
|
data_contract_specification = import_bigquery_from_api(
|
|
444
352
|
data_contract_specification, bigquery_tables, bigquery_project, bigquery_dataset
|
|
445
353
|
)
|
|
354
|
+
elif format == "odcs":
|
|
355
|
+
data_contract_specification = import_odcs(data_contract_specification, source)
|
|
356
|
+
elif format == "unity":
|
|
357
|
+
if source is not None:
|
|
358
|
+
data_contract_specification = import_unity_from_json(data_contract_specification, source)
|
|
359
|
+
else:
|
|
360
|
+
data_contract_specification = import_unity_from_api(
|
|
361
|
+
data_contract_specification, unity_table_full_name
|
|
362
|
+
)
|
|
446
363
|
else:
|
|
447
364
|
print(f"Import format {format} not supported.")
|
|
448
365
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
|
|
4
|
-
import
|
|
4
|
+
from datacontract.model.exceptions import DataContractException
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def yield_s3_files(s3_endpoint_url, s3_location):
|
|
@@ -14,6 +14,18 @@ def yield_s3_files(s3_endpoint_url, s3_location):
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def s3_fs(s3_endpoint_url):
|
|
17
|
+
try:
|
|
18
|
+
import s3fs
|
|
19
|
+
except ImportError as e:
|
|
20
|
+
raise DataContractException(
|
|
21
|
+
type="schema",
|
|
22
|
+
result="failed",
|
|
23
|
+
name="s3 extra missing",
|
|
24
|
+
reason="Install the extra datacontract-cli\[s3] to use s3",
|
|
25
|
+
engine="datacontract",
|
|
26
|
+
original_exception=e,
|
|
27
|
+
)
|
|
28
|
+
|
|
17
29
|
aws_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
|
|
18
30
|
aws_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
|
|
19
31
|
return s3fs.S3FileSystem(
|
|
@@ -64,6 +64,17 @@ def check_soda_execute(
|
|
|
64
64
|
soda_configuration_str = to_databricks_soda_configuration(server)
|
|
65
65
|
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
66
66
|
scan.set_data_source_name(server.type)
|
|
67
|
+
elif server.type == "dataframe":
|
|
68
|
+
if spark is None:
|
|
69
|
+
run.log_warn(
|
|
70
|
+
"Server type dataframe only works with the Python library and requires a Spark session, "
|
|
71
|
+
"please provide one with the DataContract class"
|
|
72
|
+
)
|
|
73
|
+
return
|
|
74
|
+
else:
|
|
75
|
+
logging.info("Use Spark to connect to data source")
|
|
76
|
+
scan.add_spark_session(spark, data_source_name="datacontract-cli")
|
|
77
|
+
scan.set_data_source_name("datacontract-cli")
|
|
67
78
|
elif server.type == "kafka":
|
|
68
79
|
if spark is None:
|
|
69
80
|
spark = create_spark_session(tmp_dir)
|
|
@@ -6,10 +6,17 @@ import yaml
|
|
|
6
6
|
# https://docs.soda.io/soda/connect-bigquery.html#authentication-methods
|
|
7
7
|
def to_bigquery_soda_configuration(server):
|
|
8
8
|
# with service account key, using an external json file
|
|
9
|
+
|
|
10
|
+
# check for our own environment variable first
|
|
11
|
+
account_info = os.getenv("DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH")
|
|
12
|
+
if account_info is None:
|
|
13
|
+
# but as a fallback look for the default google one
|
|
14
|
+
account_info = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
|
|
15
|
+
|
|
9
16
|
soda_configuration = {
|
|
10
17
|
f"data_source {server.type}": {
|
|
11
18
|
"type": "bigquery",
|
|
12
|
-
"account_info_json_path":
|
|
19
|
+
"account_info_json_path": account_info,
|
|
13
20
|
"auth_scopes": ["https://www.googleapis.com/auth/bigquery"],
|
|
14
21
|
"project_id": server.project,
|
|
15
22
|
"dataset": server.dataset,
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import os
|
|
2
3
|
from pyspark.sql import SparkSession
|
|
3
4
|
from pyspark.sql.functions import col, expr, from_json
|
|
@@ -44,6 +45,8 @@ def create_spark_session(tmp_dir: str) -> SparkSession:
|
|
|
44
45
|
|
|
45
46
|
def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecification, server: Server, tmp_dir):
|
|
46
47
|
"""Read and process data from a Kafka topic based on the server configuration."""
|
|
48
|
+
|
|
49
|
+
logging.info("Reading data from Kafka server %s topic %s", server.host, server.topic)
|
|
47
50
|
df = (
|
|
48
51
|
spark.read.format("kafka")
|
|
49
52
|
.options(**get_auth_options())
|
|
File without changes
|
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
import json
|
|
2
2
|
|
|
3
|
+
from datacontract.export.exporter import Exporter, _check_models_for_export
|
|
3
4
|
from datacontract.model.data_contract_specification import Field
|
|
4
5
|
|
|
5
6
|
|
|
7
|
+
class AvroExporter(Exporter):
|
|
8
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
9
|
+
model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
|
|
10
|
+
return to_avro_schema_json(model_name, model_value)
|
|
11
|
+
|
|
12
|
+
|
|
6
13
|
def to_avro_schema(model_name, model) -> dict:
|
|
7
14
|
return to_avro_record(model_name, model.fields, model.description, model.namespace)
|
|
8
15
|
|
|
@@ -34,13 +41,8 @@ def to_avro_field(field, field_name):
|
|
|
34
41
|
if field.description is not None:
|
|
35
42
|
avro_field["doc"] = field.description
|
|
36
43
|
avro_field["type"] = to_avro_type(field, field_name)
|
|
37
|
-
# add logical type definitions for any of the date type fields
|
|
38
|
-
if field.type in ["timestamp", "timestamp_tz", "timestamp_ntz", "date"]:
|
|
39
|
-
avro_field["logicalType"] = to_avro_logical_type(field.type)
|
|
40
44
|
|
|
41
45
|
if field.config:
|
|
42
|
-
if "avroLogicalType" in field.config:
|
|
43
|
-
avro_field["logicalType"] = field.config["avroLogicalType"]
|
|
44
46
|
if "avroDefault" in field.config:
|
|
45
47
|
avro_field["default"] = field.config["avroDefault"]
|
|
46
48
|
|
|
@@ -48,6 +50,23 @@ def to_avro_field(field, field_name):
|
|
|
48
50
|
|
|
49
51
|
|
|
50
52
|
def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
53
|
+
if field.config:
|
|
54
|
+
if "avroLogicalType" in field.config and "avroType" in field.config:
|
|
55
|
+
return {"type": field.config["avroType"], "logicalType": field.config["avroLogicalType"]}
|
|
56
|
+
if "avroLogicalType" in field.config:
|
|
57
|
+
if field.config["avroLogicalType"] in [
|
|
58
|
+
"timestamp-millis",
|
|
59
|
+
"timestamp-micros",
|
|
60
|
+
"local-timestamp-millis",
|
|
61
|
+
"local-timestamp-micros",
|
|
62
|
+
"time-micros",
|
|
63
|
+
]:
|
|
64
|
+
return {"type": "long", "logicalType": field.config["avroLogicalType"]}
|
|
65
|
+
if field.config["avroLogicalType"] in ["time-millis", "date"]:
|
|
66
|
+
return {"type": "int", "logicalType": field.config["avroLogicalType"]}
|
|
67
|
+
if "avroType" in field.config:
|
|
68
|
+
return field.config["avroLogicalType"]
|
|
69
|
+
|
|
51
70
|
if field.type is None:
|
|
52
71
|
return "null"
|
|
53
72
|
if field.type in ["string", "varchar", "text"]:
|
|
@@ -64,11 +83,11 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
|
64
83
|
elif field.type in ["boolean"]:
|
|
65
84
|
return "boolean"
|
|
66
85
|
elif field.type in ["timestamp", "timestamp_tz"]:
|
|
67
|
-
return "long"
|
|
86
|
+
return {"type": "long", "logicalType": "timestamp-millis"}
|
|
68
87
|
elif field.type in ["timestamp_ntz"]:
|
|
69
|
-
return "long"
|
|
88
|
+
return {"type": "long", "logicalType": "local-timestamp-millis"}
|
|
70
89
|
elif field.type in ["date"]:
|
|
71
|
-
return "int"
|
|
90
|
+
return {"type": "int", "logicalType": "date"}
|
|
72
91
|
elif field.type in ["time"]:
|
|
73
92
|
return "long"
|
|
74
93
|
elif field.type in ["object", "record", "struct"]:
|
|
@@ -76,20 +95,8 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
|
76
95
|
elif field.type in ["binary"]:
|
|
77
96
|
return "bytes"
|
|
78
97
|
elif field.type in ["array"]:
|
|
79
|
-
|
|
80
|
-
return "array"
|
|
98
|
+
return {"type": "array", "items": to_avro_type(field.items, field_name)}
|
|
81
99
|
elif field.type in ["null"]:
|
|
82
100
|
return "null"
|
|
83
101
|
else:
|
|
84
102
|
return "bytes"
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def to_avro_logical_type(type: str) -> str:
|
|
88
|
-
if type in ["timestamp", "timestamp_tz"]:
|
|
89
|
-
return "timestamp-millis"
|
|
90
|
-
elif type in ["timestamp_ntz"]:
|
|
91
|
-
return "local-timestamp-millis"
|
|
92
|
-
elif type in ["date"]:
|
|
93
|
-
return "date"
|
|
94
|
-
else:
|
|
95
|
-
return ""
|
|
@@ -7,28 +7,7 @@ from datacontract.lint.resolve import inline_definitions_into_data_contract
|
|
|
7
7
|
from datacontract.model.data_contract_specification import DataContractSpecification, Field
|
|
8
8
|
from datacontract.model.exceptions import DataContractException
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
def to_avro_idl(contract: DataContractSpecification) -> str:
|
|
12
|
-
"""Serialize the provided data contract specification into an Avro IDL string.
|
|
13
|
-
|
|
14
|
-
The data contract will be serialized as a protocol, with one record type
|
|
15
|
-
for each contained model. Model fields are mapped one-to-one to Avro IDL
|
|
16
|
-
record fields.
|
|
17
|
-
"""
|
|
18
|
-
stream = StringIO()
|
|
19
|
-
to_avro_idl_stream(contract, stream)
|
|
20
|
-
return stream.getvalue()
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO):
|
|
24
|
-
"""Serialize the provided data contract specification into Avro IDL."""
|
|
25
|
-
ir = _contract_to_avro_idl_ir(contract)
|
|
26
|
-
if ir.description:
|
|
27
|
-
stream.write(f"/** {contract.info.description} */\n")
|
|
28
|
-
stream.write(f"protocol {ir.name or 'Unnamed'} {{\n")
|
|
29
|
-
for model_type in ir.model_types:
|
|
30
|
-
_write_model_type(model_type, stream)
|
|
31
|
-
stream.write("}\n")
|
|
10
|
+
from datacontract.export.exporter import Exporter
|
|
32
11
|
|
|
33
12
|
|
|
34
13
|
class AvroPrimitiveType(Enum):
|
|
@@ -107,6 +86,34 @@ avro_primitive_types = set(
|
|
|
107
86
|
)
|
|
108
87
|
|
|
109
88
|
|
|
89
|
+
class AvroIdlExporter(Exporter):
|
|
90
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
91
|
+
return to_avro_idl(data_contract)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def to_avro_idl(contract: DataContractSpecification) -> str:
|
|
95
|
+
"""Serialize the provided data contract specification into an Avro IDL string.
|
|
96
|
+
|
|
97
|
+
The data contract will be serialized as a protocol, with one record type
|
|
98
|
+
for each contained model. Model fields are mapped one-to-one to Avro IDL
|
|
99
|
+
record fields.
|
|
100
|
+
"""
|
|
101
|
+
stream = StringIO()
|
|
102
|
+
to_avro_idl_stream(contract, stream)
|
|
103
|
+
return stream.getvalue()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO):
|
|
107
|
+
"""Serialize the provided data contract specification into Avro IDL."""
|
|
108
|
+
ir = _contract_to_avro_idl_ir(contract)
|
|
109
|
+
if ir.description:
|
|
110
|
+
stream.write(f"/** {contract.info.description} */\n")
|
|
111
|
+
stream.write(f"protocol {ir.name or 'Unnamed'} {{\n")
|
|
112
|
+
for model_type in ir.model_types:
|
|
113
|
+
_write_model_type(model_type, stream)
|
|
114
|
+
stream.write("}\n")
|
|
115
|
+
|
|
116
|
+
|
|
110
117
|
def _to_avro_primitive_logical_type(field_name: str, field: Field) -> AvroPrimitiveField:
|
|
111
118
|
result = AvroPrimitiveField(field_name, field.required, field.description, AvroPrimitiveType.string)
|
|
112
119
|
match field.type:
|
|
@@ -5,6 +5,21 @@ from typing import Dict, List
|
|
|
5
5
|
from datacontract.model.data_contract_specification import Model, Field, Server
|
|
6
6
|
from datacontract.model.exceptions import DataContractException
|
|
7
7
|
|
|
8
|
+
from datacontract.export.exporter import Exporter, _check_models_for_export
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BigQueryExporter(Exporter):
|
|
12
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
13
|
+
self.dict_args = export_args
|
|
14
|
+
model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
|
|
15
|
+
found_server = data_contract.servers.get(server)
|
|
16
|
+
if found_server is None:
|
|
17
|
+
raise RuntimeError("Export to bigquery requires selecting a bigquery server from the data contract.")
|
|
18
|
+
if found_server.type != "bigquery":
|
|
19
|
+
raise RuntimeError("Export to bigquery requires selecting a bigquery server from the data contract.")
|
|
20
|
+
|
|
21
|
+
return to_bigquery_json(model_name, model_value, found_server)
|
|
22
|
+
|
|
8
23
|
|
|
9
24
|
def to_bigquery_json(model_name: str, model_value: Model, server: Server) -> str:
|
|
10
25
|
bigquery_table = to_bigquery_schema(model_name, model_value, server)
|