datacontract-cli 0.10.22__py3-none-any.whl → 0.10.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (39) hide show
  1. datacontract/__init__.py +13 -0
  2. datacontract/catalog/catalog.py +2 -2
  3. datacontract/cli.py +20 -72
  4. datacontract/data_contract.py +5 -3
  5. datacontract/engines/data_contract_test.py +32 -7
  6. datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
  7. datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
  8. datacontract/engines/soda/check_soda_execute.py +17 -4
  9. datacontract/engines/soda/connections/{duckdb.py → duckdb_connection.py} +66 -9
  10. datacontract/engines/soda/connections/kafka.py +3 -2
  11. datacontract/export/avro_converter.py +10 -3
  12. datacontract/export/bigquery_converter.py +1 -1
  13. datacontract/export/dbt_converter.py +13 -10
  14. datacontract/export/duckdb_type_converter.py +57 -0
  15. datacontract/export/odcs_v3_exporter.py +27 -7
  16. datacontract/export/protobuf_converter.py +163 -69
  17. datacontract/imports/avro_importer.py +31 -6
  18. datacontract/imports/csv_importer.py +111 -57
  19. datacontract/imports/importer.py +1 -0
  20. datacontract/imports/importer_factory.py +5 -0
  21. datacontract/imports/odcs_v3_importer.py +49 -7
  22. datacontract/imports/protobuf_importer.py +266 -0
  23. datacontract/lint/resolve.py +40 -12
  24. datacontract/model/data_contract_specification.py +2 -2
  25. datacontract/model/run.py +3 -0
  26. datacontract/output/__init__.py +0 -0
  27. datacontract/output/junit_test_results.py +135 -0
  28. datacontract/output/output_format.py +10 -0
  29. datacontract/output/test_results_writer.py +79 -0
  30. datacontract/templates/datacontract.html +2 -1
  31. datacontract/templates/index.html +2 -1
  32. {datacontract_cli-0.10.22.dist-info → datacontract_cli-0.10.24.dist-info}/METADATA +279 -193
  33. {datacontract_cli-0.10.22.dist-info → datacontract_cli-0.10.24.dist-info}/RECORD +37 -33
  34. {datacontract_cli-0.10.22.dist-info → datacontract_cli-0.10.24.dist-info}/WHEEL +1 -1
  35. datacontract/export/csv_type_converter.py +0 -36
  36. datacontract/lint/linters/quality_schema_linter.py +0 -52
  37. {datacontract_cli-0.10.22.dist-info → datacontract_cli-0.10.24.dist-info}/entry_points.txt +0 -0
  38. {datacontract_cli-0.10.22.dist-info → datacontract_cli-0.10.24.dist-info/licenses}/LICENSE +0 -0
  39. {datacontract_cli-0.10.22.dist-info → datacontract_cli-0.10.24.dist-info}/top_level.txt +0 -0
datacontract/__init__.py CHANGED
@@ -0,0 +1,13 @@
1
+ # Configuration so that yaml.safe_dump dumps strings with line breaks with yaml literal |
2
+ import yaml
3
+
4
+ yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
5
+
6
+
7
+ def repr_str(dumper, data):
8
+ if "\n" in data:
9
+ return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
10
+ return dumper.org_represent_str(data)
11
+
12
+
13
+ yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
@@ -19,7 +19,7 @@ def create_data_contract_html(contracts, file: Path, path: Path, schema: str):
19
19
  file_without_suffix = file.with_suffix(".html")
20
20
  html_filepath = path / file_without_suffix
21
21
  html_filepath.parent.mkdir(parents=True, exist_ok=True)
22
- with open(html_filepath, "w") as f:
22
+ with open(html_filepath, "w", encoding="utf-8") as f:
23
23
  f.write(html)
24
24
  contracts.append(
25
25
  DataContractView(
@@ -42,7 +42,7 @@ class DataContractView:
42
42
 
43
43
  def create_index_html(contracts, path):
44
44
  index_filepath = path / "index.html"
45
- with open(index_filepath, "w") as f:
45
+ with open(index_filepath, "w", encoding="utf-8") as f:
46
46
  # Load templates from templates folder
47
47
  package_loader = PackageLoader("datacontract", "templates")
48
48
  env = Environment(
datacontract/cli.py CHANGED
@@ -5,9 +5,7 @@ from typing import Iterable, List, Optional
5
5
 
6
6
  import typer
7
7
  from click import Context
8
- from rich import box
9
8
  from rich.console import Console
10
- from rich.table import Table
11
9
  from typer.core import TyperGroup
12
10
  from typing_extensions import Annotated
13
11
 
@@ -19,6 +17,8 @@ from datacontract.integration.datamesh_manager import (
19
17
  publish_data_contract_to_datamesh_manager,
20
18
  )
21
19
  from datacontract.lint.resolve import resolve_data_contract_dict
20
+ from datacontract.output.output_format import OutputFormat
21
+ from datacontract.output.test_results_writer import write_test_result
22
22
 
23
23
  console = Console()
24
24
 
@@ -92,12 +92,19 @@ def lint(
92
92
  str,
93
93
  typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
94
94
  ] = None,
95
+ output: Annotated[
96
+ Path,
97
+ typer.Option(
98
+ help="Specify the file path where the test results should be written to (e.g., './test-results/TEST-datacontract.xml'). If no path is provided, the output will be printed to stdout."
99
+ ),
100
+ ] = None,
101
+ output_format: Annotated[OutputFormat, typer.Option(help="The target format for the test results.")] = None,
95
102
  ):
96
103
  """
97
104
  Validate that the datacontract.yaml is correctly formatted.
98
105
  """
99
106
  run = DataContract(data_contract_file=location, schema_location=schema).lint()
100
- _handle_result(run)
107
+ write_test_result(run, console, output_format, output)
101
108
 
102
109
 
103
110
  @app.command()
@@ -120,6 +127,13 @@ def test(
120
127
  ),
121
128
  ] = "all",
122
129
  publish: Annotated[str, typer.Option(help="The url to publish the results after the test")] = None,
130
+ output: Annotated[
131
+ Path,
132
+ typer.Option(
133
+ help="Specify the file path where the test results should be written to (e.g., './test-results/TEST-datacontract.xml')."
134
+ ),
135
+ ] = None,
136
+ output_format: Annotated[OutputFormat, typer.Option(help="The target format for the test results.")] = None,
123
137
  logs: Annotated[bool, typer.Option(help="Print logs")] = False,
124
138
  ssl_verification: Annotated[
125
139
  bool,
@@ -141,7 +155,7 @@ def test(
141
155
  ).test()
142
156
  if logs:
143
157
  _print_logs(run)
144
- _handle_result(run)
158
+ write_test_result(run, console, output_format, output)
145
159
 
146
160
 
147
161
  @app.command()
@@ -214,7 +228,7 @@ def export(
214
228
  if output is None:
215
229
  console.print(result, markup=False, soft_wrap=True)
216
230
  else:
217
- with output.open("w") as f:
231
+ with output.open(mode="w", encoding="utf-8") as f:
218
232
  f.write(result)
219
233
  console.print(f"Written result to {output}")
220
234
 
@@ -306,7 +320,7 @@ def import_(
306
320
  if output is None:
307
321
  console.print(result.to_yaml(), markup=False, soft_wrap=True)
308
322
  else:
309
- with output.open("w") as f:
323
+ with output.open(mode="w", encoding="utf-8") as f:
310
324
  f.write(result.to_yaml())
311
325
  console.print(f"Written result to {output}")
312
326
 
@@ -467,77 +481,11 @@ def api(
467
481
  uvicorn.run(app="datacontract.api:app", port=port, host=host, reload=True, log_config=LOGGING_CONFIG)
468
482
 
469
483
 
470
- def _handle_result(run):
471
- _print_table(run)
472
- if run.result == "passed":
473
- console.print(
474
- f"🟢 data contract is valid. Run {len(run.checks)} checks. Took {(run.timestampEnd - run.timestampStart).total_seconds()} seconds."
475
- )
476
- elif run.result == "warning":
477
- console.print("🟠 data contract has warnings. Found the following warnings:")
478
- i = 1
479
- for check in run.checks:
480
- if check.result != "passed":
481
- field = to_field(run, check)
482
- if field:
483
- field = field + " "
484
- else:
485
- field = ""
486
- console.print(f"{i}) {field}{check.name}: {check.reason}")
487
- i += 1
488
- else:
489
- console.print("🔴 data contract is invalid, found the following errors:")
490
- i = 1
491
- for check in run.checks:
492
- if check.result != "passed":
493
- field = to_field(run, check)
494
- if field:
495
- field = field + " "
496
- else:
497
- field = ""
498
- console.print(f"{i}) {field}{check.name}: {check.reason}")
499
- i += 1
500
- raise typer.Exit(code=1)
501
-
502
-
503
- def _print_table(run):
504
- table = Table(box=box.ROUNDED)
505
- table.add_column("Result", no_wrap=True)
506
- table.add_column("Check", max_width=100)
507
- table.add_column("Field", max_width=32)
508
- table.add_column("Details", max_width=50)
509
- for check in sorted(run.checks, key=lambda c: (c.result or "", c.model or "", c.field or "")):
510
- table.add_row(with_markup(check.result), check.name, to_field(run, check), check.reason)
511
- console.print(table)
512
-
513
-
514
- def to_field(run, check):
515
- models = [c.model for c in run.checks]
516
- if len(set(models)) > 1:
517
- if check.field is None:
518
- return check.model
519
- return check.model + "." + check.field
520
- else:
521
- return check.field
522
-
523
-
524
484
  def _print_logs(run):
525
485
  console.print("\nLogs:")
526
486
  for log in run.logs:
527
487
  console.print(log.timestamp.strftime("%y-%m-%d %H:%M:%S"), log.level.ljust(5), log.message)
528
488
 
529
489
 
530
- def with_markup(result):
531
- if result == "passed":
532
- return "[green]passed[/green]"
533
- if result == "warning":
534
- return "[yellow]warning[/yellow]"
535
- if result == "failed":
536
- return "[red]failed[/red]"
537
- if result == "error":
538
- return "[red]error[/red]"
539
- return result
540
-
541
-
542
490
  if __name__ == "__main__":
543
491
  app()
@@ -4,6 +4,8 @@ import typing
4
4
  if typing.TYPE_CHECKING:
5
5
  from pyspark.sql import SparkSession
6
6
 
7
+ from duckdb.duckdb import DuckDBPyConnection
8
+
7
9
  from datacontract.breaking.breaking import (
8
10
  info_breaking_changes,
9
11
  models_breaking_changes,
@@ -22,7 +24,6 @@ from datacontract.lint.linters.description_linter import DescriptionLinter
22
24
  from datacontract.lint.linters.field_pattern_linter import FieldPatternLinter
23
25
  from datacontract.lint.linters.field_reference_linter import FieldReferenceLinter
24
26
  from datacontract.lint.linters.notice_period_linter import NoticePeriodLinter
25
- from datacontract.lint.linters.quality_schema_linter import QualityUsesSchemaLinter
26
27
  from datacontract.lint.linters.valid_constraints_linter import ValidFieldConstraintsLinter
27
28
  from datacontract.model.data_contract_specification import DataContractSpecification
28
29
  from datacontract.model.exceptions import DataContractException
@@ -39,6 +40,7 @@ class DataContract:
39
40
  server: str = None,
40
41
  publish_url: str = None,
41
42
  spark: "SparkSession" = None,
43
+ duckdb_connection: DuckDBPyConnection = None,
42
44
  inline_definitions: bool = True,
43
45
  inline_quality: bool = True,
44
46
  ssl_verification: bool = True,
@@ -50,11 +52,11 @@ class DataContract:
50
52
  self._server = server
51
53
  self._publish_url = publish_url
52
54
  self._spark = spark
55
+ self._duckdb_connection = duckdb_connection
53
56
  self._inline_definitions = inline_definitions
54
57
  self._inline_quality = inline_quality
55
58
  self._ssl_verification = ssl_verification
56
59
  self.all_linters = {
57
- QualityUsesSchemaLinter(),
58
60
  FieldPatternLinter(),
59
61
  FieldReferenceLinter(),
60
62
  NoticePeriodLinter(),
@@ -146,7 +148,7 @@ class DataContract:
146
148
  inline_quality=self._inline_quality,
147
149
  )
148
150
 
149
- execute_data_contract_test(data_contract, run, self._server, self._spark)
151
+ execute_data_contract_test(data_contract, run, self._server, self._spark, self._duckdb_connection)
150
152
 
151
153
  except DataContractException as e:
152
154
  run.checks.append(
@@ -1,5 +1,7 @@
1
1
  import typing
2
2
 
3
+ from duckdb.duckdb import DuckDBPyConnection
4
+
3
5
  from datacontract.engines.data_contract_checks import create_checks
4
6
 
5
7
  if typing.TYPE_CHECKING:
@@ -10,7 +12,7 @@ from datacontract.engines.datacontract.check_that_datacontract_contains_valid_se
10
12
  )
11
13
  from datacontract.engines.fastjsonschema.check_jsonschema import check_jsonschema
12
14
  from datacontract.engines.soda.check_soda_execute import check_soda_execute
13
- from datacontract.model.data_contract_specification import DataContractSpecification
15
+ from datacontract.model.data_contract_specification import DataContractSpecification, Server
14
16
  from datacontract.model.exceptions import DataContractException
15
17
  from datacontract.model.run import ResultEnum, Run
16
18
 
@@ -20,6 +22,7 @@ def execute_data_contract_test(
20
22
  run: Run,
21
23
  server_name: str = None,
22
24
  spark: "SparkSession" = None,
25
+ duckdb_connection: DuckDBPyConnection = None,
23
26
  ):
24
27
  if data_contract_specification.models is None or len(data_contract_specification.models) == 0:
25
28
  raise DataContractException(
@@ -29,12 +32,13 @@ def execute_data_contract_test(
29
32
  reason="Models block is missing. Skip executing tests.",
30
33
  engine="datacontract",
31
34
  )
32
- check_that_datacontract_contains_valid_server_configuration(run, data_contract_specification, server_name)
33
- if server_name:
34
- server = data_contract_specification.servers.get(server_name)
35
- else:
35
+ if (
36
+ server_name is None
37
+ and data_contract_specification.servers is not None
38
+ and len(data_contract_specification.servers) > 0
39
+ ):
36
40
  server_name = list(data_contract_specification.servers.keys())[0]
37
- server = data_contract_specification.servers.get(server_name)
41
+ server = get_server(data_contract_specification, server_name)
38
42
  run.log_info(f"Running tests for data contract {data_contract_specification.id} with server {server_name}")
39
43
  run.dataContractId = data_contract_specification.id
40
44
  run.dataContractVersion = data_contract_specification.info.version
@@ -48,4 +52,25 @@ def execute_data_contract_test(
48
52
  # TODO check server credentials are complete for nicer error messages
49
53
  if server.format == "json" and server.type != "kafka":
50
54
  check_jsonschema(run, data_contract_specification, server)
51
- check_soda_execute(run, data_contract_specification, server, spark)
55
+ check_soda_execute(run, data_contract_specification, server, spark, duckdb_connection)
56
+
57
+
58
+ def get_server(data_contract_specification: DataContractSpecification, server_name: str = None) -> Server | None:
59
+ """Get the server configuration from the data contract specification.
60
+
61
+ Args:
62
+ data_contract_specification: The data contract specification
63
+ server_name: Optional name of the server to use. If not provided, uses the first server.
64
+
65
+ Returns:
66
+ The selected server configuration
67
+ """
68
+
69
+ check_that_datacontract_contains_valid_server_configuration(data_contract_specification, server_name)
70
+
71
+ if server_name is not None:
72
+ server = data_contract_specification.servers.get(server_name)
73
+ else:
74
+ server_name = list(data_contract_specification.servers.keys())[0]
75
+ server = data_contract_specification.servers.get(server_name)
76
+ return server
@@ -1,12 +1,11 @@
1
1
  from datacontract.model.data_contract_specification import DataContractSpecification
2
2
  from datacontract.model.exceptions import DataContractException
3
- from datacontract.model.run import Run
4
3
 
5
4
 
6
5
  def check_that_datacontract_contains_valid_server_configuration(
7
- run: Run, data_contract: DataContractSpecification, server_name: str
6
+ data_contract: DataContractSpecification, server_name: str | None
8
7
  ):
9
- if data_contract.servers is None:
8
+ if data_contract.servers is None or len(data_contract.servers) == 0:
10
9
  raise DataContractException(
11
10
  type="lint",
12
11
  name="Check that data contract contains valid server configuration",
@@ -2,6 +2,7 @@ import logging
2
2
  import os
3
3
 
4
4
  from datacontract.model.exceptions import DataContractException
5
+ from datacontract.model.run import ResultEnum
5
6
 
6
7
 
7
8
  def yield_s3_files(s3_endpoint_url, s3_location):
@@ -19,9 +20,9 @@ def s3_fs(s3_endpoint_url):
19
20
  except ImportError as e:
20
21
  raise DataContractException(
21
22
  type="schema",
22
- result="failed",
23
+ result=ResultEnum.failed,
23
24
  name="s3 extra missing",
24
- reason="Install the extra datacontract-cli\[s3] to use s3",
25
+ reason="Install the extra s3 to use s3",
25
26
  engine="datacontract",
26
27
  original_exception=e,
27
28
  )
@@ -1,9 +1,15 @@
1
1
  import logging
2
+ import typing
2
3
  import uuid
3
4
 
5
+ if typing.TYPE_CHECKING:
6
+ from pyspark.sql import SparkSession
7
+
8
+ from duckdb.duckdb import DuckDBPyConnection
9
+
4
10
  from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
5
11
  from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
6
- from datacontract.engines.soda.connections.duckdb import get_duckdb_connection
12
+ from datacontract.engines.soda.connections.duckdb_connection import get_duckdb_connection
7
13
  from datacontract.engines.soda.connections.kafka import create_spark_session, read_kafka_topic
8
14
  from datacontract.engines.soda.connections.postgres import to_postgres_soda_configuration
9
15
  from datacontract.engines.soda.connections.snowflake import to_snowflake_soda_configuration
@@ -14,7 +20,13 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
14
20
  from datacontract.model.run import Check, Log, ResultEnum, Run
15
21
 
16
22
 
17
- def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark):
23
+ def check_soda_execute(
24
+ run: Run,
25
+ data_contract: DataContractSpecification,
26
+ server: Server,
27
+ spark: "SparkSession" = None,
28
+ duckdb_connection: DuckDBPyConnection = None,
29
+ ):
18
30
  from soda.common.config_helper import ConfigHelper
19
31
 
20
32
  ConfigHelper.get_instance().upsert_value("send_anonymous_usage_stats", False)
@@ -30,7 +42,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
30
42
  if server.type in ["s3", "gcs", "azure", "local"]:
31
43
  if server.format in ["json", "parquet", "csv", "delta"]:
32
44
  run.log_info(f"Configuring engine soda-core to connect to {server.type} {server.format} with duckdb")
33
- con = get_duckdb_connection(data_contract, server, run)
45
+ con = get_duckdb_connection(data_contract, server, run, duckdb_connection)
34
46
  scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
35
47
  scan.set_data_source_name(server.type)
36
48
  else:
@@ -62,7 +74,8 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
62
74
  run.log_info("Connecting to databricks via spark")
63
75
  scan.add_spark_session(spark, data_source_name=server.type)
64
76
  scan.set_data_source_name(server.type)
65
- spark.sql(f"USE {server.catalog}.{server.schema_}")
77
+ database_name = ".".join(filter(None, [server.catalog, server.schema_]))
78
+ spark.sql(f"USE {database_name}")
66
79
  else:
67
80
  run.log_info("Connecting to databricks directly")
68
81
  soda_configuration_str = to_databricks_soda_configuration(server)
@@ -1,13 +1,24 @@
1
1
  import os
2
+ from typing import Any, Dict
2
3
 
3
4
  import duckdb
4
5
 
5
- from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type
6
+ from datacontract.export.duckdb_type_converter import convert_to_duckdb_csv_type, convert_to_duckdb_json_type
7
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
6
8
  from datacontract.model.run import Run
7
9
 
8
10
 
9
- def get_duckdb_connection(data_contract, server, run: Run):
10
- con = duckdb.connect(database=":memory:")
11
+ def get_duckdb_connection(
12
+ data_contract: DataContractSpecification,
13
+ server: Server,
14
+ run: Run,
15
+ duckdb_connection: duckdb.DuckDBPyConnection | None = None,
16
+ ) -> duckdb.DuckDBPyConnection:
17
+ if duckdb_connection is None:
18
+ con = duckdb.connect(database=":memory:")
19
+ else:
20
+ con = duckdb_connection
21
+
11
22
  path: str = ""
12
23
  if server.type == "local":
13
24
  path = server.path
@@ -27,14 +38,21 @@ def get_duckdb_connection(data_contract, server, run: Run):
27
38
  run.log_info(f"Creating table {model_name} for {model_path}")
28
39
 
29
40
  if server.format == "json":
30
- format = "auto"
41
+ json_format = "auto"
31
42
  if server.delimiter == "new_line":
32
- format = "newline_delimited"
43
+ json_format = "newline_delimited"
33
44
  elif server.delimiter == "array":
34
- format = "array"
35
- con.sql(f"""
36
- CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{format}', hive_partitioning=1);
45
+ json_format = "array"
46
+ columns = to_json_types(model)
47
+ if columns is None:
48
+ con.sql(f"""
49
+ CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', hive_partitioning=1);
37
50
  """)
51
+ else:
52
+ con.sql(
53
+ f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', columns={columns}, hive_partitioning=1);"""
54
+ )
55
+ add_nested_views(con, model_name, model.fields)
38
56
  elif server.format == "parquet":
39
57
  con.sql(f"""
40
58
  CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{model_path}', hive_partitioning=1);
@@ -56,7 +74,7 @@ def get_duckdb_connection(data_contract, server, run: Run):
56
74
  return con
57
75
 
58
76
 
59
- def to_csv_types(model) -> dict:
77
+ def to_csv_types(model) -> dict[Any, str | None] | None:
60
78
  if model is None:
61
79
  return None
62
80
  columns = {}
@@ -66,6 +84,45 @@ def to_csv_types(model) -> dict:
66
84
  return columns
67
85
 
68
86
 
87
+ def to_json_types(model: Model) -> dict[Any, str | None] | None:
88
+ if model is None:
89
+ return None
90
+ columns = {}
91
+ for field_name, field in model.fields.items():
92
+ columns[field_name] = convert_to_duckdb_json_type(field)
93
+ return columns
94
+
95
+
96
+ def add_nested_views(con: duckdb.DuckDBPyConnection, model_name: str, fields: Dict[str, Field] | None):
97
+ model_name = model_name.strip('"')
98
+ if fields is None:
99
+ return
100
+ for field_name, field in fields.items():
101
+ if field.type is None or field.type.lower() not in ["array", "object"]:
102
+ continue
103
+ field_type = field.type.lower()
104
+ if field_type == "array" and field.items is None:
105
+ continue
106
+ elif field_type == "object" and field.fields is None:
107
+ continue
108
+
109
+ nested_model_name = f"{model_name}__{field_name}"
110
+ max_depth = 2 if field_type == "array" else 1
111
+
112
+ ## if parent field is not required, the nested objects may respolve
113
+ ## to a row of NULLs -- but if the objects themselves have required
114
+ ## fields, this will fail the check.
115
+ where = "" if field.required else f" WHERE {field_name} IS NOT NULL"
116
+ con.sql(f"""
117
+ CREATE VIEW IF NOT EXISTS "{nested_model_name}" AS
118
+ SELECT unnest({field_name}, max_depth := {max_depth}) as {field_name} FROM "{model_name}" {where}
119
+ """)
120
+ if field_type == "array":
121
+ add_nested_views(con, nested_model_name, field.items.fields)
122
+ elif field_type == "object":
123
+ add_nested_views(con, nested_model_name, field.fields)
124
+
125
+
69
126
  def setup_s3_connection(con, server):
70
127
  s3_region = os.getenv("DATACONTRACT_S3_REGION")
71
128
  s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
@@ -6,6 +6,7 @@ import tempfile
6
6
  from datacontract.export.avro_converter import to_avro_schema_json
7
7
  from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server
8
8
  from datacontract.model.exceptions import DataContractException
9
+ from datacontract.model.run import ResultEnum
9
10
 
10
11
 
11
12
  def create_spark_session():
@@ -16,7 +17,7 @@ def create_spark_session():
16
17
  except ImportError as e:
17
18
  raise DataContractException(
18
19
  type="schema",
19
- result="failed",
20
+ result=ResultEnum.failed,
20
21
  name="pyspark is missing",
21
22
  reason="Install the extra datacontract-cli[kafka] to use kafka",
22
23
  engine="datacontract",
@@ -33,7 +34,7 @@ def create_spark_session():
33
34
  .config("spark.ui.enabled", "false")
34
35
  .config(
35
36
  "spark.jars.packages",
36
- "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.2,org.apache.spark:spark-avro_2.12:3.5.2",
37
+ "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.5,org.apache.spark:spark-avro_2.12:3.5.5",
37
38
  )
38
39
  .getOrCreate()
39
40
  )
@@ -91,7 +91,9 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
91
91
  if field.precision is not None:
92
92
  typeVal["precision"] = field.precision
93
93
  return typeVal
94
- elif field.type in ["float", "double"]:
94
+ elif field.type in ["float"]:
95
+ return "float"
96
+ elif field.type in ["double"]:
95
97
  return "double"
96
98
  elif field.type in ["integer", "int"]:
97
99
  return "int"
@@ -107,9 +109,14 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
107
109
  return {"type": "int", "logicalType": "date"}
108
110
  elif field.type in ["time"]:
109
111
  return "long"
112
+ elif field.type in ["map"]:
113
+ if field.config is not None and "values" in field.config:
114
+ return {"type": "map", "values": field.config["values"]}
115
+ else:
116
+ return "bytes"
110
117
  elif field.type in ["object", "record", "struct"]:
111
- if field.config is not None and 'namespace' in field.config:
112
- return to_avro_record(field_name ,field.fields ,field.description ,field.config['namespace'])
118
+ if field.config is not None and "namespace" in field.config:
119
+ return to_avro_record(field_name, field.fields, field.description, field.config["namespace"])
113
120
  return to_avro_record(field_name, field.fields, field.description, None)
114
121
  elif field.type in ["binary"]:
115
122
  return "bytes"
@@ -103,7 +103,7 @@ def map_type_to_bigquery(field: Field) -> str:
103
103
  elif field_type.lower() == "date":
104
104
  return "DATE"
105
105
  elif field_type.lower() == "timestamp_ntz":
106
- return "TIME"
106
+ return "DATETIME"
107
107
  elif field_type.lower() in ["number", "decimal", "numeric"]:
108
108
  return "NUMERIC"
109
109
  elif field_type.lower() == "double":
@@ -9,7 +9,7 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
9
9
 
10
10
  class DbtExporter(Exporter):
11
11
  def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
12
- return to_dbt_models_yaml(data_contract)
12
+ return to_dbt_models_yaml(data_contract, server)
13
13
 
14
14
 
15
15
  class DbtSourceExporter(Exporter):
@@ -27,15 +27,16 @@ class DbtStageExporter(Exporter):
27
27
  )
28
28
 
29
29
 
30
- def to_dbt_models_yaml(data_contract_spec: DataContractSpecification):
30
+ def to_dbt_models_yaml(data_contract_spec: DataContractSpecification, server: str = None):
31
31
  dbt = {
32
32
  "version": 2,
33
33
  "models": [],
34
34
  }
35
+
35
36
  for model_key, model_value in data_contract_spec.models.items():
36
- dbt_model = _to_dbt_model(model_key, model_value, data_contract_spec)
37
+ dbt_model = _to_dbt_model(model_key, model_value, data_contract_spec, adapter_type=server)
37
38
  dbt["models"].append(dbt_model)
38
- return yaml.dump(dbt, indent=2, sort_keys=False, allow_unicode=True)
39
+ return yaml.safe_dump(dbt, indent=2, sort_keys=False, allow_unicode=True)
39
40
 
40
41
 
41
42
  def to_dbt_staging_sql(data_contract_spec: DataContractSpecification, model_name: str, model_value: Model) -> str:
@@ -60,7 +61,7 @@ def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: s
60
61
  if data_contract_spec.info.owner is not None:
61
62
  source["meta"] = {"owner": data_contract_spec.info.owner}
62
63
  if data_contract_spec.info.description is not None:
63
- source["description"] = data_contract_spec.info.description
64
+ source["description"] = data_contract_spec.info.description.strip().replace("\n", " ")
64
65
  found_server = data_contract_spec.servers.get(server)
65
66
  adapter_type = None
66
67
  if found_server is not None:
@@ -87,14 +88,16 @@ def _to_dbt_source_table(
87
88
  }
88
89
 
89
90
  if model_value.description is not None:
90
- dbt_model["description"] = model_value.description
91
+ dbt_model["description"] = model_value.description.strip().replace("\n", " ")
91
92
  columns = _to_columns(data_contract_spec, model_value.fields, False, adapter_type)
92
93
  if columns:
93
94
  dbt_model["columns"] = columns
94
95
  return dbt_model
95
96
 
96
97
 
97
- def _to_dbt_model(model_key, model_value: Model, data_contract_spec: DataContractSpecification) -> dict:
98
+ def _to_dbt_model(
99
+ model_key, model_value: Model, data_contract_spec: DataContractSpecification, adapter_type: Optional[str]
100
+ ) -> dict:
98
101
  dbt_model = {
99
102
  "name": model_key,
100
103
  }
@@ -108,8 +111,8 @@ def _to_dbt_model(model_key, model_value: Model, data_contract_spec: DataContrac
108
111
  if _supports_constraints(model_type):
109
112
  dbt_model["config"]["contract"] = {"enforced": True}
110
113
  if model_value.description is not None:
111
- dbt_model["description"] = model_value.description
112
- columns = _to_columns(data_contract_spec, model_value.fields, _supports_constraints(model_type), None)
114
+ dbt_model["description"] = model_value.description.strip().replace("\n", " ")
115
+ columns = _to_columns(data_contract_spec, model_value.fields, _supports_constraints(model_type), adapter_type)
113
116
  if columns:
114
117
  dbt_model["columns"] = columns
115
118
  return dbt_model
@@ -171,7 +174,7 @@ def _to_column(
171
174
  {"dbt_expectations.dbt_expectations.expect_column_values_to_be_of_type": {"column_type": dbt_type}}
172
175
  )
173
176
  if field.description is not None:
174
- column["description"] = field.description
177
+ column["description"] = field.description.strip().replace("\n", " ")
175
178
  if field.required:
176
179
  if supports_constraints:
177
180
  column.setdefault("constraints", []).append({"type": "not_null"})