datacontract-cli 0.10.23__py3-none-any.whl → 0.10.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. datacontract/__init__.py +13 -0
  2. datacontract/api.py +12 -5
  3. datacontract/catalog/catalog.py +5 -3
  4. datacontract/cli.py +119 -13
  5. datacontract/data_contract.py +145 -67
  6. datacontract/engines/data_contract_checks.py +366 -60
  7. datacontract/engines/data_contract_test.py +50 -4
  8. datacontract/engines/fastjsonschema/check_jsonschema.py +37 -19
  9. datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
  10. datacontract/engines/soda/check_soda_execute.py +27 -3
  11. datacontract/engines/soda/connections/athena.py +79 -0
  12. datacontract/engines/soda/connections/duckdb_connection.py +65 -6
  13. datacontract/engines/soda/connections/kafka.py +4 -2
  14. datacontract/engines/soda/connections/oracle.py +50 -0
  15. datacontract/export/avro_converter.py +20 -3
  16. datacontract/export/bigquery_converter.py +1 -1
  17. datacontract/export/dbt_converter.py +36 -7
  18. datacontract/export/dqx_converter.py +126 -0
  19. datacontract/export/duckdb_type_converter.py +57 -0
  20. datacontract/export/excel_exporter.py +923 -0
  21. datacontract/export/exporter.py +3 -0
  22. datacontract/export/exporter_factory.py +17 -1
  23. datacontract/export/great_expectations_converter.py +55 -5
  24. datacontract/export/{html_export.py → html_exporter.py} +31 -20
  25. datacontract/export/markdown_converter.py +134 -5
  26. datacontract/export/mermaid_exporter.py +110 -0
  27. datacontract/export/odcs_v3_exporter.py +193 -149
  28. datacontract/export/protobuf_converter.py +163 -69
  29. datacontract/export/rdf_converter.py +2 -2
  30. datacontract/export/sodacl_converter.py +9 -1
  31. datacontract/export/spark_converter.py +31 -4
  32. datacontract/export/sql_converter.py +6 -2
  33. datacontract/export/sql_type_converter.py +124 -8
  34. datacontract/imports/avro_importer.py +63 -12
  35. datacontract/imports/csv_importer.py +111 -57
  36. datacontract/imports/excel_importer.py +1112 -0
  37. datacontract/imports/importer.py +16 -3
  38. datacontract/imports/importer_factory.py +17 -0
  39. datacontract/imports/json_importer.py +325 -0
  40. datacontract/imports/odcs_importer.py +2 -2
  41. datacontract/imports/odcs_v3_importer.py +367 -151
  42. datacontract/imports/protobuf_importer.py +264 -0
  43. datacontract/imports/spark_importer.py +117 -13
  44. datacontract/imports/sql_importer.py +32 -16
  45. datacontract/imports/unity_importer.py +84 -38
  46. datacontract/init/init_template.py +1 -1
  47. datacontract/integration/entropy_data.py +126 -0
  48. datacontract/lint/resolve.py +112 -23
  49. datacontract/lint/schema.py +24 -15
  50. datacontract/lint/urls.py +17 -3
  51. datacontract/model/data_contract_specification/__init__.py +1 -0
  52. datacontract/model/odcs.py +13 -0
  53. datacontract/model/run.py +3 -0
  54. datacontract/output/junit_test_results.py +3 -3
  55. datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
  56. datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
  57. datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
  58. datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
  59. datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
  60. datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
  61. datacontract/schemas/odcs-3.1.0.schema.json +2809 -0
  62. datacontract/templates/datacontract.html +54 -3
  63. datacontract/templates/datacontract_odcs.html +685 -0
  64. datacontract/templates/index.html +5 -2
  65. datacontract/templates/partials/server.html +2 -0
  66. datacontract/templates/style/output.css +319 -145
  67. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/METADATA +711 -433
  68. datacontract_cli-0.10.40.dist-info/RECORD +121 -0
  69. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/WHEEL +1 -1
  70. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info/licenses}/LICENSE +1 -1
  71. datacontract/export/csv_type_converter.py +0 -36
  72. datacontract/integration/datamesh_manager.py +0 -72
  73. datacontract/lint/lint.py +0 -142
  74. datacontract/lint/linters/description_linter.py +0 -35
  75. datacontract/lint/linters/field_pattern_linter.py +0 -34
  76. datacontract/lint/linters/field_reference_linter.py +0 -48
  77. datacontract/lint/linters/notice_period_linter.py +0 -55
  78. datacontract/lint/linters/quality_schema_linter.py +0 -52
  79. datacontract/lint/linters/valid_constraints_linter.py +0 -100
  80. datacontract/model/data_contract_specification.py +0 -327
  81. datacontract_cli-0.10.23.dist-info/RECORD +0 -113
  82. /datacontract/{lint/linters → output}/__init__.py +0 -0
  83. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/entry_points.txt +0 -0
  84. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,11 @@
1
+ import atexit
2
+ import os
3
+ import tempfile
1
4
  import typing
2
5
 
6
+ import requests
7
+ from duckdb.duckdb import DuckDBPyConnection
8
+
3
9
  from datacontract.engines.data_contract_checks import create_checks
4
10
 
5
11
  if typing.TYPE_CHECKING:
@@ -10,7 +16,7 @@ from datacontract.engines.datacontract.check_that_datacontract_contains_valid_se
10
16
  )
11
17
  from datacontract.engines.fastjsonschema.check_jsonschema import check_jsonschema
12
18
  from datacontract.engines.soda.check_soda_execute import check_soda_execute
13
- from datacontract.model.data_contract_specification import DataContractSpecification
19
+ from datacontract.model.data_contract_specification import DataContractSpecification, Server
14
20
  from datacontract.model.exceptions import DataContractException
15
21
  from datacontract.model.run import ResultEnum, Run
16
22
 
@@ -20,6 +26,7 @@ def execute_data_contract_test(
20
26
  run: Run,
21
27
  server_name: str = None,
22
28
  spark: "SparkSession" = None,
29
+ duckdb_connection: DuckDBPyConnection = None,
23
30
  ):
24
31
  if data_contract_specification.models is None or len(data_contract_specification.models) == 0:
25
32
  raise DataContractException(
@@ -29,6 +36,12 @@ def execute_data_contract_test(
29
36
  reason="Models block is missing. Skip executing tests.",
30
37
  engine="datacontract",
31
38
  )
39
+ if (
40
+ server_name is None
41
+ and data_contract_specification.servers is not None
42
+ and len(data_contract_specification.servers) > 0
43
+ ):
44
+ server_name = list(data_contract_specification.servers.keys())[0]
32
45
  server = get_server(data_contract_specification, server_name)
33
46
  run.log_info(f"Running tests for data contract {data_contract_specification.id} with server {server_name}")
34
47
  run.dataContractId = data_contract_specification.id
@@ -37,16 +50,19 @@ def execute_data_contract_test(
37
50
  run.outputPortId = server.outputPortId
38
51
  run.server = server_name
39
52
 
53
+ if server.type == "api":
54
+ server = process_api_response(run, server)
55
+
40
56
  run.checks.extend(create_checks(data_contract_specification, server))
41
57
 
42
58
  # TODO check server is supported type for nicer error messages
43
59
  # TODO check server credentials are complete for nicer error messages
44
60
  if server.format == "json" and server.type != "kafka":
45
61
  check_jsonschema(run, data_contract_specification, server)
46
- check_soda_execute(run, data_contract_specification, server, spark)
62
+ check_soda_execute(run, data_contract_specification, server, spark, duckdb_connection)
47
63
 
48
64
 
49
- def get_server(data_contract_specification: DataContractSpecification, server_name: str = None):
65
+ def get_server(data_contract_specification: DataContractSpecification, server_name: str = None) -> Server | None:
50
66
  """Get the server configuration from the data contract specification.
51
67
 
52
68
  Args:
@@ -59,9 +75,39 @@ def get_server(data_contract_specification: DataContractSpecification, server_na
59
75
 
60
76
  check_that_datacontract_contains_valid_server_configuration(data_contract_specification, server_name)
61
77
 
62
- if server_name:
78
+ if server_name is not None:
63
79
  server = data_contract_specification.servers.get(server_name)
64
80
  else:
65
81
  server_name = list(data_contract_specification.servers.keys())[0]
66
82
  server = data_contract_specification.servers.get(server_name)
67
83
  return server
84
+
85
+
86
+ def process_api_response(run, server):
87
+ tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract_cli_api_")
88
+ atexit.register(tmp_dir.cleanup)
89
+ headers = {}
90
+ if os.getenv("DATACONTRACT_API_HEADER_AUTHORIZATION") is not None:
91
+ headers["Authorization"] = os.getenv("DATACONTRACT_API_HEADER_AUTHORIZATION")
92
+ try:
93
+ response = requests.get(server.location, headers=headers)
94
+ response.raise_for_status()
95
+ except requests.exceptions.RequestException as e:
96
+ raise DataContractException(
97
+ type="connection",
98
+ name="API server connection error",
99
+ result=ResultEnum.error,
100
+ reason=f"Failed to fetch API response from {server.location}: {e}",
101
+ engine="datacontract",
102
+ )
103
+ with open(f"{tmp_dir.name}/api_response.json", "w") as f:
104
+ f.write(response.text)
105
+ run.log_info(f"Saved API response to {tmp_dir.name}/api_response.json")
106
+ server = Server(
107
+ type="local",
108
+ format="json",
109
+ path=f"{tmp_dir.name}/api_response.json",
110
+ dataProductId=server.dataProductId,
111
+ outputPortId=server.outputPortId,
112
+ )
113
+ return server
@@ -1,8 +1,9 @@
1
+ import glob
1
2
  import json
2
3
  import logging
3
4
  import os
4
5
  import threading
5
- from typing import List, Optional
6
+ from typing import Any, Callable, Generator, List, Optional
6
7
 
7
8
  import fastjsonschema
8
9
  from fastjsonschema import JsonSchemaValueException
@@ -85,7 +86,7 @@ def process_exceptions(run, exceptions: List[DataContractException]):
85
86
 
86
87
 
87
88
  def validate_json_stream(
88
- schema: dict, model_name: str, validate: callable, json_stream: list[dict]
89
+ schema: dict, model_name: str, validate: Callable, json_stream: Generator[Any, Any, None]
89
90
  ) -> List[DataContractException]:
90
91
  logging.info(f"Validating JSON stream for model: '{model_name}'.")
91
92
  exceptions: List[DataContractException] = []
@@ -99,7 +100,7 @@ def validate_json_stream(
99
100
  DataContractException(
100
101
  type="schema",
101
102
  name="Check that JSON has valid schema",
102
- result="failed",
103
+ result=ResultEnum.failed,
103
104
  reason=f"{f'#{primary_key_value}: ' if primary_key_value is not None else ''}{e.message}",
104
105
  model=model_name,
105
106
  engine="jsonschema",
@@ -159,27 +160,44 @@ def process_json_file(run, schema, model_name, validate, file, delimiter):
159
160
 
160
161
  def process_local_file(run, server, schema, model_name, validate):
161
162
  path = server.path
163
+ if not path:
164
+ raise DataContractException(
165
+ type="schema",
166
+ name="Check that JSON has valid schema",
167
+ result=ResultEnum.warning,
168
+ reason="For server with type 'local', a 'path' must be defined.",
169
+ engine="datacontract",
170
+ )
162
171
  if "{model}" in path:
163
172
  path = path.format(model=model_name)
164
173
 
174
+ all_files = []
165
175
  if os.path.isdir(path):
166
- return process_directory(run, path, server, model_name, validate)
176
+ # Fetch all JSONs in the directory
177
+ for root, _, files in os.walk(path):
178
+ for file in files:
179
+ if file.endswith(".json"):
180
+ all_files.append(os.path.join(root, file))
167
181
  else:
168
- logging.info(f"Processing file {path}")
169
- with open(path, "r") as file:
170
- process_json_file(run, schema, model_name, validate, file, server.delimiter)
182
+ # Use glob to fetch all JSONs
183
+ for file_path in glob.glob(path, recursive=True):
184
+ if os.path.isfile(file_path):
185
+ if file_path.endswith(".json"):
186
+ all_files.append(file_path)
171
187
 
188
+ if not all_files:
189
+ raise DataContractException(
190
+ type="schema",
191
+ name="Check that JSON has valid schema",
192
+ result=ResultEnum.warning,
193
+ reason=f"No files found in '{path}'.",
194
+ engine="datacontract",
195
+ )
172
196
 
173
- def process_directory(run, path, server, model_name, validate):
174
- success = True
175
- for filename in os.listdir(path):
176
- if filename.endswith(".json"): # or make this a parameter
177
- file_path = os.path.join(path, filename)
178
- with open(file_path, "r") as file:
179
- if not process_json_file(run, model_name, validate, file, server.delimiter):
180
- success = False
181
- break
182
- return success
197
+ for file in all_files:
198
+ logging.info(f"Processing file: {file}")
199
+ with open(file, "r") as f:
200
+ process_json_file(run, schema, model_name, validate, f, server.delimiter)
183
201
 
184
202
 
185
203
  def process_s3_file(run, server, schema, model_name, validate):
@@ -201,7 +219,7 @@ def process_s3_file(run, server, schema, model_name, validate):
201
219
  raise DataContractException(
202
220
  type="schema",
203
221
  name="Check that JSON has valid schema",
204
- result="warning",
222
+ result=ResultEnum.warning,
205
223
  reason=f"Cannot find any file in {s3_location}",
206
224
  engine="datacontract",
207
225
  )
@@ -222,7 +240,7 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
222
240
  Check(
223
241
  type="schema",
224
242
  name="Check that JSON has valid schema",
225
- result="warning",
243
+ result=ResultEnum.warning,
226
244
  reason="Server format is not 'json'. Skip validating jsonschema.",
227
245
  engine="jsonschema",
228
246
  )
@@ -2,6 +2,7 @@ import logging
2
2
  import os
3
3
 
4
4
  from datacontract.model.exceptions import DataContractException
5
+ from datacontract.model.run import ResultEnum
5
6
 
6
7
 
7
8
  def yield_s3_files(s3_endpoint_url, s3_location):
@@ -19,9 +20,9 @@ def s3_fs(s3_endpoint_url):
19
20
  except ImportError as e:
20
21
  raise DataContractException(
21
22
  type="schema",
22
- result="failed",
23
+ result=ResultEnum.failed,
23
24
  name="s3 extra missing",
24
- reason="Install the extra datacontract-cli\[s3] to use s3",
25
+ reason="Install the extra s3 to use s3",
25
26
  engine="datacontract",
26
27
  original_exception=e,
27
28
  )
@@ -1,6 +1,15 @@
1
1
  import logging
2
+ import typing
2
3
  import uuid
3
4
 
5
+ from datacontract.engines.soda.connections.athena import to_athena_soda_configuration
6
+ from datacontract.engines.soda.connections.oracle import initialize_client_and_create_soda_configuration
7
+
8
+ if typing.TYPE_CHECKING:
9
+ from pyspark.sql import SparkSession
10
+
11
+ from duckdb.duckdb import DuckDBPyConnection
12
+
4
13
  from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
5
14
  from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
6
15
  from datacontract.engines.soda.connections.duckdb_connection import get_duckdb_connection
@@ -14,7 +23,13 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
14
23
  from datacontract.model.run import Check, Log, ResultEnum, Run
15
24
 
16
25
 
17
- def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark):
26
+ def check_soda_execute(
27
+ run: Run,
28
+ data_contract: DataContractSpecification,
29
+ server: Server,
30
+ spark: "SparkSession" = None,
31
+ duckdb_connection: DuckDBPyConnection = None,
32
+ ):
18
33
  from soda.common.config_helper import ConfigHelper
19
34
 
20
35
  ConfigHelper.get_instance().upsert_value("send_anonymous_usage_stats", False)
@@ -30,7 +45,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
30
45
  if server.type in ["s3", "gcs", "azure", "local"]:
31
46
  if server.format in ["json", "parquet", "csv", "delta"]:
32
47
  run.log_info(f"Configuring engine soda-core to connect to {server.type} {server.format} with duckdb")
33
- con = get_duckdb_connection(data_contract, server, run)
48
+ con = get_duckdb_connection(data_contract, server, run, duckdb_connection)
34
49
  scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
35
50
  scan.set_data_source_name(server.type)
36
51
  else:
@@ -62,7 +77,8 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
62
77
  run.log_info("Connecting to databricks via spark")
63
78
  scan.add_spark_session(spark, data_source_name=server.type)
64
79
  scan.set_data_source_name(server.type)
65
- spark.sql(f"USE {server.catalog}.{server.schema_}")
80
+ database_name = ".".join(filter(None, [server.catalog, server.schema_]))
81
+ spark.sql(f"USE {database_name}")
66
82
  else:
67
83
  run.log_info("Connecting to databricks directly")
68
84
  soda_configuration_str = to_databricks_soda_configuration(server)
@@ -89,10 +105,18 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
89
105
  soda_configuration_str = to_sqlserver_soda_configuration(server)
90
106
  scan.add_configuration_yaml_str(soda_configuration_str)
91
107
  scan.set_data_source_name(server.type)
108
+ elif server.type == "oracle":
109
+ soda_configuration_str = initialize_client_and_create_soda_configuration(server)
110
+ scan.add_configuration_yaml_str(soda_configuration_str)
111
+ scan.set_data_source_name(server.type)
92
112
  elif server.type == "trino":
93
113
  soda_configuration_str = to_trino_soda_configuration(server)
94
114
  scan.add_configuration_yaml_str(soda_configuration_str)
95
115
  scan.set_data_source_name(server.type)
116
+ elif server.type == "athena":
117
+ soda_configuration_str = to_athena_soda_configuration(server)
118
+ scan.add_configuration_yaml_str(soda_configuration_str)
119
+ scan.set_data_source_name(server.type)
96
120
 
97
121
  else:
98
122
  run.checks.append(
@@ -0,0 +1,79 @@
1
+ import os
2
+
3
+ import yaml
4
+
5
+ from datacontract.model.exceptions import DataContractException
6
+
7
+
8
+ def to_athena_soda_configuration(server):
9
+ s3_region = os.getenv("DATACONTRACT_S3_REGION")
10
+ s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
11
+ s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
12
+ s3_session_token = os.getenv("DATACONTRACT_S3_SESSION_TOKEN")
13
+
14
+ # Validate required parameters
15
+ if not s3_access_key_id:
16
+ raise DataContractException(
17
+ type="athena-connection",
18
+ name="missing_access_key_id",
19
+ reason="AWS access key ID is required. Set the DATACONTRACT_S3_ACCESS_KEY_ID environment variable.",
20
+ engine="datacontract",
21
+ )
22
+
23
+ if not s3_secret_access_key:
24
+ raise DataContractException(
25
+ type="athena-connection",
26
+ name="missing_secret_access_key",
27
+ reason="AWS secret access key is required. Set the DATACONTRACT_S3_SECRET_ACCESS_KEY environment variable.",
28
+ engine="datacontract",
29
+ )
30
+
31
+ if not hasattr(server, "schema_") or not server.schema_:
32
+ raise DataContractException(
33
+ type="athena-connection",
34
+ name="missing_schema",
35
+ reason="Schema is required for Athena connection. Specify the schema where your tables exist in the server configuration.",
36
+ engine="datacontract",
37
+ )
38
+
39
+ if not hasattr(server, "stagingDir") or not server.stagingDir:
40
+ raise DataContractException(
41
+ type="athena-connection",
42
+ name="missing_s3_staging_dir",
43
+ reason="S3 staging directory is required for Athena connection. This should be the Amazon S3 Query Result Location (e.g., 's3://my-bucket/athena-results/').",
44
+ engine="datacontract",
45
+ )
46
+
47
+ # Validate S3 staging directory format
48
+ if not server.stagingDir.startswith("s3://"):
49
+ raise DataContractException(
50
+ type="athena-connection",
51
+ name="invalid_s3_staging_dir",
52
+ reason=f"S3 staging directory must start with 's3://'. Got: {server.s3_staging_dir}. Example: 's3://my-bucket/athena-results/'",
53
+ engine="datacontract",
54
+ )
55
+
56
+ data_source = {
57
+ "type": "athena",
58
+ "access_key_id": s3_access_key_id,
59
+ "secret_access_key": s3_secret_access_key,
60
+ "schema": server.schema_,
61
+ "staging_dir": server.stagingDir,
62
+ }
63
+
64
+ if s3_region:
65
+ data_source["region_name"] = s3_region
66
+ elif server.region_name:
67
+ data_source["region_name"] = server.region_name
68
+
69
+ if server.catalog:
70
+ # Optional, Identify the name of the Data Source, also referred to as a Catalog. The default value is `awsdatacatalog`.
71
+ data_source["catalog"] = server.catalog
72
+
73
+ if s3_session_token:
74
+ data_source["session_token"] = s3_session_token
75
+
76
+ soda_configuration = {f"data_source {server.type}": data_source}
77
+
78
+ soda_configuration_str = yaml.dump(soda_configuration)
79
+ return soda_configuration_str
@@ -1,14 +1,24 @@
1
1
  import os
2
- from typing import Any
2
+ from typing import Any, Dict
3
3
 
4
4
  import duckdb
5
5
 
6
- from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type
6
+ from datacontract.export.duckdb_type_converter import convert_to_duckdb_csv_type, convert_to_duckdb_json_type
7
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
7
8
  from datacontract.model.run import Run
8
9
 
9
10
 
10
- def get_duckdb_connection(data_contract, server, run: Run):
11
- con = duckdb.connect(database=":memory:")
11
+ def get_duckdb_connection(
12
+ data_contract: DataContractSpecification,
13
+ server: Server,
14
+ run: Run,
15
+ duckdb_connection: duckdb.DuckDBPyConnection | None = None,
16
+ ) -> duckdb.DuckDBPyConnection:
17
+ if duckdb_connection is None:
18
+ con = duckdb.connect(database=":memory:")
19
+ else:
20
+ con = duckdb_connection
21
+
12
22
  path: str = ""
13
23
  if server.type == "local":
14
24
  path = server.path
@@ -33,9 +43,16 @@ def get_duckdb_connection(data_contract, server, run: Run):
33
43
  json_format = "newline_delimited"
34
44
  elif server.delimiter == "array":
35
45
  json_format = "array"
36
- con.sql(f"""
46
+ columns = to_json_types(model)
47
+ if columns is None:
48
+ con.sql(f"""
37
49
  CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', hive_partitioning=1);
38
50
  """)
51
+ else:
52
+ con.sql(
53
+ f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', columns={columns}, hive_partitioning=1);"""
54
+ )
55
+ add_nested_views(con, model_name, model.fields)
39
56
  elif server.format == "parquet":
40
57
  con.sql(f"""
41
58
  CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{model_path}', hive_partitioning=1);
@@ -54,6 +71,9 @@ def get_duckdb_connection(data_contract, server, run: Run):
54
71
  elif server.format == "delta":
55
72
  con.sql("update extensions;") # Make sure we have the latest delta extension
56
73
  con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""")
74
+ table_info = con.sql(f"PRAGMA table_info('{model_name}');").fetchdf()
75
+ if table_info is not None and not table_info.empty:
76
+ run.log_info(f"DuckDB Table Info: {table_info.to_string(index=False)}")
57
77
  return con
58
78
 
59
79
 
@@ -67,6 +87,45 @@ def to_csv_types(model) -> dict[Any, str | None] | None:
67
87
  return columns
68
88
 
69
89
 
90
+ def to_json_types(model: Model) -> dict[Any, str | None] | None:
91
+ if model is None:
92
+ return None
93
+ columns = {}
94
+ for field_name, field in model.fields.items():
95
+ columns[field_name] = convert_to_duckdb_json_type(field)
96
+ return columns
97
+
98
+
99
+ def add_nested_views(con: duckdb.DuckDBPyConnection, model_name: str, fields: Dict[str, Field] | None):
100
+ model_name = model_name.strip('"')
101
+ if fields is None:
102
+ return
103
+ for field_name, field in fields.items():
104
+ if field.type is None or field.type.lower() not in ["array", "object"]:
105
+ continue
106
+ field_type = field.type.lower()
107
+ if field_type == "array" and field.items is None:
108
+ continue
109
+ elif field_type == "object" and field.fields is None:
110
+ continue
111
+
112
+ nested_model_name = f"{model_name}__{field_name}"
113
+ max_depth = 2 if field_type == "array" else 1
114
+
115
+ ## if parent field is not required, the nested objects may respolve
116
+ ## to a row of NULLs -- but if the objects themselves have required
117
+ ## fields, this will fail the check.
118
+ where = "" if field.required else f" WHERE {field_name} IS NOT NULL"
119
+ con.sql(f"""
120
+ CREATE VIEW IF NOT EXISTS "{nested_model_name}" AS
121
+ SELECT unnest({field_name}, max_depth := {max_depth}) as {field_name} FROM "{model_name}" {where}
122
+ """)
123
+ if field_type == "array":
124
+ add_nested_views(con, nested_model_name, field.items.fields)
125
+ elif field_type == "object":
126
+ add_nested_views(con, nested_model_name, field.fields)
127
+
128
+
70
129
  def setup_s3_connection(con, server):
71
130
  s3_region = os.getenv("DATACONTRACT_S3_REGION")
72
131
  s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
@@ -76,10 +135,10 @@ def setup_s3_connection(con, server):
76
135
  use_ssl = "true"
77
136
  url_style = "vhost"
78
137
  if server.endpointUrl is not None:
138
+ url_style = "path"
79
139
  s3_endpoint = server.endpointUrl.removeprefix("http://").removeprefix("https://")
80
140
  if server.endpointUrl.startswith("http://"):
81
141
  use_ssl = "false"
82
- url_style = "path"
83
142
 
84
143
  if s3_access_key_id is not None:
85
144
  if s3_session_token is not None:
@@ -6,6 +6,7 @@ import tempfile
6
6
  from datacontract.export.avro_converter import to_avro_schema_json
7
7
  from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server
8
8
  from datacontract.model.exceptions import DataContractException
9
+ from datacontract.model.run import ResultEnum
9
10
 
10
11
 
11
12
  def create_spark_session():
@@ -16,7 +17,7 @@ def create_spark_session():
16
17
  except ImportError as e:
17
18
  raise DataContractException(
18
19
  type="schema",
19
- result="failed",
20
+ result=ResultEnum.failed,
20
21
  name="pyspark is missing",
21
22
  reason="Install the extra datacontract-cli[kafka] to use kafka",
22
23
  engine="datacontract",
@@ -26,6 +27,7 @@ def create_spark_session():
26
27
  tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract-cli-spark")
27
28
  atexit.register(tmp_dir.cleanup)
28
29
 
30
+ pyspark_version = "3.5.5" # MUST be the same as in the pyproject.toml
29
31
  spark = (
30
32
  SparkSession.builder.appName("datacontract")
31
33
  .config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
@@ -33,7 +35,7 @@ def create_spark_session():
33
35
  .config("spark.ui.enabled", "false")
34
36
  .config(
35
37
  "spark.jars.packages",
36
- "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.2,org.apache.spark:spark-avro_2.12:3.5.2",
38
+ f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version},org.apache.spark:spark-avro_2.12:{pyspark_version}",
37
39
  )
38
40
  .getOrCreate()
39
41
  )
@@ -0,0 +1,50 @@
1
+ import os
2
+
3
+ import yaml
4
+
5
+ from datacontract.model.data_contract_specification import Server
6
+
7
+
8
+ def initialize_client_and_create_soda_configuration(server: Server) -> str:
9
+ import oracledb
10
+ soda_config = to_oracle_soda_configuration(server)
11
+
12
+ oracle_client_dir = os.getenv("DATACONTRACT_ORACLE_CLIENT_DIR")
13
+ if oracle_client_dir is not None:
14
+ # Soda Core currently does not support thick mode natively, see https://github.com/sodadata/soda-core/issues/2036
15
+ # but the oracledb client can be configured accordingly before Soda initializes as a work-around
16
+ oracledb.init_oracle_client(lib_dir=oracle_client_dir)
17
+
18
+ return soda_config
19
+
20
+
21
+ def to_oracle_soda_configuration(server: Server) -> str:
22
+ """Serialize server config to soda configuration.
23
+
24
+
25
+ ### Example:
26
+ type: oracle
27
+ host: database-1.us-east-1.rds.amazonaws.com
28
+ port: '1521'
29
+ username: simple
30
+ password: simple_pass
31
+ connectstring: database-1.us-east-1.rds.amazonaws.com:1521/ORCL (database is equal to service name at oracle)
32
+ schema: SYSTEM
33
+ """
34
+
35
+ service_name = server.service_name or server.database
36
+ # with service account key, using an external json file
37
+ soda_configuration = {
38
+ f"data_source {server.type}": {
39
+ "type": "oracle",
40
+ "host": server.host,
41
+ "port": str(server.port),
42
+ "username": os.getenv("DATACONTRACT_ORACLE_USERNAME", ""),
43
+ "password": os.getenv("DATACONTRACT_ORACLE_PASSWORD", ""),
44
+ "connectstring": f"{server.host}:{server.port}/{service_name}",
45
+ "schema": server.schema_,
46
+ }
47
+ }
48
+
49
+ soda_configuration_str = yaml.dump(soda_configuration)
50
+ return soda_configuration_str
@@ -44,12 +44,18 @@ def to_avro_field(field, field_name):
44
44
  avro_type = to_avro_type(field, field_name)
45
45
  avro_field["type"] = avro_type if is_required_avro else ["null", avro_type]
46
46
 
47
- if avro_field["type"] == "enum":
48
- avro_field["type"] = {
47
+ # Handle enum types - both required and optional
48
+ if avro_type == "enum" or (isinstance(avro_field["type"], list) and "enum" in avro_field["type"]):
49
+ enum_def = {
49
50
  "type": "enum",
50
51
  "name": field.title,
51
52
  "symbols": field.enum,
52
53
  }
54
+ if is_required_avro:
55
+ avro_field["type"] = enum_def
56
+ else:
57
+ # Replace "enum" with the full enum definition in the union
58
+ avro_field["type"] = ["null", enum_def]
53
59
 
54
60
  if field.config:
55
61
  if "avroDefault" in field.config:
@@ -77,6 +83,10 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
77
83
  if "avroType" in field.config:
78
84
  return field.config["avroType"]
79
85
 
86
+ # Check for enum fields based on presence of enum list and avroType config
87
+ if field.enum and field.config and field.config.get("avroType") == "enum":
88
+ return "enum"
89
+
80
90
  if field.type is None:
81
91
  return "null"
82
92
  if field.type in ["string", "varchar", "text"]:
@@ -91,7 +101,9 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
91
101
  if field.precision is not None:
92
102
  typeVal["precision"] = field.precision
93
103
  return typeVal
94
- elif field.type in ["float", "double"]:
104
+ elif field.type in ["float"]:
105
+ return "float"
106
+ elif field.type in ["double"]:
95
107
  return "double"
96
108
  elif field.type in ["integer", "int"]:
97
109
  return "int"
@@ -107,6 +119,11 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
107
119
  return {"type": "int", "logicalType": "date"}
108
120
  elif field.type in ["time"]:
109
121
  return "long"
122
+ elif field.type in ["map"]:
123
+ if field.config is not None and "values" in field.config:
124
+ return {"type": "map", "values": field.config["values"]}
125
+ else:
126
+ return "bytes"
110
127
  elif field.type in ["object", "record", "struct"]:
111
128
  if field.config is not None and "namespace" in field.config:
112
129
  return to_avro_record(field_name, field.fields, field.description, field.config["namespace"])
@@ -103,7 +103,7 @@ def map_type_to_bigquery(field: Field) -> str:
103
103
  elif field_type.lower() == "date":
104
104
  return "DATE"
105
105
  elif field_type.lower() == "timestamp_ntz":
106
- return "TIME"
106
+ return "DATETIME"
107
107
  elif field_type.lower() in ["number", "decimal", "numeric"]:
108
108
  return "NUMERIC"
109
109
  elif field_type.lower() == "double":