datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. datacontract/__init__.py +13 -0
  2. datacontract/api.py +260 -0
  3. datacontract/breaking/breaking.py +242 -12
  4. datacontract/breaking/breaking_rules.py +37 -1
  5. datacontract/catalog/catalog.py +80 -0
  6. datacontract/cli.py +387 -117
  7. datacontract/data_contract.py +216 -353
  8. datacontract/engines/data_contract_checks.py +1041 -0
  9. datacontract/engines/data_contract_test.py +113 -0
  10. datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
  11. datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
  12. datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
  13. datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
  14. datacontract/engines/soda/check_soda_execute.py +100 -56
  15. datacontract/engines/soda/connections/athena.py +79 -0
  16. datacontract/engines/soda/connections/bigquery.py +8 -1
  17. datacontract/engines/soda/connections/databricks.py +12 -3
  18. datacontract/engines/soda/connections/duckdb_connection.py +241 -0
  19. datacontract/engines/soda/connections/kafka.py +206 -113
  20. datacontract/engines/soda/connections/snowflake.py +8 -5
  21. datacontract/engines/soda/connections/sqlserver.py +43 -0
  22. datacontract/engines/soda/connections/trino.py +26 -0
  23. datacontract/export/avro_converter.py +72 -8
  24. datacontract/export/avro_idl_converter.py +31 -25
  25. datacontract/export/bigquery_converter.py +130 -0
  26. datacontract/export/custom_converter.py +40 -0
  27. datacontract/export/data_caterer_converter.py +161 -0
  28. datacontract/export/dbml_converter.py +148 -0
  29. datacontract/export/dbt_converter.py +141 -54
  30. datacontract/export/dcs_exporter.py +6 -0
  31. datacontract/export/dqx_converter.py +126 -0
  32. datacontract/export/duckdb_type_converter.py +57 -0
  33. datacontract/export/excel_exporter.py +923 -0
  34. datacontract/export/exporter.py +100 -0
  35. datacontract/export/exporter_factory.py +216 -0
  36. datacontract/export/go_converter.py +105 -0
  37. datacontract/export/great_expectations_converter.py +257 -36
  38. datacontract/export/html_exporter.py +86 -0
  39. datacontract/export/iceberg_converter.py +188 -0
  40. datacontract/export/jsonschema_converter.py +71 -16
  41. datacontract/export/markdown_converter.py +337 -0
  42. datacontract/export/mermaid_exporter.py +110 -0
  43. datacontract/export/odcs_v3_exporter.py +375 -0
  44. datacontract/export/pandas_type_converter.py +40 -0
  45. datacontract/export/protobuf_converter.py +168 -68
  46. datacontract/export/pydantic_converter.py +6 -0
  47. datacontract/export/rdf_converter.py +13 -6
  48. datacontract/export/sodacl_converter.py +36 -188
  49. datacontract/export/spark_converter.py +245 -0
  50. datacontract/export/sql_converter.py +37 -3
  51. datacontract/export/sql_type_converter.py +269 -8
  52. datacontract/export/sqlalchemy_converter.py +170 -0
  53. datacontract/export/terraform_converter.py +7 -2
  54. datacontract/imports/avro_importer.py +246 -26
  55. datacontract/imports/bigquery_importer.py +221 -0
  56. datacontract/imports/csv_importer.py +143 -0
  57. datacontract/imports/dbml_importer.py +112 -0
  58. datacontract/imports/dbt_importer.py +240 -0
  59. datacontract/imports/excel_importer.py +1111 -0
  60. datacontract/imports/glue_importer.py +288 -0
  61. datacontract/imports/iceberg_importer.py +172 -0
  62. datacontract/imports/importer.py +51 -0
  63. datacontract/imports/importer_factory.py +128 -0
  64. datacontract/imports/json_importer.py +325 -0
  65. datacontract/imports/jsonschema_importer.py +146 -0
  66. datacontract/imports/odcs_importer.py +60 -0
  67. datacontract/imports/odcs_v3_importer.py +516 -0
  68. datacontract/imports/parquet_importer.py +81 -0
  69. datacontract/imports/protobuf_importer.py +264 -0
  70. datacontract/imports/spark_importer.py +262 -0
  71. datacontract/imports/sql_importer.py +274 -35
  72. datacontract/imports/unity_importer.py +219 -0
  73. datacontract/init/init_template.py +20 -0
  74. datacontract/integration/datamesh_manager.py +86 -0
  75. datacontract/lint/resolve.py +271 -49
  76. datacontract/lint/resources.py +21 -0
  77. datacontract/lint/schema.py +53 -17
  78. datacontract/lint/urls.py +32 -12
  79. datacontract/model/data_contract_specification/__init__.py +1 -0
  80. datacontract/model/exceptions.py +4 -1
  81. datacontract/model/odcs.py +24 -0
  82. datacontract/model/run.py +49 -29
  83. datacontract/output/__init__.py +0 -0
  84. datacontract/output/junit_test_results.py +135 -0
  85. datacontract/output/output_format.py +10 -0
  86. datacontract/output/test_results_writer.py +79 -0
  87. datacontract/py.typed +0 -0
  88. datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
  89. datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
  90. datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
  91. datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
  92. datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
  93. datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
  94. datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
  95. datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
  96. datacontract/templates/datacontract.html +139 -294
  97. datacontract/templates/datacontract_odcs.html +685 -0
  98. datacontract/templates/index.html +236 -0
  99. datacontract/templates/partials/datacontract_information.html +86 -0
  100. datacontract/templates/partials/datacontract_servicelevels.html +253 -0
  101. datacontract/templates/partials/datacontract_terms.html +51 -0
  102. datacontract/templates/partials/definition.html +25 -0
  103. datacontract/templates/partials/example.html +27 -0
  104. datacontract/templates/partials/model_field.html +144 -0
  105. datacontract/templates/partials/quality.html +49 -0
  106. datacontract/templates/partials/server.html +211 -0
  107. datacontract/templates/style/output.css +491 -72
  108. datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
  109. datacontract_cli-0.10.37.dist-info/RECORD +119 -0
  110. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
  111. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
  112. datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
  113. datacontract/engines/soda/connections/dask.py +0 -28
  114. datacontract/engines/soda/connections/duckdb.py +0 -76
  115. datacontract/export/csv_type_converter.py +0 -36
  116. datacontract/export/html_export.py +0 -66
  117. datacontract/export/odcs_converter.py +0 -102
  118. datacontract/init/download_datacontract_file.py +0 -17
  119. datacontract/integration/publish_datamesh_manager.py +0 -33
  120. datacontract/integration/publish_opentelemetry.py +0 -107
  121. datacontract/lint/lint.py +0 -141
  122. datacontract/lint/linters/description_linter.py +0 -34
  123. datacontract/lint/linters/example_model_linter.py +0 -91
  124. datacontract/lint/linters/field_pattern_linter.py +0 -34
  125. datacontract/lint/linters/field_reference_linter.py +0 -38
  126. datacontract/lint/linters/notice_period_linter.py +0 -55
  127. datacontract/lint/linters/quality_schema_linter.py +0 -52
  128. datacontract/lint/linters/valid_constraints_linter.py +0 -99
  129. datacontract/model/data_contract_specification.py +0 -141
  130. datacontract/web.py +0 -14
  131. datacontract_cli-0.10.0.dist-info/METADATA +0 -951
  132. datacontract_cli-0.10.0.dist-info/RECORD +0 -66
  133. /datacontract/{model → breaking}/breaking_change.py +0 -0
  134. /datacontract/{lint/linters → export}/__init__.py +0 -0
  135. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
  136. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
@@ -1,28 +1,39 @@
1
1
  import logging
2
+ import typing
3
+ import uuid
2
4
 
3
- from pyspark.sql import SparkSession
4
- from soda.scan import Scan
5
-
6
- from datacontract.engines.soda.connections.bigquery import \
7
- to_bigquery_soda_configuration
8
- from datacontract.engines.soda.connections.databricks import \
9
- to_databricks_soda_configuration
10
- from datacontract.engines.soda.connections.duckdb import get_duckdb_connection
11
- from datacontract.engines.soda.connections.kafka import create_spark_session, \
12
- read_kafka_topic
13
- from datacontract.engines.soda.connections.postgres import \
14
- to_postgres_soda_configuration
15
- from datacontract.engines.soda.connections.snowflake import \
16
- to_snowflake_soda_configuration
5
+ from datacontract.engines.soda.connections.athena import to_athena_soda_configuration
6
+
7
+ if typing.TYPE_CHECKING:
8
+ from pyspark.sql import SparkSession
9
+
10
+ from duckdb.duckdb import DuckDBPyConnection
11
+
12
+ from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
13
+ from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
14
+ from datacontract.engines.soda.connections.duckdb_connection import get_duckdb_connection
15
+ from datacontract.engines.soda.connections.kafka import create_spark_session, read_kafka_topic
16
+ from datacontract.engines.soda.connections.postgres import to_postgres_soda_configuration
17
+ from datacontract.engines.soda.connections.snowflake import to_snowflake_soda_configuration
18
+ from datacontract.engines.soda.connections.sqlserver import to_sqlserver_soda_configuration
19
+ from datacontract.engines.soda.connections.trino import to_trino_soda_configuration
17
20
  from datacontract.export.sodacl_converter import to_sodacl_yaml
18
- from datacontract.model.data_contract_specification import \
19
- DataContractSpecification, Server
20
- from datacontract.model.run import Run, Check, Log
21
+ from datacontract.model.data_contract_specification import DataContractSpecification, Server
22
+ from datacontract.model.run import Check, Log, ResultEnum, Run
21
23
 
22
24
 
23
25
  def check_soda_execute(
24
- run: Run, data_contract: DataContractSpecification, server: Server, spark: SparkSession, tmp_dir
26
+ run: Run,
27
+ data_contract: DataContractSpecification,
28
+ server: Server,
29
+ spark: "SparkSession" = None,
30
+ duckdb_connection: DuckDBPyConnection = None,
25
31
  ):
32
+ from soda.common.config_helper import ConfigHelper
33
+
34
+ ConfigHelper.get_instance().upsert_value("send_anonymous_usage_stats", False)
35
+ from soda.scan import Scan
36
+
26
37
  if data_contract is None:
27
38
  run.log_warn("Cannot run engine soda-core, as data contract is invalid")
28
39
  return
@@ -30,9 +41,10 @@ def check_soda_execute(
30
41
  run.log_info("Running engine soda-core")
31
42
  scan = Scan()
32
43
 
33
- if server.type == "s3" or server.type == "local":
34
- if server.format in ["json", "parquet", "csv"]:
35
- con = get_duckdb_connection(data_contract, server)
44
+ if server.type in ["s3", "gcs", "azure", "local"]:
45
+ if server.format in ["json", "parquet", "csv", "delta"]:
46
+ run.log_info(f"Configuring engine soda-core to connect to {server.type} {server.format} with duckdb")
47
+ con = get_duckdb_connection(data_contract, server, run, duckdb_connection)
36
48
  scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
37
49
  scan.set_data_source_name(server.type)
38
50
  else:
@@ -40,7 +52,7 @@ def check_soda_execute(
40
52
  Check(
41
53
  type="general",
42
54
  name="Check that format is supported",
43
- result="warning",
55
+ result=ResultEnum.warning,
44
56
  reason=f"Format {server.format} not yet supported by datacontract CLI",
45
57
  engine="datacontract",
46
58
  )
@@ -61,27 +73,52 @@ def check_soda_execute(
61
73
  scan.set_data_source_name(server.type)
62
74
  elif server.type == "databricks":
63
75
  if spark is not None:
64
- logging.info("Use Spark to connect to data source")
76
+ run.log_info("Connecting to databricks via spark")
65
77
  scan.add_spark_session(spark, data_source_name=server.type)
66
78
  scan.set_data_source_name(server.type)
67
- spark.sql(f"USE {server.catalog}.{server.schema_}")
79
+ database_name = ".".join(filter(None, [server.catalog, server.schema_]))
80
+ spark.sql(f"USE {database_name}")
68
81
  else:
82
+ run.log_info("Connecting to databricks directly")
69
83
  soda_configuration_str = to_databricks_soda_configuration(server)
70
84
  scan.add_configuration_yaml_str(soda_configuration_str)
71
85
  scan.set_data_source_name(server.type)
86
+ elif server.type == "dataframe":
87
+ if spark is None:
88
+ run.log_warn(
89
+ "Server type dataframe only works with the Python library and requires a Spark session, "
90
+ "please provide one with the DataContract class"
91
+ )
92
+ return
93
+ else:
94
+ logging.info("Use Spark to connect to data source")
95
+ scan.add_spark_session(spark, data_source_name="datacontract-cli")
96
+ scan.set_data_source_name("datacontract-cli")
72
97
  elif server.type == "kafka":
73
98
  if spark is None:
74
- spark = create_spark_session(tmp_dir)
75
- read_kafka_topic(spark, data_contract, server, tmp_dir)
99
+ spark = create_spark_session()
100
+ read_kafka_topic(spark, data_contract, server)
76
101
  scan.add_spark_session(spark, data_source_name=server.type)
77
102
  scan.set_data_source_name(server.type)
103
+ elif server.type == "sqlserver":
104
+ soda_configuration_str = to_sqlserver_soda_configuration(server)
105
+ scan.add_configuration_yaml_str(soda_configuration_str)
106
+ scan.set_data_source_name(server.type)
107
+ elif server.type == "trino":
108
+ soda_configuration_str = to_trino_soda_configuration(server)
109
+ scan.add_configuration_yaml_str(soda_configuration_str)
110
+ scan.set_data_source_name(server.type)
111
+ elif server.type == "athena":
112
+ soda_configuration_str = to_athena_soda_configuration(server)
113
+ scan.add_configuration_yaml_str(soda_configuration_str)
114
+ scan.set_data_source_name(server.type)
78
115
 
79
116
  else:
80
117
  run.checks.append(
81
118
  Check(
82
119
  type="general",
83
120
  name="Check that server type is supported",
84
- result="warning",
121
+ result=ResultEnum.warning,
85
122
  reason=f"Server type {server.type} not yet supported by datacontract CLI",
86
123
  engine="datacontract-cli",
87
124
  )
@@ -89,37 +126,34 @@ def check_soda_execute(
89
126
  run.log_warn(f"Server type {server.type} not yet supported by datacontract CLI")
90
127
  return
91
128
 
92
- # Don't check types for json format, as they are checked with json schema
93
- # Don't check types for avro format, as they are checked with avro schema
94
- # Don't check types for csv format, as they are hard to detect
95
- server_type = server.type
96
- check_types = server.format != "json" and server.format != "csv" and server.format != "avro"
97
-
98
- sodacl_yaml_str = to_sodacl_yaml(data_contract, server_type, check_types)
129
+ sodacl_yaml_str = to_sodacl_yaml(run)
99
130
  # print("sodacl_yaml_str:\n" + sodacl_yaml_str)
100
131
  scan.add_sodacl_yaml_str(sodacl_yaml_str)
101
132
 
102
133
  # Execute the scan
103
- logging.info("Starting soda scan")
134
+ logging.info("Starting soda scan with checks:\n" + sodacl_yaml_str)
104
135
  scan.execute()
105
136
  logging.info("Finished soda scan")
106
137
 
107
138
  # pprint.PrettyPrinter(indent=2).pprint(scan.build_scan_results())
108
139
 
109
140
  scan_results = scan.get_scan_results()
110
- for c in scan_results.get("checks"):
111
- check = Check(
112
- type="schema",
113
- result=to_result(c),
114
- reason=", ".join(c.get("outcomeReasons")),
115
- name=c.get("name"),
116
- model=c.get("table"),
117
- field=c.get("column"),
118
- engine="soda-core",
119
- diagnostics=c.get("diagnostics"),
120
- )
121
- update_reason(check, c)
122
- run.checks.append(check)
141
+ for scan_result in scan_results.get("checks"):
142
+ name = scan_result.get("name")
143
+ check = get_check(run, scan_result)
144
+ if check is None:
145
+ check = Check(
146
+ id=str(uuid.uuid4()),
147
+ category="custom",
148
+ type="custom",
149
+ name=name,
150
+ engine="soda-core",
151
+ )
152
+ run.checks.append(check)
153
+ check.result = to_result(scan_result)
154
+ check.reason = ", ".join(scan_result.get("outcomeReasons"))
155
+ check.diagnostics = scan_result.get("diagnostics")
156
+ update_reason(check, scan_result)
123
157
 
124
158
  for log in scan_results.get("logs"):
125
159
  run.logs.append(
@@ -135,8 +169,8 @@ def check_soda_execute(
135
169
  run.checks.append(
136
170
  Check(
137
171
  type="general",
138
- name="Execute quality checks",
139
- result="warning",
172
+ name="Data Contract Tests",
173
+ result=ResultEnum.warning,
140
174
  reason="Engine soda-core has errors. See the logs for details.",
141
175
  engine="soda-core",
142
176
  )
@@ -144,14 +178,22 @@ def check_soda_execute(
144
178
  return
145
179
 
146
180
 
147
- def to_result(c) -> str:
181
+ def get_check(run, scan_result) -> Check | None:
182
+ check_by_name = next((c for c in run.checks if c.key == scan_result.get("name")), None)
183
+ if check_by_name is not None:
184
+ return check_by_name
185
+
186
+ return None
187
+
188
+
189
+ def to_result(c) -> ResultEnum:
148
190
  soda_outcome = c.get("outcome")
149
191
  if soda_outcome == "pass":
150
- return "passed"
192
+ return ResultEnum.passed
151
193
  elif soda_outcome == "fail":
152
- return "failed"
194
+ return ResultEnum.failed
153
195
  else:
154
- return soda_outcome
196
+ return ResultEnum.unknown
155
197
 
156
198
 
157
199
  def update_reason(check, c):
@@ -164,9 +206,11 @@ def update_reason(check, c):
164
206
  if block["title"] == "Diagnostics":
165
207
  # Extract and print the 'text' value
166
208
  diagnostics_text = block["text"]
167
- print(diagnostics_text)
209
+ # print(diagnostics_text)
168
210
  diagnostics_text_split = diagnostics_text.split(":icon-fail: ")
169
211
  if len(diagnostics_text_split) > 1:
170
212
  check.reason = diagnostics_text_split[1].strip()
171
- print(check.reason)
213
+ # print(check.reason)
172
214
  break # Exit the loop once the desired block is found
215
+ if "fail" in c["diagnostics"]:
216
+ check.reason = f"Value: {c['diagnostics']['value']} Fail: {c['diagnostics']['fail']}"
@@ -0,0 +1,79 @@
1
+ import os
2
+
3
+ import yaml
4
+
5
+ from datacontract.model.exceptions import DataContractException
6
+
7
+
8
+ def to_athena_soda_configuration(server):
9
+ s3_region = os.getenv("DATACONTRACT_S3_REGION")
10
+ s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
11
+ s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
12
+ s3_session_token = os.getenv("DATACONTRACT_S3_SESSION_TOKEN")
13
+
14
+ # Validate required parameters
15
+ if not s3_access_key_id:
16
+ raise DataContractException(
17
+ type="athena-connection",
18
+ name="missing_access_key_id",
19
+ reason="AWS access key ID is required. Set the DATACONTRACT_S3_ACCESS_KEY_ID environment variable.",
20
+ engine="datacontract",
21
+ )
22
+
23
+ if not s3_secret_access_key:
24
+ raise DataContractException(
25
+ type="athena-connection",
26
+ name="missing_secret_access_key",
27
+ reason="AWS secret access key is required. Set the DATACONTRACT_S3_SECRET_ACCESS_KEY environment variable.",
28
+ engine="datacontract",
29
+ )
30
+
31
+ if not hasattr(server, "schema_") or not server.schema_:
32
+ raise DataContractException(
33
+ type="athena-connection",
34
+ name="missing_schema",
35
+ reason="Schema is required for Athena connection. Specify the schema where your tables exist in the server configuration.",
36
+ engine="datacontract",
37
+ )
38
+
39
+ if not hasattr(server, "stagingDir") or not server.stagingDir:
40
+ raise DataContractException(
41
+ type="athena-connection",
42
+ name="missing_s3_staging_dir",
43
+ reason="S3 staging directory is required for Athena connection. This should be the Amazon S3 Query Result Location (e.g., 's3://my-bucket/athena-results/').",
44
+ engine="datacontract",
45
+ )
46
+
47
+ # Validate S3 staging directory format
48
+ if not server.stagingDir.startswith("s3://"):
49
+ raise DataContractException(
50
+ type="athena-connection",
51
+ name="invalid_s3_staging_dir",
52
+ reason=f"S3 staging directory must start with 's3://'. Got: {server.s3_staging_dir}. Example: 's3://my-bucket/athena-results/'",
53
+ engine="datacontract",
54
+ )
55
+
56
+ data_source = {
57
+ "type": "athena",
58
+ "access_key_id": s3_access_key_id,
59
+ "secret_access_key": s3_secret_access_key,
60
+ "schema": server.schema_,
61
+ "staging_dir": server.stagingDir,
62
+ }
63
+
64
+ if s3_region:
65
+ data_source["region_name"] = s3_region
66
+ elif server.region_name:
67
+ data_source["region_name"] = server.region_name
68
+
69
+ if server.catalog:
70
+ # Optional, Identify the name of the Data Source, also referred to as a Catalog. The default value is `awsdatacatalog`.
71
+ data_source["catalog"] = server.catalog
72
+
73
+ if s3_session_token:
74
+ data_source["aws_session_token"] = s3_session_token
75
+
76
+ soda_configuration = {f"data_source {server.type}": data_source}
77
+
78
+ soda_configuration_str = yaml.dump(soda_configuration)
79
+ return soda_configuration_str
@@ -6,10 +6,17 @@ import yaml
6
6
  # https://docs.soda.io/soda/connect-bigquery.html#authentication-methods
7
7
  def to_bigquery_soda_configuration(server):
8
8
  # with service account key, using an external json file
9
+
10
+ # check for our own environment variable first
11
+ account_info = os.getenv("DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH")
12
+ if account_info is None:
13
+ # but as a fallback look for the default google one
14
+ account_info = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
15
+
9
16
  soda_configuration = {
10
17
  f"data_source {server.type}": {
11
18
  "type": "bigquery",
12
- "account_info_json_path": os.getenv("DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH"),
19
+ "account_info_json_path": account_info,
13
20
  "auth_scopes": ["https://www.googleapis.com/auth/bigquery"],
14
21
  "project_id": server.project,
15
22
  "dataset": server.dataset,
@@ -4,15 +4,24 @@ import yaml
4
4
 
5
5
 
6
6
  def to_databricks_soda_configuration(server):
7
+ token = os.getenv("DATACONTRACT_DATABRICKS_TOKEN")
8
+ if token is None:
9
+ raise ValueError("DATACONTRACT_DATABRICKS_TOKEN environment variable is not set")
10
+ http_path = os.getenv("DATACONTRACT_DATABRICKS_HTTP_PATH")
11
+ host = server.host
12
+ if host is None:
13
+ host = os.getenv("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME")
14
+ if host is None:
15
+ raise ValueError("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set")
7
16
  soda_configuration = {
8
17
  f"data_source {server.type}": {
9
18
  "type": "spark",
10
19
  "method": "databricks",
11
- "host": server.host,
20
+ "host": host,
12
21
  "catalog": server.catalog,
13
22
  "schema": server.schema_,
14
- "http_path": os.getenv("DATACONTRACT_DATABRICKS_HTTP_PATH"),
15
- "token": os.getenv("DATACONTRACT_DATABRICKS_TOKEN"),
23
+ "http_path": http_path,
24
+ "token": token,
16
25
  }
17
26
  }
18
27
 
@@ -0,0 +1,241 @@
1
+ import os
2
+ from typing import Any, Dict
3
+
4
+ import duckdb
5
+
6
+ from datacontract.export.duckdb_type_converter import convert_to_duckdb_csv_type, convert_to_duckdb_json_type
7
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
8
+ from datacontract.model.run import Run
9
+
10
+
11
+ def get_duckdb_connection(
12
+ data_contract: DataContractSpecification,
13
+ server: Server,
14
+ run: Run,
15
+ duckdb_connection: duckdb.DuckDBPyConnection | None = None,
16
+ ) -> duckdb.DuckDBPyConnection:
17
+ if duckdb_connection is None:
18
+ con = duckdb.connect(database=":memory:")
19
+ else:
20
+ con = duckdb_connection
21
+
22
+ path: str = ""
23
+ if server.type == "local":
24
+ path = server.path
25
+ if server.type == "s3":
26
+ path = server.location
27
+ setup_s3_connection(con, server)
28
+ if server.type == "gcs":
29
+ path = server.location
30
+ setup_gcs_connection(con, server)
31
+ if server.type == "azure":
32
+ path = server.location
33
+ setup_azure_connection(con, server)
34
+ for model_name, model in data_contract.models.items():
35
+ model_path = path
36
+ if "{model}" in model_path:
37
+ model_path = model_path.format(model=model_name)
38
+ run.log_info(f"Creating table {model_name} for {model_path}")
39
+
40
+ if server.format == "json":
41
+ json_format = "auto"
42
+ if server.delimiter == "new_line":
43
+ json_format = "newline_delimited"
44
+ elif server.delimiter == "array":
45
+ json_format = "array"
46
+ columns = to_json_types(model)
47
+ if columns is None:
48
+ con.sql(f"""
49
+ CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', hive_partitioning=1);
50
+ """)
51
+ else:
52
+ con.sql(
53
+ f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', columns={columns}, hive_partitioning=1);"""
54
+ )
55
+ add_nested_views(con, model_name, model.fields)
56
+ elif server.format == "parquet":
57
+ con.sql(f"""
58
+ CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{model_path}', hive_partitioning=1);
59
+ """)
60
+ elif server.format == "csv":
61
+ columns = to_csv_types(model)
62
+ run.log_info("Using columns: " + str(columns))
63
+ if columns is None:
64
+ con.sql(
65
+ f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1);"""
66
+ )
67
+ else:
68
+ con.sql(
69
+ f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});"""
70
+ )
71
+ elif server.format == "delta":
72
+ con.sql("update extensions;") # Make sure we have the latest delta extension
73
+ con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""")
74
+ table_info = con.sql(f"PRAGMA table_info('{model_name}');").fetchdf()
75
+ if table_info is not None and not table_info.empty:
76
+ run.log_info(f"DuckDB Table Info: {table_info.to_string(index=False)}")
77
+ return con
78
+
79
+
80
+ def to_csv_types(model) -> dict[Any, str | None] | None:
81
+ if model is None:
82
+ return None
83
+ columns = {}
84
+ # ['SQLNULL', 'BOOLEAN', 'BIGINT', 'DOUBLE', 'TIME', 'DATE', 'TIMESTAMP', 'VARCHAR']
85
+ for field_name, field in model.fields.items():
86
+ columns[field_name] = convert_to_duckdb_csv_type(field)
87
+ return columns
88
+
89
+
90
+ def to_json_types(model: Model) -> dict[Any, str | None] | None:
91
+ if model is None:
92
+ return None
93
+ columns = {}
94
+ for field_name, field in model.fields.items():
95
+ columns[field_name] = convert_to_duckdb_json_type(field)
96
+ return columns
97
+
98
+
99
+ def add_nested_views(con: duckdb.DuckDBPyConnection, model_name: str, fields: Dict[str, Field] | None):
100
+ model_name = model_name.strip('"')
101
+ if fields is None:
102
+ return
103
+ for field_name, field in fields.items():
104
+ if field.type is None or field.type.lower() not in ["array", "object"]:
105
+ continue
106
+ field_type = field.type.lower()
107
+ if field_type == "array" and field.items is None:
108
+ continue
109
+ elif field_type == "object" and field.fields is None:
110
+ continue
111
+
112
+ nested_model_name = f"{model_name}__{field_name}"
113
+ max_depth = 2 if field_type == "array" else 1
114
+
115
+ ## if parent field is not required, the nested objects may respolve
116
+ ## to a row of NULLs -- but if the objects themselves have required
117
+ ## fields, this will fail the check.
118
+ where = "" if field.required else f" WHERE {field_name} IS NOT NULL"
119
+ con.sql(f"""
120
+ CREATE VIEW IF NOT EXISTS "{nested_model_name}" AS
121
+ SELECT unnest({field_name}, max_depth := {max_depth}) as {field_name} FROM "{model_name}" {where}
122
+ """)
123
+ if field_type == "array":
124
+ add_nested_views(con, nested_model_name, field.items.fields)
125
+ elif field_type == "object":
126
+ add_nested_views(con, nested_model_name, field.fields)
127
+
128
+
129
+ def setup_s3_connection(con, server):
130
+ s3_region = os.getenv("DATACONTRACT_S3_REGION")
131
+ s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
132
+ s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
133
+ s3_session_token = os.getenv("DATACONTRACT_S3_SESSION_TOKEN")
134
+ s3_endpoint = "s3.amazonaws.com"
135
+ use_ssl = "true"
136
+ url_style = "vhost"
137
+ if server.endpointUrl is not None:
138
+ url_style = "path"
139
+ s3_endpoint = server.endpointUrl.removeprefix("http://").removeprefix("https://")
140
+ if server.endpointUrl.startswith("http://"):
141
+ use_ssl = "false"
142
+
143
+ if s3_access_key_id is not None:
144
+ if s3_session_token is not None:
145
+ con.sql(f"""
146
+ CREATE OR REPLACE SECRET s3_secret (
147
+ TYPE S3,
148
+ PROVIDER CREDENTIAL_CHAIN,
149
+ REGION '{s3_region}',
150
+ KEY_ID '{s3_access_key_id}',
151
+ SECRET '{s3_secret_access_key}',
152
+ SESSION_TOKEN '{s3_session_token}',
153
+ ENDPOINT '{s3_endpoint}',
154
+ USE_SSL '{use_ssl}',
155
+ URL_STYLE '{url_style}'
156
+ );
157
+ """)
158
+ else:
159
+ con.sql(f"""
160
+ CREATE OR REPLACE SECRET s3_secret (
161
+ TYPE S3,
162
+ PROVIDER CREDENTIAL_CHAIN,
163
+ REGION '{s3_region}',
164
+ KEY_ID '{s3_access_key_id}',
165
+ SECRET '{s3_secret_access_key}',
166
+ ENDPOINT '{s3_endpoint}',
167
+ USE_SSL '{use_ssl}',
168
+ URL_STYLE '{url_style}'
169
+ );
170
+ """)
171
+
172
+ # con.sql(f"""
173
+ # SET s3_region = '{s3_region}';
174
+ # SET s3_access_key_id = '{s3_access_key_id}';
175
+ # SET s3_secret_access_key = '{s3_secret_access_key}';
176
+ # """)
177
+ # else:
178
+ # con.sql("""
179
+ # RESET s3_region;
180
+ # RESET s3_access_key_id;
181
+ # RESET s3_secret_access_key;
182
+ # """)
183
+ # con.sql("RESET s3_session_token")
184
+ # print(con.sql("SELECT * FROM duckdb_settings() WHERE name like 's3%'"))
185
+
186
+
187
+ def setup_gcs_connection(con, server):
188
+ key_id = os.getenv("DATACONTRACT_GCS_KEY_ID")
189
+ secret = os.getenv("DATACONTRACT_GCS_SECRET")
190
+
191
+ if key_id is None:
192
+ raise ValueError("Error: Environment variable DATACONTRACT_GCS_KEY_ID is not set")
193
+ if secret is None:
194
+ raise ValueError("Error: Environment variable DATACONTRACT_GCS_SECRET is not set")
195
+
196
+ con.sql(f"""
197
+ CREATE SECRET gcs_secret (
198
+ TYPE GCS,
199
+ KEY_ID '{key_id}',
200
+ SECRET '{secret}'
201
+ );
202
+ """)
203
+
204
+
205
+ def setup_azure_connection(con, server):
206
+ tenant_id = os.getenv("DATACONTRACT_AZURE_TENANT_ID")
207
+ client_id = os.getenv("DATACONTRACT_AZURE_CLIENT_ID")
208
+ client_secret = os.getenv("DATACONTRACT_AZURE_CLIENT_SECRET")
209
+ storage_account = server.storageAccount
210
+
211
+ if tenant_id is None:
212
+ raise ValueError("Error: Environment variable DATACONTRACT_AZURE_TENANT_ID is not set")
213
+ if client_id is None:
214
+ raise ValueError("Error: Environment variable DATACONTRACT_AZURE_CLIENT_ID is not set")
215
+ if client_secret is None:
216
+ raise ValueError("Error: Environment variable DATACONTRACT_AZURE_CLIENT_SECRET is not set")
217
+
218
+ con.install_extension("azure")
219
+ con.load_extension("azure")
220
+
221
+ if storage_account is not None:
222
+ con.sql(f"""
223
+ CREATE SECRET azure_spn (
224
+ TYPE AZURE,
225
+ PROVIDER SERVICE_PRINCIPAL,
226
+ TENANT_ID '{tenant_id}',
227
+ CLIENT_ID '{client_id}',
228
+ CLIENT_SECRET '{client_secret}',
229
+ ACCOUNT_NAME '{storage_account}'
230
+ );
231
+ """)
232
+ else:
233
+ con.sql(f"""
234
+ CREATE SECRET azure_spn (
235
+ TYPE AZURE,
236
+ PROVIDER SERVICE_PRINCIPAL,
237
+ TENANT_ID '{tenant_id}',
238
+ CLIENT_ID '{client_id}',
239
+ CLIENT_SECRET '{client_secret}'
240
+ );
241
+ """)