datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. datacontract/__init__.py +13 -0
  2. datacontract/api.py +260 -0
  3. datacontract/breaking/breaking.py +242 -12
  4. datacontract/breaking/breaking_rules.py +37 -1
  5. datacontract/catalog/catalog.py +80 -0
  6. datacontract/cli.py +387 -117
  7. datacontract/data_contract.py +216 -353
  8. datacontract/engines/data_contract_checks.py +1041 -0
  9. datacontract/engines/data_contract_test.py +113 -0
  10. datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
  11. datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
  12. datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
  13. datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
  14. datacontract/engines/soda/check_soda_execute.py +100 -56
  15. datacontract/engines/soda/connections/athena.py +79 -0
  16. datacontract/engines/soda/connections/bigquery.py +8 -1
  17. datacontract/engines/soda/connections/databricks.py +12 -3
  18. datacontract/engines/soda/connections/duckdb_connection.py +241 -0
  19. datacontract/engines/soda/connections/kafka.py +206 -113
  20. datacontract/engines/soda/connections/snowflake.py +8 -5
  21. datacontract/engines/soda/connections/sqlserver.py +43 -0
  22. datacontract/engines/soda/connections/trino.py +26 -0
  23. datacontract/export/avro_converter.py +72 -8
  24. datacontract/export/avro_idl_converter.py +31 -25
  25. datacontract/export/bigquery_converter.py +130 -0
  26. datacontract/export/custom_converter.py +40 -0
  27. datacontract/export/data_caterer_converter.py +161 -0
  28. datacontract/export/dbml_converter.py +148 -0
  29. datacontract/export/dbt_converter.py +141 -54
  30. datacontract/export/dcs_exporter.py +6 -0
  31. datacontract/export/dqx_converter.py +126 -0
  32. datacontract/export/duckdb_type_converter.py +57 -0
  33. datacontract/export/excel_exporter.py +923 -0
  34. datacontract/export/exporter.py +100 -0
  35. datacontract/export/exporter_factory.py +216 -0
  36. datacontract/export/go_converter.py +105 -0
  37. datacontract/export/great_expectations_converter.py +257 -36
  38. datacontract/export/html_exporter.py +86 -0
  39. datacontract/export/iceberg_converter.py +188 -0
  40. datacontract/export/jsonschema_converter.py +71 -16
  41. datacontract/export/markdown_converter.py +337 -0
  42. datacontract/export/mermaid_exporter.py +110 -0
  43. datacontract/export/odcs_v3_exporter.py +375 -0
  44. datacontract/export/pandas_type_converter.py +40 -0
  45. datacontract/export/protobuf_converter.py +168 -68
  46. datacontract/export/pydantic_converter.py +6 -0
  47. datacontract/export/rdf_converter.py +13 -6
  48. datacontract/export/sodacl_converter.py +36 -188
  49. datacontract/export/spark_converter.py +245 -0
  50. datacontract/export/sql_converter.py +37 -3
  51. datacontract/export/sql_type_converter.py +269 -8
  52. datacontract/export/sqlalchemy_converter.py +170 -0
  53. datacontract/export/terraform_converter.py +7 -2
  54. datacontract/imports/avro_importer.py +246 -26
  55. datacontract/imports/bigquery_importer.py +221 -0
  56. datacontract/imports/csv_importer.py +143 -0
  57. datacontract/imports/dbml_importer.py +112 -0
  58. datacontract/imports/dbt_importer.py +240 -0
  59. datacontract/imports/excel_importer.py +1111 -0
  60. datacontract/imports/glue_importer.py +288 -0
  61. datacontract/imports/iceberg_importer.py +172 -0
  62. datacontract/imports/importer.py +51 -0
  63. datacontract/imports/importer_factory.py +128 -0
  64. datacontract/imports/json_importer.py +325 -0
  65. datacontract/imports/jsonschema_importer.py +146 -0
  66. datacontract/imports/odcs_importer.py +60 -0
  67. datacontract/imports/odcs_v3_importer.py +516 -0
  68. datacontract/imports/parquet_importer.py +81 -0
  69. datacontract/imports/protobuf_importer.py +264 -0
  70. datacontract/imports/spark_importer.py +262 -0
  71. datacontract/imports/sql_importer.py +274 -35
  72. datacontract/imports/unity_importer.py +219 -0
  73. datacontract/init/init_template.py +20 -0
  74. datacontract/integration/datamesh_manager.py +86 -0
  75. datacontract/lint/resolve.py +271 -49
  76. datacontract/lint/resources.py +21 -0
  77. datacontract/lint/schema.py +53 -17
  78. datacontract/lint/urls.py +32 -12
  79. datacontract/model/data_contract_specification/__init__.py +1 -0
  80. datacontract/model/exceptions.py +4 -1
  81. datacontract/model/odcs.py +24 -0
  82. datacontract/model/run.py +49 -29
  83. datacontract/output/__init__.py +0 -0
  84. datacontract/output/junit_test_results.py +135 -0
  85. datacontract/output/output_format.py +10 -0
  86. datacontract/output/test_results_writer.py +79 -0
  87. datacontract/py.typed +0 -0
  88. datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
  89. datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
  90. datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
  91. datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
  92. datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
  93. datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
  94. datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
  95. datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
  96. datacontract/templates/datacontract.html +139 -294
  97. datacontract/templates/datacontract_odcs.html +685 -0
  98. datacontract/templates/index.html +236 -0
  99. datacontract/templates/partials/datacontract_information.html +86 -0
  100. datacontract/templates/partials/datacontract_servicelevels.html +253 -0
  101. datacontract/templates/partials/datacontract_terms.html +51 -0
  102. datacontract/templates/partials/definition.html +25 -0
  103. datacontract/templates/partials/example.html +27 -0
  104. datacontract/templates/partials/model_field.html +144 -0
  105. datacontract/templates/partials/quality.html +49 -0
  106. datacontract/templates/partials/server.html +211 -0
  107. datacontract/templates/style/output.css +491 -72
  108. datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
  109. datacontract_cli-0.10.37.dist-info/RECORD +119 -0
  110. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
  111. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
  112. datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
  113. datacontract/engines/soda/connections/dask.py +0 -28
  114. datacontract/engines/soda/connections/duckdb.py +0 -76
  115. datacontract/export/csv_type_converter.py +0 -36
  116. datacontract/export/html_export.py +0 -66
  117. datacontract/export/odcs_converter.py +0 -102
  118. datacontract/init/download_datacontract_file.py +0 -17
  119. datacontract/integration/publish_datamesh_manager.py +0 -33
  120. datacontract/integration/publish_opentelemetry.py +0 -107
  121. datacontract/lint/lint.py +0 -141
  122. datacontract/lint/linters/description_linter.py +0 -34
  123. datacontract/lint/linters/example_model_linter.py +0 -91
  124. datacontract/lint/linters/field_pattern_linter.py +0 -34
  125. datacontract/lint/linters/field_reference_linter.py +0 -38
  126. datacontract/lint/linters/notice_period_linter.py +0 -55
  127. datacontract/lint/linters/quality_schema_linter.py +0 -52
  128. datacontract/lint/linters/valid_constraints_linter.py +0 -99
  129. datacontract/model/data_contract_specification.py +0 -141
  130. datacontract/web.py +0 -14
  131. datacontract_cli-0.10.0.dist-info/METADATA +0 -951
  132. datacontract_cli-0.10.0.dist-info/RECORD +0 -66
  133. /datacontract/{model → breaking}/breaking_change.py +0 -0
  134. /datacontract/{lint/linters → export}/__init__.py +0 -0
  135. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
  136. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
@@ -1,43 +1,41 @@
1
+ import atexit
2
+ import logging
1
3
  import os
2
-
3
- import pyspark.sql.functions as fn
4
- from pyspark.sql import SparkSession
5
- from pyspark.sql.avro.functions import from_avro
6
- from pyspark.sql.functions import from_json, col
7
- from pyspark.sql.types import (
8
- StructType,
9
- DataType,
10
- NullType,
11
- ArrayType,
12
- BinaryType,
13
- DateType,
14
- TimestampNTZType,
15
- TimestampType,
16
- BooleanType,
17
- LongType,
18
- IntegerType,
19
- DoubleType,
20
- DecimalType,
21
- StringType,
22
- StructField,
23
- )
4
+ import tempfile
24
5
 
25
6
  from datacontract.export.avro_converter import to_avro_schema_json
26
- from datacontract.model.data_contract_specification import \
27
- DataContractSpecification, Server, Field
7
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server
28
8
  from datacontract.model.exceptions import DataContractException
9
+ from datacontract.model.run import ResultEnum
10
+
11
+
12
+ def create_spark_session():
13
+ """Create and configure a Spark session."""
14
+
15
+ try:
16
+ from pyspark.sql import SparkSession
17
+ except ImportError as e:
18
+ raise DataContractException(
19
+ type="schema",
20
+ result=ResultEnum.failed,
21
+ name="pyspark is missing",
22
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
23
+ engine="datacontract",
24
+ original_exception=e,
25
+ )
29
26
 
27
+ tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract-cli-spark")
28
+ atexit.register(tmp_dir.cleanup)
30
29
 
31
- def create_spark_session(tmp_dir) -> SparkSession:
32
- # TODO: Update dependency versions when updating pyspark
33
- # TODO: add protobuf library
30
+ pyspark_version = "3.5.5" # MUST be the same as in the pyproject.toml
34
31
  spark = (
35
32
  SparkSession.builder.appName("datacontract")
36
- .config("spark.sql.warehouse.dir", tmp_dir + "/spark-warehouse")
37
- .config("spark.streaming.stopGracefullyOnShutdown", True)
33
+ .config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
34
+ .config("spark.streaming.stopGracefullyOnShutdown", "true")
35
+ .config("spark.ui.enabled", "false")
38
36
  .config(
39
37
  "spark.jars.packages",
40
- "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,org.apache.spark:spark-avro_2.12:3.5.0",
38
+ f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version},org.apache.spark:spark-avro_2.12:{pyspark_version}",
41
39
  )
42
40
  .getOrCreate()
43
41
  )
@@ -46,107 +44,202 @@ def create_spark_session(tmp_dir) -> SparkSession:
46
44
  return spark
47
45
 
48
46
 
49
- def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecification, server: Server, tmp_dir):
50
- host = server.host
51
- topic = server.topic
52
- auth_options = get_auth_options()
47
+ def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Server):
48
+ """Read and process data from a Kafka topic based on the server configuration."""
53
49
 
54
- # read full kafka topic
50
+ logging.info("Reading data from Kafka server %s topic %s", server.host, server.topic)
55
51
  df = (
56
52
  spark.read.format("kafka")
57
- .options(**auth_options)
58
- .option("kafka.bootstrap.servers", host)
59
- .option("subscribe", topic)
53
+ .options(**get_auth_options())
54
+ .option("kafka.bootstrap.servers", server.host)
55
+ .option("subscribe", server.topic)
60
56
  .option("startingOffsets", "earliest")
61
57
  .load()
62
58
  )
63
- # TODO a warning if none or multiple models
59
+
64
60
  model_name, model = next(iter(data_contract.models.items()))
65
- if server.format == "avro":
66
- avro_schema = to_avro_schema_json(model_name, model)
67
-
68
- # Parse out the extra bytes from the Avro data
69
- # A Kafka message contains a key and a value. Data going through a Kafka topic in Confluent Cloud has five bytes added to the beginning of every Avro value. If you are using Avro format keys, then five bytes will be added to the beginning of those as well. For this example, we’re assuming string keys. These bytes consist of one magic byte and four bytes representing the schema ID of the schema in the registry that is needed to decode that data. The bytes need to be removed so that the schema ID can be determined and the Avro data can be parsed. To manipulate the data, we need a couple of imports:
70
- df2 = df.withColumn("fixedValue", fn.expr("substring(value, 6, length(value)-5)"))
71
-
72
- options = {"mode": "PERMISSIVE"}
73
- df3 = df2.select(from_avro(col("fixedValue"), avro_schema, options).alias("avro")).select(col("avro.*"))
74
- elif server.format == "json":
75
- # TODO A good warning when the conversion to json fails
76
- struct_type = to_struct_type(model.fields)
77
- df2 = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
78
-
79
- options = {"mode": "PERMISSIVE"}
80
- df3 = df2.select(from_json(df2.value, struct_type, options).alias("json")).select(col("json.*"))
81
- else:
61
+
62
+ match server.format:
63
+ case "avro":
64
+ process_avro_format(df, model_name, model)
65
+ case "json":
66
+ process_json_format(df, model_name, model)
67
+ case _:
68
+ raise DataContractException(
69
+ type="test",
70
+ name="Configuring Kafka checks",
71
+ result="warning",
72
+ reason=f"Kafka format '{server.format}' is not supported. Skip executing tests.",
73
+ engine="datacontract",
74
+ )
75
+
76
+
77
+ def process_avro_format(df, model_name, model):
78
+ try:
79
+ from pyspark.sql.avro.functions import from_avro
80
+ from pyspark.sql.functions import col, expr
81
+ except ImportError as e:
82
+ raise DataContractException(
83
+ type="schema",
84
+ result="failed",
85
+ name="pyspark is missing",
86
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
87
+ engine="datacontract",
88
+ original_exception=e,
89
+ )
90
+
91
+ avro_schema = to_avro_schema_json(model_name, model)
92
+ df2 = df.withColumn("fixedValue", expr("substring(value, 6, length(value)-5)"))
93
+ options = {"mode": "PERMISSIVE"}
94
+ df2.select(from_avro(col("fixedValue"), avro_schema, options).alias("avro")).select(
95
+ col("avro.*")
96
+ ).createOrReplaceTempView(model_name)
97
+
98
+
99
+ def process_json_format(df, model_name, model):
100
+ try:
101
+ from pyspark.sql.functions import col, from_json
102
+ except ImportError as e:
82
103
  raise DataContractException(
83
- type="test",
84
- name="Configuring Kafka checks",
85
- result="warning",
86
- reason=f"Kafka format '{server.format}' is not supported. Skip executing tests.",
104
+ type="schema",
105
+ result="failed",
106
+ name="pyspark is missing",
107
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
87
108
  engine="datacontract",
109
+ original_exception=e,
88
110
  )
89
111
 
90
- # df3.writeStream.toTable(model_name, checkpointLocation=tmp_dir + "/checkpoint")
91
- df3.createOrReplaceTempView(model_name)
92
- # print(spark.sql(f"select * from {model_name}").show())
112
+ struct_type = to_struct_type(model.fields)
113
+ df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").select(
114
+ from_json(col("value"), struct_type, {"mode": "PERMISSIVE"}).alias("json")
115
+ ).select(col("json.*")).createOrReplaceTempView(model_name)
93
116
 
94
117
 
95
118
  def get_auth_options():
119
+ """Retrieve Kafka authentication options from environment variables."""
96
120
  kafka_sasl_username = os.getenv("DATACONTRACT_KAFKA_SASL_USERNAME")
97
121
  kafka_sasl_password = os.getenv("DATACONTRACT_KAFKA_SASL_PASSWORD")
98
- if kafka_sasl_username is None:
99
- auth_options = {}
100
- else:
101
- kafka_sasl_jaas_config = f'org.apache.kafka.common.security.plain.PlainLoginModule required username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
102
- auth_options = {
103
- "kafka.sasl.mechanism": "PLAIN",
104
- "kafka.security.protocol": "SASL_SSL",
105
- "kafka.sasl.jaas.config": kafka_sasl_jaas_config,
106
- }
107
- return auth_options
122
+ kafka_sasl_mechanism = os.getenv("DATACONTRACT_KAFKA_SASL_MECHANISM", "PLAIN").upper()
123
+
124
+ # Skip authentication if credentials are not provided
125
+ if not kafka_sasl_username or not kafka_sasl_password:
126
+ return {}
127
+
128
+ # SASL mechanisms supported by Kafka
129
+ jaas_config = {
130
+ "PLAIN": (
131
+ f"org.apache.kafka.common.security.plain.PlainLoginModule required "
132
+ f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
133
+ ),
134
+ "SCRAM-SHA-256": (
135
+ f"org.apache.kafka.common.security.scram.ScramLoginModule required "
136
+ f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
137
+ ),
138
+ "SCRAM-SHA-512": (
139
+ f"org.apache.kafka.common.security.scram.ScramLoginModule required "
140
+ f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
141
+ ),
142
+ # Add more mechanisms as needed
143
+ }
144
+
145
+ # Validate SASL mechanism
146
+ if kafka_sasl_mechanism not in jaas_config:
147
+ raise ValueError(f"Unsupported SASL mechanism: {kafka_sasl_mechanism}")
148
+
149
+ # Return config
150
+ return {
151
+ "kafka.sasl.mechanism": kafka_sasl_mechanism,
152
+ "kafka.security.protocol": "SASL_SSL",
153
+ "kafka.sasl.jaas.config": jaas_config[kafka_sasl_mechanism],
154
+ }
108
155
 
109
156
 
110
157
  def to_struct_type(fields):
111
- struct_fields = []
112
- for field_name, field in fields.items():
113
- struct_fields.append(to_struct_field(field_name, field))
114
- return StructType(struct_fields)
115
-
116
-
117
- def to_struct_field(field_name: str, field: Field) -> StructField:
118
- if field.type is None:
119
- data_type = DataType()
120
- if field.type in ["string", "varchar", "text"]:
121
- data_type = StringType()
122
- elif field.type in ["number", "decimal", "numeric"]:
123
- data_type = DecimalType()
124
- elif field.type in ["float", "double"]:
125
- data_type = DoubleType()
126
- elif field.type in ["integer", "int"]:
127
- data_type = IntegerType()
128
- elif field.type in ["long", "bigint"]:
129
- data_type = LongType()
130
- elif field.type in ["boolean"]:
131
- data_type = BooleanType()
132
- elif field.type in ["timestamp", "timestamp_tz"]:
133
- data_type = TimestampType()
134
- elif field.type in ["timestamp_ntz"]:
135
- data_type = TimestampNTZType()
136
- elif field.type in ["date"]:
137
- data_type = DateType()
138
- elif field.type in ["time"]:
139
- data_type = DataType()
140
- elif field.type in ["object", "record", "struct"]:
141
- data_type = to_struct_type(field.fields)
142
- elif field.type in ["binary"]:
143
- data_type = BinaryType()
144
- elif field.type in ["array"]:
145
- # TODO support array structs
146
- data_type = ArrayType()
147
- elif field.type in ["null"]:
148
- data_type = NullType()
149
- else:
150
- data_type = DataType()
158
+ try:
159
+ from pyspark.sql.types import StructType
160
+ except ImportError as e:
161
+ raise DataContractException(
162
+ type="schema",
163
+ result="failed",
164
+ name="pyspark is missing",
165
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
166
+ engine="datacontract",
167
+ original_exception=e,
168
+ )
169
+
170
+ """Convert field definitions to Spark StructType."""
171
+ return StructType([to_struct_field(field_name, field) for field_name, field in fields.items()])
172
+
173
+
174
+ def to_struct_field(field_name: str, field: Field):
175
+ try:
176
+ from pyspark.sql.types import (
177
+ ArrayType,
178
+ BinaryType,
179
+ BooleanType,
180
+ DataType,
181
+ DateType,
182
+ DecimalType,
183
+ DoubleType,
184
+ IntegerType,
185
+ LongType,
186
+ NullType,
187
+ StringType,
188
+ StructField,
189
+ StructType,
190
+ TimestampNTZType,
191
+ TimestampType,
192
+ )
193
+ except ImportError as e:
194
+ raise DataContractException(
195
+ type="schema",
196
+ result="failed",
197
+ name="pyspark is missing",
198
+ reason="Install the extra datacontract-cli[kafka] to use kafka",
199
+ engine="datacontract",
200
+ original_exception=e,
201
+ )
202
+
203
+ """Map field definitions to Spark StructField using match-case."""
204
+ match field.type:
205
+ case "string" | "varchar" | "text":
206
+ data_type = StringType()
207
+ case "number" | "decimal" | "numeric":
208
+ data_type = DecimalType()
209
+ case "float" | "double":
210
+ data_type = DoubleType()
211
+ case "integer" | "int":
212
+ data_type = IntegerType()
213
+ case "long" | "bigint":
214
+ data_type = LongType()
215
+ case "boolean":
216
+ data_type = BooleanType()
217
+ case "timestamp" | "timestamp_tz":
218
+ data_type = TimestampType()
219
+ case "timestamp_ntz":
220
+ data_type = TimestampNTZType()
221
+ case "date":
222
+ data_type = DateType()
223
+ case "time":
224
+ data_type = DataType() # Specific handling for time type
225
+ case "object" | "record" | "struct":
226
+ data_type = StructType(
227
+ [to_struct_field(sub_field_name, sub_field) for sub_field_name, sub_field in field.fields.items()]
228
+ )
229
+ case "binary":
230
+ data_type = BinaryType()
231
+ case "array":
232
+ element_type = (
233
+ StructType(
234
+ [to_struct_field(sub_field_name, sub_field) for sub_field_name, sub_field in field.fields.items()]
235
+ )
236
+ if field.fields
237
+ else DataType()
238
+ )
239
+ data_type = ArrayType(element_type)
240
+ case "null":
241
+ data_type = NullType()
242
+ case _:
243
+ data_type = DataType() # Fallback generic DataType
151
244
 
152
245
  return StructField(field_name, data_type, nullable=not field.required)
@@ -4,17 +4,20 @@ import yaml
4
4
 
5
5
 
6
6
  def to_snowflake_soda_configuration(server):
7
+ prefix = "DATACONTRACT_SNOWFLAKE_"
8
+ snowflake_soda_params = {k.replace(prefix, "").lower(): v for k, v in os.environ.items() if k.startswith(prefix)}
9
+
10
+ # backward compatibility
11
+ if "connection_timeout" not in snowflake_soda_params:
12
+ snowflake_soda_params["connection_timeout"] = "5" # minutes
13
+
7
14
  soda_configuration = {
8
15
  f"data_source {server.type}": {
9
16
  "type": "snowflake",
10
- "username": os.getenv("DATACONTRACT_SNOWFLAKE_USERNAME"),
11
- "password": os.getenv("DATACONTRACT_SNOWFLAKE_PASSWORD"),
12
- "role": os.getenv("DATACONTRACT_SNOWFLAKE_ROLE"),
13
17
  "account": server.account,
14
18
  "database": server.database,
15
19
  "schema": server.schema_,
16
- "warehouse": os.getenv("DATACONTRACT_SNOWFLAKE_WAREHOUSE"),
17
- "connection_timeout": 5, # minutes
20
+ **snowflake_soda_params,
18
21
  }
19
22
  }
20
23
  soda_configuration_str = yaml.dump(soda_configuration)
@@ -0,0 +1,43 @@
1
+ import os
2
+
3
+ import yaml
4
+
5
+ from datacontract.model.data_contract_specification import Server
6
+
7
+
8
+ def to_sqlserver_soda_configuration(server: Server) -> str:
9
+ """Serialize server config to soda configuration.
10
+
11
+
12
+ ### Example:
13
+ type: sqlserver
14
+ host: host
15
+ port: '1433'
16
+ username: simple
17
+ password: simple_pass
18
+ database: database
19
+ schema: dbo
20
+ trusted_connection: false
21
+ encrypt: false
22
+ trust_server_certificate: false
23
+ driver: ODBC Driver 18 for SQL Server
24
+ """
25
+ # with service account key, using an external json file
26
+ soda_configuration = {
27
+ f"data_source {server.type}": {
28
+ "type": "sqlserver",
29
+ "host": server.host,
30
+ "port": str(server.port),
31
+ "username": os.getenv("DATACONTRACT_SQLSERVER_USERNAME", ""),
32
+ "password": os.getenv("DATACONTRACT_SQLSERVER_PASSWORD", ""),
33
+ "database": server.database,
34
+ "schema": server.schema_,
35
+ "trusted_connection": os.getenv("DATACONTRACT_SQLSERVER_TRUSTED_CONNECTION", False),
36
+ "trust_server_certificate": os.getenv("DATACONTRACT_SQLSERVER_TRUST_SERVER_CERTIFICATE", False),
37
+ "encrypt": os.getenv("DATACONTRACT_SQLSERVER_ENCRYPTED_CONNECTION", True),
38
+ "driver": server.driver,
39
+ }
40
+ }
41
+
42
+ soda_configuration_str = yaml.dump(soda_configuration)
43
+ return soda_configuration_str
@@ -0,0 +1,26 @@
1
+ import os
2
+
3
+ import yaml
4
+
5
+
6
+ def to_trino_soda_configuration(server):
7
+ password = os.getenv("DATACONTRACT_TRINO_PASSWORD")
8
+ username = os.getenv("DATACONTRACT_TRINO_USERNAME")
9
+
10
+ data_source = {
11
+ "type": "trino",
12
+ "host": server.host,
13
+ "port": str(server.port),
14
+ "username": username,
15
+ "password": password,
16
+ "catalog": server.catalog,
17
+ "schema": server.schema_,
18
+ }
19
+
20
+ if password is None or password == "":
21
+ data_source["auth_type"] = "NoAuthentication" # default is BasicAuthentication
22
+
23
+ soda_configuration = {f"data_source {server.type}": data_source}
24
+
25
+ soda_configuration_str = yaml.dump(soda_configuration)
26
+ return soda_configuration_str
@@ -1,8 +1,15 @@
1
1
  import json
2
2
 
3
+ from datacontract.export.exporter import Exporter, _check_models_for_export
3
4
  from datacontract.model.data_contract_specification import Field
4
5
 
5
6
 
7
+ class AvroExporter(Exporter):
8
+ def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
9
+ model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
10
+ return to_avro_schema_json(model_name, model_value)
11
+
12
+
6
13
  def to_avro_schema(model_name, model) -> dict:
7
14
  return to_avro_record(model_name, model.fields, model.description, model.namespace)
8
15
 
@@ -33,19 +40,70 @@ def to_avro_field(field, field_name):
33
40
  avro_field = {"name": field_name}
34
41
  if field.description is not None:
35
42
  avro_field["doc"] = field.description
36
- avro_field["type"] = to_avro_type(field, field_name)
43
+ is_required_avro = field.required if field.required is not None else True
44
+ avro_type = to_avro_type(field, field_name)
45
+ avro_field["type"] = avro_type if is_required_avro else ["null", avro_type]
46
+
47
+ # Handle enum types - both required and optional
48
+ if avro_type == "enum" or (isinstance(avro_field["type"], list) and "enum" in avro_field["type"]):
49
+ enum_def = {
50
+ "type": "enum",
51
+ "name": field.title,
52
+ "symbols": field.enum,
53
+ }
54
+ if is_required_avro:
55
+ avro_field["type"] = enum_def
56
+ else:
57
+ # Replace "enum" with the full enum definition in the union
58
+ avro_field["type"] = ["null", enum_def]
59
+
60
+ if field.config:
61
+ if "avroDefault" in field.config:
62
+ if field.config.get("avroType") != "enum":
63
+ avro_field["default"] = field.config["avroDefault"]
64
+
37
65
  return avro_field
38
66
 
39
67
 
40
68
  def to_avro_type(field: Field, field_name: str) -> str | dict:
69
+ if field.config:
70
+ if "avroLogicalType" in field.config and "avroType" in field.config:
71
+ return {"type": field.config["avroType"], "logicalType": field.config["avroLogicalType"]}
72
+ if "avroLogicalType" in field.config:
73
+ if field.config["avroLogicalType"] in [
74
+ "timestamp-millis",
75
+ "timestamp-micros",
76
+ "local-timestamp-millis",
77
+ "local-timestamp-micros",
78
+ "time-micros",
79
+ ]:
80
+ return {"type": "long", "logicalType": field.config["avroLogicalType"]}
81
+ if field.config["avroLogicalType"] in ["time-millis", "date"]:
82
+ return {"type": "int", "logicalType": field.config["avroLogicalType"]}
83
+ if "avroType" in field.config:
84
+ return field.config["avroType"]
85
+
86
+ # Check for enum fields based on presence of enum list and avroType config
87
+ if field.enum and field.config and field.config.get("avroType") == "enum":
88
+ return "enum"
89
+
41
90
  if field.type is None:
42
91
  return "null"
43
92
  if field.type in ["string", "varchar", "text"]:
44
93
  return "string"
45
- elif field.type in ["number", "decimal", "numeric"]:
94
+ elif field.type in ["number", "numeric"]:
46
95
  # https://avro.apache.org/docs/1.11.1/specification/#decimal
47
96
  return "bytes"
48
- elif field.type in ["float", "double"]:
97
+ elif field.type in ["decimal"]:
98
+ typeVal = {"type": "bytes", "logicalType": "decimal"}
99
+ if field.scale is not None:
100
+ typeVal["scale"] = field.scale
101
+ if field.precision is not None:
102
+ typeVal["precision"] = field.precision
103
+ return typeVal
104
+ elif field.type in ["float"]:
105
+ return "float"
106
+ elif field.type in ["double"]:
49
107
  return "double"
50
108
  elif field.type in ["integer", "int"]:
51
109
  return "int"
@@ -54,20 +112,26 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
54
112
  elif field.type in ["boolean"]:
55
113
  return "boolean"
56
114
  elif field.type in ["timestamp", "timestamp_tz"]:
57
- return "string"
115
+ return {"type": "long", "logicalType": "timestamp-millis"}
58
116
  elif field.type in ["timestamp_ntz"]:
59
- return "string"
117
+ return {"type": "long", "logicalType": "local-timestamp-millis"}
60
118
  elif field.type in ["date"]:
61
- return "int"
119
+ return {"type": "int", "logicalType": "date"}
62
120
  elif field.type in ["time"]:
63
121
  return "long"
122
+ elif field.type in ["map"]:
123
+ if field.config is not None and "values" in field.config:
124
+ return {"type": "map", "values": field.config["values"]}
125
+ else:
126
+ return "bytes"
64
127
  elif field.type in ["object", "record", "struct"]:
128
+ if field.config is not None and "namespace" in field.config:
129
+ return to_avro_record(field_name, field.fields, field.description, field.config["namespace"])
65
130
  return to_avro_record(field_name, field.fields, field.description, None)
66
131
  elif field.type in ["binary"]:
67
132
  return "bytes"
68
133
  elif field.type in ["array"]:
69
- # TODO support array structs
70
- return "array"
134
+ return {"type": "array", "items": to_avro_type(field.items, field_name)}
71
135
  elif field.type in ["null"]:
72
136
  return "null"
73
137
  else:
@@ -3,35 +3,12 @@ from dataclasses import dataclass
3
3
  from enum import Enum
4
4
  from io import StringIO
5
5
 
6
+ from datacontract.export.exporter import Exporter
6
7
  from datacontract.lint.resolve import inline_definitions_into_data_contract
7
- from datacontract.model.data_contract_specification import \
8
- DataContractSpecification, Field
8
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field
9
9
  from datacontract.model.exceptions import DataContractException
10
10
 
11
11
 
12
- def to_avro_idl(contract: DataContractSpecification) -> str:
13
- """Serialize the provided data contract specification into an Avro IDL string.
14
-
15
- The data contract will be serialized as a protocol, with one record type
16
- for each contained model. Model fields are mapped one-to-one to Avro IDL
17
- record fields.
18
- """
19
- stream = StringIO()
20
- to_avro_idl_stream(contract, stream)
21
- return stream.getvalue()
22
-
23
-
24
- def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO):
25
- """Serialize the provided data contract specification into Avro IDL."""
26
- ir = _contract_to_avro_idl_ir(contract)
27
- if ir.description:
28
- stream.write(f"/** {contract.info.description} */\n")
29
- stream.write(f"protocol {ir.name or 'Unnamed'} {{\n")
30
- for model_type in ir.model_types:
31
- _write_model_type(model_type, stream)
32
- stream.write("}\n")
33
-
34
-
35
12
  class AvroPrimitiveType(Enum):
36
13
  int = "int"
37
14
  long = "long"
@@ -86,6 +63,7 @@ class AvroIDLProtocol:
86
63
  model_types: list[AvroModelType]
87
64
 
88
65
 
66
+ # TODO use DATACONTRACT_TYPES from datacontract/model/data_contract_specification.py
89
67
  avro_primitive_types = set(
90
68
  [
91
69
  "string",
@@ -108,6 +86,34 @@ avro_primitive_types = set(
108
86
  )
109
87
 
110
88
 
89
+ class AvroIdlExporter(Exporter):
90
+ def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
91
+ return to_avro_idl(data_contract)
92
+
93
+
94
+ def to_avro_idl(contract: DataContractSpecification) -> str:
95
+ """Serialize the provided data contract specification into an Avro IDL string.
96
+
97
+ The data contract will be serialized as a protocol, with one record type
98
+ for each contained model. Model fields are mapped one-to-one to Avro IDL
99
+ record fields.
100
+ """
101
+ stream = StringIO()
102
+ to_avro_idl_stream(contract, stream)
103
+ return stream.getvalue()
104
+
105
+
106
+ def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO):
107
+ """Serialize the provided data contract specification into Avro IDL."""
108
+ ir = _contract_to_avro_idl_ir(contract)
109
+ if ir.description:
110
+ stream.write(f"/** {contract.info.description} */\n")
111
+ stream.write(f"protocol {ir.name or 'Unnamed'} {{\n")
112
+ for model_type in ir.model_types:
113
+ _write_model_type(model_type, stream)
114
+ stream.write("}\n")
115
+
116
+
111
117
  def _to_avro_primitive_logical_type(field_name: str, field: Field) -> AvroPrimitiveField:
112
118
  result = AvroPrimitiveField(field_name, field.required, field.description, AvroPrimitiveType.string)
113
119
  match field.type: