datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. datacontract/__init__.py +13 -0
  2. datacontract/api.py +260 -0
  3. datacontract/breaking/breaking.py +242 -12
  4. datacontract/breaking/breaking_rules.py +37 -1
  5. datacontract/catalog/catalog.py +80 -0
  6. datacontract/cli.py +387 -117
  7. datacontract/data_contract.py +216 -353
  8. datacontract/engines/data_contract_checks.py +1041 -0
  9. datacontract/engines/data_contract_test.py +113 -0
  10. datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
  11. datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
  12. datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
  13. datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
  14. datacontract/engines/soda/check_soda_execute.py +100 -56
  15. datacontract/engines/soda/connections/athena.py +79 -0
  16. datacontract/engines/soda/connections/bigquery.py +8 -1
  17. datacontract/engines/soda/connections/databricks.py +12 -3
  18. datacontract/engines/soda/connections/duckdb_connection.py +241 -0
  19. datacontract/engines/soda/connections/kafka.py +206 -113
  20. datacontract/engines/soda/connections/snowflake.py +8 -5
  21. datacontract/engines/soda/connections/sqlserver.py +43 -0
  22. datacontract/engines/soda/connections/trino.py +26 -0
  23. datacontract/export/avro_converter.py +72 -8
  24. datacontract/export/avro_idl_converter.py +31 -25
  25. datacontract/export/bigquery_converter.py +130 -0
  26. datacontract/export/custom_converter.py +40 -0
  27. datacontract/export/data_caterer_converter.py +161 -0
  28. datacontract/export/dbml_converter.py +148 -0
  29. datacontract/export/dbt_converter.py +141 -54
  30. datacontract/export/dcs_exporter.py +6 -0
  31. datacontract/export/dqx_converter.py +126 -0
  32. datacontract/export/duckdb_type_converter.py +57 -0
  33. datacontract/export/excel_exporter.py +923 -0
  34. datacontract/export/exporter.py +100 -0
  35. datacontract/export/exporter_factory.py +216 -0
  36. datacontract/export/go_converter.py +105 -0
  37. datacontract/export/great_expectations_converter.py +257 -36
  38. datacontract/export/html_exporter.py +86 -0
  39. datacontract/export/iceberg_converter.py +188 -0
  40. datacontract/export/jsonschema_converter.py +71 -16
  41. datacontract/export/markdown_converter.py +337 -0
  42. datacontract/export/mermaid_exporter.py +110 -0
  43. datacontract/export/odcs_v3_exporter.py +375 -0
  44. datacontract/export/pandas_type_converter.py +40 -0
  45. datacontract/export/protobuf_converter.py +168 -68
  46. datacontract/export/pydantic_converter.py +6 -0
  47. datacontract/export/rdf_converter.py +13 -6
  48. datacontract/export/sodacl_converter.py +36 -188
  49. datacontract/export/spark_converter.py +245 -0
  50. datacontract/export/sql_converter.py +37 -3
  51. datacontract/export/sql_type_converter.py +269 -8
  52. datacontract/export/sqlalchemy_converter.py +170 -0
  53. datacontract/export/terraform_converter.py +7 -2
  54. datacontract/imports/avro_importer.py +246 -26
  55. datacontract/imports/bigquery_importer.py +221 -0
  56. datacontract/imports/csv_importer.py +143 -0
  57. datacontract/imports/dbml_importer.py +112 -0
  58. datacontract/imports/dbt_importer.py +240 -0
  59. datacontract/imports/excel_importer.py +1111 -0
  60. datacontract/imports/glue_importer.py +288 -0
  61. datacontract/imports/iceberg_importer.py +172 -0
  62. datacontract/imports/importer.py +51 -0
  63. datacontract/imports/importer_factory.py +128 -0
  64. datacontract/imports/json_importer.py +325 -0
  65. datacontract/imports/jsonschema_importer.py +146 -0
  66. datacontract/imports/odcs_importer.py +60 -0
  67. datacontract/imports/odcs_v3_importer.py +516 -0
  68. datacontract/imports/parquet_importer.py +81 -0
  69. datacontract/imports/protobuf_importer.py +264 -0
  70. datacontract/imports/spark_importer.py +262 -0
  71. datacontract/imports/sql_importer.py +274 -35
  72. datacontract/imports/unity_importer.py +219 -0
  73. datacontract/init/init_template.py +20 -0
  74. datacontract/integration/datamesh_manager.py +86 -0
  75. datacontract/lint/resolve.py +271 -49
  76. datacontract/lint/resources.py +21 -0
  77. datacontract/lint/schema.py +53 -17
  78. datacontract/lint/urls.py +32 -12
  79. datacontract/model/data_contract_specification/__init__.py +1 -0
  80. datacontract/model/exceptions.py +4 -1
  81. datacontract/model/odcs.py +24 -0
  82. datacontract/model/run.py +49 -29
  83. datacontract/output/__init__.py +0 -0
  84. datacontract/output/junit_test_results.py +135 -0
  85. datacontract/output/output_format.py +10 -0
  86. datacontract/output/test_results_writer.py +79 -0
  87. datacontract/py.typed +0 -0
  88. datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
  89. datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
  90. datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
  91. datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
  92. datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
  93. datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
  94. datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
  95. datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
  96. datacontract/templates/datacontract.html +139 -294
  97. datacontract/templates/datacontract_odcs.html +685 -0
  98. datacontract/templates/index.html +236 -0
  99. datacontract/templates/partials/datacontract_information.html +86 -0
  100. datacontract/templates/partials/datacontract_servicelevels.html +253 -0
  101. datacontract/templates/partials/datacontract_terms.html +51 -0
  102. datacontract/templates/partials/definition.html +25 -0
  103. datacontract/templates/partials/example.html +27 -0
  104. datacontract/templates/partials/model_field.html +144 -0
  105. datacontract/templates/partials/quality.html +49 -0
  106. datacontract/templates/partials/server.html +211 -0
  107. datacontract/templates/style/output.css +491 -72
  108. datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
  109. datacontract_cli-0.10.37.dist-info/RECORD +119 -0
  110. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
  111. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
  112. datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
  113. datacontract/engines/soda/connections/dask.py +0 -28
  114. datacontract/engines/soda/connections/duckdb.py +0 -76
  115. datacontract/export/csv_type_converter.py +0 -36
  116. datacontract/export/html_export.py +0 -66
  117. datacontract/export/odcs_converter.py +0 -102
  118. datacontract/init/download_datacontract_file.py +0 -17
  119. datacontract/integration/publish_datamesh_manager.py +0 -33
  120. datacontract/integration/publish_opentelemetry.py +0 -107
  121. datacontract/lint/lint.py +0 -141
  122. datacontract/lint/linters/description_linter.py +0 -34
  123. datacontract/lint/linters/example_model_linter.py +0 -91
  124. datacontract/lint/linters/field_pattern_linter.py +0 -34
  125. datacontract/lint/linters/field_reference_linter.py +0 -38
  126. datacontract/lint/linters/notice_period_linter.py +0 -55
  127. datacontract/lint/linters/quality_schema_linter.py +0 -52
  128. datacontract/lint/linters/valid_constraints_linter.py +0 -99
  129. datacontract/model/data_contract_specification.py +0 -141
  130. datacontract/web.py +0 -14
  131. datacontract_cli-0.10.0.dist-info/METADATA +0 -951
  132. datacontract_cli-0.10.0.dist-info/RECORD +0 -66
  133. /datacontract/{model → breaking}/breaking_change.py +0 -0
  134. /datacontract/{lint/linters → export}/__init__.py +0 -0
  135. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
  136. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,264 @@
1
+ import os
2
+ import re
3
+ import tempfile
4
+
5
+ from google.protobuf import descriptor_pb2
6
+ from grpc_tools import protoc
7
+
8
+ from datacontract.imports.importer import Importer
9
+ from datacontract.model.data_contract_specification import DataContractSpecification
10
+ from datacontract.model.exceptions import DataContractException
11
+
12
+
13
+ def map_type_from_protobuf(field_type: int):
14
+ protobuf_type_mapping = {
15
+ 1: "double",
16
+ 2: "float",
17
+ 3: "long",
18
+ 4: "long", # uint64 mapped to long
19
+ 5: "integer", # int32 mapped to integer
20
+ 6: "string", # fixed64 mapped to string
21
+ 7: "string", # fixed32 mapped to string
22
+ 8: "boolean",
23
+ 9: "string",
24
+ 12: "bytes",
25
+ 13: "integer", # uint32 mapped to integer
26
+ 15: "integer", # sfixed32 mapped to integer
27
+ 16: "long", # sfixed64 mapped to long
28
+ 17: "integer", # sint32 mapped to integer
29
+ 18: "long", # sint64 mapped to long
30
+ }
31
+ return protobuf_type_mapping.get(field_type, "string")
32
+
33
+
34
+ def parse_imports(proto_file: str) -> list:
35
+ """
36
+ Parse import statements from a .proto file and return a list of imported file paths.
37
+ """
38
+ try:
39
+ with open(proto_file, "r") as f:
40
+ content = f.read()
41
+ except Exception as e:
42
+ raise DataContractException(
43
+ type="file",
44
+ name="Parse proto imports",
45
+ reason=f"Failed to read proto file: {proto_file}",
46
+ engine="datacontract",
47
+ original_exception=e,
48
+ )
49
+ imported_files = re.findall(r'import\s+"(.+?)";', content)
50
+ proto_dir = os.path.dirname(proto_file)
51
+ return [os.path.join(proto_dir, imp) for imp in imported_files]
52
+
53
+
54
+ def compile_proto_to_binary(proto_files: list, output_file: str):
55
+ """
56
+ Compile the provided proto files into a single descriptor set using grpc_tools.protoc.
57
+ """
58
+ proto_dirs = set(os.path.dirname(proto) for proto in proto_files)
59
+ proto_paths = [f"--proto_path={d}" for d in proto_dirs]
60
+
61
+ args = [""] + proto_paths + [f"--descriptor_set_out={output_file}"] + proto_files
62
+ ret = protoc.main(args)
63
+ if ret != 0:
64
+ raise DataContractException(
65
+ type="schema",
66
+ name="Compile proto files",
67
+ reason=f"grpc_tools.protoc failed with exit code {ret}",
68
+ engine="datacontract",
69
+ original_exception=None,
70
+ )
71
+
72
+
73
+ def extract_enum_values_from_fds(fds: descriptor_pb2.FileDescriptorSet, enum_name: str) -> dict:
74
+ """
75
+ Search the FileDescriptorSet for an enum definition with the given name
76
+ and return a dictionary of its values (name to number).
77
+ """
78
+ for file_descriptor in fds.file:
79
+ # Check top-level enums.
80
+ for enum in file_descriptor.enum_type:
81
+ if enum.name == enum_name:
82
+ return {value.name: value.number for value in enum.value}
83
+ # Check enums defined inside messages.
84
+ for message in file_descriptor.message_type:
85
+ for enum in message.enum_type:
86
+ if enum.name == enum_name:
87
+ return {value.name: value.number for value in enum.value}
88
+ return {}
89
+
90
+
91
+ def extract_message_fields_from_fds(fds: descriptor_pb2.FileDescriptorSet, message_name: str) -> dict:
92
+ """
93
+ Given a FileDescriptorSet and a message name, return a dict with its field definitions.
94
+ This function recurses for nested messages and handles enums.
95
+ """
96
+ for file_descriptor in fds.file:
97
+ for msg in file_descriptor.message_type:
98
+ if msg.name == message_name:
99
+ fields = {}
100
+ for field in msg.field:
101
+ if field.type == 11: # TYPE_MESSAGE
102
+ nested_msg_name = field.type_name.split(".")[-1]
103
+ nested_fields = extract_message_fields_from_fds(fds, nested_msg_name)
104
+ if field.label == 3: # repeated field
105
+ field_info = {
106
+ "description": f"List of {nested_msg_name}",
107
+ "type": "array",
108
+ "items": {"type": "object", "fields": nested_fields},
109
+ }
110
+ else:
111
+ field_info = {
112
+ "description": f"Nested object of {nested_msg_name}",
113
+ "type": "object",
114
+ "fields": nested_fields,
115
+ }
116
+ elif field.type == 14: # TYPE_ENUM
117
+ enum_name = field.type_name.split(".")[-1]
118
+ enum_values = extract_enum_values_from_fds(fds, enum_name)
119
+ field_info = {
120
+ "description": f"Enum field {field.name}",
121
+ "type": "string",
122
+ "values": enum_values,
123
+ "required": (field.label == 2),
124
+ }
125
+ else:
126
+ field_info = {
127
+ "description": f"Field {field.name}",
128
+ "type": map_type_from_protobuf(field.type),
129
+ "required": (field.label == 2),
130
+ }
131
+ fields[field.name] = field_info
132
+ return fields
133
+ return {}
134
+
135
+
136
+ def import_protobuf(
137
+ data_contract_specification: DataContractSpecification, sources: list, import_args: dict = None
138
+ ) -> DataContractSpecification:
139
+ """
140
+ Gather all proto files (including those imported), compile them into one descriptor,
141
+ then generate models with nested fields and enums resolved.
142
+
143
+ The generated data contract uses generic defaults instead of specific hardcoded ones.
144
+ """
145
+
146
+ # --- Step 1: Gather all proto files (main and imported)
147
+ proto_files_set = set()
148
+ queue = list(sources)
149
+ while queue:
150
+ proto = queue.pop(0)
151
+ if proto not in proto_files_set:
152
+ proto_files_set.add(proto)
153
+ for imp in parse_imports(proto):
154
+ if os.path.exists(imp) and imp not in proto_files_set:
155
+ queue.append(imp)
156
+ all_proto_files = list(proto_files_set)
157
+
158
+ # --- Step 2: Compile all proto files into a single descriptor set.
159
+ temp_descriptor = tempfile.NamedTemporaryFile(suffix=".pb", delete=False)
160
+ descriptor_file = temp_descriptor.name
161
+ temp_descriptor.close() # Allow protoc to write to the file
162
+ try:
163
+ compile_proto_to_binary(all_proto_files, descriptor_file)
164
+
165
+ with open(descriptor_file, "rb") as f:
166
+ proto_data = f.read()
167
+ fds = descriptor_pb2.FileDescriptorSet()
168
+ try:
169
+ fds.ParseFromString(proto_data)
170
+ except Exception as e:
171
+ raise DataContractException(
172
+ type="schema",
173
+ name="Parse descriptor set",
174
+ reason="Failed to parse descriptor set from compiled proto files",
175
+ engine="datacontract",
176
+ original_exception=e,
177
+ )
178
+
179
+ # --- Step 3: Build models from the descriptor set.
180
+ all_models = {}
181
+ # Create a set of the main proto file basenames.
182
+ source_proto_basenames = {os.path.basename(proto) for proto in sources}
183
+
184
+ for file_descriptor in fds.file:
185
+ # Only process file descriptors that correspond to your main proto files.
186
+ if os.path.basename(file_descriptor.name) not in source_proto_basenames:
187
+ continue
188
+
189
+ for message in file_descriptor.message_type:
190
+ fields = {}
191
+ for field in message.field:
192
+ if field.type == 11: # TYPE_MESSAGE
193
+ nested_msg_name = field.type_name.split(".")[-1]
194
+ nested_fields = extract_message_fields_from_fds(fds, nested_msg_name)
195
+ if field.label == 3:
196
+ field_info = {
197
+ "description": f"List of {nested_msg_name}",
198
+ "type": "array",
199
+ "items": {"type": "object", "fields": nested_fields},
200
+ }
201
+ else:
202
+ field_info = {
203
+ "description": f"Nested object of {nested_msg_name}",
204
+ "type": "object",
205
+ "fields": nested_fields,
206
+ }
207
+ fields[field.name] = field_info
208
+ elif field.type == 14: # TYPE_ENUM
209
+ enum_name = field.type_name.split(".")[-1]
210
+ enum_values = extract_enum_values_from_fds(fds, enum_name)
211
+ field_info = {
212
+ "description": f"Enum field {field.name}",
213
+ "type": "string",
214
+ "values": enum_values,
215
+ "required": (field.label == 2),
216
+ }
217
+ fields[field.name] = field_info
218
+ else:
219
+ field_info = {
220
+ "description": f"Field {field.name}",
221
+ "type": map_type_from_protobuf(field.type),
222
+ "required": (field.label == 2),
223
+ }
224
+ fields[field.name] = field_info
225
+
226
+ all_models[message.name] = {
227
+ "description": f"Details of {message.name}.",
228
+ "type": "table",
229
+ "fields": fields,
230
+ }
231
+
232
+ data_contract_specification.models = all_models
233
+
234
+ return data_contract_specification
235
+ finally:
236
+ # Clean up the temporary descriptor file.
237
+ if os.path.exists(descriptor_file):
238
+ os.remove(descriptor_file)
239
+
240
+
241
+ class ProtoBufImporter(Importer):
242
+ def __init__(self, name):
243
+ # 'name' is passed by the importer factory.
244
+ self.name = name
245
+
246
+ def import_source(
247
+ self,
248
+ data_contract_specification: DataContractSpecification,
249
+ source: str,
250
+ import_args: dict = None,
251
+ ) -> DataContractSpecification:
252
+ """
253
+ Import a protobuf file (and its imports) into the given DataContractSpecification.
254
+
255
+ Parameters:
256
+ - data_contract_specification: the initial specification to update.
257
+ - source: the protobuf file path.
258
+ - import_args: optional dictionary with additional arguments (e.g. 'output_dir').
259
+
260
+ Returns:
261
+ The updated DataContractSpecification.
262
+ """
263
+ # Wrap the source in a list because import_protobuf expects a list of sources.
264
+ return import_protobuf(data_contract_specification, [source], import_args)
@@ -0,0 +1,262 @@
1
+ import atexit
2
+ import logging
3
+ import tempfile
4
+
5
+ from databricks.sdk import WorkspaceClient
6
+ from pyspark.sql import DataFrame, SparkSession, types
7
+
8
+ from datacontract.imports.importer import Importer
9
+ from datacontract.model.data_contract_specification import (
10
+ DataContractSpecification,
11
+ Field,
12
+ Model,
13
+ Server,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class SparkImporter(Importer):
20
+ def import_source(
21
+ self,
22
+ data_contract_specification: DataContractSpecification,
23
+ source: str,
24
+ import_args: dict,
25
+ ) -> DataContractSpecification:
26
+ """
27
+ Imports data from a Spark source into the data contract specification.
28
+
29
+ Args:
30
+ data_contract_specification: The data contract specification object.
31
+ source: The source string indicating the Spark tables to read.
32
+ import_args: Additional arguments for the import process.
33
+ Returns:
34
+ dict: The updated data contract specification.
35
+ """
36
+ dataframe = import_args.get("dataframe", None)
37
+ description = import_args.get("description", None)
38
+ return import_spark(data_contract_specification, source, dataframe, description)
39
+
40
+
41
+ def import_spark(
42
+ data_contract_specification: DataContractSpecification,
43
+ source: str,
44
+ dataframe: DataFrame | None = None,
45
+ description: str | None = None,
46
+ ) -> DataContractSpecification:
47
+ """
48
+ Imports schema(s) from Spark into a Data Contract Specification.
49
+
50
+ Args:
51
+ data_contract_specification (DataContractSpecification): The contract spec to update.
52
+ source (str): Comma-separated Spark table/view names.
53
+ dataframe (DataFrame | None): Optional Spark DataFrame to import.
54
+ description (str | None): Optional table-level description.
55
+
56
+ Returns:
57
+ DataContractSpecification: The updated contract spec with imported models.
58
+ """
59
+
60
+ tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract-cli-spark")
61
+ atexit.register(tmp_dir.cleanup)
62
+
63
+ spark = (
64
+ SparkSession.builder.config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
65
+ .config("spark.streaming.stopGracefullyOnShutdown", "true")
66
+ .config("spark.ui.enabled", "false")
67
+ .getOrCreate()
68
+ )
69
+ data_contract_specification.servers["local"] = Server(type="dataframe")
70
+
71
+ if dataframe is not None:
72
+ if not isinstance(dataframe, DataFrame):
73
+ raise TypeError("Expected 'dataframe' to be a pyspark.sql.DataFrame")
74
+ data_contract_specification.models[source] = import_from_spark_df(spark, source, dataframe, description)
75
+ return data_contract_specification
76
+
77
+ if not source:
78
+ raise ValueError("Either 'dataframe' or a valid 'source' must be provided")
79
+
80
+ for table_name in map(str.strip, source.split(",")):
81
+ df = spark.read.table(table_name)
82
+ data_contract_specification.models[table_name] = import_from_spark_df(spark, table_name, df, description)
83
+
84
+ return data_contract_specification
85
+
86
+
87
+ def import_from_spark_df(spark: SparkSession, source: str, df: DataFrame, description: str) -> Model:
88
+ """
89
+ Converts a Spark DataFrame into a Model.
90
+
91
+ Args:
92
+ spark: SparkSession
93
+ source: A comma-separated string of Spark temporary views to read.
94
+ df: The Spark DataFrame to convert.
95
+ description: Table level comment
96
+
97
+ Returns:
98
+ Model: The generated data contract model.
99
+ """
100
+ model = Model()
101
+ schema = df.schema
102
+
103
+ if description is None:
104
+ model.description = _table_comment_from_spark(spark, source)
105
+ else:
106
+ model.description = description
107
+
108
+ for field in schema:
109
+ model.fields[field.name] = _field_from_struct_type(field)
110
+
111
+ return model
112
+
113
+
114
+ def _field_from_struct_type(spark_field: types.StructField) -> Field:
115
+ """
116
+ Converts a Spark StructField into a Field object for the data contract.
117
+
118
+ Args:
119
+ spark_field: The Spark StructField to convert.
120
+
121
+ Returns:
122
+ Field: The generated Field object.
123
+ """
124
+ field = Field()
125
+ field.required = not spark_field.nullable
126
+ field.description = spark_field.metadata.get("comment")
127
+
128
+ return _type_from_data_type(field, spark_field.dataType)
129
+
130
+
131
+ def _type_from_data_type(field: Field, spark_type: types.DataType) -> Field:
132
+ """
133
+ Maps Spark data types to the Data Contract type system and updates the field.
134
+
135
+ Args:
136
+ field: The Field object to update.
137
+ spark_type: The Spark data type to map.
138
+
139
+ Returns:
140
+ Field: The updated Field object.
141
+ """
142
+ field.type = _data_type_from_spark(spark_type)
143
+
144
+ if field.type == "array":
145
+ field.items = _type_from_data_type(Field(required=not spark_type.containsNull), spark_type.elementType)
146
+
147
+ elif field.type == "map":
148
+ field.keys = _type_from_data_type(Field(required=True), spark_type.keyType)
149
+ field.values = _type_from_data_type(Field(required=not spark_type.valueContainsNull), spark_type.valueType)
150
+
151
+ elif field.type == "struct":
152
+ field.fields = {sf.name: _field_from_struct_type(sf) for sf in spark_type.fields}
153
+
154
+ return field
155
+
156
+
157
+ def _data_type_from_spark(spark_type: types.DataType) -> str:
158
+ """
159
+ Maps Spark data types to the Data Contract type system.
160
+
161
+ Args:
162
+ spark_type: The Spark data type to map.
163
+
164
+ Returns:
165
+ str: The corresponding Data Contract type.
166
+ """
167
+ if isinstance(spark_type, types.StringType):
168
+ return "string"
169
+ elif isinstance(spark_type, (types.IntegerType, types.ShortType)):
170
+ return "integer"
171
+ elif isinstance(spark_type, types.LongType):
172
+ return "long"
173
+ elif isinstance(spark_type, types.FloatType):
174
+ return "float"
175
+ elif isinstance(spark_type, types.DoubleType):
176
+ return "double"
177
+ elif isinstance(spark_type, types.StructType):
178
+ return "struct"
179
+ elif isinstance(spark_type, types.ArrayType):
180
+ return "array"
181
+ elif isinstance(spark_type, types.MapType):
182
+ return "map"
183
+ elif isinstance(spark_type, types.TimestampType):
184
+ return "timestamp"
185
+ elif isinstance(spark_type, types.TimestampNTZType):
186
+ return "timestamp_ntz"
187
+ elif isinstance(spark_type, types.DateType):
188
+ return "date"
189
+ elif isinstance(spark_type, types.BooleanType):
190
+ return "boolean"
191
+ elif isinstance(spark_type, types.BinaryType):
192
+ return "bytes"
193
+ elif isinstance(spark_type, types.DecimalType):
194
+ return "decimal"
195
+ elif isinstance(spark_type, types.NullType):
196
+ return "null"
197
+ elif isinstance(spark_type, types.VarcharType):
198
+ return "varchar"
199
+ elif isinstance(spark_type, types.VariantType):
200
+ return "variant"
201
+ else:
202
+ raise ValueError(f"Unsupported Spark type: {spark_type}")
203
+
204
+
205
+ def _table_comment_from_spark(spark: SparkSession, source: str):
206
+ """
207
+ Attempts to retrieve the table-level comment from a Spark table using multiple fallback methods.
208
+
209
+ Args:
210
+ spark (SparkSession): The active Spark session.
211
+ source (str): The name of the table (without catalog or schema).
212
+
213
+ Returns:
214
+ str or None: The table-level comment, if found.
215
+ """
216
+
217
+ # Get Current Catalog and Schema from Spark Session
218
+ try:
219
+ current_catalog = spark.sql("SELECT current_catalog()").collect()[0][0]
220
+ except Exception:
221
+ current_catalog = "hive_metastore" # Fallback for non-Unity Catalog clusters
222
+ try:
223
+ current_schema = spark.catalog.currentDatabase()
224
+ except Exception:
225
+ current_schema = spark.sql("SELECT current_database()").collect()[0][0]
226
+
227
+ # Get table comment if it exists
228
+ table_comment = ""
229
+ source = f"{current_catalog}.{current_schema}.{source}"
230
+ try:
231
+ # Initialize WorkspaceClient for Unity Catalog API calls
232
+ workspace_client = WorkspaceClient()
233
+ created_table = workspace_client.tables.get(full_name=f"{source}")
234
+ table_comment = created_table.comment
235
+ logger.info(f"'{source}' table comment retrieved using 'WorkspaceClient.tables.get({source})'")
236
+ return table_comment
237
+ except Exception:
238
+ pass
239
+
240
+ # Fallback to Spark Catalog API for Hive Metastore or Non-UC Tables
241
+ try:
242
+ table_comment = spark.catalog.getTable(f"{source}").description
243
+ logger.info(f"'{source}' table comment retrieved using 'spark.catalog.getTable({source}).description'")
244
+ return table_comment
245
+ except Exception:
246
+ pass
247
+
248
+ # Final Fallback Using DESCRIBE TABLE EXTENDED
249
+ try:
250
+ rows = spark.sql(f"DESCRIBE TABLE EXTENDED {source}").collect()
251
+ for row in rows:
252
+ if row.col_name.strip().lower() == "comment":
253
+ table_comment = row.data_type
254
+ break
255
+ logger.info(f"'{source}' table comment retrieved using 'DESCRIBE TABLE EXTENDED {source}'")
256
+ return table_comment
257
+ except Exception:
258
+ pass
259
+
260
+ logger.info(f"{source} table comment could not be retrieved")
261
+
262
+ return None