datacontract-cli 0.10.34__py3-none-any.whl → 0.10.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (39) hide show
  1. datacontract/api.py +10 -3
  2. datacontract/cli.py +5 -3
  3. datacontract/data_contract.py +18 -51
  4. datacontract/engines/data_contract_checks.py +280 -19
  5. datacontract/engines/fastjsonschema/check_jsonschema.py +29 -19
  6. datacontract/export/dbt_converter.py +30 -4
  7. datacontract/export/dqx_converter.py +126 -0
  8. datacontract/export/excel_exporter.py +3 -3
  9. datacontract/export/exporter.py +1 -0
  10. datacontract/export/exporter_factory.py +6 -0
  11. datacontract/export/markdown_converter.py +35 -16
  12. datacontract/export/mermaid_exporter.py +24 -11
  13. datacontract/export/rdf_converter.py +2 -2
  14. datacontract/export/spark_converter.py +28 -3
  15. datacontract/export/sql_type_converter.py +6 -4
  16. datacontract/imports/odcs_v3_importer.py +100 -19
  17. datacontract/imports/unity_importer.py +16 -11
  18. datacontract/init/init_template.py +1 -1
  19. datacontract/lint/resolve.py +1 -1
  20. datacontract/lint/schema.py +1 -1
  21. datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
  22. datacontract/schemas/datacontract-1.2.0.init.yaml +1 -1
  23. datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
  24. datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
  25. datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
  26. datacontract/templates/datacontract_odcs.html +60 -41
  27. {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/METADATA +68 -56
  28. {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/RECORD +32 -35
  29. datacontract/lint/lint.py +0 -142
  30. datacontract/lint/linters/__init__.py +0 -0
  31. datacontract/lint/linters/description_linter.py +0 -33
  32. datacontract/lint/linters/field_pattern_linter.py +0 -34
  33. datacontract/lint/linters/field_reference_linter.py +0 -47
  34. datacontract/lint/linters/notice_period_linter.py +0 -55
  35. datacontract/lint/linters/valid_constraints_linter.py +0 -100
  36. {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/WHEEL +0 -0
  37. {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/entry_points.txt +0 -0
  38. {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/licenses/LICENSE +0 -0
  39. {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,9 @@
1
+ import glob
1
2
  import json
2
3
  import logging
3
4
  import os
4
5
  import threading
5
- from typing import List, Optional
6
+ from typing import Any, Callable, Generator, List, Optional
6
7
 
7
8
  import fastjsonschema
8
9
  from fastjsonschema import JsonSchemaValueException
@@ -85,7 +86,7 @@ def process_exceptions(run, exceptions: List[DataContractException]):
85
86
 
86
87
 
87
88
  def validate_json_stream(
88
- schema: dict, model_name: str, validate: callable, json_stream: list[dict]
89
+ schema: dict, model_name: str, validate: Callable, json_stream: Generator[Any, Any, None]
89
90
  ) -> List[DataContractException]:
90
91
  logging.info(f"Validating JSON stream for model: '{model_name}'.")
91
92
  exceptions: List[DataContractException] = []
@@ -99,7 +100,7 @@ def validate_json_stream(
99
100
  DataContractException(
100
101
  type="schema",
101
102
  name="Check that JSON has valid schema",
102
- result="failed",
103
+ result=ResultEnum.failed,
103
104
  reason=f"{f'#{primary_key_value}: ' if primary_key_value is not None else ''}{e.message}",
104
105
  model=model_name,
105
106
  engine="jsonschema",
@@ -170,24 +171,33 @@ def process_local_file(run, server, schema, model_name, validate):
170
171
  if "{model}" in path:
171
172
  path = path.format(model=model_name)
172
173
 
174
+ all_files = []
173
175
  if os.path.isdir(path):
174
- return process_directory(run, path, server, model_name, validate)
176
+ # Fetch all JSONs in the directory
177
+ for root, _, files in os.walk(path):
178
+ for file in files:
179
+ if file.endswith(".json"):
180
+ all_files.append(os.path.join(root, file))
175
181
  else:
176
- logging.info(f"Processing file {path}")
177
- with open(path, "r") as file:
178
- process_json_file(run, schema, model_name, validate, file, server.delimiter)
182
+ # Use glob to fetch all JSONs
183
+ for file_path in glob.glob(path, recursive=True):
184
+ if os.path.isfile(file_path):
185
+ if file_path.endswith(".json"):
186
+ all_files.append(file_path)
179
187
 
188
+ if not all_files:
189
+ raise DataContractException(
190
+ type="schema",
191
+ name="Check that JSON has valid schema",
192
+ result=ResultEnum.warning,
193
+ reason=f"No files found in '{path}'.",
194
+ engine="datacontract",
195
+ )
180
196
 
181
- def process_directory(run, path, server, model_name, validate):
182
- success = True
183
- for filename in os.listdir(path):
184
- if filename.endswith(".json"): # or make this a parameter
185
- file_path = os.path.join(path, filename)
186
- with open(file_path, "r") as file:
187
- if not process_json_file(run, model_name, validate, file, server.delimiter):
188
- success = False
189
- break
190
- return success
197
+ for file in all_files:
198
+ logging.info(f"Processing file: {file}")
199
+ with open(file, "r") as f:
200
+ process_json_file(run, schema, model_name, validate, f, server.delimiter)
191
201
 
192
202
 
193
203
  def process_s3_file(run, server, schema, model_name, validate):
@@ -209,7 +219,7 @@ def process_s3_file(run, server, schema, model_name, validate):
209
219
  raise DataContractException(
210
220
  type="schema",
211
221
  name="Check that JSON has valid schema",
212
- result="warning",
222
+ result=ResultEnum.warning,
213
223
  reason=f"Cannot find any file in {s3_location}",
214
224
  engine="datacontract",
215
225
  )
@@ -230,7 +240,7 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
230
240
  Check(
231
241
  type="schema",
232
242
  name="Check that JSON has valid schema",
233
- result="warning",
243
+ result=ResultEnum.warning,
234
244
  reason="Server format is not 'json'. Skip validating jsonschema.",
235
245
  engine="jsonschema",
236
246
  )
@@ -115,9 +115,28 @@ def _to_dbt_model(
115
115
  dbt_model["config"]["contract"] = {"enforced": True}
116
116
  if model_value.description is not None:
117
117
  dbt_model["description"] = model_value.description.strip().replace("\n", " ")
118
- columns = _to_columns(data_contract_spec, model_value.fields, _supports_constraints(model_type), adapter_type)
118
+
119
+ # Handle model-level primaryKey (before columns for better YAML ordering)
120
+ primary_key_columns = []
121
+ if hasattr(model_value, "primaryKey") and model_value.primaryKey:
122
+ if isinstance(model_value.primaryKey, list) and len(model_value.primaryKey) > 1:
123
+ # Multiple columns: use dbt_utils.unique_combination_of_columns
124
+ dbt_model["data_tests"] = [
125
+ {"dbt_utils.unique_combination_of_columns": {"combination_of_columns": model_value.primaryKey}}
126
+ ]
127
+ elif isinstance(model_value.primaryKey, list) and len(model_value.primaryKey) == 1:
128
+ # Single column: handle at column level (pass to _to_columns)
129
+ primary_key_columns = model_value.primaryKey
130
+ elif isinstance(model_value.primaryKey, str):
131
+ # Single column as string: handle at column level
132
+ primary_key_columns = [model_value.primaryKey]
133
+
134
+ columns = _to_columns(
135
+ data_contract_spec, model_value.fields, _supports_constraints(model_type), adapter_type, primary_key_columns
136
+ )
119
137
  if columns:
120
138
  dbt_model["columns"] = columns
139
+
121
140
  return dbt_model
122
141
 
123
142
 
@@ -143,10 +162,13 @@ def _to_columns(
143
162
  fields: Dict[str, Field],
144
163
  supports_constraints: bool,
145
164
  adapter_type: Optional[str],
165
+ primary_key_columns: Optional[list] = None,
146
166
  ) -> list:
147
167
  columns = []
168
+ primary_key_columns = primary_key_columns or []
148
169
  for field_name, field in fields.items():
149
- column = _to_column(data_contract_spec, field_name, field, supports_constraints, adapter_type)
170
+ is_primary_key = field_name in primary_key_columns
171
+ column = _to_column(data_contract_spec, field_name, field, supports_constraints, adapter_type, is_primary_key)
150
172
  columns.append(column)
151
173
  return columns
152
174
 
@@ -164,6 +186,7 @@ def _to_column(
164
186
  field: Field,
165
187
  supports_constraints: bool,
166
188
  adapter_type: Optional[str],
189
+ is_primary_key: bool = False,
167
190
  ) -> dict:
168
191
  column = {"name": field_name}
169
192
  adapter_type = adapter_type or "snowflake"
@@ -178,12 +201,15 @@ def _to_column(
178
201
  )
179
202
  if field.description is not None:
180
203
  column["description"] = field.description.strip().replace("\n", " ")
181
- if field.required:
204
+ # Handle required/not_null constraint
205
+ if field.required or is_primary_key:
182
206
  if supports_constraints:
183
207
  column.setdefault("constraints", []).append({"type": "not_null"})
184
208
  else:
185
209
  column["data_tests"].append("not_null")
186
- if field.unique:
210
+
211
+ # Handle unique constraint
212
+ if field.unique or is_primary_key:
187
213
  if supports_constraints:
188
214
  column.setdefault("constraints", []).append({"type": "unique"})
189
215
  else:
@@ -0,0 +1,126 @@
1
+ from typing import Any, Dict, List, Union
2
+
3
+ import yaml
4
+
5
+ from datacontract.export.exporter import Exporter, _check_models_for_export
6
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Quality
7
+
8
+
9
+ class DqxKeys:
10
+ CHECK = "check"
11
+ ARGUMENTS = "arguments"
12
+ SPECIFICATION = "specification"
13
+ COL_NAME = "column"
14
+ COL_NAMES = "for_each_column"
15
+ COLUMNS = "columns"
16
+ FUNCTION = "function"
17
+
18
+
19
+ class DqxExporter(Exporter):
20
+ """Exporter implementation for converting data contracts to DQX YAML file."""
21
+
22
+ def export(
23
+ self,
24
+ data_contract: DataContractSpecification,
25
+ model: Model,
26
+ server: str,
27
+ sql_server_type: str,
28
+ export_args: Dict[str, Any],
29
+ ) -> str:
30
+ """Exports a data contract to DQX format."""
31
+ model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
32
+ return to_dqx_yaml(model_value)
33
+
34
+
35
+ def to_dqx_yaml(model_value: Model) -> str:
36
+ """
37
+ Converts the data contract's quality checks to DQX YAML format.
38
+
39
+ Args:
40
+ model_value (Model): The data contract to convert.
41
+
42
+ Returns:
43
+ str: YAML representation of the data contract's quality checks.
44
+ """
45
+ extracted_rules = extract_quality_rules(model_value)
46
+ return yaml.dump(extracted_rules, sort_keys=False, allow_unicode=True, default_flow_style=False)
47
+
48
+
49
+ def process_quality_rule(rule: Quality, column_name: str) -> Dict[str, Any]:
50
+ """
51
+ Processes a single quality rule by injecting the column path into its arguments if absent.
52
+
53
+ Args:
54
+ rule (Quality): The quality rule to process.
55
+ column_name (str): The full path to the current column.
56
+
57
+ Returns:
58
+ dict: The processed quality rule specification.
59
+ """
60
+ rule_data = rule.model_extra
61
+ specification = rule_data[DqxKeys.SPECIFICATION]
62
+ check = specification[DqxKeys.CHECK]
63
+
64
+ if column_name:
65
+ arguments = check.setdefault(DqxKeys.ARGUMENTS, {})
66
+
67
+ if (
68
+ DqxKeys.COL_NAME not in arguments
69
+ and DqxKeys.COL_NAMES not in arguments
70
+ and DqxKeys.COLUMNS not in arguments
71
+ ):
72
+ if check[DqxKeys.FUNCTION] not in ("is_unique", "foreign_key"):
73
+ arguments[DqxKeys.COL_NAME] = column_name
74
+ else:
75
+ arguments[DqxKeys.COLUMNS] = [column_name]
76
+
77
+ return specification
78
+
79
+
80
+ def extract_quality_rules(data: Union[Model, Field, Quality], column_path: str = "") -> List[Dict[str, Any]]:
81
+ """
82
+ Recursively extracts all quality rules from a data contract structure.
83
+
84
+ Args:
85
+ data (Union[Model, Field, Quality]): The data contract model, field, or quality rule.
86
+ column_path (str, optional): The current path in the schema hierarchy. Defaults to "".
87
+
88
+ Returns:
89
+ List[Dict[str, Any]]: A list of quality rule specifications.
90
+ """
91
+ quality_rules = []
92
+
93
+ if isinstance(data, Quality):
94
+ return [process_quality_rule(data, column_path)]
95
+
96
+ if isinstance(data, (Model, Field)):
97
+ for key, field in data.fields.items():
98
+ current_path = build_column_path(column_path, key)
99
+
100
+ if field.fields:
101
+ # Field is a struct-like object, recurse deeper
102
+ quality_rules.extend(extract_quality_rules(field, current_path))
103
+ else:
104
+ # Process quality rules at leaf fields
105
+ for rule in field.quality:
106
+ quality_rules.append(process_quality_rule(rule, current_path))
107
+
108
+ # Process any quality rules attached directly to this level
109
+ for rule in data.quality:
110
+ quality_rules.append(process_quality_rule(rule, column_path))
111
+
112
+ return quality_rules
113
+
114
+
115
+ def build_column_path(current_path: str, key: str) -> str:
116
+ """
117
+ Builds the full column path by concatenating parent path with current key.
118
+
119
+ Args:
120
+ current_path (str): The current path prefix.
121
+ key (str): The current field's key.
122
+
123
+ Returns:
124
+ str: The full path.
125
+ """
126
+ return f"{current_path}.{key}" if current_path else key
@@ -283,7 +283,7 @@ def fill_single_property_template(
283
283
  sheet: Worksheet, row_index: int, prefix: str, property: SchemaProperty, header_map: dict
284
284
  ) -> int:
285
285
  """Fill a single property row using the template's column structure"""
286
- property_name = f"{prefix}.{property.name}" if prefix else property.name
286
+ property_name = f"{prefix}{'.' + property.name if property.name else ''}" if prefix else property.name
287
287
 
288
288
  # Helper function to set cell value by header name
289
289
  def set_by_header(header_name: str, value: Any):
@@ -307,7 +307,7 @@ def fill_single_property_template(
307
307
  set_by_header("Classification", property.classification)
308
308
  set_by_header("Tags", ",".join(property.tags) if property.tags else "")
309
309
  set_by_header(
310
- "Example(s)", ",".join(property.examples) if property.examples else ""
310
+ "Example(s)", ",".join(map(str, property.examples)) if property.examples else ""
311
311
  ) # Note: using "Example(s)" as in template
312
312
  set_by_header("Encrypted Name", property.encryptedName)
313
313
  set_by_header(
@@ -404,7 +404,7 @@ def fill_properties_quality(
404
404
  if not property.name:
405
405
  continue
406
406
 
407
- full_property_name = f"{prefix}.{property.name}" if prefix else property.name
407
+ full_property_name = f"{prefix}{'.' + property.name if property.name else ''}" if prefix else property.name
408
408
 
409
409
  # Add quality attributes for this property
410
410
  if property.quality:
@@ -46,6 +46,7 @@ class ExportFormat(str, Enum):
46
46
  iceberg = "iceberg"
47
47
  custom = "custom"
48
48
  excel = "excel"
49
+ dqx = "dqx"
49
50
 
50
51
  @classmethod
51
52
  def get_supported_formats(cls):
@@ -197,6 +197,12 @@ exporter_factory.register_lazy_exporter(
197
197
  class_name="MarkdownExporter",
198
198
  )
199
199
 
200
+ exporter_factory.register_lazy_exporter(
201
+ name=ExportFormat.dqx,
202
+ module_path="datacontract.export.dqx_converter",
203
+ class_name="DqxExporter",
204
+ )
205
+
200
206
  exporter_factory.register_lazy_exporter(
201
207
  name=ExportFormat.iceberg, module_path="datacontract.export.iceberg_converter", class_name="IcebergExporter"
202
208
  )
@@ -82,7 +82,7 @@ def obj_attributes_to_markdown(obj: BaseModel, excluded_fields: set = set(), is_
82
82
  if value
83
83
  ]
84
84
  description = f"*{description_to_markdown(description_value)}*"
85
- extra = [extra_to_markdown(obj)] if obj.model_extra else []
85
+ extra = [extra_to_markdown(obj, is_in_table_cell)] if obj.model_extra else []
86
86
  return newline_char.join([description] + attributes + extra)
87
87
 
88
88
 
@@ -293,26 +293,45 @@ def dict_to_markdown(dictionary: Dict[str, str]) -> str:
293
293
  return "\n".join(markdown_parts) + "\n"
294
294
 
295
295
 
296
- def extra_to_markdown(obj: BaseModel) -> str:
296
+ def extra_to_markdown(obj: BaseModel, is_in_table_cell: bool = False) -> str:
297
297
  """
298
298
  Convert the extra attributes of a data contract to Markdown format.
299
299
  Args:
300
300
  obj (BaseModel): The data contract object containing extra attributes.
301
+ is_in_table_cell (bool): Whether the extra attributes are in a table cell.
301
302
  Returns:
302
303
  str: A Markdown formatted string representing the extra attributes of the data contract.
303
304
  """
304
- markdown_part = ""
305
305
  extra = obj.model_extra
306
- if extra:
307
- for key_extra, value_extra in extra.items():
308
- markdown_part += f"\n### {key_extra.capitalize()}\n"
309
- if isinstance(value_extra, list) and len(value_extra):
310
- if isinstance(value_extra[0], dict):
311
- markdown_part += array_of_dict_to_markdown(value_extra)
312
- elif isinstance(value_extra[0], str):
313
- markdown_part += array_to_markdown(value_extra)
314
- elif isinstance(value_extra, dict):
315
- markdown_part += dict_to_markdown(value_extra)
316
- else:
317
- markdown_part += f"{str(value_extra)}\n"
318
- return markdown_part
306
+
307
+ if not extra:
308
+ return ""
309
+
310
+ bullet_char = "•"
311
+ value_line_ending = "" if is_in_table_cell else "\n"
312
+ row_suffix = "<br>" if is_in_table_cell else ""
313
+
314
+ def render_header(key: str) -> str:
315
+ return f"{bullet_char} **{key}:** " if is_in_table_cell else f"\n### {key.capitalize()}\n"
316
+
317
+ parts: list[str] = []
318
+ for key_extra, value_extra in extra.items():
319
+ if not value_extra:
320
+ continue
321
+
322
+ parts.append(render_header(key_extra))
323
+
324
+ if isinstance(value_extra, list) and len(value_extra):
325
+ if isinstance(value_extra[0], dict):
326
+ parts.append(array_of_dict_to_markdown(value_extra))
327
+ elif isinstance(value_extra[0], str):
328
+ parts.append(array_to_markdown(value_extra))
329
+ elif isinstance(value_extra, dict):
330
+ parts.append(dict_to_markdown(value_extra))
331
+ else:
332
+ parts.append(f"{str(value_extra)}{value_line_ending}")
333
+
334
+ if row_suffix:
335
+ parts.append(row_suffix)
336
+
337
+ return "".join(parts)
@@ -27,31 +27,33 @@ def dcs_to_mermaid(data_contract_spec: DataContractSpecification) -> str | None:
27
27
  mmd_references = []
28
28
 
29
29
  for model_name, model in data_contract_spec.models.items():
30
+ clean_model = _sanitize_name(model_name)
30
31
  entity_block = ""
31
32
 
32
33
  for field_name, field in model.fields.items():
33
34
  clean_name = _sanitize_name(field_name)
34
- indicators = ""
35
+ field_type = field.type or "unknown"
35
36
 
36
- if field.primaryKey or (field.unique and field.required):
37
- indicators += "🔑"
38
- if field.references:
39
- indicators += "⌘"
37
+ is_pk = bool(field.primaryKey or (field.unique and field.required))
38
+ is_fk = bool(field.references)
40
39
 
41
- field_type = field.type or "unknown"
42
- entity_block += f"\t{clean_name}{indicators} {field_type}\n"
40
+ entity_block += _field_line(clean_name, field_type, pk=is_pk, uk=bool(field.unique), fk=is_fk)
43
41
 
44
42
  if field.references:
45
- referenced_model = field.references.split(".")[0] if "." in field.references else ""
43
+ references = field.references.replace(".", "·")
44
+ parts = references.split("·")
45
+ referenced_model = _sanitize_name(parts[0]) if len(parts) > 0 else ""
46
+ referenced_field = _sanitize_name(parts[1]) if len(parts) > 1 else ""
46
47
  if referenced_model:
47
- mmd_references.append(f'"📑{referenced_model}"' + "}o--{ ||" + f'"📑{model_name}"')
48
+ label = referenced_field or clean_name
49
+ mmd_references.append(f'"**{referenced_model}**" ||--o{{ "**{clean_model}**" : {label}')
48
50
 
49
- mmd_entity += f'\t"**{model_name}**"' + "{\n" + entity_block + "}\n"
51
+ mmd_entity += f'\t"**{clean_model}**" {{\n{entity_block}}}\n'
50
52
 
51
53
  if mmd_references:
52
54
  mmd_entity += "\n" + "\n".join(mmd_references)
53
55
 
54
- return f"{mmd_entity}\n"
56
+ return mmd_entity + "\n"
55
57
 
56
58
  except Exception as e:
57
59
  print(f"Error generating DCS mermaid diagram: {e}")
@@ -95,3 +97,14 @@ def odcs_to_mermaid(data_contract_spec: OpenDataContractStandard) -> str | None:
95
97
 
96
98
  def _sanitize_name(name: str) -> str:
97
99
  return name.replace("#", "Nb").replace(" ", "_").replace("/", "by")
100
+
101
+
102
+ def _field_line(name: str, field_type: str, pk: bool = False, uk: bool = False, fk: bool = False) -> str:
103
+ indicators = ""
104
+ if pk:
105
+ indicators += "🔑"
106
+ if uk:
107
+ indicators += "🔒"
108
+ if fk:
109
+ indicators += "⌘"
110
+ return f"\t{name}{indicators} {field_type}\n"
@@ -57,8 +57,8 @@ def to_rdf(data_contract_spec: DataContractSpecification, base) -> Graph:
57
57
  else:
58
58
  g = Graph(base=Namespace(""))
59
59
 
60
- dc = Namespace("https://datacontract.com/DataContractSpecification/1.2.0/")
61
- dcx = Namespace("https://datacontract.com/DataContractSpecification/1.2.0/Extension/")
60
+ dc = Namespace("https://datacontract.com/DataContractSpecification/1.2.1/")
61
+ dcx = Namespace("https://datacontract.com/DataContractSpecification/1.2.1/Extension/")
62
62
 
63
63
  g.bind("dc", dc)
64
64
  g.bind("dcx", dcx)
@@ -1,3 +1,5 @@
1
+ import json
2
+
1
3
  from pyspark.sql import types
2
4
 
3
5
  from datacontract.export.exporter import Exporter
@@ -104,7 +106,8 @@ def to_struct_field(field: Field, field_name: str) -> types.StructField:
104
106
  types.StructField: The corresponding Spark StructField.
105
107
  """
106
108
  data_type = to_spark_data_type(field)
107
- return types.StructField(name=field_name, dataType=data_type, nullable=not field.required)
109
+ metadata = to_spark_metadata(field)
110
+ return types.StructField(name=field_name, dataType=data_type, nullable=not field.required, metadata=metadata)
108
111
 
109
112
 
110
113
  def to_spark_data_type(field: Field) -> types.DataType:
@@ -152,7 +155,25 @@ def to_spark_data_type(field: Field) -> types.DataType:
152
155
  return types.DateType()
153
156
  if field_type == "bytes":
154
157
  return types.BinaryType()
155
- return types.StringType() # default if no condition is met
158
+ return types.StringType() # default if no condition is met
159
+
160
+
161
+ def to_spark_metadata(field: Field) -> dict[str, str]:
162
+ """
163
+ Convert a field to a Spark metadata dictonary.
164
+
165
+ Args:
166
+ field (Field): The field to convert.
167
+
168
+ Returns:
169
+ dict: dictionary that can be supplied to Spark as metadata for a StructField
170
+ """
171
+
172
+ metadata = {}
173
+ if field.description:
174
+ metadata["comment"] = field.description
175
+
176
+ return metadata
156
177
 
157
178
 
158
179
  def print_schema(dtype: types.DataType) -> str:
@@ -192,7 +213,11 @@ def print_schema(dtype: types.DataType) -> str:
192
213
  name = f'"{column.name}"'
193
214
  data_type = indent(print_schema(column.dataType), 1)
194
215
  nullable = indent(f"{column.nullable}", 1)
195
- return f"StructField({name},\n{data_type},\n{nullable}\n)"
216
+ if column.metadata:
217
+ metadata = indent(f"{json.dumps(column.metadata)}", 1)
218
+ return f"StructField({name},\n{data_type},\n{nullable},\n{metadata}\n)"
219
+ else:
220
+ return f"StructField({name},\n{data_type},\n{nullable}\n)"
196
221
 
197
222
  def format_struct_type(struct_type: types.StructType) -> str:
198
223
  """
@@ -133,8 +133,9 @@ def convert_to_dataframe(field: Field) -> None | str:
133
133
  if type.lower() in ["time"]:
134
134
  return "STRING"
135
135
  if type.lower() in ["number", "decimal", "numeric"]:
136
- # precision and scale not supported by data contract
137
- return "DECIMAL"
136
+ precision = field.precision if field.precision is not None else 38
137
+ scale = field.scale if field.scale is not None else 0
138
+ return f"DECIMAL({precision},{scale})"
138
139
  if type.lower() in ["float"]:
139
140
  return "FLOAT"
140
141
  if type.lower() in ["double"]:
@@ -182,8 +183,9 @@ def convert_to_databricks(field: Field) -> None | str:
182
183
  if type.lower() in ["time"]:
183
184
  return "STRING"
184
185
  if type.lower() in ["number", "decimal", "numeric"]:
185
- # precision and scale not supported by data contract
186
- return "DECIMAL"
186
+ precision = field.precision if field.precision is not None else 38
187
+ scale = field.scale if field.scale is not None else 0
188
+ return f"DECIMAL({precision},{scale})"
187
189
  if type.lower() in ["float"]:
188
190
  return "FLOAT"
189
191
  if type.lower() in ["double"]: