datacontract-cli 0.10.34__py3-none-any.whl → 0.10.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/api.py +10 -3
- datacontract/cli.py +5 -3
- datacontract/data_contract.py +18 -51
- datacontract/engines/data_contract_checks.py +280 -19
- datacontract/engines/fastjsonschema/check_jsonschema.py +29 -19
- datacontract/export/dbt_converter.py +30 -4
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/excel_exporter.py +3 -3
- datacontract/export/exporter.py +1 -0
- datacontract/export/exporter_factory.py +6 -0
- datacontract/export/markdown_converter.py +35 -16
- datacontract/export/mermaid_exporter.py +24 -11
- datacontract/export/rdf_converter.py +2 -2
- datacontract/export/spark_converter.py +28 -3
- datacontract/export/sql_type_converter.py +6 -4
- datacontract/imports/odcs_v3_importer.py +100 -19
- datacontract/imports/unity_importer.py +16 -11
- datacontract/init/init_template.py +1 -1
- datacontract/lint/resolve.py +1 -1
- datacontract/lint/schema.py +1 -1
- datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
- datacontract/schemas/datacontract-1.2.0.init.yaml +1 -1
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract_odcs.html +60 -41
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/METADATA +68 -56
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/RECORD +32 -35
- datacontract/lint/lint.py +0 -142
- datacontract/lint/linters/__init__.py +0 -0
- datacontract/lint/linters/description_linter.py +0 -33
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -47
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/valid_constraints_linter.py +0 -100
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/licenses/LICENSE +0 -0
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
4
5
|
import threading
|
|
5
|
-
from typing import List, Optional
|
|
6
|
+
from typing import Any, Callable, Generator, List, Optional
|
|
6
7
|
|
|
7
8
|
import fastjsonschema
|
|
8
9
|
from fastjsonschema import JsonSchemaValueException
|
|
@@ -85,7 +86,7 @@ def process_exceptions(run, exceptions: List[DataContractException]):
|
|
|
85
86
|
|
|
86
87
|
|
|
87
88
|
def validate_json_stream(
|
|
88
|
-
schema: dict, model_name: str, validate:
|
|
89
|
+
schema: dict, model_name: str, validate: Callable, json_stream: Generator[Any, Any, None]
|
|
89
90
|
) -> List[DataContractException]:
|
|
90
91
|
logging.info(f"Validating JSON stream for model: '{model_name}'.")
|
|
91
92
|
exceptions: List[DataContractException] = []
|
|
@@ -99,7 +100,7 @@ def validate_json_stream(
|
|
|
99
100
|
DataContractException(
|
|
100
101
|
type="schema",
|
|
101
102
|
name="Check that JSON has valid schema",
|
|
102
|
-
result=
|
|
103
|
+
result=ResultEnum.failed,
|
|
103
104
|
reason=f"{f'#{primary_key_value}: ' if primary_key_value is not None else ''}{e.message}",
|
|
104
105
|
model=model_name,
|
|
105
106
|
engine="jsonschema",
|
|
@@ -170,24 +171,33 @@ def process_local_file(run, server, schema, model_name, validate):
|
|
|
170
171
|
if "{model}" in path:
|
|
171
172
|
path = path.format(model=model_name)
|
|
172
173
|
|
|
174
|
+
all_files = []
|
|
173
175
|
if os.path.isdir(path):
|
|
174
|
-
|
|
176
|
+
# Fetch all JSONs in the directory
|
|
177
|
+
for root, _, files in os.walk(path):
|
|
178
|
+
for file in files:
|
|
179
|
+
if file.endswith(".json"):
|
|
180
|
+
all_files.append(os.path.join(root, file))
|
|
175
181
|
else:
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
182
|
+
# Use glob to fetch all JSONs
|
|
183
|
+
for file_path in glob.glob(path, recursive=True):
|
|
184
|
+
if os.path.isfile(file_path):
|
|
185
|
+
if file_path.endswith(".json"):
|
|
186
|
+
all_files.append(file_path)
|
|
179
187
|
|
|
188
|
+
if not all_files:
|
|
189
|
+
raise DataContractException(
|
|
190
|
+
type="schema",
|
|
191
|
+
name="Check that JSON has valid schema",
|
|
192
|
+
result=ResultEnum.warning,
|
|
193
|
+
reason=f"No files found in '{path}'.",
|
|
194
|
+
engine="datacontract",
|
|
195
|
+
)
|
|
180
196
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
file_path = os.path.join(path, filename)
|
|
186
|
-
with open(file_path, "r") as file:
|
|
187
|
-
if not process_json_file(run, model_name, validate, file, server.delimiter):
|
|
188
|
-
success = False
|
|
189
|
-
break
|
|
190
|
-
return success
|
|
197
|
+
for file in all_files:
|
|
198
|
+
logging.info(f"Processing file: {file}")
|
|
199
|
+
with open(file, "r") as f:
|
|
200
|
+
process_json_file(run, schema, model_name, validate, f, server.delimiter)
|
|
191
201
|
|
|
192
202
|
|
|
193
203
|
def process_s3_file(run, server, schema, model_name, validate):
|
|
@@ -209,7 +219,7 @@ def process_s3_file(run, server, schema, model_name, validate):
|
|
|
209
219
|
raise DataContractException(
|
|
210
220
|
type="schema",
|
|
211
221
|
name="Check that JSON has valid schema",
|
|
212
|
-
result=
|
|
222
|
+
result=ResultEnum.warning,
|
|
213
223
|
reason=f"Cannot find any file in {s3_location}",
|
|
214
224
|
engine="datacontract",
|
|
215
225
|
)
|
|
@@ -230,7 +240,7 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
230
240
|
Check(
|
|
231
241
|
type="schema",
|
|
232
242
|
name="Check that JSON has valid schema",
|
|
233
|
-
result=
|
|
243
|
+
result=ResultEnum.warning,
|
|
234
244
|
reason="Server format is not 'json'. Skip validating jsonschema.",
|
|
235
245
|
engine="jsonschema",
|
|
236
246
|
)
|
|
@@ -115,9 +115,28 @@ def _to_dbt_model(
|
|
|
115
115
|
dbt_model["config"]["contract"] = {"enforced": True}
|
|
116
116
|
if model_value.description is not None:
|
|
117
117
|
dbt_model["description"] = model_value.description.strip().replace("\n", " ")
|
|
118
|
-
|
|
118
|
+
|
|
119
|
+
# Handle model-level primaryKey (before columns for better YAML ordering)
|
|
120
|
+
primary_key_columns = []
|
|
121
|
+
if hasattr(model_value, "primaryKey") and model_value.primaryKey:
|
|
122
|
+
if isinstance(model_value.primaryKey, list) and len(model_value.primaryKey) > 1:
|
|
123
|
+
# Multiple columns: use dbt_utils.unique_combination_of_columns
|
|
124
|
+
dbt_model["data_tests"] = [
|
|
125
|
+
{"dbt_utils.unique_combination_of_columns": {"combination_of_columns": model_value.primaryKey}}
|
|
126
|
+
]
|
|
127
|
+
elif isinstance(model_value.primaryKey, list) and len(model_value.primaryKey) == 1:
|
|
128
|
+
# Single column: handle at column level (pass to _to_columns)
|
|
129
|
+
primary_key_columns = model_value.primaryKey
|
|
130
|
+
elif isinstance(model_value.primaryKey, str):
|
|
131
|
+
# Single column as string: handle at column level
|
|
132
|
+
primary_key_columns = [model_value.primaryKey]
|
|
133
|
+
|
|
134
|
+
columns = _to_columns(
|
|
135
|
+
data_contract_spec, model_value.fields, _supports_constraints(model_type), adapter_type, primary_key_columns
|
|
136
|
+
)
|
|
119
137
|
if columns:
|
|
120
138
|
dbt_model["columns"] = columns
|
|
139
|
+
|
|
121
140
|
return dbt_model
|
|
122
141
|
|
|
123
142
|
|
|
@@ -143,10 +162,13 @@ def _to_columns(
|
|
|
143
162
|
fields: Dict[str, Field],
|
|
144
163
|
supports_constraints: bool,
|
|
145
164
|
adapter_type: Optional[str],
|
|
165
|
+
primary_key_columns: Optional[list] = None,
|
|
146
166
|
) -> list:
|
|
147
167
|
columns = []
|
|
168
|
+
primary_key_columns = primary_key_columns or []
|
|
148
169
|
for field_name, field in fields.items():
|
|
149
|
-
|
|
170
|
+
is_primary_key = field_name in primary_key_columns
|
|
171
|
+
column = _to_column(data_contract_spec, field_name, field, supports_constraints, adapter_type, is_primary_key)
|
|
150
172
|
columns.append(column)
|
|
151
173
|
return columns
|
|
152
174
|
|
|
@@ -164,6 +186,7 @@ def _to_column(
|
|
|
164
186
|
field: Field,
|
|
165
187
|
supports_constraints: bool,
|
|
166
188
|
adapter_type: Optional[str],
|
|
189
|
+
is_primary_key: bool = False,
|
|
167
190
|
) -> dict:
|
|
168
191
|
column = {"name": field_name}
|
|
169
192
|
adapter_type = adapter_type or "snowflake"
|
|
@@ -178,12 +201,15 @@ def _to_column(
|
|
|
178
201
|
)
|
|
179
202
|
if field.description is not None:
|
|
180
203
|
column["description"] = field.description.strip().replace("\n", " ")
|
|
181
|
-
|
|
204
|
+
# Handle required/not_null constraint
|
|
205
|
+
if field.required or is_primary_key:
|
|
182
206
|
if supports_constraints:
|
|
183
207
|
column.setdefault("constraints", []).append({"type": "not_null"})
|
|
184
208
|
else:
|
|
185
209
|
column["data_tests"].append("not_null")
|
|
186
|
-
|
|
210
|
+
|
|
211
|
+
# Handle unique constraint
|
|
212
|
+
if field.unique or is_primary_key:
|
|
187
213
|
if supports_constraints:
|
|
188
214
|
column.setdefault("constraints", []).append({"type": "unique"})
|
|
189
215
|
else:
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Union
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
from datacontract.export.exporter import Exporter, _check_models_for_export
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Quality
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DqxKeys:
|
|
10
|
+
CHECK = "check"
|
|
11
|
+
ARGUMENTS = "arguments"
|
|
12
|
+
SPECIFICATION = "specification"
|
|
13
|
+
COL_NAME = "column"
|
|
14
|
+
COL_NAMES = "for_each_column"
|
|
15
|
+
COLUMNS = "columns"
|
|
16
|
+
FUNCTION = "function"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DqxExporter(Exporter):
|
|
20
|
+
"""Exporter implementation for converting data contracts to DQX YAML file."""
|
|
21
|
+
|
|
22
|
+
def export(
|
|
23
|
+
self,
|
|
24
|
+
data_contract: DataContractSpecification,
|
|
25
|
+
model: Model,
|
|
26
|
+
server: str,
|
|
27
|
+
sql_server_type: str,
|
|
28
|
+
export_args: Dict[str, Any],
|
|
29
|
+
) -> str:
|
|
30
|
+
"""Exports a data contract to DQX format."""
|
|
31
|
+
model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
|
|
32
|
+
return to_dqx_yaml(model_value)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def to_dqx_yaml(model_value: Model) -> str:
|
|
36
|
+
"""
|
|
37
|
+
Converts the data contract's quality checks to DQX YAML format.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
model_value (Model): The data contract to convert.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
str: YAML representation of the data contract's quality checks.
|
|
44
|
+
"""
|
|
45
|
+
extracted_rules = extract_quality_rules(model_value)
|
|
46
|
+
return yaml.dump(extracted_rules, sort_keys=False, allow_unicode=True, default_flow_style=False)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def process_quality_rule(rule: Quality, column_name: str) -> Dict[str, Any]:
|
|
50
|
+
"""
|
|
51
|
+
Processes a single quality rule by injecting the column path into its arguments if absent.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
rule (Quality): The quality rule to process.
|
|
55
|
+
column_name (str): The full path to the current column.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
dict: The processed quality rule specification.
|
|
59
|
+
"""
|
|
60
|
+
rule_data = rule.model_extra
|
|
61
|
+
specification = rule_data[DqxKeys.SPECIFICATION]
|
|
62
|
+
check = specification[DqxKeys.CHECK]
|
|
63
|
+
|
|
64
|
+
if column_name:
|
|
65
|
+
arguments = check.setdefault(DqxKeys.ARGUMENTS, {})
|
|
66
|
+
|
|
67
|
+
if (
|
|
68
|
+
DqxKeys.COL_NAME not in arguments
|
|
69
|
+
and DqxKeys.COL_NAMES not in arguments
|
|
70
|
+
and DqxKeys.COLUMNS not in arguments
|
|
71
|
+
):
|
|
72
|
+
if check[DqxKeys.FUNCTION] not in ("is_unique", "foreign_key"):
|
|
73
|
+
arguments[DqxKeys.COL_NAME] = column_name
|
|
74
|
+
else:
|
|
75
|
+
arguments[DqxKeys.COLUMNS] = [column_name]
|
|
76
|
+
|
|
77
|
+
return specification
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def extract_quality_rules(data: Union[Model, Field, Quality], column_path: str = "") -> List[Dict[str, Any]]:
|
|
81
|
+
"""
|
|
82
|
+
Recursively extracts all quality rules from a data contract structure.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
data (Union[Model, Field, Quality]): The data contract model, field, or quality rule.
|
|
86
|
+
column_path (str, optional): The current path in the schema hierarchy. Defaults to "".
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
List[Dict[str, Any]]: A list of quality rule specifications.
|
|
90
|
+
"""
|
|
91
|
+
quality_rules = []
|
|
92
|
+
|
|
93
|
+
if isinstance(data, Quality):
|
|
94
|
+
return [process_quality_rule(data, column_path)]
|
|
95
|
+
|
|
96
|
+
if isinstance(data, (Model, Field)):
|
|
97
|
+
for key, field in data.fields.items():
|
|
98
|
+
current_path = build_column_path(column_path, key)
|
|
99
|
+
|
|
100
|
+
if field.fields:
|
|
101
|
+
# Field is a struct-like object, recurse deeper
|
|
102
|
+
quality_rules.extend(extract_quality_rules(field, current_path))
|
|
103
|
+
else:
|
|
104
|
+
# Process quality rules at leaf fields
|
|
105
|
+
for rule in field.quality:
|
|
106
|
+
quality_rules.append(process_quality_rule(rule, current_path))
|
|
107
|
+
|
|
108
|
+
# Process any quality rules attached directly to this level
|
|
109
|
+
for rule in data.quality:
|
|
110
|
+
quality_rules.append(process_quality_rule(rule, column_path))
|
|
111
|
+
|
|
112
|
+
return quality_rules
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def build_column_path(current_path: str, key: str) -> str:
|
|
116
|
+
"""
|
|
117
|
+
Builds the full column path by concatenating parent path with current key.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
current_path (str): The current path prefix.
|
|
121
|
+
key (str): The current field's key.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
str: The full path.
|
|
125
|
+
"""
|
|
126
|
+
return f"{current_path}.{key}" if current_path else key
|
|
@@ -283,7 +283,7 @@ def fill_single_property_template(
|
|
|
283
283
|
sheet: Worksheet, row_index: int, prefix: str, property: SchemaProperty, header_map: dict
|
|
284
284
|
) -> int:
|
|
285
285
|
"""Fill a single property row using the template's column structure"""
|
|
286
|
-
property_name = f"{prefix}.
|
|
286
|
+
property_name = f"{prefix}{'.' + property.name if property.name else ''}" if prefix else property.name
|
|
287
287
|
|
|
288
288
|
# Helper function to set cell value by header name
|
|
289
289
|
def set_by_header(header_name: str, value: Any):
|
|
@@ -307,7 +307,7 @@ def fill_single_property_template(
|
|
|
307
307
|
set_by_header("Classification", property.classification)
|
|
308
308
|
set_by_header("Tags", ",".join(property.tags) if property.tags else "")
|
|
309
309
|
set_by_header(
|
|
310
|
-
"Example(s)", ",".join(property.examples) if property.examples else ""
|
|
310
|
+
"Example(s)", ",".join(map(str, property.examples)) if property.examples else ""
|
|
311
311
|
) # Note: using "Example(s)" as in template
|
|
312
312
|
set_by_header("Encrypted Name", property.encryptedName)
|
|
313
313
|
set_by_header(
|
|
@@ -404,7 +404,7 @@ def fill_properties_quality(
|
|
|
404
404
|
if not property.name:
|
|
405
405
|
continue
|
|
406
406
|
|
|
407
|
-
full_property_name = f"{prefix}.
|
|
407
|
+
full_property_name = f"{prefix}{'.' + property.name if property.name else ''}" if prefix else property.name
|
|
408
408
|
|
|
409
409
|
# Add quality attributes for this property
|
|
410
410
|
if property.quality:
|
datacontract/export/exporter.py
CHANGED
|
@@ -197,6 +197,12 @@ exporter_factory.register_lazy_exporter(
|
|
|
197
197
|
class_name="MarkdownExporter",
|
|
198
198
|
)
|
|
199
199
|
|
|
200
|
+
exporter_factory.register_lazy_exporter(
|
|
201
|
+
name=ExportFormat.dqx,
|
|
202
|
+
module_path="datacontract.export.dqx_converter",
|
|
203
|
+
class_name="DqxExporter",
|
|
204
|
+
)
|
|
205
|
+
|
|
200
206
|
exporter_factory.register_lazy_exporter(
|
|
201
207
|
name=ExportFormat.iceberg, module_path="datacontract.export.iceberg_converter", class_name="IcebergExporter"
|
|
202
208
|
)
|
|
@@ -82,7 +82,7 @@ def obj_attributes_to_markdown(obj: BaseModel, excluded_fields: set = set(), is_
|
|
|
82
82
|
if value
|
|
83
83
|
]
|
|
84
84
|
description = f"*{description_to_markdown(description_value)}*"
|
|
85
|
-
extra = [extra_to_markdown(obj)] if obj.model_extra else []
|
|
85
|
+
extra = [extra_to_markdown(obj, is_in_table_cell)] if obj.model_extra else []
|
|
86
86
|
return newline_char.join([description] + attributes + extra)
|
|
87
87
|
|
|
88
88
|
|
|
@@ -293,26 +293,45 @@ def dict_to_markdown(dictionary: Dict[str, str]) -> str:
|
|
|
293
293
|
return "\n".join(markdown_parts) + "\n"
|
|
294
294
|
|
|
295
295
|
|
|
296
|
-
def extra_to_markdown(obj: BaseModel) -> str:
|
|
296
|
+
def extra_to_markdown(obj: BaseModel, is_in_table_cell: bool = False) -> str:
|
|
297
297
|
"""
|
|
298
298
|
Convert the extra attributes of a data contract to Markdown format.
|
|
299
299
|
Args:
|
|
300
300
|
obj (BaseModel): The data contract object containing extra attributes.
|
|
301
|
+
is_in_table_cell (bool): Whether the extra attributes are in a table cell.
|
|
301
302
|
Returns:
|
|
302
303
|
str: A Markdown formatted string representing the extra attributes of the data contract.
|
|
303
304
|
"""
|
|
304
|
-
markdown_part = ""
|
|
305
305
|
extra = obj.model_extra
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
306
|
+
|
|
307
|
+
if not extra:
|
|
308
|
+
return ""
|
|
309
|
+
|
|
310
|
+
bullet_char = "•"
|
|
311
|
+
value_line_ending = "" if is_in_table_cell else "\n"
|
|
312
|
+
row_suffix = "<br>" if is_in_table_cell else ""
|
|
313
|
+
|
|
314
|
+
def render_header(key: str) -> str:
|
|
315
|
+
return f"{bullet_char} **{key}:** " if is_in_table_cell else f"\n### {key.capitalize()}\n"
|
|
316
|
+
|
|
317
|
+
parts: list[str] = []
|
|
318
|
+
for key_extra, value_extra in extra.items():
|
|
319
|
+
if not value_extra:
|
|
320
|
+
continue
|
|
321
|
+
|
|
322
|
+
parts.append(render_header(key_extra))
|
|
323
|
+
|
|
324
|
+
if isinstance(value_extra, list) and len(value_extra):
|
|
325
|
+
if isinstance(value_extra[0], dict):
|
|
326
|
+
parts.append(array_of_dict_to_markdown(value_extra))
|
|
327
|
+
elif isinstance(value_extra[0], str):
|
|
328
|
+
parts.append(array_to_markdown(value_extra))
|
|
329
|
+
elif isinstance(value_extra, dict):
|
|
330
|
+
parts.append(dict_to_markdown(value_extra))
|
|
331
|
+
else:
|
|
332
|
+
parts.append(f"{str(value_extra)}{value_line_ending}")
|
|
333
|
+
|
|
334
|
+
if row_suffix:
|
|
335
|
+
parts.append(row_suffix)
|
|
336
|
+
|
|
337
|
+
return "".join(parts)
|
|
@@ -27,31 +27,33 @@ def dcs_to_mermaid(data_contract_spec: DataContractSpecification) -> str | None:
|
|
|
27
27
|
mmd_references = []
|
|
28
28
|
|
|
29
29
|
for model_name, model in data_contract_spec.models.items():
|
|
30
|
+
clean_model = _sanitize_name(model_name)
|
|
30
31
|
entity_block = ""
|
|
31
32
|
|
|
32
33
|
for field_name, field in model.fields.items():
|
|
33
34
|
clean_name = _sanitize_name(field_name)
|
|
34
|
-
|
|
35
|
+
field_type = field.type or "unknown"
|
|
35
36
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
if field.references:
|
|
39
|
-
indicators += "⌘"
|
|
37
|
+
is_pk = bool(field.primaryKey or (field.unique and field.required))
|
|
38
|
+
is_fk = bool(field.references)
|
|
40
39
|
|
|
41
|
-
field_type = field.
|
|
42
|
-
entity_block += f"\t{clean_name}{indicators} {field_type}\n"
|
|
40
|
+
entity_block += _field_line(clean_name, field_type, pk=is_pk, uk=bool(field.unique), fk=is_fk)
|
|
43
41
|
|
|
44
42
|
if field.references:
|
|
45
|
-
|
|
43
|
+
references = field.references.replace(".", "·")
|
|
44
|
+
parts = references.split("·")
|
|
45
|
+
referenced_model = _sanitize_name(parts[0]) if len(parts) > 0 else ""
|
|
46
|
+
referenced_field = _sanitize_name(parts[1]) if len(parts) > 1 else ""
|
|
46
47
|
if referenced_model:
|
|
47
|
-
|
|
48
|
+
label = referenced_field or clean_name
|
|
49
|
+
mmd_references.append(f'"**{referenced_model}**" ||--o{{ "**{clean_model}**" : {label}')
|
|
48
50
|
|
|
49
|
-
mmd_entity += f'\t"**{
|
|
51
|
+
mmd_entity += f'\t"**{clean_model}**" {{\n{entity_block}}}\n'
|
|
50
52
|
|
|
51
53
|
if mmd_references:
|
|
52
54
|
mmd_entity += "\n" + "\n".join(mmd_references)
|
|
53
55
|
|
|
54
|
-
return
|
|
56
|
+
return mmd_entity + "\n"
|
|
55
57
|
|
|
56
58
|
except Exception as e:
|
|
57
59
|
print(f"Error generating DCS mermaid diagram: {e}")
|
|
@@ -95,3 +97,14 @@ def odcs_to_mermaid(data_contract_spec: OpenDataContractStandard) -> str | None:
|
|
|
95
97
|
|
|
96
98
|
def _sanitize_name(name: str) -> str:
|
|
97
99
|
return name.replace("#", "Nb").replace(" ", "_").replace("/", "by")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _field_line(name: str, field_type: str, pk: bool = False, uk: bool = False, fk: bool = False) -> str:
|
|
103
|
+
indicators = ""
|
|
104
|
+
if pk:
|
|
105
|
+
indicators += "🔑"
|
|
106
|
+
if uk:
|
|
107
|
+
indicators += "🔒"
|
|
108
|
+
if fk:
|
|
109
|
+
indicators += "⌘"
|
|
110
|
+
return f"\t{name}{indicators} {field_type}\n"
|
|
@@ -57,8 +57,8 @@ def to_rdf(data_contract_spec: DataContractSpecification, base) -> Graph:
|
|
|
57
57
|
else:
|
|
58
58
|
g = Graph(base=Namespace(""))
|
|
59
59
|
|
|
60
|
-
dc = Namespace("https://datacontract.com/DataContractSpecification/1.2.
|
|
61
|
-
dcx = Namespace("https://datacontract.com/DataContractSpecification/1.2.
|
|
60
|
+
dc = Namespace("https://datacontract.com/DataContractSpecification/1.2.1/")
|
|
61
|
+
dcx = Namespace("https://datacontract.com/DataContractSpecification/1.2.1/Extension/")
|
|
62
62
|
|
|
63
63
|
g.bind("dc", dc)
|
|
64
64
|
g.bind("dcx", dcx)
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
1
3
|
from pyspark.sql import types
|
|
2
4
|
|
|
3
5
|
from datacontract.export.exporter import Exporter
|
|
@@ -104,7 +106,8 @@ def to_struct_field(field: Field, field_name: str) -> types.StructField:
|
|
|
104
106
|
types.StructField: The corresponding Spark StructField.
|
|
105
107
|
"""
|
|
106
108
|
data_type = to_spark_data_type(field)
|
|
107
|
-
|
|
109
|
+
metadata = to_spark_metadata(field)
|
|
110
|
+
return types.StructField(name=field_name, dataType=data_type, nullable=not field.required, metadata=metadata)
|
|
108
111
|
|
|
109
112
|
|
|
110
113
|
def to_spark_data_type(field: Field) -> types.DataType:
|
|
@@ -152,7 +155,25 @@ def to_spark_data_type(field: Field) -> types.DataType:
|
|
|
152
155
|
return types.DateType()
|
|
153
156
|
if field_type == "bytes":
|
|
154
157
|
return types.BinaryType()
|
|
155
|
-
return types.StringType()
|
|
158
|
+
return types.StringType() # default if no condition is met
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def to_spark_metadata(field: Field) -> dict[str, str]:
|
|
162
|
+
"""
|
|
163
|
+
Convert a field to a Spark metadata dictonary.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
field (Field): The field to convert.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
dict: dictionary that can be supplied to Spark as metadata for a StructField
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
metadata = {}
|
|
173
|
+
if field.description:
|
|
174
|
+
metadata["comment"] = field.description
|
|
175
|
+
|
|
176
|
+
return metadata
|
|
156
177
|
|
|
157
178
|
|
|
158
179
|
def print_schema(dtype: types.DataType) -> str:
|
|
@@ -192,7 +213,11 @@ def print_schema(dtype: types.DataType) -> str:
|
|
|
192
213
|
name = f'"{column.name}"'
|
|
193
214
|
data_type = indent(print_schema(column.dataType), 1)
|
|
194
215
|
nullable = indent(f"{column.nullable}", 1)
|
|
195
|
-
|
|
216
|
+
if column.metadata:
|
|
217
|
+
metadata = indent(f"{json.dumps(column.metadata)}", 1)
|
|
218
|
+
return f"StructField({name},\n{data_type},\n{nullable},\n{metadata}\n)"
|
|
219
|
+
else:
|
|
220
|
+
return f"StructField({name},\n{data_type},\n{nullable}\n)"
|
|
196
221
|
|
|
197
222
|
def format_struct_type(struct_type: types.StructType) -> str:
|
|
198
223
|
"""
|
|
@@ -133,8 +133,9 @@ def convert_to_dataframe(field: Field) -> None | str:
|
|
|
133
133
|
if type.lower() in ["time"]:
|
|
134
134
|
return "STRING"
|
|
135
135
|
if type.lower() in ["number", "decimal", "numeric"]:
|
|
136
|
-
|
|
137
|
-
|
|
136
|
+
precision = field.precision if field.precision is not None else 38
|
|
137
|
+
scale = field.scale if field.scale is not None else 0
|
|
138
|
+
return f"DECIMAL({precision},{scale})"
|
|
138
139
|
if type.lower() in ["float"]:
|
|
139
140
|
return "FLOAT"
|
|
140
141
|
if type.lower() in ["double"]:
|
|
@@ -182,8 +183,9 @@ def convert_to_databricks(field: Field) -> None | str:
|
|
|
182
183
|
if type.lower() in ["time"]:
|
|
183
184
|
return "STRING"
|
|
184
185
|
if type.lower() in ["number", "decimal", "numeric"]:
|
|
185
|
-
|
|
186
|
-
|
|
186
|
+
precision = field.precision if field.precision is not None else 38
|
|
187
|
+
scale = field.scale if field.scale is not None else 0
|
|
188
|
+
return f"DECIMAL({precision},{scale})"
|
|
187
189
|
if type.lower() in ["float"]:
|
|
188
190
|
return "FLOAT"
|
|
189
191
|
if type.lower() in ["double"]:
|