datacontract-cli 0.10.23__py3-none-any.whl → 0.10.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +12 -5
- datacontract/catalog/catalog.py +5 -3
- datacontract/cli.py +119 -13
- datacontract/data_contract.py +145 -67
- datacontract/engines/data_contract_checks.py +366 -60
- datacontract/engines/data_contract_test.py +50 -4
- datacontract/engines/fastjsonschema/check_jsonschema.py +37 -19
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
- datacontract/engines/soda/check_soda_execute.py +27 -3
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/duckdb_connection.py +65 -6
- datacontract/engines/soda/connections/kafka.py +4 -2
- datacontract/engines/soda/connections/oracle.py +50 -0
- datacontract/export/avro_converter.py +20 -3
- datacontract/export/bigquery_converter.py +1 -1
- datacontract/export/dbt_converter.py +36 -7
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +3 -0
- datacontract/export/exporter_factory.py +17 -1
- datacontract/export/great_expectations_converter.py +55 -5
- datacontract/export/{html_export.py → html_exporter.py} +31 -20
- datacontract/export/markdown_converter.py +134 -5
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +193 -149
- datacontract/export/protobuf_converter.py +163 -69
- datacontract/export/rdf_converter.py +2 -2
- datacontract/export/sodacl_converter.py +9 -1
- datacontract/export/spark_converter.py +31 -4
- datacontract/export/sql_converter.py +6 -2
- datacontract/export/sql_type_converter.py +124 -8
- datacontract/imports/avro_importer.py +63 -12
- datacontract/imports/csv_importer.py +111 -57
- datacontract/imports/excel_importer.py +1112 -0
- datacontract/imports/importer.py +16 -3
- datacontract/imports/importer_factory.py +17 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/odcs_importer.py +2 -2
- datacontract/imports/odcs_v3_importer.py +367 -151
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +117 -13
- datacontract/imports/sql_importer.py +32 -16
- datacontract/imports/unity_importer.py +84 -38
- datacontract/init/init_template.py +1 -1
- datacontract/integration/entropy_data.py +126 -0
- datacontract/lint/resolve.py +112 -23
- datacontract/lint/schema.py +24 -15
- datacontract/lint/urls.py +17 -3
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/odcs.py +13 -0
- datacontract/model/run.py +3 -0
- datacontract/output/junit_test_results.py +3 -3
- datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/schemas/odcs-3.1.0.schema.json +2809 -0
- datacontract/templates/datacontract.html +54 -3
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +5 -2
- datacontract/templates/partials/server.html +2 -0
- datacontract/templates/style/output.css +319 -145
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/METADATA +711 -433
- datacontract_cli-0.10.40.dist-info/RECORD +121 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info/licenses}/LICENSE +1 -1
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/integration/datamesh_manager.py +0 -72
- datacontract/lint/lint.py +0 -142
- datacontract/lint/linters/description_linter.py +0 -35
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -48
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -100
- datacontract/model/data_contract_specification.py +0 -327
- datacontract_cli-0.10.23.dist-info/RECORD +0 -113
- /datacontract/{lint/linters → output}/__init__.py +0 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
from typing import List
|
|
3
|
+
from typing import List
|
|
4
4
|
|
|
5
5
|
from databricks.sdk import WorkspaceClient
|
|
6
6
|
from databricks.sdk.service.catalog import ColumnInfo, TableInfo
|
|
7
|
-
from
|
|
7
|
+
from open_data_contract_standard.model import OpenDataContractStandard
|
|
8
8
|
|
|
9
9
|
from datacontract.imports.importer import Importer
|
|
10
|
-
from datacontract.imports.
|
|
11
|
-
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
10
|
+
from datacontract.imports.sql_importer import map_type_from_sql, to_physical_type_key
|
|
11
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
|
|
12
12
|
from datacontract.model.exceptions import DataContractException
|
|
13
13
|
|
|
14
14
|
|
|
@@ -18,8 +18,11 @@ class UnityImporter(Importer):
|
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
20
|
def import_source(
|
|
21
|
-
self,
|
|
22
|
-
|
|
21
|
+
self,
|
|
22
|
+
data_contract_specification: DataContractSpecification | OpenDataContractStandard,
|
|
23
|
+
source: str,
|
|
24
|
+
import_args: dict,
|
|
25
|
+
) -> DataContractSpecification | OpenDataContractStandard:
|
|
23
26
|
"""
|
|
24
27
|
Import data contract specification from a source.
|
|
25
28
|
|
|
@@ -35,15 +38,14 @@ class UnityImporter(Importer):
|
|
|
35
38
|
if source is not None:
|
|
36
39
|
data_contract_specification = import_unity_from_json(data_contract_specification, source)
|
|
37
40
|
else:
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
)
|
|
41
|
+
unity_table_full_name_list = import_args.get("unity_table_full_name")
|
|
42
|
+
data_contract_specification = import_unity_from_api(data_contract_specification, unity_table_full_name_list)
|
|
41
43
|
return data_contract_specification
|
|
42
44
|
|
|
43
45
|
|
|
44
46
|
def import_unity_from_json(
|
|
45
|
-
data_contract_specification: DataContractSpecification, source: str
|
|
46
|
-
) -> DataContractSpecification:
|
|
47
|
+
data_contract_specification: DataContractSpecification | OpenDataContractStandard, source: str
|
|
48
|
+
) -> DataContractSpecification | OpenDataContractStandard:
|
|
47
49
|
"""
|
|
48
50
|
Import data contract specification from a JSON file.
|
|
49
51
|
|
|
@@ -71,39 +73,71 @@ def import_unity_from_json(
|
|
|
71
73
|
|
|
72
74
|
|
|
73
75
|
def import_unity_from_api(
|
|
74
|
-
data_contract_specification: DataContractSpecification,
|
|
76
|
+
data_contract_specification: DataContractSpecification, unity_table_full_name_list: List[str] = None
|
|
75
77
|
) -> DataContractSpecification:
|
|
76
78
|
"""
|
|
77
79
|
Import data contract specification from Unity Catalog API.
|
|
78
80
|
|
|
79
81
|
:param data_contract_specification: The data contract specification to be imported.
|
|
80
82
|
:type data_contract_specification: DataContractSpecification
|
|
81
|
-
:param
|
|
82
|
-
:type
|
|
83
|
+
:param unity_table_full_name_list: The full name of the Unity table.
|
|
84
|
+
:type unity_table_full_name_list: list[str]
|
|
83
85
|
:return: The imported data contract specification.
|
|
84
86
|
:rtype: DataContractSpecification
|
|
85
87
|
:raises DataContractException: If there is an error retrieving the schema from the API.
|
|
86
88
|
"""
|
|
87
89
|
try:
|
|
88
|
-
|
|
89
|
-
|
|
90
|
+
# print(f"Retrieving Unity Catalog schema for table: {unity_table_full_name}")
|
|
91
|
+
profile = os.getenv("DATACONTRACT_DATABRICKS_PROFILE")
|
|
92
|
+
host, token = os.getenv("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME"), os.getenv("DATACONTRACT_DATABRICKS_TOKEN")
|
|
93
|
+
# print(f"Databricks host: {host}, token: {'***' if token else 'not set'}")
|
|
94
|
+
exception = DataContractException(
|
|
95
|
+
type="configuration",
|
|
96
|
+
name="Databricks configuration",
|
|
97
|
+
reason="",
|
|
98
|
+
engine="datacontract",
|
|
99
|
+
)
|
|
100
|
+
if not profile and not host and not token:
|
|
101
|
+
reason = "Either DATACONTRACT_DATABRICKS_PROFILE or both DATACONTRACT_DATABRICKS_SERVER_HOSTNAME and DATACONTRACT_DATABRICKS_TOKEN environment variables must be set"
|
|
102
|
+
exception.reason = reason
|
|
103
|
+
raise exception
|
|
104
|
+
if token and not host:
|
|
105
|
+
reason = "DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set"
|
|
106
|
+
exception.reason = reason
|
|
107
|
+
raise exception
|
|
108
|
+
if host and not token:
|
|
109
|
+
reason = "DATACONTRACT_DATABRICKS_TOKEN environment variable is not set"
|
|
110
|
+
exception.reason = reason
|
|
111
|
+
raise exception
|
|
112
|
+
workspace_client = WorkspaceClient(profile=profile) if profile else WorkspaceClient(host=host, token=token)
|
|
90
113
|
except Exception as e:
|
|
91
114
|
raise DataContractException(
|
|
92
115
|
type="schema",
|
|
93
116
|
name="Retrieve unity catalog schema",
|
|
94
|
-
reason=
|
|
117
|
+
reason="Failed to connect to unity catalog schema",
|
|
95
118
|
engine="datacontract",
|
|
96
119
|
original_exception=e,
|
|
97
120
|
)
|
|
98
121
|
|
|
99
|
-
|
|
122
|
+
for unity_table_full_name in unity_table_full_name_list:
|
|
123
|
+
try:
|
|
124
|
+
unity_schema: TableInfo = workspace_client.tables.get(unity_table_full_name)
|
|
125
|
+
except Exception as e:
|
|
126
|
+
raise DataContractException(
|
|
127
|
+
type="schema",
|
|
128
|
+
name="Retrieve unity catalog schema",
|
|
129
|
+
reason=f"Unity table {unity_table_full_name} not found",
|
|
130
|
+
engine="datacontract",
|
|
131
|
+
original_exception=e,
|
|
132
|
+
)
|
|
133
|
+
data_contract_specification = convert_unity_schema(data_contract_specification, unity_schema)
|
|
100
134
|
|
|
101
135
|
return data_contract_specification
|
|
102
136
|
|
|
103
137
|
|
|
104
138
|
def convert_unity_schema(
|
|
105
|
-
data_contract_specification: DataContractSpecification, unity_schema: TableInfo
|
|
106
|
-
) -> DataContractSpecification:
|
|
139
|
+
data_contract_specification: DataContractSpecification | OpenDataContractStandard, unity_schema: TableInfo
|
|
140
|
+
) -> DataContractSpecification | OpenDataContractStandard:
|
|
107
141
|
"""
|
|
108
142
|
Convert Unity schema to data contract specification.
|
|
109
143
|
|
|
@@ -117,6 +151,21 @@ def convert_unity_schema(
|
|
|
117
151
|
if data_contract_specification.models is None:
|
|
118
152
|
data_contract_specification.models = {}
|
|
119
153
|
|
|
154
|
+
if data_contract_specification.servers is None:
|
|
155
|
+
data_contract_specification.servers = {}
|
|
156
|
+
|
|
157
|
+
# Configure databricks server with catalog and schema from Unity table info
|
|
158
|
+
schema_name = unity_schema.schema_name
|
|
159
|
+
catalog_name = unity_schema.catalog_name
|
|
160
|
+
if catalog_name and schema_name:
|
|
161
|
+
server_name = "myserver" # Default server name
|
|
162
|
+
|
|
163
|
+
data_contract_specification.servers[server_name] = Server(
|
|
164
|
+
type="databricks",
|
|
165
|
+
catalog=catalog_name,
|
|
166
|
+
schema=schema_name,
|
|
167
|
+
)
|
|
168
|
+
|
|
120
169
|
fields = import_table_fields(unity_schema.columns)
|
|
121
170
|
|
|
122
171
|
table_id = unity_schema.name or unity_schema.table_id
|
|
@@ -149,25 +198,22 @@ def import_table_fields(columns: List[ColumnInfo]) -> dict[str, Field]:
|
|
|
149
198
|
imported_fields = {}
|
|
150
199
|
|
|
151
200
|
for column in columns:
|
|
152
|
-
|
|
153
|
-
imported_fields[column.name] = _field_from_struct_type(struct_field)
|
|
201
|
+
imported_fields[column.name] = _to_field(column)
|
|
154
202
|
|
|
155
203
|
return imported_fields
|
|
156
204
|
|
|
157
205
|
|
|
158
|
-
def
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
type_dict = json.loads(type_json)
|
|
173
|
-
return types.StructField.fromJson(type_dict)
|
|
206
|
+
def _to_field(column: ColumnInfo) -> Field:
|
|
207
|
+
field = Field()
|
|
208
|
+
# The second condition evaluates for complex types (e.g. variant)
|
|
209
|
+
if column.type_name is not None or (column.type_name is None and column.type_text is not None):
|
|
210
|
+
sql_type = str(column.type_text)
|
|
211
|
+
field.type = map_type_from_sql(sql_type)
|
|
212
|
+
physical_type_key = to_physical_type_key("databricks")
|
|
213
|
+
field.config = {
|
|
214
|
+
physical_type_key: sql_type,
|
|
215
|
+
}
|
|
216
|
+
field.required = column.nullable is None or not column.nullable
|
|
217
|
+
field.description = column.comment if column.comment else None
|
|
218
|
+
|
|
219
|
+
return field
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from urllib.parse import urlparse
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
from datacontract.model.run import Run
|
|
7
|
+
|
|
8
|
+
# used to retrieve the HTML location of the published data contract or test results
|
|
9
|
+
RESPONSE_HEADER_LOCATION_HTML = "location-html"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def publish_test_results_to_entropy_data(run: Run, publish_url: str, ssl_verification: bool):
|
|
13
|
+
try:
|
|
14
|
+
host = publish_url
|
|
15
|
+
if publish_url is None:
|
|
16
|
+
# this url supports Data Mesh Manager and Data Contract Manager
|
|
17
|
+
host = _get_host()
|
|
18
|
+
url = "%s/api/test-results" % host
|
|
19
|
+
else:
|
|
20
|
+
url = publish_url
|
|
21
|
+
|
|
22
|
+
api_key = _get_api_key()
|
|
23
|
+
|
|
24
|
+
if run.dataContractId is None:
|
|
25
|
+
raise Exception("Cannot publish run results for unknown data contract ID")
|
|
26
|
+
|
|
27
|
+
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
|
28
|
+
request_body = run.model_dump_json()
|
|
29
|
+
# print("Request Body:", request_body)
|
|
30
|
+
response = requests.post(
|
|
31
|
+
url,
|
|
32
|
+
data=request_body,
|
|
33
|
+
headers=headers,
|
|
34
|
+
verify=ssl_verification,
|
|
35
|
+
)
|
|
36
|
+
# print("Status Code:", response.status_code)
|
|
37
|
+
# print("Response Body:", response.text)
|
|
38
|
+
if response.status_code != 200:
|
|
39
|
+
display_host = _extract_hostname(host)
|
|
40
|
+
run.log_error(f"Error publishing test results to {display_host}: {response.text}")
|
|
41
|
+
return
|
|
42
|
+
run.log_info("Published test results successfully")
|
|
43
|
+
|
|
44
|
+
location_html = response.headers.get(RESPONSE_HEADER_LOCATION_HTML)
|
|
45
|
+
if location_html is not None and len(location_html) > 0:
|
|
46
|
+
print(f"🚀 Open {location_html}")
|
|
47
|
+
|
|
48
|
+
except Exception as e:
|
|
49
|
+
run.log_error(f"Failed publishing test results. Error: {str(e)}")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def publish_data_contract_to_entropy_data(data_contract_dict: dict, ssl_verification: bool):
|
|
53
|
+
try:
|
|
54
|
+
api_key = _get_api_key()
|
|
55
|
+
host = _get_host()
|
|
56
|
+
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
|
57
|
+
id = data_contract_dict["id"]
|
|
58
|
+
url = f"{host}/api/datacontracts/{id}"
|
|
59
|
+
response = requests.put(
|
|
60
|
+
url=url,
|
|
61
|
+
json=data_contract_dict,
|
|
62
|
+
headers=headers,
|
|
63
|
+
verify=ssl_verification,
|
|
64
|
+
)
|
|
65
|
+
if response.status_code != 200:
|
|
66
|
+
display_host = _extract_hostname(host)
|
|
67
|
+
print(f"Error publishing data contract to {display_host}: {response.text}")
|
|
68
|
+
exit(1)
|
|
69
|
+
|
|
70
|
+
print("✅ Published data contract successfully")
|
|
71
|
+
|
|
72
|
+
location_html = response.headers.get(RESPONSE_HEADER_LOCATION_HTML)
|
|
73
|
+
if location_html is not None and len(location_html) > 0:
|
|
74
|
+
print(f"🚀 Open {location_html}")
|
|
75
|
+
|
|
76
|
+
except Exception as e:
|
|
77
|
+
print(f"Failed publishing data contract. Error: {str(e)}")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _get_api_key() -> str:
|
|
81
|
+
"""
|
|
82
|
+
Get API key from environment variables with fallback priority:
|
|
83
|
+
1. ENTROPY_DATA_API_KEY
|
|
84
|
+
2. DATAMESH_MANAGER_API_KEY
|
|
85
|
+
3. DATACONTRACT_MANAGER_API_KEY
|
|
86
|
+
"""
|
|
87
|
+
api_key = os.getenv("ENTROPY_DATA_API_KEY")
|
|
88
|
+
if api_key is None:
|
|
89
|
+
api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
|
|
90
|
+
if api_key is None:
|
|
91
|
+
api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
|
|
92
|
+
if api_key is None:
|
|
93
|
+
raise Exception(
|
|
94
|
+
"Cannot publish, as neither ENTROPY_DATA_API_KEY, DATAMESH_MANAGER_API_KEY, nor DATACONTRACT_MANAGER_API_KEY is set"
|
|
95
|
+
)
|
|
96
|
+
return api_key
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _get_host() -> str:
|
|
100
|
+
"""
|
|
101
|
+
Get host from environment variables with fallback priority:
|
|
102
|
+
1. ENTROPY_DATA_HOST
|
|
103
|
+
2. DATAMESH_MANAGER_HOST
|
|
104
|
+
3. DATACONTRACT_MANAGER_HOST
|
|
105
|
+
4. Default: https://api.entropy-data.com
|
|
106
|
+
"""
|
|
107
|
+
host = os.getenv("ENTROPY_DATA_HOST")
|
|
108
|
+
if host is None:
|
|
109
|
+
host = os.getenv("DATAMESH_MANAGER_HOST")
|
|
110
|
+
if host is None:
|
|
111
|
+
host = os.getenv("DATACONTRACT_MANAGER_HOST")
|
|
112
|
+
if host is None:
|
|
113
|
+
host = "https://api.entropy-data.com"
|
|
114
|
+
return host
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _extract_hostname(url: str) -> str:
|
|
118
|
+
"""
|
|
119
|
+
Extract the hostname (including subdomains and top-level domain) from a URL.
|
|
120
|
+
|
|
121
|
+
Examples:
|
|
122
|
+
- https://app.entropy-data.com/path -> app.entropy-data.com
|
|
123
|
+
- http://api.example.com:8080/api -> api.example.com
|
|
124
|
+
"""
|
|
125
|
+
parsed = urlparse(url)
|
|
126
|
+
return parsed.netloc.split(":")[0] if parsed.netloc else url
|
datacontract/lint/resolve.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
|
+
import importlib.resources as resources
|
|
1
2
|
import logging
|
|
2
3
|
import os
|
|
4
|
+
import warnings
|
|
5
|
+
from pathlib import Path
|
|
3
6
|
|
|
4
7
|
import fastjsonschema
|
|
5
8
|
import yaml
|
|
6
9
|
from fastjsonschema import JsonSchemaValueException
|
|
10
|
+
from open_data_contract_standard.model import OpenDataContractStandard
|
|
7
11
|
|
|
8
|
-
from datacontract.imports.odcs_v3_importer import
|
|
12
|
+
from datacontract.imports.odcs_v3_importer import import_from_odcs, parse_odcs_v3_from_str
|
|
9
13
|
from datacontract.lint.resources import read_resource
|
|
10
14
|
from datacontract.lint.schema import fetch_schema
|
|
11
15
|
from datacontract.lint.urls import fetch_resource
|
|
@@ -15,7 +19,8 @@ from datacontract.model.data_contract_specification import (
|
|
|
15
19
|
DeprecatedQuality,
|
|
16
20
|
)
|
|
17
21
|
from datacontract.model.exceptions import DataContractException
|
|
18
|
-
from datacontract.model.odcs import is_open_data_contract_standard
|
|
22
|
+
from datacontract.model.odcs import is_open_data_contract_standard, is_open_data_product_standard
|
|
23
|
+
from datacontract.model.run import ResultEnum
|
|
19
24
|
|
|
20
25
|
|
|
21
26
|
def resolve_data_contract(
|
|
@@ -37,7 +42,35 @@ def resolve_data_contract(
|
|
|
37
42
|
else:
|
|
38
43
|
raise DataContractException(
|
|
39
44
|
type="lint",
|
|
40
|
-
result=
|
|
45
|
+
result=ResultEnum.failed,
|
|
46
|
+
name="Check that data contract YAML is valid",
|
|
47
|
+
reason="Data contract needs to be provided",
|
|
48
|
+
engine="datacontract",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def resolve_data_contract_v2(
|
|
53
|
+
data_contract_location: str = None,
|
|
54
|
+
data_contract_str: str = None,
|
|
55
|
+
data_contract: DataContractSpecification | OpenDataContractStandard = None,
|
|
56
|
+
schema_location: str = None,
|
|
57
|
+
inline_definitions: bool = False,
|
|
58
|
+
inline_quality: bool = False,
|
|
59
|
+
) -> DataContractSpecification | OpenDataContractStandard:
|
|
60
|
+
if data_contract_location is not None:
|
|
61
|
+
return resolve_data_contract_from_location_v2(
|
|
62
|
+
data_contract_location, schema_location, inline_definitions, inline_quality
|
|
63
|
+
)
|
|
64
|
+
elif data_contract_str is not None:
|
|
65
|
+
return _resolve_data_contract_from_str_v2(
|
|
66
|
+
data_contract_str, schema_location, inline_definitions, inline_quality
|
|
67
|
+
)
|
|
68
|
+
elif data_contract is not None:
|
|
69
|
+
return data_contract
|
|
70
|
+
else:
|
|
71
|
+
raise DataContractException(
|
|
72
|
+
type="lint",
|
|
73
|
+
result=ResultEnum.failed,
|
|
41
74
|
name="Check that data contract YAML is valid",
|
|
42
75
|
reason="Data contract needs to be provided",
|
|
43
76
|
engine="datacontract",
|
|
@@ -58,13 +91,20 @@ def resolve_data_contract_dict(
|
|
|
58
91
|
else:
|
|
59
92
|
raise DataContractException(
|
|
60
93
|
type="lint",
|
|
61
|
-
result=
|
|
94
|
+
result=ResultEnum.failed,
|
|
62
95
|
name="Check that data contract YAML is valid",
|
|
63
96
|
reason="Data contract needs to be provided",
|
|
64
97
|
engine="datacontract",
|
|
65
98
|
)
|
|
66
99
|
|
|
67
100
|
|
|
101
|
+
def resolve_data_contract_from_location_v2(
|
|
102
|
+
location, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
|
|
103
|
+
) -> DataContractSpecification | OpenDataContractStandard:
|
|
104
|
+
data_contract_str = read_resource(location)
|
|
105
|
+
return _resolve_data_contract_from_str_v2(data_contract_str, schema_location, inline_definitions, inline_quality)
|
|
106
|
+
|
|
107
|
+
|
|
68
108
|
def resolve_data_contract_from_location(
|
|
69
109
|
location, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
|
|
70
110
|
) -> DataContractSpecification:
|
|
@@ -152,7 +192,7 @@ def _resolve_definition_ref(ref, spec) -> Definition:
|
|
|
152
192
|
else:
|
|
153
193
|
raise DataContractException(
|
|
154
194
|
type="lint",
|
|
155
|
-
result=
|
|
195
|
+
result=ResultEnum.failed,
|
|
156
196
|
name="Check that data contract YAML is valid",
|
|
157
197
|
reason=f"Cannot resolve reference {ref}",
|
|
158
198
|
engine="datacontract",
|
|
@@ -165,7 +205,7 @@ def _find_by_path_in_spec(definition_path: str, spec: DataContractSpecification)
|
|
|
165
205
|
if definition_key not in spec.definitions:
|
|
166
206
|
raise DataContractException(
|
|
167
207
|
type="lint",
|
|
168
|
-
result=
|
|
208
|
+
result=ResultEnum.failed,
|
|
169
209
|
name="Check that data contract YAML is valid",
|
|
170
210
|
reason=f"Cannot resolve definition {definition_key}",
|
|
171
211
|
engine="datacontract",
|
|
@@ -195,7 +235,7 @@ def _fetch_file(path) -> str:
|
|
|
195
235
|
if not os.path.exists(path):
|
|
196
236
|
raise DataContractException(
|
|
197
237
|
type="export",
|
|
198
|
-
result=
|
|
238
|
+
result=ResultEnum.failed,
|
|
199
239
|
name="Check that data contract definition is valid",
|
|
200
240
|
reason=f"Cannot resolve reference {path}",
|
|
201
241
|
engine="datacontract",
|
|
@@ -230,7 +270,7 @@ def _get_quality_ref_file(quality_spec: str | object) -> str | object:
|
|
|
230
270
|
if not os.path.exists(ref):
|
|
231
271
|
raise DataContractException(
|
|
232
272
|
type="export",
|
|
233
|
-
result=
|
|
273
|
+
result=ResultEnum.failed,
|
|
234
274
|
name="Check that data contract quality is valid",
|
|
235
275
|
reason=f"Cannot resolve reference {ref}",
|
|
236
276
|
engine="datacontract",
|
|
@@ -240,35 +280,83 @@ def _get_quality_ref_file(quality_spec: str | object) -> str | object:
|
|
|
240
280
|
return quality_spec
|
|
241
281
|
|
|
242
282
|
|
|
283
|
+
def _resolve_data_contract_from_str_v2(
|
|
284
|
+
data_contract_str, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
|
|
285
|
+
) -> DataContractSpecification | OpenDataContractStandard:
|
|
286
|
+
yaml_dict = _to_yaml(data_contract_str)
|
|
287
|
+
|
|
288
|
+
if is_open_data_product_standard(yaml_dict):
|
|
289
|
+
logging.info("Cannot import ODPS, as not supported")
|
|
290
|
+
raise DataContractException(
|
|
291
|
+
type="schema",
|
|
292
|
+
result=ResultEnum.failed,
|
|
293
|
+
name="Parse ODCS contract",
|
|
294
|
+
reason="Cannot parse ODPS product",
|
|
295
|
+
engine="datacontract",
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
if is_open_data_contract_standard(yaml_dict):
|
|
299
|
+
logging.info("Importing ODCS v3")
|
|
300
|
+
# if ODCS, then validate the ODCS schema and import to DataContractSpecification directly
|
|
301
|
+
odcs = parse_odcs_v3_from_str(data_contract_str)
|
|
302
|
+
return odcs
|
|
303
|
+
|
|
304
|
+
logging.info("Importing DCS")
|
|
305
|
+
return _resolve_dcs_from_yaml_dict(inline_definitions, inline_quality, schema_location, yaml_dict)
|
|
306
|
+
|
|
307
|
+
|
|
243
308
|
def _resolve_data_contract_from_str(
|
|
244
309
|
data_contract_str, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
|
|
245
310
|
) -> DataContractSpecification:
|
|
246
311
|
yaml_dict = _to_yaml(data_contract_str)
|
|
247
312
|
|
|
313
|
+
if schema_location is None:
|
|
314
|
+
if is_open_data_contract_standard(yaml_dict):
|
|
315
|
+
logging.info("Using ODCS 3.1.0 schema to validate data contract")
|
|
316
|
+
# TODO refactor this to a specific function
|
|
317
|
+
schema_location = resources.files("datacontract").joinpath("schemas", "odcs-3.1.0.schema.json")
|
|
318
|
+
|
|
319
|
+
_validate_json_schema(yaml_dict, schema_location)
|
|
320
|
+
|
|
248
321
|
if is_open_data_contract_standard(yaml_dict):
|
|
249
322
|
logging.info("Importing ODCS v3")
|
|
250
323
|
# if ODCS, then validate the ODCS schema and import to DataContractSpecification directly
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
324
|
+
odcs = parse_odcs_v3_from_str(data_contract_str)
|
|
325
|
+
|
|
326
|
+
data_contract_specification = DataContractSpecification(dataContractSpecification="1.2.1")
|
|
327
|
+
return import_from_odcs(data_contract_specification, odcs)
|
|
328
|
+
|
|
329
|
+
logging.info("Importing DCS")
|
|
330
|
+
return _resolve_dcs_from_yaml_dict(inline_definitions, inline_quality, schema_location, yaml_dict)
|
|
255
331
|
|
|
256
|
-
|
|
332
|
+
|
|
333
|
+
def _resolve_dcs_from_yaml_dict(inline_definitions, inline_quality, schema_location, yaml_dict):
|
|
334
|
+
_validate_json_schema(yaml_dict, schema_location)
|
|
257
335
|
data_contract_specification = yaml_dict
|
|
258
336
|
spec = DataContractSpecification(**data_contract_specification)
|
|
259
|
-
|
|
260
337
|
if inline_definitions:
|
|
261
338
|
inline_definitions_into_data_contract(spec)
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
339
|
+
## Suppress DeprecationWarning when accessing spec.quality,
|
|
340
|
+
## iif it is in fact *not* used.
|
|
341
|
+
with warnings.catch_warnings(record=True) as recorded_warnings:
|
|
342
|
+
spec_quality = spec.quality
|
|
343
|
+
for w in recorded_warnings:
|
|
344
|
+
if not issubclass(w.category, DeprecationWarning) or spec_quality is not None:
|
|
345
|
+
warnings.warn_explicit(
|
|
346
|
+
message=w.message,
|
|
347
|
+
category=w.category,
|
|
348
|
+
filename=w.filename,
|
|
349
|
+
lineno=w.lineno,
|
|
350
|
+
source=w.source,
|
|
351
|
+
)
|
|
352
|
+
if spec_quality and inline_quality:
|
|
353
|
+
_resolve_quality_ref(spec_quality)
|
|
265
354
|
return spec
|
|
266
355
|
|
|
267
356
|
|
|
268
357
|
def _to_yaml(data_contract_str) -> dict:
|
|
269
358
|
try:
|
|
270
|
-
|
|
271
|
-
return yaml_dict
|
|
359
|
+
return yaml.safe_load(data_contract_str)
|
|
272
360
|
except Exception as e:
|
|
273
361
|
logging.warning(f"Cannot parse YAML. Error: {str(e)}")
|
|
274
362
|
raise DataContractException(
|
|
@@ -280,16 +368,17 @@ def _to_yaml(data_contract_str) -> dict:
|
|
|
280
368
|
)
|
|
281
369
|
|
|
282
370
|
|
|
283
|
-
def
|
|
371
|
+
def _validate_json_schema(yaml_str, schema_location: str | Path = None):
|
|
372
|
+
logging.debug(f"Linting data contract with schema at {schema_location}")
|
|
284
373
|
schema = fetch_schema(schema_location)
|
|
285
374
|
try:
|
|
286
|
-
fastjsonschema.validate(schema,
|
|
375
|
+
fastjsonschema.validate(schema, yaml_str, use_default=False)
|
|
287
376
|
logging.debug("YAML data is valid.")
|
|
288
377
|
except JsonSchemaValueException as e:
|
|
289
378
|
logging.warning(f"Data Contract YAML is invalid. Validation error: {e.message}")
|
|
290
379
|
raise DataContractException(
|
|
291
380
|
type="lint",
|
|
292
|
-
result=
|
|
381
|
+
result=ResultEnum.failed,
|
|
293
382
|
name="Check that data contract YAML is valid",
|
|
294
383
|
reason=e.message,
|
|
295
384
|
engine="datacontract",
|
|
@@ -298,7 +387,7 @@ def _validate_data_contract_specification_schema(data_contract_yaml, schema_loca
|
|
|
298
387
|
logging.warning(f"Data Contract YAML is invalid. Validation error: {str(e)}")
|
|
299
388
|
raise DataContractException(
|
|
300
389
|
type="lint",
|
|
301
|
-
result=
|
|
390
|
+
result=ResultEnum.failed,
|
|
302
391
|
name="Check that data contract YAML is valid",
|
|
303
392
|
reason=str(e),
|
|
304
393
|
engine="datacontract",
|
datacontract/lint/schema.py
CHANGED
|
@@ -2,16 +2,18 @@ import importlib.resources as resources
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
|
+
from pathlib import Path
|
|
5
6
|
from typing import Any, Dict
|
|
6
7
|
|
|
7
8
|
import requests
|
|
8
9
|
|
|
9
10
|
from datacontract.model.exceptions import DataContractException
|
|
11
|
+
from datacontract.model.run import ResultEnum
|
|
10
12
|
|
|
11
|
-
DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.1.
|
|
13
|
+
DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.2.1.schema.json"
|
|
12
14
|
|
|
13
15
|
|
|
14
|
-
def fetch_schema(location: str = None) -> Dict[str, Any]:
|
|
16
|
+
def fetch_schema(location: str | Path = None) -> Dict[str, Any]:
|
|
15
17
|
"""
|
|
16
18
|
Fetch and return a JSON schema from a given location.
|
|
17
19
|
|
|
@@ -36,19 +38,26 @@ def fetch_schema(location: str = None) -> Dict[str, Any]:
|
|
|
36
38
|
schema_file = schemas.joinpath("schemas", DEFAULT_DATA_CONTRACT_SCHEMA)
|
|
37
39
|
with schema_file.open("r") as file:
|
|
38
40
|
schema = json.load(file)
|
|
39
|
-
elif location.startswith("http://") or location.startswith("https://"):
|
|
40
|
-
response = requests.get(location)
|
|
41
|
-
schema = response.json()
|
|
42
41
|
else:
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
42
|
+
# Convert Path objects to strings for string operations
|
|
43
|
+
location_str = str(location)
|
|
44
|
+
|
|
45
|
+
if location_str.startswith("http://") or location_str.startswith("https://"):
|
|
46
|
+
logging.debug(f"Downloading schema from {location_str}")
|
|
47
|
+
response = requests.get(location_str)
|
|
48
|
+
schema = response.json()
|
|
49
|
+
else:
|
|
50
|
+
if not os.path.exists(location):
|
|
51
|
+
raise DataContractException(
|
|
52
|
+
type="lint",
|
|
53
|
+
name=f"Reading schema from {location}",
|
|
54
|
+
reason=f"The file '{location}' does not exist.",
|
|
55
|
+
engine="datacontract",
|
|
56
|
+
result=ResultEnum.error,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
logging.debug(f"Loading JSON schema locally at {location}")
|
|
60
|
+
with open(location, "r") as file:
|
|
61
|
+
schema = json.load(file)
|
|
53
62
|
|
|
54
63
|
return schema
|
datacontract/lint/urls.py
CHANGED
|
@@ -28,10 +28,22 @@ def fetch_resource(url: str):
|
|
|
28
28
|
def _set_api_key(headers, url):
|
|
29
29
|
hostname = urlparse(url).hostname
|
|
30
30
|
|
|
31
|
+
entropy_data_api_key = os.getenv("ENTROPY_DATA_API_KEY")
|
|
31
32
|
datamesh_manager_api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
|
|
32
33
|
datacontract_manager_api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
|
|
33
34
|
|
|
34
|
-
if hostname == "
|
|
35
|
+
if hostname == "entropy-data.com" or hostname.endswith(".entropy-data.com"):
|
|
36
|
+
if entropy_data_api_key is None or entropy_data_api_key == "":
|
|
37
|
+
print("Error: Entropy Data API key is not set. Set env variable ENTROPY_DATA_API_KEY.")
|
|
38
|
+
raise DataContractException(
|
|
39
|
+
type="lint",
|
|
40
|
+
name=f"Reading data contract from {url}",
|
|
41
|
+
reason="Error: Entropy Data API key is not set. Set env variable ENTROPY_DATA_API_KEY.",
|
|
42
|
+
engine="datacontract",
|
|
43
|
+
result="error",
|
|
44
|
+
)
|
|
45
|
+
headers["x-api-key"] = entropy_data_api_key
|
|
46
|
+
elif hostname == "datamesh-manager.com" or hostname.endswith(".datamesh-manager.com"):
|
|
35
47
|
if datamesh_manager_api_key is None or datamesh_manager_api_key == "":
|
|
36
48
|
print("Error: Data Mesh Manager API key is not set. Set env variable DATAMESH_MANAGER_API_KEY.")
|
|
37
49
|
raise DataContractException(
|
|
@@ -54,7 +66,9 @@ def _set_api_key(headers, url):
|
|
|
54
66
|
)
|
|
55
67
|
headers["x-api-key"] = datacontract_manager_api_key
|
|
56
68
|
|
|
57
|
-
if datamesh_manager_api_key is not None and datamesh_manager_api_key != "":
|
|
58
|
-
headers["x-api-key"] = datamesh_manager_api_key
|
|
59
69
|
if datacontract_manager_api_key is not None and datacontract_manager_api_key != "":
|
|
60
70
|
headers["x-api-key"] = datacontract_manager_api_key
|
|
71
|
+
if datamesh_manager_api_key is not None and datamesh_manager_api_key != "":
|
|
72
|
+
headers["x-api-key"] = datamesh_manager_api_key
|
|
73
|
+
if entropy_data_api_key is not None and entropy_data_api_key != "":
|
|
74
|
+
headers["x-api-key"] = entropy_data_api_key
|