datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +260 -0
- datacontract/breaking/breaking.py +242 -12
- datacontract/breaking/breaking_rules.py +37 -1
- datacontract/catalog/catalog.py +80 -0
- datacontract/cli.py +387 -117
- datacontract/data_contract.py +216 -353
- datacontract/engines/data_contract_checks.py +1041 -0
- datacontract/engines/data_contract_test.py +113 -0
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
- datacontract/engines/soda/check_soda_execute.py +100 -56
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/bigquery.py +8 -1
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/engines/soda/connections/duckdb_connection.py +241 -0
- datacontract/engines/soda/connections/kafka.py +206 -113
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/engines/soda/connections/sqlserver.py +43 -0
- datacontract/engines/soda/connections/trino.py +26 -0
- datacontract/export/avro_converter.py +72 -8
- datacontract/export/avro_idl_converter.py +31 -25
- datacontract/export/bigquery_converter.py +130 -0
- datacontract/export/custom_converter.py +40 -0
- datacontract/export/data_caterer_converter.py +161 -0
- datacontract/export/dbml_converter.py +148 -0
- datacontract/export/dbt_converter.py +141 -54
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +100 -0
- datacontract/export/exporter_factory.py +216 -0
- datacontract/export/go_converter.py +105 -0
- datacontract/export/great_expectations_converter.py +257 -36
- datacontract/export/html_exporter.py +86 -0
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +71 -16
- datacontract/export/markdown_converter.py +337 -0
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +375 -0
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +168 -68
- datacontract/export/pydantic_converter.py +6 -0
- datacontract/export/rdf_converter.py +13 -6
- datacontract/export/sodacl_converter.py +36 -188
- datacontract/export/spark_converter.py +245 -0
- datacontract/export/sql_converter.py +37 -3
- datacontract/export/sql_type_converter.py +269 -8
- datacontract/export/sqlalchemy_converter.py +170 -0
- datacontract/export/terraform_converter.py +7 -2
- datacontract/imports/avro_importer.py +246 -26
- datacontract/imports/bigquery_importer.py +221 -0
- datacontract/imports/csv_importer.py +143 -0
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +240 -0
- datacontract/imports/excel_importer.py +1111 -0
- datacontract/imports/glue_importer.py +288 -0
- datacontract/imports/iceberg_importer.py +172 -0
- datacontract/imports/importer.py +51 -0
- datacontract/imports/importer_factory.py +128 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/jsonschema_importer.py +146 -0
- datacontract/imports/odcs_importer.py +60 -0
- datacontract/imports/odcs_v3_importer.py +516 -0
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +262 -0
- datacontract/imports/sql_importer.py +274 -35
- datacontract/imports/unity_importer.py +219 -0
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +86 -0
- datacontract/lint/resolve.py +271 -49
- datacontract/lint/resources.py +21 -0
- datacontract/lint/schema.py +53 -17
- datacontract/lint/urls.py +32 -12
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/exceptions.py +4 -1
- datacontract/model/odcs.py +24 -0
- datacontract/model/run.py +49 -29
- datacontract/output/__init__.py +0 -0
- datacontract/output/junit_test_results.py +135 -0
- datacontract/output/output_format.py +10 -0
- datacontract/output/test_results_writer.py +79 -0
- datacontract/py.typed +0 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract.html +139 -294
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +236 -0
- datacontract/templates/partials/datacontract_information.html +86 -0
- datacontract/templates/partials/datacontract_servicelevels.html +253 -0
- datacontract/templates/partials/datacontract_terms.html +51 -0
- datacontract/templates/partials/definition.html +25 -0
- datacontract/templates/partials/example.html +27 -0
- datacontract/templates/partials/model_field.html +144 -0
- datacontract/templates/partials/quality.html +49 -0
- datacontract/templates/partials/server.html +211 -0
- datacontract/templates/style/output.css +491 -72
- datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
- datacontract_cli-0.10.37.dist-info/RECORD +119 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- datacontract/engines/soda/connections/dask.py +0 -28
- datacontract/engines/soda/connections/duckdb.py +0 -76
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/export/html_export.py +0 -66
- datacontract/export/odcs_converter.py +0 -102
- datacontract/init/download_datacontract_file.py +0 -17
- datacontract/integration/publish_datamesh_manager.py +0 -33
- datacontract/integration/publish_opentelemetry.py +0 -107
- datacontract/lint/lint.py +0 -141
- datacontract/lint/linters/description_linter.py +0 -34
- datacontract/lint/linters/example_model_linter.py +0 -91
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -38
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -99
- datacontract/model/data_contract_specification.py +0 -141
- datacontract/web.py +0 -14
- datacontract_cli-0.10.0.dist-info/METADATA +0 -951
- datacontract_cli-0.10.0.dist-info/RECORD +0 -66
- /datacontract/{model → breaking}/breaking_change.py +0 -0
- /datacontract/{lint/linters → export}/__init__.py +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
import typing
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
from duckdb.duckdb import DuckDBPyConnection
|
|
8
|
+
|
|
9
|
+
from datacontract.engines.data_contract_checks import create_checks
|
|
10
|
+
|
|
11
|
+
if typing.TYPE_CHECKING:
|
|
12
|
+
from pyspark.sql import SparkSession
|
|
13
|
+
|
|
14
|
+
from datacontract.engines.datacontract.check_that_datacontract_contains_valid_servers_configuration import (
|
|
15
|
+
check_that_datacontract_contains_valid_server_configuration,
|
|
16
|
+
)
|
|
17
|
+
from datacontract.engines.fastjsonschema.check_jsonschema import check_jsonschema
|
|
18
|
+
from datacontract.engines.soda.check_soda_execute import check_soda_execute
|
|
19
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
20
|
+
from datacontract.model.exceptions import DataContractException
|
|
21
|
+
from datacontract.model.run import ResultEnum, Run
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def execute_data_contract_test(
|
|
25
|
+
data_contract_specification: DataContractSpecification,
|
|
26
|
+
run: Run,
|
|
27
|
+
server_name: str = None,
|
|
28
|
+
spark: "SparkSession" = None,
|
|
29
|
+
duckdb_connection: DuckDBPyConnection = None,
|
|
30
|
+
):
|
|
31
|
+
if data_contract_specification.models is None or len(data_contract_specification.models) == 0:
|
|
32
|
+
raise DataContractException(
|
|
33
|
+
type="lint",
|
|
34
|
+
name="Check that data contract contains models",
|
|
35
|
+
result=ResultEnum.warning,
|
|
36
|
+
reason="Models block is missing. Skip executing tests.",
|
|
37
|
+
engine="datacontract",
|
|
38
|
+
)
|
|
39
|
+
if (
|
|
40
|
+
server_name is None
|
|
41
|
+
and data_contract_specification.servers is not None
|
|
42
|
+
and len(data_contract_specification.servers) > 0
|
|
43
|
+
):
|
|
44
|
+
server_name = list(data_contract_specification.servers.keys())[0]
|
|
45
|
+
server = get_server(data_contract_specification, server_name)
|
|
46
|
+
run.log_info(f"Running tests for data contract {data_contract_specification.id} with server {server_name}")
|
|
47
|
+
run.dataContractId = data_contract_specification.id
|
|
48
|
+
run.dataContractVersion = data_contract_specification.info.version
|
|
49
|
+
run.dataProductId = server.dataProductId
|
|
50
|
+
run.outputPortId = server.outputPortId
|
|
51
|
+
run.server = server_name
|
|
52
|
+
|
|
53
|
+
if server.type == "api":
|
|
54
|
+
server = process_api_response(run, server)
|
|
55
|
+
|
|
56
|
+
run.checks.extend(create_checks(data_contract_specification, server))
|
|
57
|
+
|
|
58
|
+
# TODO check server is supported type for nicer error messages
|
|
59
|
+
# TODO check server credentials are complete for nicer error messages
|
|
60
|
+
if server.format == "json" and server.type != "kafka":
|
|
61
|
+
check_jsonschema(run, data_contract_specification, server)
|
|
62
|
+
check_soda_execute(run, data_contract_specification, server, spark, duckdb_connection)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_server(data_contract_specification: DataContractSpecification, server_name: str = None) -> Server | None:
|
|
66
|
+
"""Get the server configuration from the data contract specification.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
data_contract_specification: The data contract specification
|
|
70
|
+
server_name: Optional name of the server to use. If not provided, uses the first server.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
The selected server configuration
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
check_that_datacontract_contains_valid_server_configuration(data_contract_specification, server_name)
|
|
77
|
+
|
|
78
|
+
if server_name is not None:
|
|
79
|
+
server = data_contract_specification.servers.get(server_name)
|
|
80
|
+
else:
|
|
81
|
+
server_name = list(data_contract_specification.servers.keys())[0]
|
|
82
|
+
server = data_contract_specification.servers.get(server_name)
|
|
83
|
+
return server
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def process_api_response(run, server):
|
|
87
|
+
tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract_cli_api_")
|
|
88
|
+
atexit.register(tmp_dir.cleanup)
|
|
89
|
+
headers = {}
|
|
90
|
+
if os.getenv("DATACONTRACT_API_HEADER_AUTHORIZATION") is not None:
|
|
91
|
+
headers["Authorization"] = os.getenv("DATACONTRACT_API_HEADER_AUTHORIZATION")
|
|
92
|
+
try:
|
|
93
|
+
response = requests.get(server.location, headers=headers)
|
|
94
|
+
response.raise_for_status()
|
|
95
|
+
except requests.exceptions.RequestException as e:
|
|
96
|
+
raise DataContractException(
|
|
97
|
+
type="connection",
|
|
98
|
+
name="API server connection error",
|
|
99
|
+
result=ResultEnum.error,
|
|
100
|
+
reason=f"Failed to fetch API response from {server.location}: {e}",
|
|
101
|
+
engine="datacontract",
|
|
102
|
+
)
|
|
103
|
+
with open(f"{tmp_dir.name}/api_response.json", "w") as f:
|
|
104
|
+
f.write(response.text)
|
|
105
|
+
run.log_info(f"Saved API response to {tmp_dir.name}/api_response.json")
|
|
106
|
+
server = Server(
|
|
107
|
+
type="local",
|
|
108
|
+
format="json",
|
|
109
|
+
path=f"{tmp_dir.name}/api_response.json",
|
|
110
|
+
dataProductId=server.dataProductId,
|
|
111
|
+
outputPortId=server.outputPortId,
|
|
112
|
+
)
|
|
113
|
+
return server
|
datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
2
2
|
from datacontract.model.exceptions import DataContractException
|
|
3
|
-
from datacontract.model.run import Run
|
|
4
3
|
|
|
5
4
|
|
|
6
5
|
def check_that_datacontract_contains_valid_server_configuration(
|
|
7
|
-
|
|
6
|
+
data_contract: DataContractSpecification, server_name: str | None
|
|
8
7
|
):
|
|
9
|
-
if data_contract.servers is None:
|
|
8
|
+
if data_contract.servers is None or len(data_contract.servers) == 0:
|
|
10
9
|
raise DataContractException(
|
|
11
10
|
type="lint",
|
|
12
11
|
name="Check that data contract contains valid server configuration",
|
|
@@ -1,32 +1,115 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
5
|
+
import threading
|
|
6
|
+
from typing import Any, Callable, Generator, List, Optional
|
|
4
7
|
|
|
5
8
|
import fastjsonschema
|
|
9
|
+
from fastjsonschema import JsonSchemaValueException
|
|
6
10
|
|
|
7
11
|
from datacontract.engines.fastjsonschema.s3.s3_read_files import yield_s3_files
|
|
8
12
|
from datacontract.export.jsonschema_converter import to_jsonschema
|
|
9
|
-
from datacontract.model.data_contract_specification import
|
|
10
|
-
DataContractSpecification, Server
|
|
13
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
11
14
|
from datacontract.model.exceptions import DataContractException
|
|
12
|
-
from datacontract.model.run import
|
|
15
|
+
from datacontract.model.run import Check, ResultEnum, Run
|
|
13
16
|
|
|
17
|
+
# Thread-safe cache for primaryKey fields.
|
|
18
|
+
_primary_key_cache = {}
|
|
19
|
+
_cache_lock = threading.Lock()
|
|
14
20
|
|
|
15
|
-
|
|
21
|
+
|
|
22
|
+
def get_primary_key_field(schema: dict, model_name: str) -> Optional[str]:
|
|
23
|
+
# Check cache first.
|
|
24
|
+
with _cache_lock:
|
|
25
|
+
cached_value = _primary_key_cache.get(model_name)
|
|
26
|
+
if cached_value is not None:
|
|
27
|
+
return cached_value
|
|
28
|
+
|
|
29
|
+
# Find primaryKey field.
|
|
30
|
+
fields = schema.get("properties", {})
|
|
31
|
+
for field_name, attributes in fields.items():
|
|
32
|
+
if attributes.get("primaryKey", False):
|
|
33
|
+
# Cache the result before returning.
|
|
34
|
+
with _cache_lock:
|
|
35
|
+
_primary_key_cache[model_name] = field_name
|
|
36
|
+
return field_name
|
|
37
|
+
|
|
38
|
+
# Return None if no primary key was found.
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_primary_key_value(schema: dict, model_name: str, json_object: dict) -> Optional[str]:
|
|
43
|
+
# Get the `primaryKey` field.
|
|
44
|
+
primary_key_field = get_primary_key_field(schema, model_name)
|
|
45
|
+
if not primary_key_field:
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
# Return the value of the `primaryKey` field in the JSON object.
|
|
49
|
+
return json_object.get(primary_key_field)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def process_exceptions(run, exceptions: List[DataContractException]):
|
|
53
|
+
if not exceptions:
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
# Define the maximum number of errors to process (can be adjusted by defining an ENV variable).
|
|
16
57
|
try:
|
|
17
|
-
|
|
18
|
-
|
|
58
|
+
error_limit = int(os.getenv("DATACONTRACT_MAX_ERRORS", 500))
|
|
59
|
+
except ValueError:
|
|
60
|
+
# Fallback to default if environment variable is invalid.
|
|
61
|
+
error_limit = 500
|
|
62
|
+
|
|
63
|
+
# Calculate the effective limit to avoid index out of range
|
|
64
|
+
limit = min(len(exceptions), error_limit)
|
|
65
|
+
|
|
66
|
+
# Add all exceptions up to the limit - 1 to `run.checks`.
|
|
67
|
+
DEFAULT_ERROR_MESSAGE = "An error occurred during validation phase. See the logs for more details."
|
|
68
|
+
run.checks.extend(
|
|
69
|
+
[
|
|
70
|
+
Check(
|
|
71
|
+
type=exception.type,
|
|
72
|
+
name=exception.name,
|
|
73
|
+
result=exception.result,
|
|
74
|
+
reason=exception.reason,
|
|
75
|
+
model=exception.model,
|
|
76
|
+
engine=exception.engine,
|
|
77
|
+
message=exception.message or DEFAULT_ERROR_MESSAGE,
|
|
78
|
+
)
|
|
79
|
+
for exception in exceptions[: limit - 1]
|
|
80
|
+
]
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Raise the last exception within the limit.
|
|
84
|
+
last_exception = exceptions[limit - 1]
|
|
85
|
+
raise last_exception
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def validate_json_stream(
|
|
89
|
+
schema: dict, model_name: str, validate: Callable, json_stream: Generator[Any, Any, None]
|
|
90
|
+
) -> List[DataContractException]:
|
|
91
|
+
logging.info(f"Validating JSON stream for model: '{model_name}'.")
|
|
92
|
+
exceptions: List[DataContractException] = []
|
|
93
|
+
for json_obj in json_stream:
|
|
94
|
+
try:
|
|
19
95
|
validate(json_obj)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
96
|
+
except JsonSchemaValueException as e:
|
|
97
|
+
logging.warning(f"Validation failed for JSON object with type: '{model_name}'.")
|
|
98
|
+
primary_key_value = get_primary_key_value(schema, model_name, json_obj)
|
|
99
|
+
exceptions.append(
|
|
100
|
+
DataContractException(
|
|
101
|
+
type="schema",
|
|
102
|
+
name="Check that JSON has valid schema",
|
|
103
|
+
result=ResultEnum.failed,
|
|
104
|
+
reason=f"{f'#{primary_key_value}: ' if primary_key_value is not None else ''}{e.message}",
|
|
105
|
+
model=model_name,
|
|
106
|
+
engine="jsonschema",
|
|
107
|
+
message=e.message,
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
if not exceptions:
|
|
111
|
+
logging.info(f"All JSON objects in the stream passed validation for model: '{model_name}'.")
|
|
112
|
+
return exceptions
|
|
30
113
|
|
|
31
114
|
|
|
32
115
|
def read_json_lines(file):
|
|
@@ -60,42 +143,64 @@ def read_json_file_content(file_content: str):
|
|
|
60
143
|
yield json.loads(file_content)
|
|
61
144
|
|
|
62
145
|
|
|
63
|
-
def process_json_file(run, model_name, validate, file, delimiter):
|
|
146
|
+
def process_json_file(run, schema, model_name, validate, file, delimiter):
|
|
64
147
|
if delimiter == "new_line":
|
|
65
148
|
json_stream = read_json_lines(file)
|
|
66
149
|
elif delimiter == "array":
|
|
67
150
|
json_stream = read_json_array(file)
|
|
68
151
|
else:
|
|
69
152
|
json_stream = read_json_file(file)
|
|
70
|
-
validate_json_stream(model_name, validate, json_stream)
|
|
71
153
|
|
|
154
|
+
# Validate the JSON stream and collect exceptions.
|
|
155
|
+
exceptions = validate_json_stream(schema, model_name, validate, json_stream)
|
|
156
|
+
|
|
157
|
+
# Handle all errors from schema validation.
|
|
158
|
+
process_exceptions(run, exceptions)
|
|
72
159
|
|
|
73
|
-
|
|
160
|
+
|
|
161
|
+
def process_local_file(run, server, schema, model_name, validate):
|
|
74
162
|
path = server.path
|
|
163
|
+
if not path:
|
|
164
|
+
raise DataContractException(
|
|
165
|
+
type="schema",
|
|
166
|
+
name="Check that JSON has valid schema",
|
|
167
|
+
result=ResultEnum.warning,
|
|
168
|
+
reason="For server with type 'local', a 'path' must be defined.",
|
|
169
|
+
engine="datacontract",
|
|
170
|
+
)
|
|
75
171
|
if "{model}" in path:
|
|
76
172
|
path = path.format(model=model_name)
|
|
77
173
|
|
|
174
|
+
all_files = []
|
|
78
175
|
if os.path.isdir(path):
|
|
79
|
-
|
|
176
|
+
# Fetch all JSONs in the directory
|
|
177
|
+
for root, _, files in os.walk(path):
|
|
178
|
+
for file in files:
|
|
179
|
+
if file.endswith(".json"):
|
|
180
|
+
all_files.append(os.path.join(root, file))
|
|
80
181
|
else:
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
182
|
+
# Use glob to fetch all JSONs
|
|
183
|
+
for file_path in glob.glob(path, recursive=True):
|
|
184
|
+
if os.path.isfile(file_path):
|
|
185
|
+
if file_path.endswith(".json"):
|
|
186
|
+
all_files.append(file_path)
|
|
84
187
|
|
|
188
|
+
if not all_files:
|
|
189
|
+
raise DataContractException(
|
|
190
|
+
type="schema",
|
|
191
|
+
name="Check that JSON has valid schema",
|
|
192
|
+
result=ResultEnum.warning,
|
|
193
|
+
reason=f"No files found in '{path}'.",
|
|
194
|
+
engine="datacontract",
|
|
195
|
+
)
|
|
85
196
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
file_path = os.path.join(path, filename)
|
|
91
|
-
with open(file_path, "r") as file:
|
|
92
|
-
if not process_json_file(run, model_name, validate, file, server.delimiter):
|
|
93
|
-
success = False
|
|
94
|
-
break
|
|
95
|
-
return success
|
|
197
|
+
for file in all_files:
|
|
198
|
+
logging.info(f"Processing file: {file}")
|
|
199
|
+
with open(file, "r") as f:
|
|
200
|
+
process_json_file(run, schema, model_name, validate, f, server.delimiter)
|
|
96
201
|
|
|
97
202
|
|
|
98
|
-
def process_s3_file(server, model_name, validate):
|
|
203
|
+
def process_s3_file(run, server, schema, model_name, validate):
|
|
99
204
|
s3_endpoint_url = server.endpointUrl
|
|
100
205
|
s3_location = server.location
|
|
101
206
|
if "{model}" in s3_location:
|
|
@@ -114,12 +219,16 @@ def process_s3_file(server, model_name, validate):
|
|
|
114
219
|
raise DataContractException(
|
|
115
220
|
type="schema",
|
|
116
221
|
name="Check that JSON has valid schema",
|
|
117
|
-
result=
|
|
222
|
+
result=ResultEnum.warning,
|
|
118
223
|
reason=f"Cannot find any file in {s3_location}",
|
|
119
224
|
engine="datacontract",
|
|
120
225
|
)
|
|
121
226
|
|
|
122
|
-
|
|
227
|
+
# Validate the JSON stream and collect exceptions.
|
|
228
|
+
exceptions = validate_json_stream(schema, model_name, validate, json_stream)
|
|
229
|
+
|
|
230
|
+
# Handle all errors from schema validation.
|
|
231
|
+
process_exceptions(run, exceptions)
|
|
123
232
|
|
|
124
233
|
|
|
125
234
|
def check_jsonschema(run: Run, data_contract: DataContractSpecification, server: Server):
|
|
@@ -131,7 +240,7 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
131
240
|
Check(
|
|
132
241
|
type="schema",
|
|
133
242
|
name="Check that JSON has valid schema",
|
|
134
|
-
result=
|
|
243
|
+
result=ResultEnum.warning,
|
|
135
244
|
reason="Server format is not 'json'. Skip validating jsonschema.",
|
|
136
245
|
engine="jsonschema",
|
|
137
246
|
)
|
|
@@ -149,20 +258,45 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
149
258
|
schema = to_jsonschema(model_name, model)
|
|
150
259
|
run.log_info(f"jsonschema: {schema}")
|
|
151
260
|
|
|
152
|
-
validate = fastjsonschema.compile(
|
|
261
|
+
validate = fastjsonschema.compile(
|
|
262
|
+
schema,
|
|
263
|
+
formats={"uuid": r"^[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12}$"},
|
|
264
|
+
)
|
|
153
265
|
|
|
154
266
|
# Process files based on server type
|
|
155
267
|
if server.type == "local":
|
|
156
|
-
process_local_file(run, server, model_name, validate)
|
|
268
|
+
process_local_file(run, server, schema, model_name, validate)
|
|
157
269
|
elif server.type == "s3":
|
|
158
|
-
process_s3_file(server, model_name, validate)
|
|
270
|
+
process_s3_file(run, server, schema, model_name, validate)
|
|
271
|
+
elif server.type == "gcs":
|
|
272
|
+
run.checks.append(
|
|
273
|
+
Check(
|
|
274
|
+
type="schema",
|
|
275
|
+
name="Check that JSON has valid schema",
|
|
276
|
+
model=model_name,
|
|
277
|
+
result=ResultEnum.info,
|
|
278
|
+
reason="JSON Schema check skipped for GCS, as GCS is currently not supported",
|
|
279
|
+
engine="jsonschema",
|
|
280
|
+
)
|
|
281
|
+
)
|
|
282
|
+
elif server.type == "azure":
|
|
283
|
+
run.checks.append(
|
|
284
|
+
Check(
|
|
285
|
+
type="schema",
|
|
286
|
+
name="Check that JSON has valid schema",
|
|
287
|
+
model=model_name,
|
|
288
|
+
result=ResultEnum.info,
|
|
289
|
+
reason="JSON Schema check skipped for azure, as azure is currently not supported",
|
|
290
|
+
engine="jsonschema",
|
|
291
|
+
)
|
|
292
|
+
)
|
|
159
293
|
else:
|
|
160
294
|
run.checks.append(
|
|
161
295
|
Check(
|
|
162
296
|
type="schema",
|
|
163
297
|
name="Check that JSON has valid schema",
|
|
164
298
|
model=model_name,
|
|
165
|
-
result=
|
|
299
|
+
result=ResultEnum.warning,
|
|
166
300
|
reason=f"Server type {server.type} not supported",
|
|
167
301
|
engine="jsonschema",
|
|
168
302
|
)
|
|
@@ -174,7 +308,7 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
174
308
|
type="schema",
|
|
175
309
|
name="Check that JSON has valid schema",
|
|
176
310
|
model=model_name,
|
|
177
|
-
result=
|
|
311
|
+
result=ResultEnum.passed,
|
|
178
312
|
reason="All JSON entries are valid.",
|
|
179
313
|
engine="jsonschema",
|
|
180
314
|
)
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
|
|
4
|
-
import
|
|
4
|
+
from datacontract.model.exceptions import DataContractException
|
|
5
|
+
from datacontract.model.run import ResultEnum
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
def yield_s3_files(s3_endpoint_url, s3_location):
|
|
@@ -14,11 +15,25 @@ def yield_s3_files(s3_endpoint_url, s3_location):
|
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def s3_fs(s3_endpoint_url):
|
|
18
|
+
try:
|
|
19
|
+
import s3fs
|
|
20
|
+
except ImportError as e:
|
|
21
|
+
raise DataContractException(
|
|
22
|
+
type="schema",
|
|
23
|
+
result=ResultEnum.failed,
|
|
24
|
+
name="s3 extra missing",
|
|
25
|
+
reason="Install the extra s3 to use s3",
|
|
26
|
+
engine="datacontract",
|
|
27
|
+
original_exception=e,
|
|
28
|
+
)
|
|
29
|
+
|
|
17
30
|
aws_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
|
|
18
31
|
aws_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
|
|
32
|
+
aws_session_token = os.getenv("DATACONTRACT_S3_SESSION_TOKEN")
|
|
19
33
|
return s3fs.S3FileSystem(
|
|
20
34
|
key=aws_access_key_id,
|
|
21
35
|
secret=aws_secret_access_key,
|
|
36
|
+
token=aws_session_token,
|
|
22
37
|
anon=aws_access_key_id is None,
|
|
23
38
|
client_kwargs={"endpoint_url": s3_endpoint_url},
|
|
24
39
|
)
|