datacontract-cli 0.10.23__py3-none-any.whl → 0.10.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/__init__.py +13 -0
- datacontract/catalog/catalog.py +2 -2
- datacontract/data_contract.py +5 -3
- datacontract/engines/data_contract_test.py +13 -4
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
- datacontract/engines/soda/check_soda_execute.py +16 -3
- datacontract/engines/soda/connections/duckdb_connection.py +61 -5
- datacontract/engines/soda/connections/kafka.py +3 -2
- datacontract/export/avro_converter.py +8 -1
- datacontract/export/bigquery_converter.py +1 -1
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/odcs_v3_exporter.py +6 -5
- datacontract/export/protobuf_converter.py +163 -69
- datacontract/imports/avro_importer.py +31 -6
- datacontract/imports/csv_importer.py +111 -57
- datacontract/imports/importer.py +1 -0
- datacontract/imports/importer_factory.py +5 -0
- datacontract/imports/odcs_v3_importer.py +48 -6
- datacontract/imports/protobuf_importer.py +266 -0
- datacontract/lint/resolve.py +23 -8
- datacontract/model/data_contract_specification.py +2 -2
- datacontract/model/run.py +3 -0
- datacontract/output/__init__.py +0 -0
- datacontract/templates/datacontract.html +2 -1
- datacontract/templates/index.html +2 -1
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.24.dist-info}/METADATA +276 -194
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.24.dist-info}/RECORD +31 -30
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.24.dist-info}/WHEEL +1 -1
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.24.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.24.dist-info/licenses}/LICENSE +0 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.24.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import tempfile
|
|
4
|
+
|
|
5
|
+
from google.protobuf import descriptor_pb2
|
|
6
|
+
from grpc_tools import protoc
|
|
7
|
+
|
|
8
|
+
from datacontract.imports.importer import Importer
|
|
9
|
+
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
10
|
+
from datacontract.model.exceptions import DataContractException
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def map_type_from_protobuf(field_type: int):
|
|
14
|
+
protobuf_type_mapping = {
|
|
15
|
+
1: "double",
|
|
16
|
+
2: "float",
|
|
17
|
+
3: "long",
|
|
18
|
+
4: "long", # uint64 mapped to long
|
|
19
|
+
5: "integer", # int32 mapped to integer
|
|
20
|
+
6: "string", # fixed64 mapped to string
|
|
21
|
+
7: "string", # fixed32 mapped to string
|
|
22
|
+
8: "boolean",
|
|
23
|
+
9: "string",
|
|
24
|
+
12: "bytes",
|
|
25
|
+
13: "integer", # uint32 mapped to integer
|
|
26
|
+
15: "integer", # sfixed32 mapped to integer
|
|
27
|
+
16: "long", # sfixed64 mapped to long
|
|
28
|
+
17: "integer", # sint32 mapped to integer
|
|
29
|
+
18: "long", # sint64 mapped to long
|
|
30
|
+
}
|
|
31
|
+
return protobuf_type_mapping.get(field_type, "string")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def parse_imports(proto_file: str) -> list:
|
|
35
|
+
"""
|
|
36
|
+
Parse import statements from a .proto file and return a list of imported file paths.
|
|
37
|
+
"""
|
|
38
|
+
try:
|
|
39
|
+
with open(proto_file, "r") as f:
|
|
40
|
+
content = f.read()
|
|
41
|
+
except Exception as e:
|
|
42
|
+
raise DataContractException(
|
|
43
|
+
type="file",
|
|
44
|
+
name="Parse proto imports",
|
|
45
|
+
reason=f"Failed to read proto file: {proto_file}",
|
|
46
|
+
engine="datacontract",
|
|
47
|
+
original_exception=e,
|
|
48
|
+
)
|
|
49
|
+
imported_files = re.findall(r'import\s+"(.+?)";', content)
|
|
50
|
+
proto_dir = os.path.dirname(proto_file)
|
|
51
|
+
return [os.path.join(proto_dir, imp) for imp in imported_files]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def compile_proto_to_binary(proto_files: list, output_file: str):
|
|
55
|
+
"""
|
|
56
|
+
Compile the provided proto files into a single descriptor set using grpc_tools.protoc.
|
|
57
|
+
"""
|
|
58
|
+
proto_dirs = set(os.path.dirname(proto) for proto in proto_files)
|
|
59
|
+
proto_paths = [f"--proto_path={d}" for d in proto_dirs]
|
|
60
|
+
|
|
61
|
+
args = [""] + proto_paths + [f"--descriptor_set_out={output_file}"] + proto_files
|
|
62
|
+
ret = protoc.main(args)
|
|
63
|
+
if ret != 0:
|
|
64
|
+
raise DataContractException(
|
|
65
|
+
type="schema",
|
|
66
|
+
name="Compile proto files",
|
|
67
|
+
reason=f"grpc_tools.protoc failed with exit code {ret}",
|
|
68
|
+
engine="datacontract",
|
|
69
|
+
original_exception=None,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def extract_enum_values_from_fds(fds: descriptor_pb2.FileDescriptorSet, enum_name: str) -> dict:
|
|
74
|
+
"""
|
|
75
|
+
Search the FileDescriptorSet for an enum definition with the given name
|
|
76
|
+
and return a dictionary of its values (name to number).
|
|
77
|
+
"""
|
|
78
|
+
for file_descriptor in fds.file:
|
|
79
|
+
# Check top-level enums.
|
|
80
|
+
for enum in file_descriptor.enum_type:
|
|
81
|
+
if enum.name == enum_name:
|
|
82
|
+
return {value.name: value.number for value in enum.value}
|
|
83
|
+
# Check enums defined inside messages.
|
|
84
|
+
for message in file_descriptor.message_type:
|
|
85
|
+
for enum in message.enum_type:
|
|
86
|
+
if enum.name == enum_name:
|
|
87
|
+
return {value.name: value.number for value in enum.value}
|
|
88
|
+
return {}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def extract_message_fields_from_fds(fds: descriptor_pb2.FileDescriptorSet, message_name: str) -> dict:
|
|
92
|
+
"""
|
|
93
|
+
Given a FileDescriptorSet and a message name, return a dict with its field definitions.
|
|
94
|
+
This function recurses for nested messages and handles enums.
|
|
95
|
+
"""
|
|
96
|
+
for file_descriptor in fds.file:
|
|
97
|
+
for msg in file_descriptor.message_type:
|
|
98
|
+
if msg.name == message_name:
|
|
99
|
+
fields = {}
|
|
100
|
+
for field in msg.field:
|
|
101
|
+
if field.type == 11: # TYPE_MESSAGE
|
|
102
|
+
nested_msg_name = field.type_name.split(".")[-1]
|
|
103
|
+
nested_fields = extract_message_fields_from_fds(fds, nested_msg_name)
|
|
104
|
+
if field.label == 3: # repeated field
|
|
105
|
+
field_info = {
|
|
106
|
+
"description": f"List of {nested_msg_name}",
|
|
107
|
+
"type": "array",
|
|
108
|
+
"items": {"type": "object", "fields": nested_fields},
|
|
109
|
+
}
|
|
110
|
+
else:
|
|
111
|
+
field_info = {
|
|
112
|
+
"description": f"Nested object of {nested_msg_name}",
|
|
113
|
+
"type": "object",
|
|
114
|
+
"fields": nested_fields,
|
|
115
|
+
}
|
|
116
|
+
elif field.type == 14: # TYPE_ENUM
|
|
117
|
+
enum_name = field.type_name.split(".")[-1]
|
|
118
|
+
enum_values = extract_enum_values_from_fds(fds, enum_name)
|
|
119
|
+
field_info = {
|
|
120
|
+
"description": f"Enum field {field.name}",
|
|
121
|
+
"type": "string",
|
|
122
|
+
"values": enum_values,
|
|
123
|
+
"required": (field.label == 2),
|
|
124
|
+
}
|
|
125
|
+
else:
|
|
126
|
+
field_info = {
|
|
127
|
+
"description": f"Field {field.name}",
|
|
128
|
+
"type": map_type_from_protobuf(field.type),
|
|
129
|
+
"required": (field.label == 2),
|
|
130
|
+
}
|
|
131
|
+
fields[field.name] = field_info
|
|
132
|
+
return fields
|
|
133
|
+
return {}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def import_protobuf(
|
|
137
|
+
data_contract_specification: DataContractSpecification, sources: list, import_args: dict = None
|
|
138
|
+
) -> DataContractSpecification:
|
|
139
|
+
"""
|
|
140
|
+
Gather all proto files (including those imported), compile them into one descriptor,
|
|
141
|
+
then generate models with nested fields and enums resolved.
|
|
142
|
+
|
|
143
|
+
The generated data contract uses generic defaults instead of specific hardcoded ones.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
# --- Step 1: Gather all proto files (main and imported)
|
|
147
|
+
proto_files_set = set()
|
|
148
|
+
queue = list(sources)
|
|
149
|
+
while queue:
|
|
150
|
+
proto = queue.pop(0)
|
|
151
|
+
if proto not in proto_files_set:
|
|
152
|
+
proto_files_set.add(proto)
|
|
153
|
+
for imp in parse_imports(proto):
|
|
154
|
+
if os.path.exists(imp) and imp not in proto_files_set:
|
|
155
|
+
queue.append(imp)
|
|
156
|
+
all_proto_files = list(proto_files_set)
|
|
157
|
+
|
|
158
|
+
# --- Step 2: Compile all proto files into a single descriptor set.
|
|
159
|
+
temp_descriptor = tempfile.NamedTemporaryFile(suffix=".pb", delete=False)
|
|
160
|
+
descriptor_file = temp_descriptor.name
|
|
161
|
+
temp_descriptor.close() # Allow protoc to write to the file
|
|
162
|
+
try:
|
|
163
|
+
compile_proto_to_binary(all_proto_files, descriptor_file)
|
|
164
|
+
|
|
165
|
+
with open(descriptor_file, "rb") as f:
|
|
166
|
+
proto_data = f.read()
|
|
167
|
+
fds = descriptor_pb2.FileDescriptorSet()
|
|
168
|
+
try:
|
|
169
|
+
fds.ParseFromString(proto_data)
|
|
170
|
+
except Exception as e:
|
|
171
|
+
raise DataContractException(
|
|
172
|
+
type="schema",
|
|
173
|
+
name="Parse descriptor set",
|
|
174
|
+
reason="Failed to parse descriptor set from compiled proto files",
|
|
175
|
+
engine="datacontract",
|
|
176
|
+
original_exception=e,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# --- Step 3: Build models from the descriptor set.
|
|
180
|
+
all_models = {}
|
|
181
|
+
# Create a set of the main proto file basenames.
|
|
182
|
+
source_proto_basenames = {os.path.basename(proto) for proto in sources}
|
|
183
|
+
|
|
184
|
+
for file_descriptor in fds.file:
|
|
185
|
+
# Only process file descriptors that correspond to your main proto files.
|
|
186
|
+
if os.path.basename(file_descriptor.name) not in source_proto_basenames:
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
for message in file_descriptor.message_type:
|
|
190
|
+
fields = {}
|
|
191
|
+
for field in message.field:
|
|
192
|
+
if field.type == 11: # TYPE_MESSAGE
|
|
193
|
+
nested_msg_name = field.type_name.split(".")[-1]
|
|
194
|
+
nested_fields = extract_message_fields_from_fds(fds, nested_msg_name)
|
|
195
|
+
if field.label == 3:
|
|
196
|
+
field_info = {
|
|
197
|
+
"description": f"List of {nested_msg_name}",
|
|
198
|
+
"type": "array",
|
|
199
|
+
"items": {"type": "object", "fields": nested_fields},
|
|
200
|
+
}
|
|
201
|
+
else:
|
|
202
|
+
field_info = {
|
|
203
|
+
"description": f"Nested object of {nested_msg_name}",
|
|
204
|
+
"type": "object",
|
|
205
|
+
"fields": nested_fields,
|
|
206
|
+
}
|
|
207
|
+
fields[field.name] = field_info
|
|
208
|
+
elif field.type == 14: # TYPE_ENUM
|
|
209
|
+
enum_name = field.type_name.split(".")[-1]
|
|
210
|
+
enum_values = extract_enum_values_from_fds(fds, enum_name)
|
|
211
|
+
field_info = {
|
|
212
|
+
"description": f"Enum field {field.name}",
|
|
213
|
+
"type": "string",
|
|
214
|
+
"values": enum_values,
|
|
215
|
+
"required": (field.label == 2),
|
|
216
|
+
}
|
|
217
|
+
fields[field.name] = field_info
|
|
218
|
+
else:
|
|
219
|
+
field_info = {
|
|
220
|
+
"description": f"Field {field.name}",
|
|
221
|
+
"type": map_type_from_protobuf(field.type),
|
|
222
|
+
"required": (field.label == 2),
|
|
223
|
+
}
|
|
224
|
+
fields[field.name] = field_info
|
|
225
|
+
|
|
226
|
+
all_models[message.name] = {
|
|
227
|
+
"description": f"Details of {message.name}.",
|
|
228
|
+
"type": "table",
|
|
229
|
+
"fields": fields,
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
data_contract_specification.models = all_models
|
|
233
|
+
|
|
234
|
+
return data_contract_specification
|
|
235
|
+
finally:
|
|
236
|
+
# Clean up the temporary descriptor file.
|
|
237
|
+
if os.path.exists(descriptor_file):
|
|
238
|
+
os.remove(descriptor_file)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class ProtoBufImporter(Importer):
|
|
243
|
+
def __init__(self, name):
|
|
244
|
+
# 'name' is passed by the importer factory.
|
|
245
|
+
self.name = name
|
|
246
|
+
|
|
247
|
+
def import_source(
|
|
248
|
+
self,
|
|
249
|
+
data_contract_specification: DataContractSpecification,
|
|
250
|
+
source: str,
|
|
251
|
+
import_args: dict = None,
|
|
252
|
+
) -> DataContractSpecification:
|
|
253
|
+
"""
|
|
254
|
+
Import a protobuf file (and its imports) into the given DataContractSpecification.
|
|
255
|
+
|
|
256
|
+
Parameters:
|
|
257
|
+
- data_contract_specification: the initial specification to update.
|
|
258
|
+
- source: the protobuf file path.
|
|
259
|
+
- import_args: optional dictionary with additional arguments (e.g. 'output_dir').
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
The updated DataContractSpecification.
|
|
263
|
+
"""
|
|
264
|
+
# Wrap the source in a list because import_protobuf expects a list of sources.
|
|
265
|
+
return import_protobuf(data_contract_specification, [source], import_args)
|
|
266
|
+
|
datacontract/lint/resolve.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
|
+
import warnings
|
|
3
4
|
|
|
4
5
|
import fastjsonschema
|
|
5
6
|
import yaml
|
|
@@ -16,6 +17,7 @@ from datacontract.model.data_contract_specification import (
|
|
|
16
17
|
)
|
|
17
18
|
from datacontract.model.exceptions import DataContractException
|
|
18
19
|
from datacontract.model.odcs import is_open_data_contract_standard
|
|
20
|
+
from datacontract.model.run import ResultEnum
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
def resolve_data_contract(
|
|
@@ -37,7 +39,7 @@ def resolve_data_contract(
|
|
|
37
39
|
else:
|
|
38
40
|
raise DataContractException(
|
|
39
41
|
type="lint",
|
|
40
|
-
result=
|
|
42
|
+
result=ResultEnum.failed,
|
|
41
43
|
name="Check that data contract YAML is valid",
|
|
42
44
|
reason="Data contract needs to be provided",
|
|
43
45
|
engine="datacontract",
|
|
@@ -58,7 +60,7 @@ def resolve_data_contract_dict(
|
|
|
58
60
|
else:
|
|
59
61
|
raise DataContractException(
|
|
60
62
|
type="lint",
|
|
61
|
-
result=
|
|
63
|
+
result=ResultEnum.failed,
|
|
62
64
|
name="Check that data contract YAML is valid",
|
|
63
65
|
reason="Data contract needs to be provided",
|
|
64
66
|
engine="datacontract",
|
|
@@ -152,7 +154,7 @@ def _resolve_definition_ref(ref, spec) -> Definition:
|
|
|
152
154
|
else:
|
|
153
155
|
raise DataContractException(
|
|
154
156
|
type="lint",
|
|
155
|
-
result=
|
|
157
|
+
result=ResultEnum.failed,
|
|
156
158
|
name="Check that data contract YAML is valid",
|
|
157
159
|
reason=f"Cannot resolve reference {ref}",
|
|
158
160
|
engine="datacontract",
|
|
@@ -165,7 +167,7 @@ def _find_by_path_in_spec(definition_path: str, spec: DataContractSpecification)
|
|
|
165
167
|
if definition_key not in spec.definitions:
|
|
166
168
|
raise DataContractException(
|
|
167
169
|
type="lint",
|
|
168
|
-
result=
|
|
170
|
+
result=ResultEnum.failed,
|
|
169
171
|
name="Check that data contract YAML is valid",
|
|
170
172
|
reason=f"Cannot resolve definition {definition_key}",
|
|
171
173
|
engine="datacontract",
|
|
@@ -195,7 +197,7 @@ def _fetch_file(path) -> str:
|
|
|
195
197
|
if not os.path.exists(path):
|
|
196
198
|
raise DataContractException(
|
|
197
199
|
type="export",
|
|
198
|
-
result=
|
|
200
|
+
result=ResultEnum.failed,
|
|
199
201
|
name="Check that data contract definition is valid",
|
|
200
202
|
reason=f"Cannot resolve reference {path}",
|
|
201
203
|
engine="datacontract",
|
|
@@ -230,7 +232,7 @@ def _get_quality_ref_file(quality_spec: str | object) -> str | object:
|
|
|
230
232
|
if not os.path.exists(ref):
|
|
231
233
|
raise DataContractException(
|
|
232
234
|
type="export",
|
|
233
|
-
result=
|
|
235
|
+
result=ResultEnum.failed,
|
|
234
236
|
name="Check that data contract quality is valid",
|
|
235
237
|
reason=f"Cannot resolve reference {ref}",
|
|
236
238
|
engine="datacontract",
|
|
@@ -259,8 +261,21 @@ def _resolve_data_contract_from_str(
|
|
|
259
261
|
|
|
260
262
|
if inline_definitions:
|
|
261
263
|
inline_definitions_into_data_contract(spec)
|
|
262
|
-
|
|
263
|
-
|
|
264
|
+
## Suppress DeprecationWarning when accessing spec.quality,
|
|
265
|
+
## iif it is in fact *not* used.
|
|
266
|
+
with warnings.catch_warnings(record=True) as recorded_warnings:
|
|
267
|
+
spec_quality = spec.quality
|
|
268
|
+
for w in recorded_warnings:
|
|
269
|
+
if not issubclass(w.category, DeprecationWarning) or spec_quality is not None:
|
|
270
|
+
warnings.warn_explicit(
|
|
271
|
+
message=w.message,
|
|
272
|
+
category=w.category,
|
|
273
|
+
filename=w.filename,
|
|
274
|
+
lineno=w.lineno,
|
|
275
|
+
source=w.source,
|
|
276
|
+
)
|
|
277
|
+
if spec_quality and inline_quality:
|
|
278
|
+
_resolve_quality_ref(spec_quality)
|
|
264
279
|
|
|
265
280
|
return spec
|
|
266
281
|
|
|
@@ -320,8 +320,8 @@ class DataContractSpecification(pyd.BaseModel):
|
|
|
320
320
|
return DataContractSpecification(**data)
|
|
321
321
|
|
|
322
322
|
def to_yaml(self):
|
|
323
|
-
return yaml.
|
|
324
|
-
self.model_dump(exclude_defaults=True, exclude_none=True, by_alias=True),
|
|
323
|
+
return yaml.safe_dump(
|
|
324
|
+
self.model_dump(mode="json", exclude_defaults=True, exclude_none=True, by_alias=True),
|
|
325
325
|
sort_keys=False,
|
|
326
326
|
allow_unicode=True,
|
|
327
327
|
)
|
datacontract/model/run.py
CHANGED
|
@@ -89,6 +89,9 @@ class Run(BaseModel):
|
|
|
89
89
|
def pretty(self):
|
|
90
90
|
return self.model_dump_json(indent=2)
|
|
91
91
|
|
|
92
|
+
def pretty_logs(self) -> str:
|
|
93
|
+
return "\n".join(f"[{log.timestamp.isoformat()}] {log.level}: {log.message}" for log in self.logs)
|
|
94
|
+
|
|
92
95
|
@staticmethod
|
|
93
96
|
def create_run():
|
|
94
97
|
"""
|
|
File without changes
|
|
@@ -283,7 +283,8 @@
|
|
|
283
283
|
</div>
|
|
284
284
|
<div class="mt-8 md:order-1 md:mt-0">
|
|
285
285
|
<p class="text-center leading-5 text-gray-400">
|
|
286
|
-
Supported
|
|
286
|
+
Supported by <a href="https://datacontract-manager.com"
|
|
287
|
+
class="text-gray-400 hover:text-gray-500">Data Contract Manager</a>
|
|
287
288
|
</p>
|
|
288
289
|
</div>
|
|
289
290
|
</div>
|
|
@@ -190,7 +190,8 @@
|
|
|
190
190
|
</div>
|
|
191
191
|
<div class="mt-8 md:order-1 md:mt-0">
|
|
192
192
|
<p class="text-center leading-5 text-gray-400">
|
|
193
|
-
Supported
|
|
193
|
+
Supported by <a href="https://datacontract-manager.com"
|
|
194
|
+
class="text-gray-400 hover:text-gray-500">Data Contract Manager</a>
|
|
194
195
|
</p>
|
|
195
196
|
</div>
|
|
196
197
|
</div>
|