datacontract-cli 0.10.3__py3-none-any.whl → 0.10.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (41) hide show
  1. datacontract/breaking/breaking.py +12 -0
  2. datacontract/breaking/breaking_rules.py +4 -0
  3. datacontract/catalog/catalog.py +2 -2
  4. datacontract/cli.py +42 -8
  5. datacontract/data_contract.py +84 -134
  6. datacontract/engines/soda/check_soda_execute.py +5 -0
  7. datacontract/engines/soda/connections/duckdb.py +1 -2
  8. datacontract/engines/soda/connections/sqlserver.py +43 -0
  9. datacontract/export/avro_converter.py +23 -2
  10. datacontract/export/bigquery_converter.py +107 -0
  11. datacontract/export/dbml_converter.py +118 -0
  12. datacontract/export/go_converter.py +98 -0
  13. datacontract/export/html_export.py +4 -2
  14. datacontract/export/jsonschema_converter.py +41 -2
  15. datacontract/export/rdf_converter.py +1 -2
  16. datacontract/export/sql_converter.py +1 -0
  17. datacontract/export/sql_type_converter.py +125 -4
  18. datacontract/imports/avro_importer.py +41 -14
  19. datacontract/imports/bigquery_importer.py +178 -0
  20. datacontract/imports/jsonschema_importer.py +148 -0
  21. datacontract/imports/sql_importer.py +2 -2
  22. datacontract/lint/resolve.py +1 -2
  23. datacontract/model/data_contract_specification.py +65 -1
  24. datacontract/publish/publish.py +32 -0
  25. datacontract/py.typed +0 -0
  26. datacontract/templates/datacontract.html +37 -346
  27. datacontract/templates/index.html +70 -5
  28. datacontract/templates/partials/datacontract_information.html +66 -0
  29. datacontract/templates/partials/datacontract_servicelevels.html +253 -0
  30. datacontract/templates/partials/datacontract_terms.html +44 -0
  31. datacontract/templates/partials/definition.html +99 -0
  32. datacontract/templates/partials/example.html +27 -0
  33. datacontract/templates/partials/model_field.html +97 -0
  34. datacontract/templates/partials/server.html +144 -0
  35. datacontract/templates/style/output.css +99 -13
  36. {datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/METADATA +276 -139
  37. {datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/RECORD +41 -26
  38. {datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/LICENSE +0 -0
  39. {datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/WHEEL +0 -0
  40. {datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/entry_points.txt +0 -0
  41. {datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,178 @@
1
+ import json
2
+ from typing import List
3
+
4
+ from google.cloud import bigquery
5
+
6
+ from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
7
+ from datacontract.model.exceptions import DataContractException
8
+
9
+
10
+ def import_bigquery_from_json(
11
+ data_contract_specification: DataContractSpecification, source: str
12
+ ) -> DataContractSpecification:
13
+ try:
14
+ with open(source, "r") as file:
15
+ bigquery_schema = json.loads(file.read())
16
+ except json.JSONDecodeError as e:
17
+ raise DataContractException(
18
+ type="schema",
19
+ name="Parse bigquery schema",
20
+ reason=f"Failed to parse bigquery schema from {source}",
21
+ engine="datacontract",
22
+ original_exception=e,
23
+ )
24
+ return convert_bigquery_schema(data_contract_specification, bigquery_schema)
25
+
26
+
27
+ def import_bigquery_from_api(
28
+ data_contract_specification: DataContractSpecification,
29
+ bigquery_tables: List[str],
30
+ bigquery_project: str,
31
+ bigquery_dataset: str,
32
+ ) -> DataContractSpecification:
33
+ client = bigquery.Client(project=bigquery_project)
34
+
35
+ if bigquery_tables is None:
36
+ bigquery_tables = fetch_table_names(client, bigquery_dataset)
37
+
38
+ for table in bigquery_tables:
39
+ try:
40
+ api_table = client.get_table("{}.{}.{}".format(bigquery_project, bigquery_dataset, table))
41
+
42
+ except ValueError as e:
43
+ raise DataContractException(
44
+ type="schema",
45
+ result="failed",
46
+ name="Invalid table name for bigquery API",
47
+ reason=f"Tablename {table} is invalid for the bigquery API",
48
+ original_exception=e,
49
+ engine="datacontract",
50
+ )
51
+
52
+ if api_table is None:
53
+ raise DataContractException(
54
+ type="request",
55
+ result="failed",
56
+ name="Query bigtable Schema from API",
57
+ reason=f"Table {table} bnot found on bigtable schema Project {bigquery_project}, dataset {bigquery_dataset}.",
58
+ engine="datacontract",
59
+ )
60
+
61
+ convert_bigquery_schema(data_contract_specification, api_table.to_api_repr())
62
+
63
+ return data_contract_specification
64
+
65
+
66
+ def fetch_table_names(client: bigquery.Client, dataset: str) -> List[str]:
67
+ table_names = []
68
+ api_tables = client.list_tables(dataset)
69
+ for api_table in api_tables:
70
+ table_names.append(api_table.table_id)
71
+
72
+ return table_names
73
+
74
+
75
+ def convert_bigquery_schema(
76
+ data_contract_specification: DataContractSpecification, bigquery_schema: dict
77
+ ) -> DataContractSpecification:
78
+ if data_contract_specification.models is None:
79
+ data_contract_specification.models = {}
80
+
81
+ fields = import_table_fields(bigquery_schema.get("schema").get("fields"))
82
+
83
+ # Looking at actual export data, I guess this is always set and friendlyName isn't, though I couldn't say
84
+ # what exactly leads to friendlyName being set
85
+ table_id = bigquery_schema.get("tableReference").get("tableId")
86
+
87
+ data_contract_specification.models[table_id] = Model(fields=fields, type="table")
88
+
89
+ # Copy the description, if it exists
90
+ if bigquery_schema.get("description") is not None:
91
+ data_contract_specification.models[table_id].description = bigquery_schema.get("description")
92
+
93
+ # Set the title from friendlyName if it exists
94
+ if bigquery_schema.get("friendlyName") is not None:
95
+ data_contract_specification.models[table_id].title = bigquery_schema.get("friendlyName")
96
+
97
+ return data_contract_specification
98
+
99
+
100
+ def import_table_fields(table_fields):
101
+ imported_fields = {}
102
+ for field in table_fields:
103
+ field_name = field.get("name")
104
+ imported_fields[field_name] = Field()
105
+ imported_fields[field_name].required = field.get("mode") == "REQUIRED"
106
+ imported_fields[field_name].description = field.get("description")
107
+
108
+ if field.get("type") == "RECORD":
109
+ imported_fields[field_name].type = "object"
110
+ imported_fields[field_name].fields = import_table_fields(field.get("fields"))
111
+ elif field.get("type") == "STRUCT":
112
+ imported_fields[field_name].type = "struct"
113
+ imported_fields[field_name].fields = import_table_fields(field.get("fields"))
114
+ elif field.get("type") == "RANGE":
115
+ # This is a range of date/datetime/timestamp but multiple values
116
+ # So we map it to an array
117
+ imported_fields[field_name].type = "array"
118
+ imported_fields[field_name].items = Field(
119
+ type=map_type_from_bigquery(field["rangeElementType"].get("type"))
120
+ )
121
+ else: # primitive type
122
+ imported_fields[field_name].type = map_type_from_bigquery(field.get("type"))
123
+
124
+ if field.get("type") == "STRING":
125
+ # in bigquery both string and bytes have maxLength but in the datacontracts
126
+ # spec it is only valid for strings
127
+ if field.get("maxLength") is not None:
128
+ imported_fields[field_name].maxLength = int(field.get("maxLength"))
129
+
130
+ if field.get("type") == "NUMERIC" or field.get("type") == "BIGNUMERIC":
131
+ if field.get("precision") is not None:
132
+ imported_fields[field_name].precision = int(field.get("precision"))
133
+
134
+ if field.get("scale") is not None:
135
+ imported_fields[field_name].scale = int(field.get("scale"))
136
+
137
+ return imported_fields
138
+
139
+
140
+ def map_type_from_bigquery(bigquery_type_str: str):
141
+ if bigquery_type_str == "STRING":
142
+ return "string"
143
+ elif bigquery_type_str == "BYTES":
144
+ return "bytes"
145
+ elif bigquery_type_str == "INTEGER":
146
+ return "int"
147
+ elif bigquery_type_str == "INT64":
148
+ return "bigint"
149
+ elif bigquery_type_str == "FLOAT":
150
+ return "float"
151
+ elif bigquery_type_str == "FLOAT64":
152
+ return "double"
153
+ elif bigquery_type_str == "BOOLEAN" or bigquery_type_str == "BOOL":
154
+ return "boolean"
155
+ elif bigquery_type_str == "TIMESTAMP":
156
+ return "timestamp"
157
+ elif bigquery_type_str == "DATE":
158
+ return "date"
159
+ elif bigquery_type_str == "TIME":
160
+ return "timestamp_ntz"
161
+ elif bigquery_type_str == "DATETIME":
162
+ return "timestamp"
163
+ elif bigquery_type_str == "NUMERIC":
164
+ return "numeric"
165
+ elif bigquery_type_str == "BIGNUMERIC":
166
+ return "double"
167
+ elif bigquery_type_str == "GEOGRAPHY":
168
+ return "object"
169
+ elif bigquery_type_str == "JSON":
170
+ return "object"
171
+ else:
172
+ raise DataContractException(
173
+ type="schema",
174
+ result="failed",
175
+ name="Map bigquery type to data contract type",
176
+ reason=f"Unsupported type {bigquery_type_str} in bigquery json definition.",
177
+ engine="datacontract",
178
+ )
@@ -0,0 +1,148 @@
1
+ import json
2
+
3
+ import fastjsonschema
4
+
5
+ from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field, Definition
6
+ from datacontract.model.exceptions import DataContractException
7
+
8
+
9
+ def convert_json_schema_properties(properties, is_definition=False):
10
+ fields = {}
11
+ for field_name, field_schema in properties.items():
12
+ field_kwargs = {}
13
+ field_type = field_schema.get("type")
14
+
15
+ # Determine if the field is required and set the type to the non-null option if applicable
16
+ if isinstance(field_type, list) and "null" in field_type:
17
+ field_kwargs["required"] = False
18
+ non_null_types = [t for t in field_type if t != "null"]
19
+ if non_null_types:
20
+ field_type = non_null_types[0]
21
+ else:
22
+ field_type = None
23
+ else:
24
+ field_kwargs["required"] = True
25
+
26
+ # Set the non-null type
27
+ if field_type:
28
+ field_kwargs["type"] = field_type
29
+
30
+ for key, value in field_schema.items():
31
+ match key:
32
+ case "title":
33
+ field_kwargs["title"] = value
34
+ case "type":
35
+ pass # type is already handled above
36
+ case "format":
37
+ field_kwargs["format"] = value
38
+ case "description":
39
+ field_kwargs["description"] = value
40
+ case "pattern":
41
+ field_kwargs["pattern"] = value
42
+ case "minLength":
43
+ field_kwargs["minLength"] = value
44
+ case "maxLength":
45
+ field_kwargs["maxLength"] = value
46
+ case "minimum":
47
+ field_kwargs["minimum"] = value
48
+ case "exclusiveMinimum":
49
+ field_kwargs["exclusiveMinimum"] = value
50
+ case "maximum":
51
+ field_kwargs["maximum"] = value
52
+ case "exclusiveMaximum":
53
+ field_kwargs["exclusiveMaximum"] = value
54
+ case "enum":
55
+ field_kwargs["enum"] = value
56
+ case "tags":
57
+ field_kwargs["tags"] = value
58
+ case "properties":
59
+ field_kwargs["fields"] = convert_json_schema_properties(value)
60
+ case "items":
61
+ field_kwargs["items"] = convert_json_schema_properties(value)
62
+
63
+ field = Field(**field_kwargs)
64
+ fields[field_name] = field
65
+
66
+ return fields
67
+
68
+
69
+ def import_jsonschema(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
70
+ if data_contract_specification.models is None:
71
+ data_contract_specification.models = {}
72
+
73
+ try:
74
+ with open(source, "r") as file:
75
+ json_schema = json.loads(file.read())
76
+ validator = fastjsonschema.compile({})
77
+ validator(json_schema)
78
+
79
+ model = Model(
80
+ description=json_schema.get("description"),
81
+ type=json_schema.get("type"),
82
+ title=json_schema.get("title"),
83
+ fields=convert_json_schema_properties(json_schema.get("properties", {})),
84
+ )
85
+ data_contract_specification.models[json_schema.get("title", "default_model")] = model
86
+
87
+ if "definitions" in json_schema:
88
+ for def_name, def_schema in json_schema["definitions"].items():
89
+ definition_kwargs = {}
90
+
91
+ for key, value in def_schema.items():
92
+ match key:
93
+ case "domain":
94
+ definition_kwargs["domain"] = value
95
+ case "title":
96
+ definition_kwargs["title"] = value
97
+ case "description":
98
+ definition_kwargs["description"] = value
99
+ case "type":
100
+ definition_kwargs["type"] = value
101
+ case "enum":
102
+ definition_kwargs["enum"] = value
103
+ case "format":
104
+ definition_kwargs["format"] = value
105
+ case "minLength":
106
+ definition_kwargs["minLength"] = value
107
+ case "maxLength":
108
+ definition_kwargs["maxLength"] = value
109
+ case "pattern":
110
+ definition_kwargs["pattern"] = value
111
+ case "minimum":
112
+ definition_kwargs["minimum"] = value
113
+ case "exclusiveMinimum":
114
+ definition_kwargs["exclusiveMinimum"] = value
115
+ case "maximum":
116
+ definition_kwargs["maximum"] = value
117
+ case "exclusiveMaximum":
118
+ definition_kwargs["exclusiveMaximum"] = value
119
+ case "pii":
120
+ definition_kwargs["pii"] = value
121
+ case "classification":
122
+ definition_kwargs["classification"] = value
123
+ case "tags":
124
+ definition_kwargs["tags"] = value
125
+ case "properties":
126
+ definition_kwargs["fields"] = convert_json_schema_properties(value, is_definition=True)
127
+
128
+ definition = Definition(name=def_name, **definition_kwargs)
129
+ data_contract_specification.definitions[def_name] = definition
130
+
131
+ except fastjsonschema.JsonSchemaException as e:
132
+ raise DataContractException(
133
+ type="schema",
134
+ name="Parse json schema",
135
+ reason=f"Failed to parse json schema from {source}: {e}",
136
+ engine="datacontract",
137
+ )
138
+
139
+ except Exception as e:
140
+ raise DataContractException(
141
+ type="schema",
142
+ name="Parse json schema",
143
+ reason=f"Failed to parse json schema from {source}",
144
+ engine="datacontract",
145
+ original_exception=e,
146
+ )
147
+
148
+ return data_contract_specification
@@ -45,7 +45,7 @@ def map_type_from_sql(sql_type: str):
45
45
  return None
46
46
 
47
47
  sql_type_normed = sql_type.lower().strip()
48
-
48
+
49
49
  if sql_type_normed.startswith("varchar"):
50
50
  return "varchar"
51
51
  elif sql_type_normed.startswith("string"):
@@ -69,6 +69,6 @@ def map_type_from_sql(sql_type: str):
69
69
  elif sql_type_normed == "datetime2":
70
70
  return "timestamp_ntz"
71
71
  elif sql_type_normed == "datetimeoffset":
72
- return "timestamp_tz"
72
+ return "timestamp_tz"
73
73
  else:
74
74
  return "variant"
@@ -8,8 +8,7 @@ from fastjsonschema import JsonSchemaValueException
8
8
  from datacontract.lint.files import read_file
9
9
  from datacontract.lint.schema import fetch_schema
10
10
  from datacontract.lint.urls import fetch_resource
11
- from datacontract.model.data_contract_specification import \
12
- DataContractSpecification, Definition, Quality
11
+ from datacontract.model.data_contract_specification import DataContractSpecification, Definition, Quality
13
12
  from datacontract.model.exceptions import DataContractException
14
13
 
15
14
 
@@ -1,5 +1,5 @@
1
1
  import os
2
- from typing import List, Dict
2
+ from typing import List, Dict, Optional, Any
3
3
 
4
4
  import pydantic as pyd
5
5
  import yaml
@@ -31,6 +31,7 @@ class Server(pyd.BaseModel):
31
31
  token: str = None # Use ENV variable
32
32
  dataProductId: str = None
33
33
  outputPortId: str = None
34
+ driver: str = None
34
35
 
35
36
 
36
37
  class Terms(pyd.BaseModel):
@@ -58,6 +59,7 @@ class Definition(pyd.BaseModel):
58
59
  pii: bool = None
59
60
  classification: str = None
60
61
  tags: List[str] = []
62
+ example: str = None
61
63
 
62
64
 
63
65
  class Field(pyd.BaseModel):
@@ -84,12 +86,17 @@ class Field(pyd.BaseModel):
84
86
  tags: List[str] = []
85
87
  fields: Dict[str, "Field"] = {}
86
88
  items: "Field" = None
89
+ precision: int = None
90
+ scale: int = None
91
+ example: str = None
92
+ config: Dict[str, Any] = None
87
93
 
88
94
 
89
95
  class Model(pyd.BaseModel):
90
96
  description: str = None
91
97
  type: str = None
92
98
  namespace: str = None
99
+ title: str = None
93
100
  fields: Dict[str, Field] = {}
94
101
 
95
102
 
@@ -114,6 +121,62 @@ class Quality(pyd.BaseModel):
114
121
  specification: str | object = None
115
122
 
116
123
 
124
+ class Availability(pyd.BaseModel):
125
+ description: Optional[str] = None
126
+ percentage: Optional[str] = None
127
+
128
+
129
+ class Retention(pyd.BaseModel):
130
+ description: Optional[str] = None
131
+ period: Optional[str] = None
132
+ unlimited: Optional[bool] = None
133
+ timestampField: Optional[str] = None
134
+
135
+
136
+ class Latency(pyd.BaseModel):
137
+ description: Optional[str] = None
138
+ threshold: Optional[str] = None
139
+ sourceTimestampField: Optional[str] = None
140
+ processedTimestampField: Optional[str] = None
141
+
142
+
143
+ class Freshness(pyd.BaseModel):
144
+ description: Optional[str] = None
145
+ threshold: Optional[str] = None
146
+ timestampField: Optional[str] = None
147
+
148
+
149
+ class Frequency(pyd.BaseModel):
150
+ description: Optional[str] = None
151
+ type: Optional[str] = None
152
+ interval: Optional[str] = None
153
+ cron: Optional[str] = None
154
+
155
+
156
+ class Support(pyd.BaseModel):
157
+ description: Optional[str] = None
158
+ time: Optional[str] = None
159
+ responseTime: Optional[str] = None
160
+
161
+
162
+ class Backup(pyd.BaseModel):
163
+ description: Optional[str] = None
164
+ interval: Optional[str] = None
165
+ cron: Optional[str] = None
166
+ recoveryTime: Optional[str] = None
167
+ recoveryPoint: Optional[str] = None
168
+
169
+
170
+ class ServiceLevel(pyd.BaseModel):
171
+ availability: Optional[Availability] = None
172
+ retention: Optional[Retention] = None
173
+ latency: Optional[Latency] = None
174
+ freshness: Optional[Freshness] = None
175
+ frequency: Optional[Frequency] = None
176
+ support: Optional[Support] = None
177
+ backup: Optional[Backup] = None
178
+
179
+
117
180
  class DataContractSpecification(pyd.BaseModel):
118
181
  dataContractSpecification: str = None
119
182
  id: str = None
@@ -125,6 +188,7 @@ class DataContractSpecification(pyd.BaseModel):
125
188
  # schema: Dict[str, str]
126
189
  examples: List[Example] = []
127
190
  quality: Quality = None
191
+ servicelevels: Optional[ServiceLevel] = None
128
192
 
129
193
  @classmethod
130
194
  def from_file(cls, file):
@@ -0,0 +1,32 @@
1
+ import os
2
+
3
+ import requests
4
+
5
+ from datacontract.data_contract import DataContract
6
+
7
+
8
+ def publish_to_datamesh_manager(data_contract: DataContract):
9
+ try:
10
+ headers = {"Content-Type": "application/json", "x-api-key": _require_datamesh_manager_api_key()}
11
+ spec = data_contract.get_data_contract_specification()
12
+ id = spec.id
13
+ url = "https://api.datamesh-manager.com/api/datacontracts/{0}".format(id)
14
+ request_body = spec.model_dump_json().encode("utf-8")
15
+ response = requests.put(
16
+ url=url,
17
+ data=request_body,
18
+ headers=headers,
19
+ )
20
+ if response.status_code != 200:
21
+ print(f"Error publishing data contract to Data Mesh Manager: {response.text}")
22
+ exit(1)
23
+ print(f"Published data contract to {url}")
24
+ except Exception as e:
25
+ print(f"Failed publishing data contract. Error: {str(e)}")
26
+
27
+
28
+ def _require_datamesh_manager_api_key():
29
+ datamesh_manager_api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
30
+ if datamesh_manager_api_key is None:
31
+ raise Exception("Cannot publish data contract, as DATAMESH_MANAGER_API_KEY is not set")
32
+ return datamesh_manager_api_key
datacontract/py.typed ADDED
File without changes