datacontract-cli 0.10.23__py3-none-any.whl → 0.10.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. datacontract/__init__.py +13 -0
  2. datacontract/api.py +12 -5
  3. datacontract/catalog/catalog.py +5 -3
  4. datacontract/cli.py +119 -13
  5. datacontract/data_contract.py +145 -67
  6. datacontract/engines/data_contract_checks.py +366 -60
  7. datacontract/engines/data_contract_test.py +50 -4
  8. datacontract/engines/fastjsonschema/check_jsonschema.py +37 -19
  9. datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
  10. datacontract/engines/soda/check_soda_execute.py +27 -3
  11. datacontract/engines/soda/connections/athena.py +79 -0
  12. datacontract/engines/soda/connections/duckdb_connection.py +65 -6
  13. datacontract/engines/soda/connections/kafka.py +4 -2
  14. datacontract/engines/soda/connections/oracle.py +50 -0
  15. datacontract/export/avro_converter.py +20 -3
  16. datacontract/export/bigquery_converter.py +1 -1
  17. datacontract/export/dbt_converter.py +36 -7
  18. datacontract/export/dqx_converter.py +126 -0
  19. datacontract/export/duckdb_type_converter.py +57 -0
  20. datacontract/export/excel_exporter.py +923 -0
  21. datacontract/export/exporter.py +3 -0
  22. datacontract/export/exporter_factory.py +17 -1
  23. datacontract/export/great_expectations_converter.py +55 -5
  24. datacontract/export/{html_export.py → html_exporter.py} +31 -20
  25. datacontract/export/markdown_converter.py +134 -5
  26. datacontract/export/mermaid_exporter.py +110 -0
  27. datacontract/export/odcs_v3_exporter.py +193 -149
  28. datacontract/export/protobuf_converter.py +163 -69
  29. datacontract/export/rdf_converter.py +2 -2
  30. datacontract/export/sodacl_converter.py +9 -1
  31. datacontract/export/spark_converter.py +31 -4
  32. datacontract/export/sql_converter.py +6 -2
  33. datacontract/export/sql_type_converter.py +124 -8
  34. datacontract/imports/avro_importer.py +63 -12
  35. datacontract/imports/csv_importer.py +111 -57
  36. datacontract/imports/excel_importer.py +1112 -0
  37. datacontract/imports/importer.py +16 -3
  38. datacontract/imports/importer_factory.py +17 -0
  39. datacontract/imports/json_importer.py +325 -0
  40. datacontract/imports/odcs_importer.py +2 -2
  41. datacontract/imports/odcs_v3_importer.py +367 -151
  42. datacontract/imports/protobuf_importer.py +264 -0
  43. datacontract/imports/spark_importer.py +117 -13
  44. datacontract/imports/sql_importer.py +32 -16
  45. datacontract/imports/unity_importer.py +84 -38
  46. datacontract/init/init_template.py +1 -1
  47. datacontract/integration/entropy_data.py +126 -0
  48. datacontract/lint/resolve.py +112 -23
  49. datacontract/lint/schema.py +24 -15
  50. datacontract/lint/urls.py +17 -3
  51. datacontract/model/data_contract_specification/__init__.py +1 -0
  52. datacontract/model/odcs.py +13 -0
  53. datacontract/model/run.py +3 -0
  54. datacontract/output/junit_test_results.py +3 -3
  55. datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
  56. datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
  57. datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
  58. datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
  59. datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
  60. datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
  61. datacontract/schemas/odcs-3.1.0.schema.json +2809 -0
  62. datacontract/templates/datacontract.html +54 -3
  63. datacontract/templates/datacontract_odcs.html +685 -0
  64. datacontract/templates/index.html +5 -2
  65. datacontract/templates/partials/server.html +2 -0
  66. datacontract/templates/style/output.css +319 -145
  67. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/METADATA +711 -433
  68. datacontract_cli-0.10.40.dist-info/RECORD +121 -0
  69. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/WHEEL +1 -1
  70. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info/licenses}/LICENSE +1 -1
  71. datacontract/export/csv_type_converter.py +0 -36
  72. datacontract/integration/datamesh_manager.py +0 -72
  73. datacontract/lint/lint.py +0 -142
  74. datacontract/lint/linters/description_linter.py +0 -35
  75. datacontract/lint/linters/field_pattern_linter.py +0 -34
  76. datacontract/lint/linters/field_reference_linter.py +0 -48
  77. datacontract/lint/linters/notice_period_linter.py +0 -55
  78. datacontract/lint/linters/quality_schema_linter.py +0 -52
  79. datacontract/lint/linters/valid_constraints_linter.py +0 -100
  80. datacontract/model/data_contract_specification.py +0 -327
  81. datacontract_cli-0.10.23.dist-info/RECORD +0 -113
  82. /datacontract/{lint/linters → output}/__init__.py +0 -0
  83. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/entry_points.txt +0 -0
  84. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/top_level.txt +0 -0
@@ -55,7 +55,6 @@ def import_avro(data_contract_specification: DataContractSpecification, source:
55
55
  engine="datacontract",
56
56
  original_exception=e,
57
57
  )
58
-
59
58
  # type record is being used for both the table and the object types in data contract
60
59
  # -> CONSTRAINT: one table per .avsc input, all nested records are interpreted as objects
61
60
  fields = import_record_fields(avro_schema.fields)
@@ -92,6 +91,20 @@ def handle_config_avro_custom_properties(field: avro.schema.Field, imported_fiel
92
91
  imported_field.config["avroDefault"] = field.default
93
92
 
94
93
 
94
+ LOGICAL_TYPE_MAPPING = {
95
+ "decimal": "decimal",
96
+ "date": "date",
97
+ "time-millis": "time",
98
+ "time-micros": "time",
99
+ "timestamp-millis": "timestamp_tz",
100
+ "timestamp-micros": "timestamp_tz",
101
+ "local-timestamp-micros": "timestamp_ntz",
102
+ "local-timestamp-millis": "timestamp_ntz",
103
+ "duration": "string",
104
+ "uuid": "string",
105
+ }
106
+
107
+
95
108
  def import_record_fields(record_fields: List[avro.schema.Field]) -> Dict[str, Field]:
96
109
  """
97
110
  Import Avro record fields and convert them to data contract fields.
@@ -117,13 +130,23 @@ def import_record_fields(record_fields: List[avro.schema.Field]) -> Dict[str, Fi
117
130
  imported_field.fields = import_record_fields(field.type.fields)
118
131
  elif field.type.type == "union":
119
132
  imported_field.required = False
120
- type = import_type_of_optional_field(field)
121
- imported_field.type = type
122
- if type == "record":
123
- imported_field.fields = import_record_fields(get_record_from_union_field(field).fields)
124
- elif type == "array":
125
- imported_field.type = "array"
126
- imported_field.items = import_avro_array_items(get_array_from_union_field(field))
133
+ # Check for enum in union first, since it needs special handling
134
+ enum_schema = get_enum_from_union_field(field)
135
+ if enum_schema:
136
+ imported_field.type = "string"
137
+ imported_field.enum = enum_schema.symbols
138
+ imported_field.title = enum_schema.name
139
+ if not imported_field.config:
140
+ imported_field.config = {}
141
+ imported_field.config["avroType"] = "enum"
142
+ else:
143
+ type = import_type_of_optional_field(field)
144
+ imported_field.type = type
145
+ if type == "record":
146
+ imported_field.fields = import_record_fields(get_record_from_union_field(field).fields)
147
+ elif type == "array":
148
+ imported_field.type = "array"
149
+ imported_field.items = import_avro_array_items(get_array_from_union_field(field))
127
150
  elif field.type.type == "array":
128
151
  imported_field.type = "array"
129
152
  imported_field.items = import_avro_array_items(field.type)
@@ -137,9 +160,15 @@ def import_record_fields(record_fields: List[avro.schema.Field]) -> Dict[str, Fi
137
160
  if not imported_field.config:
138
161
  imported_field.config = {}
139
162
  imported_field.config["avroType"] = "enum"
140
- else: # primitive type
141
- imported_field.type = map_type_from_avro(field.type.type)
142
-
163
+ else:
164
+ logical_type = field.type.get_prop("logicalType")
165
+ if logical_type in LOGICAL_TYPE_MAPPING:
166
+ imported_field.type = LOGICAL_TYPE_MAPPING[logical_type]
167
+ if logical_type == "decimal":
168
+ imported_field.precision = field.type.precision
169
+ imported_field.scale = field.type.scale
170
+ else:
171
+ imported_field.type = map_type_from_avro(field.type.type)
143
172
  imported_fields[field.name] = imported_field
144
173
 
145
174
  return imported_fields
@@ -212,7 +241,11 @@ def import_type_of_optional_field(field: avro.schema.Field) -> str:
212
241
  """
213
242
  for field_type in field.type.schemas:
214
243
  if field_type.type != "null":
215
- return map_type_from_avro(field_type.type)
244
+ logical_type = field_type.get_prop("logicalType")
245
+ if logical_type and logical_type in LOGICAL_TYPE_MAPPING:
246
+ return LOGICAL_TYPE_MAPPING[logical_type]
247
+ else:
248
+ return map_type_from_avro(field_type.type)
216
249
  raise DataContractException(
217
250
  type="schema",
218
251
  result="failed",
@@ -254,6 +287,22 @@ def get_array_from_union_field(field: avro.schema.Field) -> avro.schema.ArraySch
254
287
  return None
255
288
 
256
289
 
290
+ def get_enum_from_union_field(field: avro.schema.Field) -> avro.schema.EnumSchema | None:
291
+ """
292
+ Get the enum schema from a union field.
293
+
294
+ Args:
295
+ field: The Avro field with a union type.
296
+
297
+ Returns:
298
+ The enum schema if found, None otherwise.
299
+ """
300
+ for field_type in field.type.schemas:
301
+ if field_type.type == "enum":
302
+ return field_type
303
+ return None
304
+
305
+
257
306
  def map_type_from_avro(avro_type_str: str) -> str:
258
307
  """
259
308
  Map Avro type strings to data contract type strings.
@@ -276,6 +325,8 @@ def map_type_from_avro(avro_type_str: str) -> str:
276
325
  return "binary"
277
326
  elif avro_type_str == "double":
278
327
  return "double"
328
+ elif avro_type_str == "float":
329
+ return "float"
279
330
  elif avro_type_str == "int":
280
331
  return "int"
281
332
  elif avro_type_str == "long":
@@ -1,89 +1,143 @@
1
1
  import os
2
+ from typing import Any, Dict, List
2
3
 
3
- import clevercsv
4
+ import duckdb
4
5
 
5
6
  from datacontract.imports.importer import Importer
6
- from datacontract.model.data_contract_specification import DataContractSpecification, Example, Field, Model, Server
7
+ from datacontract.model.data_contract_specification import DataContractSpecification, Model, Server
7
8
 
8
9
 
9
10
  class CsvImporter(Importer):
10
11
  def import_source(
11
12
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
12
13
  ) -> DataContractSpecification:
13
- return import_csv(data_contract_specification, self.import_format, source)
14
+ return import_csv(data_contract_specification, source)
14
15
 
15
16
 
16
- def import_csv(data_contract_specification: DataContractSpecification, format: str, source: str):
17
- include_example = False
18
-
19
- # detect encoding and dialect
20
- encoding = clevercsv.encoding.get_encoding(source)
21
- with open(source, "r", newline="") as fp:
22
- dialect = clevercsv.Sniffer().sniff(fp.read(10000))
23
-
24
- # using auto detecting of the format and encoding
25
- df = clevercsv.read_dataframe(source)
26
-
27
- if data_contract_specification.models is None:
28
- data_contract_specification.models = {}
29
-
17
+ def import_csv(
18
+ data_contract_specification: DataContractSpecification, source: str, include_examples: bool = False
19
+ ) -> DataContractSpecification:
30
20
  # use the file name as table name
31
21
  table_name = os.path.splitext(os.path.basename(source))[0]
32
22
 
23
+ # use duckdb to auto detect format, columns, etc.
24
+ con = duckdb.connect(database=":memory:")
25
+ con.sql(
26
+ f"""CREATE VIEW "{table_name}" AS SELECT * FROM read_csv_auto('{source}', hive_partitioning=1, auto_type_candidates = ['BOOLEAN', 'INTEGER', 'BIGINT', 'DOUBLE', 'VARCHAR']);"""
27
+ )
28
+ dialect = con.sql(f"SELECT * FROM sniff_csv('{source}', sample_size = 1000);").fetchnumpy()
29
+ tbl = con.table(table_name)
30
+
33
31
  if data_contract_specification.servers is None:
34
32
  data_contract_specification.servers = {}
35
33
 
34
+ delimiter = None if dialect is None else dialect["Delimiter"][0]
35
+
36
+ if dialect is not None:
37
+ dc_types = [map_type_from_duckdb(x["type"]) for x in dialect["Columns"][0]]
38
+ else:
39
+ dc_types = [map_type_from_duckdb(str(x)) for x in tbl.dtypes]
40
+
36
41
  data_contract_specification.servers["production"] = Server(
37
- type="local", path=source, format="csv", delimiter=dialect.delimiter
42
+ type="local", path=source, format="csv", delimiter=delimiter
38
43
  )
39
44
 
45
+ rowcount = tbl.shape[0]
46
+
47
+ tallies = dict()
48
+ for row in tbl.describe().fetchall():
49
+ if row[0] not in ["count", "max", "min"]:
50
+ continue
51
+ for i in range(tbl.shape[1]):
52
+ tallies[(row[0], tbl.columns[i])] = row[i + 1] if row[0] != "count" else int(row[i + 1])
53
+
54
+ samples: Dict[str, List] = dict()
55
+ for i in range(tbl.shape[1]):
56
+ field_name = tbl.columns[i]
57
+ if tallies[("count", field_name)] > 0 and tbl.dtypes[i] not in ["BOOLEAN", "BLOB"]:
58
+ sql = f"""SELECT DISTINCT "{field_name}" FROM "{table_name}" WHERE "{field_name}" IS NOT NULL USING SAMPLE 5 ROWS;"""
59
+ samples[field_name] = [x[0] for x in con.sql(sql).fetchall()]
60
+
61
+ formats: Dict[str, str] = dict()
62
+ for i in range(tbl.shape[1]):
63
+ field_name = tbl.columns[i]
64
+ if tallies[("count", field_name)] > 0 and tbl.dtypes[i] == "VARCHAR":
65
+ sql = f"""SELECT
66
+ count_if("{field_name}" IS NOT NULL) as count,
67
+ count_if(regexp_matches("{field_name}", '^[\\w-\\.]+@([\\w-]+\\.)+[\\w-]{{2,4}}$')) as email,
68
+ count_if(regexp_matches("{field_name}", '^[[a-z0-9]{{8}}-?[a-z0-9]{{4}}-?[a-z0-9]{{4}}-?[a-z0-9]{{4}}-?[a-z0-9]{{12}}]')) as uuid
69
+ FROM "{table_name}";
70
+ """
71
+ res = con.sql(sql).fetchone()
72
+ if res[1] == res[0]:
73
+ formats[field_name] = "email"
74
+ elif res[2] == res[0]:
75
+ formats[field_name] = "uuid"
76
+
40
77
  fields = {}
41
- for column, dtype in df.dtypes.items():
42
- field = Field()
43
- field.type = map_type_from_pandas(dtype.name)
44
- fields[column] = field
78
+ for i in range(tbl.shape[1]):
79
+ field_name = tbl.columns[i]
80
+ dc_type = dc_types[i]
81
+
82
+ ## specifying "integer" rather than "bigint" looks nicer
83
+ if (
84
+ dc_type == "bigint"
85
+ and tallies[("max", field_name)] <= 2147483647
86
+ and tallies[("min", field_name)] >= -2147483648
87
+ ):
88
+ dc_type = "integer"
89
+
90
+ field: Dict[str, Any] = {"type": dc_type, "format": formats.get(field_name, None)}
91
+
92
+ if tallies[("count", field_name)] == rowcount:
93
+ field["required"] = True
94
+ if dc_type not in ["boolean", "bytes"]:
95
+ distinct_values = tbl.count(f'DISTINCT "{field_name}"').fetchone()[0] # type: ignore
96
+ if distinct_values > 0 and distinct_values == tallies[("count", field_name)]:
97
+ field["unique"] = True
98
+ s = samples.get(field_name, None)
99
+ if s is not None:
100
+ field["examples"] = s
101
+ if dc_type in ["integer", "bigint", "float", "double"]:
102
+ field["minimum"] = tallies[("min", field_name)]
103
+ field["maximum"] = tallies[("max", field_name)]
104
+
105
+ fields[field_name] = field
106
+
107
+ model_examples = None
108
+ if include_examples:
109
+ model_examples = con.sql(f"""SELECT DISTINCT * FROM "{table_name}" USING SAMPLE 5 ROWS;""").fetchall()
45
110
 
46
111
  data_contract_specification.models[table_name] = Model(
47
- type="table",
48
- description=f"Csv file with encoding {encoding}",
49
- fields=fields,
112
+ type="table", description="Generated model of " + source, fields=fields, examples=model_examples
50
113
  )
51
114
 
52
- # multiline data is not correctly handled by yaml dump
53
- if include_example:
54
- if data_contract_specification.examples is None:
55
- data_contract_specification.examples = []
56
-
57
- # read first 10 lines with the detected encoding
58
- with open(source, "r", encoding=encoding) as csvfile:
59
- lines = csvfile.readlines()[:10]
60
-
61
- data_contract_specification.examples.append(Example(type="csv", model=table_name, data="".join(lines)))
62
-
63
115
  return data_contract_specification
64
116
 
65
117
 
66
- def map_type_from_pandas(sql_type: str):
118
+ _duck_db_types = {
119
+ "BOOLEAN": "boolean",
120
+ "BLOB": "bytes",
121
+ "TINYINT": "integer",
122
+ "SMALLINT": "integer",
123
+ "INTEGER": "integer",
124
+ "BIGINT": "bigint",
125
+ "UTINYINT": "integer",
126
+ "USMALLINT": "integer",
127
+ "UINTEGER": "integer",
128
+ "UBIGINT": "bigint",
129
+ "FLOAT": "float",
130
+ "DOUBLE": "double",
131
+ "VARCHAR": "string",
132
+ "TIMESTAMP": "timestamp",
133
+ "DATE": "date",
134
+ # TODO: Add support for NULL
135
+ }
136
+
137
+
138
+ def map_type_from_duckdb(sql_type: None | str):
67
139
  if sql_type is None:
68
140
  return None
69
141
 
70
- sql_type_normed = sql_type.lower().strip()
71
-
72
- if sql_type_normed == "object":
73
- return "string"
74
- elif sql_type_normed.startswith("str"):
75
- return "string"
76
- elif sql_type_normed.startswith("int"):
77
- return "integer"
78
- elif sql_type_normed.startswith("float"):
79
- return "float"
80
- elif sql_type_normed.startswith("bool"):
81
- return "boolean"
82
- elif sql_type_normed.startswith("timestamp"):
83
- return "timestamp"
84
- elif sql_type_normed == "datetime64":
85
- return "date"
86
- elif sql_type_normed == "timedelta[ns]":
87
- return "timestamp_ntz"
88
- else:
89
- return "variant"
142
+ sql_type_normed = sql_type.upper().strip()
143
+ return _duck_db_types.get(sql_type_normed, "string")