datacontract-cli 0.10.23__py3-none-any.whl → 0.10.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. datacontract/__init__.py +13 -0
  2. datacontract/api.py +12 -5
  3. datacontract/catalog/catalog.py +5 -3
  4. datacontract/cli.py +116 -10
  5. datacontract/data_contract.py +143 -65
  6. datacontract/engines/data_contract_checks.py +366 -60
  7. datacontract/engines/data_contract_test.py +50 -4
  8. datacontract/engines/fastjsonschema/check_jsonschema.py +37 -19
  9. datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
  10. datacontract/engines/soda/check_soda_execute.py +22 -3
  11. datacontract/engines/soda/connections/athena.py +79 -0
  12. datacontract/engines/soda/connections/duckdb_connection.py +65 -6
  13. datacontract/engines/soda/connections/kafka.py +4 -2
  14. datacontract/export/avro_converter.py +20 -3
  15. datacontract/export/bigquery_converter.py +1 -1
  16. datacontract/export/dbt_converter.py +36 -7
  17. datacontract/export/dqx_converter.py +126 -0
  18. datacontract/export/duckdb_type_converter.py +57 -0
  19. datacontract/export/excel_exporter.py +923 -0
  20. datacontract/export/exporter.py +3 -0
  21. datacontract/export/exporter_factory.py +17 -1
  22. datacontract/export/great_expectations_converter.py +55 -5
  23. datacontract/export/{html_export.py → html_exporter.py} +31 -20
  24. datacontract/export/markdown_converter.py +134 -5
  25. datacontract/export/mermaid_exporter.py +110 -0
  26. datacontract/export/odcs_v3_exporter.py +187 -145
  27. datacontract/export/protobuf_converter.py +163 -69
  28. datacontract/export/rdf_converter.py +2 -2
  29. datacontract/export/sodacl_converter.py +9 -1
  30. datacontract/export/spark_converter.py +31 -4
  31. datacontract/export/sql_converter.py +6 -2
  32. datacontract/export/sql_type_converter.py +20 -8
  33. datacontract/imports/avro_importer.py +63 -12
  34. datacontract/imports/csv_importer.py +111 -57
  35. datacontract/imports/excel_importer.py +1111 -0
  36. datacontract/imports/importer.py +16 -3
  37. datacontract/imports/importer_factory.py +17 -0
  38. datacontract/imports/json_importer.py +325 -0
  39. datacontract/imports/odcs_importer.py +2 -2
  40. datacontract/imports/odcs_v3_importer.py +351 -151
  41. datacontract/imports/protobuf_importer.py +264 -0
  42. datacontract/imports/spark_importer.py +117 -13
  43. datacontract/imports/sql_importer.py +32 -16
  44. datacontract/imports/unity_importer.py +84 -38
  45. datacontract/init/init_template.py +1 -1
  46. datacontract/integration/datamesh_manager.py +16 -2
  47. datacontract/lint/resolve.py +112 -23
  48. datacontract/lint/schema.py +24 -15
  49. datacontract/model/data_contract_specification/__init__.py +1 -0
  50. datacontract/model/odcs.py +13 -0
  51. datacontract/model/run.py +3 -0
  52. datacontract/output/junit_test_results.py +3 -3
  53. datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
  54. datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
  55. datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
  56. datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
  57. datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
  58. datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
  59. datacontract/templates/datacontract.html +54 -3
  60. datacontract/templates/datacontract_odcs.html +685 -0
  61. datacontract/templates/index.html +5 -2
  62. datacontract/templates/partials/server.html +2 -0
  63. datacontract/templates/style/output.css +319 -145
  64. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.37.dist-info}/METADATA +656 -431
  65. datacontract_cli-0.10.37.dist-info/RECORD +119 -0
  66. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
  67. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
  68. datacontract/export/csv_type_converter.py +0 -36
  69. datacontract/lint/lint.py +0 -142
  70. datacontract/lint/linters/description_linter.py +0 -35
  71. datacontract/lint/linters/field_pattern_linter.py +0 -34
  72. datacontract/lint/linters/field_reference_linter.py +0 -48
  73. datacontract/lint/linters/notice_period_linter.py +0 -55
  74. datacontract/lint/linters/quality_schema_linter.py +0 -52
  75. datacontract/lint/linters/valid_constraints_linter.py +0 -100
  76. datacontract/model/data_contract_specification.py +0 -327
  77. datacontract_cli-0.10.23.dist-info/RECORD +0 -113
  78. /datacontract/{lint/linters → output}/__init__.py +0 -0
  79. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
  80. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
@@ -1,89 +1,143 @@
1
1
  import os
2
+ from typing import Any, Dict, List
2
3
 
3
- import clevercsv
4
+ import duckdb
4
5
 
5
6
  from datacontract.imports.importer import Importer
6
- from datacontract.model.data_contract_specification import DataContractSpecification, Example, Field, Model, Server
7
+ from datacontract.model.data_contract_specification import DataContractSpecification, Model, Server
7
8
 
8
9
 
9
10
  class CsvImporter(Importer):
10
11
  def import_source(
11
12
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
12
13
  ) -> DataContractSpecification:
13
- return import_csv(data_contract_specification, self.import_format, source)
14
+ return import_csv(data_contract_specification, source)
14
15
 
15
16
 
16
- def import_csv(data_contract_specification: DataContractSpecification, format: str, source: str):
17
- include_example = False
18
-
19
- # detect encoding and dialect
20
- encoding = clevercsv.encoding.get_encoding(source)
21
- with open(source, "r", newline="") as fp:
22
- dialect = clevercsv.Sniffer().sniff(fp.read(10000))
23
-
24
- # using auto detecting of the format and encoding
25
- df = clevercsv.read_dataframe(source)
26
-
27
- if data_contract_specification.models is None:
28
- data_contract_specification.models = {}
29
-
17
+ def import_csv(
18
+ data_contract_specification: DataContractSpecification, source: str, include_examples: bool = False
19
+ ) -> DataContractSpecification:
30
20
  # use the file name as table name
31
21
  table_name = os.path.splitext(os.path.basename(source))[0]
32
22
 
23
+ # use duckdb to auto detect format, columns, etc.
24
+ con = duckdb.connect(database=":memory:")
25
+ con.sql(
26
+ f"""CREATE VIEW "{table_name}" AS SELECT * FROM read_csv_auto('{source}', hive_partitioning=1, auto_type_candidates = ['BOOLEAN', 'INTEGER', 'BIGINT', 'DOUBLE', 'VARCHAR']);"""
27
+ )
28
+ dialect = con.sql(f"SELECT * FROM sniff_csv('{source}', sample_size = 1000);").fetchnumpy()
29
+ tbl = con.table(table_name)
30
+
33
31
  if data_contract_specification.servers is None:
34
32
  data_contract_specification.servers = {}
35
33
 
34
+ delimiter = None if dialect is None else dialect["Delimiter"][0]
35
+
36
+ if dialect is not None:
37
+ dc_types = [map_type_from_duckdb(x["type"]) for x in dialect["Columns"][0]]
38
+ else:
39
+ dc_types = [map_type_from_duckdb(str(x)) for x in tbl.dtypes]
40
+
36
41
  data_contract_specification.servers["production"] = Server(
37
- type="local", path=source, format="csv", delimiter=dialect.delimiter
42
+ type="local", path=source, format="csv", delimiter=delimiter
38
43
  )
39
44
 
45
+ rowcount = tbl.shape[0]
46
+
47
+ tallies = dict()
48
+ for row in tbl.describe().fetchall():
49
+ if row[0] not in ["count", "max", "min"]:
50
+ continue
51
+ for i in range(tbl.shape[1]):
52
+ tallies[(row[0], tbl.columns[i])] = row[i + 1] if row[0] != "count" else int(row[i + 1])
53
+
54
+ samples: Dict[str, List] = dict()
55
+ for i in range(tbl.shape[1]):
56
+ field_name = tbl.columns[i]
57
+ if tallies[("count", field_name)] > 0 and tbl.dtypes[i] not in ["BOOLEAN", "BLOB"]:
58
+ sql = f"""SELECT DISTINCT "{field_name}" FROM "{table_name}" WHERE "{field_name}" IS NOT NULL USING SAMPLE 5 ROWS;"""
59
+ samples[field_name] = [x[0] for x in con.sql(sql).fetchall()]
60
+
61
+ formats: Dict[str, str] = dict()
62
+ for i in range(tbl.shape[1]):
63
+ field_name = tbl.columns[i]
64
+ if tallies[("count", field_name)] > 0 and tbl.dtypes[i] == "VARCHAR":
65
+ sql = f"""SELECT
66
+ count_if("{field_name}" IS NOT NULL) as count,
67
+ count_if(regexp_matches("{field_name}", '^[\\w-\\.]+@([\\w-]+\\.)+[\\w-]{{2,4}}$')) as email,
68
+ count_if(regexp_matches("{field_name}", '^[[a-z0-9]{{8}}-?[a-z0-9]{{4}}-?[a-z0-9]{{4}}-?[a-z0-9]{{4}}-?[a-z0-9]{{12}}]')) as uuid
69
+ FROM "{table_name}";
70
+ """
71
+ res = con.sql(sql).fetchone()
72
+ if res[1] == res[0]:
73
+ formats[field_name] = "email"
74
+ elif res[2] == res[0]:
75
+ formats[field_name] = "uuid"
76
+
40
77
  fields = {}
41
- for column, dtype in df.dtypes.items():
42
- field = Field()
43
- field.type = map_type_from_pandas(dtype.name)
44
- fields[column] = field
78
+ for i in range(tbl.shape[1]):
79
+ field_name = tbl.columns[i]
80
+ dc_type = dc_types[i]
81
+
82
+ ## specifying "integer" rather than "bigint" looks nicer
83
+ if (
84
+ dc_type == "bigint"
85
+ and tallies[("max", field_name)] <= 2147483647
86
+ and tallies[("min", field_name)] >= -2147483648
87
+ ):
88
+ dc_type = "integer"
89
+
90
+ field: Dict[str, Any] = {"type": dc_type, "format": formats.get(field_name, None)}
91
+
92
+ if tallies[("count", field_name)] == rowcount:
93
+ field["required"] = True
94
+ if dc_type not in ["boolean", "bytes"]:
95
+ distinct_values = tbl.count(f'DISTINCT "{field_name}"').fetchone()[0] # type: ignore
96
+ if distinct_values > 0 and distinct_values == tallies[("count", field_name)]:
97
+ field["unique"] = True
98
+ s = samples.get(field_name, None)
99
+ if s is not None:
100
+ field["examples"] = s
101
+ if dc_type in ["integer", "bigint", "float", "double"]:
102
+ field["minimum"] = tallies[("min", field_name)]
103
+ field["maximum"] = tallies[("max", field_name)]
104
+
105
+ fields[field_name] = field
106
+
107
+ model_examples = None
108
+ if include_examples:
109
+ model_examples = con.sql(f"""SELECT DISTINCT * FROM "{table_name}" USING SAMPLE 5 ROWS;""").fetchall()
45
110
 
46
111
  data_contract_specification.models[table_name] = Model(
47
- type="table",
48
- description=f"Csv file with encoding {encoding}",
49
- fields=fields,
112
+ type="table", description="Generated model of " + source, fields=fields, examples=model_examples
50
113
  )
51
114
 
52
- # multiline data is not correctly handled by yaml dump
53
- if include_example:
54
- if data_contract_specification.examples is None:
55
- data_contract_specification.examples = []
56
-
57
- # read first 10 lines with the detected encoding
58
- with open(source, "r", encoding=encoding) as csvfile:
59
- lines = csvfile.readlines()[:10]
60
-
61
- data_contract_specification.examples.append(Example(type="csv", model=table_name, data="".join(lines)))
62
-
63
115
  return data_contract_specification
64
116
 
65
117
 
66
- def map_type_from_pandas(sql_type: str):
118
+ _duck_db_types = {
119
+ "BOOLEAN": "boolean",
120
+ "BLOB": "bytes",
121
+ "TINYINT": "integer",
122
+ "SMALLINT": "integer",
123
+ "INTEGER": "integer",
124
+ "BIGINT": "bigint",
125
+ "UTINYINT": "integer",
126
+ "USMALLINT": "integer",
127
+ "UINTEGER": "integer",
128
+ "UBIGINT": "bigint",
129
+ "FLOAT": "float",
130
+ "DOUBLE": "double",
131
+ "VARCHAR": "string",
132
+ "TIMESTAMP": "timestamp",
133
+ "DATE": "date",
134
+ # TODO: Add support for NULL
135
+ }
136
+
137
+
138
+ def map_type_from_duckdb(sql_type: None | str):
67
139
  if sql_type is None:
68
140
  return None
69
141
 
70
- sql_type_normed = sql_type.lower().strip()
71
-
72
- if sql_type_normed == "object":
73
- return "string"
74
- elif sql_type_normed.startswith("str"):
75
- return "string"
76
- elif sql_type_normed.startswith("int"):
77
- return "integer"
78
- elif sql_type_normed.startswith("float"):
79
- return "float"
80
- elif sql_type_normed.startswith("bool"):
81
- return "boolean"
82
- elif sql_type_normed.startswith("timestamp"):
83
- return "timestamp"
84
- elif sql_type_normed == "datetime64":
85
- return "date"
86
- elif sql_type_normed == "timedelta[ns]":
87
- return "timestamp_ntz"
88
- else:
89
- return "variant"
142
+ sql_type_normed = sql_type.upper().strip()
143
+ return _duck_db_types.get(sql_type_normed, "string")