dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,222 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import base64
16
+ import logging
17
+ from typing import Any, ClassVar, List, Optional, Type, Union
18
+
19
+ import attrs
20
+
21
+ from data_diff.abcs.database_types import (
22
+ Boolean,
23
+ Date,
24
+ DbPath,
25
+ Decimal,
26
+ Float,
27
+ FractionalType,
28
+ TemporalType,
29
+ Text,
30
+ Time,
31
+ Timestamp,
32
+ TimestampTZ,
33
+ )
34
+ from data_diff.databases.base import (
35
+ CHECKSUM_MASK,
36
+ CHECKSUM_OFFSET,
37
+ BaseDialect,
38
+ ConnectError,
39
+ Database,
40
+ ThreadLocalInterpreter,
41
+ import_helper,
42
+ )
43
+
44
+
45
+ @import_helper("snowflake")
46
+ def import_snowflake():
47
+ import snowflake.connector
48
+ from cryptography.hazmat.backends import default_backend
49
+ from cryptography.hazmat.primitives import serialization
50
+
51
+ return snowflake, serialization, default_backend
52
+
53
+
54
+ class Dialect(BaseDialect):
55
+ name = "Snowflake"
56
+ ROUNDS_ON_PREC_LOSS = False
57
+ TYPE_CLASSES = {
58
+ # Timestamps
59
+ "TIMESTAMP_NTZ": Timestamp,
60
+ "TIMESTAMP_LTZ": Timestamp,
61
+ "TIMESTAMP_TZ": TimestampTZ,
62
+ "DATE": Date,
63
+ "TIME": Time,
64
+ # Numbers
65
+ "NUMBER": Decimal,
66
+ "FLOAT": Float,
67
+ # Text
68
+ "TEXT": Text,
69
+ # Boolean
70
+ "BOOLEAN": Boolean,
71
+ }
72
+
73
+ def explain_as_text(self, query: str) -> str:
74
+ return f"EXPLAIN USING TEXT {query}"
75
+
76
+ def quote(self, s: str, is_table: bool = False):
77
+ return f'"{s}"'
78
+
79
+ def to_string(self, s: str):
80
+ return f"cast({s} as string)"
81
+
82
+ def set_timezone_to_utc(self) -> str:
83
+ return "ALTER SESSION SET TIMEZONE = 'UTC'"
84
+
85
+ def optimizer_hints(self, hints: str) -> str:
86
+ raise NotImplementedError("Optimizer hints not yet implemented in snowflake")
87
+
88
+ def type_repr(self, t) -> str:
89
+ if isinstance(t, TimestampTZ):
90
+ return f"timestamp_tz({t.precision})"
91
+ return super().type_repr(t)
92
+
93
+ def md5_as_int(self, s: str) -> str:
94
+ return f"BITAND(md5_number_lower64({s}), {CHECKSUM_MASK}) - {CHECKSUM_OFFSET}"
95
+
96
+ def md5_as_hex(self, s: str) -> str:
97
+ return f"md5({s})"
98
+
99
+ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
100
+ try:
101
+ is_date = coltype.is_date
102
+ is_time = coltype.is_time
103
+ except:
104
+ is_date = False
105
+ is_time = False
106
+ if isinstance(coltype, Date) or is_date:
107
+ return f"({value}::varchar)"
108
+ elif isinstance(coltype, Time) or is_time:
109
+ microseconds = f"TIMEDIFF(microsecond, cast('00:00:00' as time), {value})"
110
+ rounded = f"round({microseconds}, -6 + {coltype.precision})"
111
+ time_value = f"TIMEADD(microsecond, {rounded}, cast('00:00:00' as time))"
112
+ converted = f"TO_VARCHAR({time_value}, 'HH24:MI:SS.FF6')"
113
+ return converted
114
+
115
+ if coltype.rounds:
116
+ timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, convert_timezone('UTC', {value})::timestamp(9))/1000000000, {coltype.precision}))"
117
+ else:
118
+ timestamp = f"cast(convert_timezone('UTC', {value}) as timestamp({coltype.precision}))"
119
+
120
+ return f"to_char({timestamp}, 'YYYY-MM-DD HH24:MI:SS.FF6')"
121
+
122
+ def normalize_number(self, value: str, coltype: FractionalType) -> str:
123
+ return self.to_string(f"cast({value} as decimal(38, {coltype.precision}))")
124
+
125
+ def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
126
+ return self.to_string(f"{value}::int")
127
+
128
+
129
+ @attrs.define(frozen=False, init=False, kw_only=True)
130
+ class Snowflake(Database):
131
+ DIALECT_CLASS: ClassVar[Type[BaseDialect]] = Dialect
132
+ CONNECT_URI_HELP = "snowflake://<user>:<password>@<account>/<database>/<SCHEMA>?warehouse=<WAREHOUSE>"
133
+ CONNECT_URI_PARAMS = ["database", "schema"]
134
+ CONNECT_URI_KWPARAMS = ["warehouse"]
135
+
136
+ _conn: Any
137
+
138
+ def __init__(self, *, schema: str, key: Optional[str] = None, key_content: Optional[str] = None, **kw) -> None:
139
+ super().__init__()
140
+ snowflake, serialization, default_backend = import_snowflake()
141
+ logging.getLogger("snowflake.connector").setLevel(logging.WARNING)
142
+
143
+ # Ignore the error: snowflake.connector.network.RetryRequest: could not find io module state
144
+ # It's a known issue: https://github.com/snowflakedb/snowflake-connector-python/issues/145
145
+ logging.getLogger("snowflake.connector.network").disabled = True
146
+
147
+ assert '"' not in schema, "Schema name should not contain quotes!"
148
+ if key_content and key:
149
+ raise ConnectError("Only key value or key file path can be specified, not both")
150
+
151
+ key_bytes = None
152
+ if key:
153
+ with open(key, "rb") as f:
154
+ key_bytes = f.read()
155
+ if key_content:
156
+ key_bytes = base64.b64decode(key_content)
157
+
158
+ # If a private key is used, read it from the specified path and pass it as "private_key" to the connector.
159
+ if key_bytes:
160
+ if "password" in kw:
161
+ raise ConnectError("Cannot use password and key at the same time")
162
+ if kw.get("private_key_passphrase"):
163
+ encoded_passphrase = kw.get("private_key_passphrase").encode()
164
+ else:
165
+ encoded_passphrase = None
166
+ p_key = serialization.load_pem_private_key(
167
+ key_bytes,
168
+ password=encoded_passphrase,
169
+ backend=default_backend(),
170
+ )
171
+
172
+ kw["private_key"] = p_key.private_bytes(
173
+ encoding=serialization.Encoding.DER,
174
+ format=serialization.PrivateFormat.PKCS8,
175
+ encryption_algorithm=serialization.NoEncryption(),
176
+ )
177
+
178
+ self._conn = snowflake.connector.connect(schema=f'"{schema}"', **kw)
179
+
180
+ self.default_schema = schema
181
+
182
+ def close(self):
183
+ super().close()
184
+ self._conn.close()
185
+
186
+ def _query(self, sql_code: Union[str, ThreadLocalInterpreter]):
187
+ "Uses the standard SQL cursor interface"
188
+ return self._query_conn(self._conn, sql_code)
189
+
190
+ def select_table_schema(self, path: DbPath) -> str:
191
+ """Provide SQL for selecting the table schema as (name, type, date_prec, num_prec)"""
192
+ database, schema, name = self._normalize_table_path(path)
193
+ info_schema_path = ["information_schema", "columns"]
194
+ if database:
195
+ info_schema_path.insert(0, database)
196
+
197
+ return (
198
+ "SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale, "
199
+ " coalesce(collation_name, 'utf8') as collation_name, "
200
+ " coalesce(character_maximum_length, NULL) as character_maximum_length "
201
+ f"FROM {'.'.join(info_schema_path)} "
202
+ f"WHERE table_name = '{name}' AND table_schema = '{schema}'"
203
+ )
204
+
205
+ def _normalize_table_path(self, path: DbPath) -> DbPath:
206
+ if len(path) == 1:
207
+ return None, self.default_schema, path[0]
208
+ elif len(path) == 2:
209
+ return None, path[0], path[1]
210
+ elif len(path) == 3:
211
+ return path
212
+
213
+ raise ValueError(
214
+ f"{self.name}: Bad table path for {self}: '{'.'.join(path)}'. Expected format: table, schema.table, or database.schema.table"
215
+ )
216
+
217
+ @property
218
+ def is_autocommit(self) -> bool:
219
+ return True
220
+
221
+ def query_table_unique_columns(self, path: DbPath) -> List[str]:
222
+ return []