dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
data_diff/format.py ADDED
@@ -0,0 +1,369 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import collections
16
+ from enum import Enum
17
+ from typing import Any, Dict, List, Optional, Tuple, Type
18
+
19
+ import attrs
20
+
21
+ from data_diff.abcs.database_types import (
22
+ JSON,
23
+ Array,
24
+ Boolean,
25
+ ColType,
26
+ ColType_Alphanum,
27
+ ColType_UUID,
28
+ Date,
29
+ FractionalType,
30
+ NumericType,
31
+ String_Alphanum,
32
+ Struct,
33
+ TemporalType,
34
+ )
35
+ from data_diff.diff_tables import DiffResultWrapper
36
+
37
+
38
+ def jsonify_error(table1: List[str], table2: List[str], error: str) -> "FailedDiff":
39
+ return attrs.asdict(
40
+ FailedDiff(
41
+ status="failed",
42
+ dataset1=table1,
43
+ dataset2=table2,
44
+ error=error,
45
+ )
46
+ )
47
+
48
+
49
+ Columns = List[Tuple[str, str, ColType]]
50
+
51
+
52
+ def jsonify(
53
+ diff: DiffResultWrapper,
54
+ dataset1_columns: Columns,
55
+ dataset2_columns: Columns,
56
+ columns_diff: Dict[str, List[str]],
57
+ with_summary: bool = False,
58
+ stats_only: bool = False,
59
+ ) -> "JsonDiff":
60
+ """
61
+ Converts the diff result into a JSON-serializable format.
62
+ Optionally add stats summary and schema diff.
63
+ """
64
+ diff_info = diff.info_tree.info
65
+ table1 = diff_info.tables[0]
66
+ table2 = diff_info.tables[1]
67
+ key_columns = table1.key_columns
68
+
69
+ t1_exclusive_rows = []
70
+ t2_exclusive_rows = []
71
+ diff_rows = []
72
+ rows = None
73
+ schema = [field for field, _ in diff_info.diff_schema]
74
+
75
+ t1_exclusive_rows, t2_exclusive_rows, diff_rows = _group_rows(diff_info, schema)
76
+
77
+ if not stats_only:
78
+ rows = _make_rows_diff(t1_exclusive_rows, t2_exclusive_rows, diff_rows, key_columns)
79
+
80
+ summary = None
81
+ if with_summary:
82
+ summary = _jsonify_diff_summary(diff.get_stats_dict())
83
+
84
+ columns = _jsonify_columns_diff(dataset1_columns, dataset2_columns, columns_diff, list(key_columns))
85
+
86
+ is_different = bool(
87
+ t1_exclusive_rows
88
+ or t2_exclusive_rows
89
+ or diff_rows
90
+ or (columns_diff["added"] or columns_diff["removed"] or columns_diff["changed"])
91
+ )
92
+ return attrs.asdict(
93
+ JsonDiff(
94
+ status="success",
95
+ result="different" if is_different else "identical",
96
+ dataset1=list(table1.table_path),
97
+ dataset2=list(table2.table_path),
98
+ rows=rows,
99
+ summary=summary,
100
+ columns=columns,
101
+ )
102
+ )
103
+
104
+
105
+ @attrs.define(frozen=True)
106
+ class JsonExclusiveRowValue:
107
+ """
108
+ Value of a single column in a row
109
+ """
110
+
111
+ isPK: bool
112
+ value: Any
113
+
114
+
115
+ @attrs.define(frozen=True)
116
+ class JsonDiffRowValue:
117
+ """
118
+ Pair of diffed values for 2 rows with equal PKs
119
+ """
120
+
121
+ dataset1: Any
122
+ dataset2: Any
123
+ isDiff: bool
124
+ isPK: bool
125
+
126
+
127
+ @attrs.define(frozen=True)
128
+ class Total:
129
+ dataset1: int
130
+ dataset2: int
131
+
132
+
133
+ @attrs.define(frozen=True)
134
+ class ExclusiveRows:
135
+ dataset1: int
136
+ dataset2: int
137
+
138
+
139
+ @attrs.define(frozen=True)
140
+ class Rows:
141
+ total: Total
142
+ exclusive: ExclusiveRows
143
+ updated: int
144
+ unchanged: int
145
+
146
+
147
+ @attrs.define(frozen=True)
148
+ class Stats:
149
+ diffCounts: Dict[str, int]
150
+
151
+
152
+ @attrs.define(frozen=True)
153
+ class JsonDiffSummary:
154
+ rows: Rows
155
+ stats: Stats
156
+
157
+
158
+ @attrs.define(frozen=True)
159
+ class ExclusiveColumns:
160
+ dataset1: List[str]
161
+ dataset2: List[str]
162
+
163
+
164
+ class ColumnKind(Enum):
165
+ INTEGER = "integer"
166
+ FLOAT = "float"
167
+ STRING = "string"
168
+ DATE = "date"
169
+ TIME = "time"
170
+ DATETIME = "datetime"
171
+ BOOL = "boolean"
172
+ UNSUPPORTED = "unsupported"
173
+
174
+
175
+ KIND_MAPPING: List[Tuple[Type[ColType], ColumnKind]] = [
176
+ (Boolean, ColumnKind.BOOL),
177
+ (Date, ColumnKind.DATE),
178
+ (TemporalType, ColumnKind.DATETIME),
179
+ (FractionalType, ColumnKind.FLOAT),
180
+ (NumericType, ColumnKind.INTEGER),
181
+ (ColType_UUID, ColumnKind.STRING),
182
+ (ColType_Alphanum, ColumnKind.STRING),
183
+ (String_Alphanum, ColumnKind.STRING),
184
+ (JSON, ColumnKind.STRING),
185
+ (Array, ColumnKind.STRING),
186
+ (Struct, ColumnKind.STRING),
187
+ (ColType, ColumnKind.UNSUPPORTED),
188
+ ]
189
+
190
+
191
+ @attrs.define(frozen=True)
192
+ class Column:
193
+ name: str
194
+ type: str
195
+ kind: str
196
+
197
+
198
+ @attrs.define(frozen=True)
199
+ class JsonColumnsSummary:
200
+ dataset1: List[Column]
201
+ dataset2: List[Column]
202
+ primaryKey: List[str]
203
+ exclusive: ExclusiveColumns
204
+ typeChanged: List[str]
205
+
206
+
207
+ @attrs.define(frozen=True)
208
+ class ExclusiveDiff:
209
+ dataset1: List[Dict[str, JsonExclusiveRowValue]]
210
+ dataset2: List[Dict[str, JsonExclusiveRowValue]]
211
+
212
+
213
+ @attrs.define(frozen=True)
214
+ class RowsDiff:
215
+ exclusive: ExclusiveDiff
216
+ diff: List[Dict[str, JsonDiffRowValue]]
217
+
218
+
219
+ @attrs.define(frozen=True)
220
+ class FailedDiff:
221
+ status: str # Literal ["failed"]
222
+ dataset1: List[str]
223
+ dataset2: List[str]
224
+ error: str
225
+
226
+ version: str = "1.0.0"
227
+
228
+
229
+ @attrs.define(frozen=True)
230
+ class JsonDiff:
231
+ status: str # Literal ["success"]
232
+ result: str # Literal ["different", "identical"]
233
+ dataset1: List[str]
234
+ dataset2: List[str]
235
+ rows: Optional[RowsDiff]
236
+ summary: Optional[JsonDiffSummary]
237
+ columns: Optional[JsonColumnsSummary]
238
+
239
+ version: str = "1.1.0"
240
+
241
+
242
+ def _group_rows(
243
+ diff_info: DiffResultWrapper, schema: List[str]
244
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
245
+ t1_exclusive_rows = []
246
+ t2_exclusive_rows = []
247
+ diff_rows = []
248
+
249
+ for row in diff_info.diff:
250
+ row_w_schema = dict(zip(schema, row))
251
+ is_t1_exclusive = row_w_schema["is_exclusive_a"]
252
+ is_t2_exclusive = row_w_schema["is_exclusive_b"]
253
+
254
+ if is_t1_exclusive:
255
+ t1_exclusive_rows.append(row_w_schema)
256
+
257
+ elif is_t2_exclusive:
258
+ t2_exclusive_rows.append(row_w_schema)
259
+
260
+ else:
261
+ diff_rows.append(row_w_schema)
262
+
263
+ return t1_exclusive_rows, t2_exclusive_rows, diff_rows
264
+
265
+
266
+ def _make_rows_diff(
267
+ t1_exclusive_rows: List[Dict[str, Any]],
268
+ t2_exclusive_rows: List[Dict[str, Any]],
269
+ diff_rows: List[Dict[str, Any]],
270
+ key_columns: List[str],
271
+ ) -> RowsDiff:
272
+ diff_rows_jsonified = []
273
+ for row in diff_rows:
274
+ diff_rows_jsonified.append(_jsonify_diff(row, key_columns))
275
+
276
+ t1_exclusive_rows_jsonified = []
277
+ for row in t1_exclusive_rows:
278
+ t1_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
279
+
280
+ t2_exclusive_rows_jsonified = []
281
+ for row in t2_exclusive_rows:
282
+ t2_exclusive_rows_jsonified.append(_jsonify_exclusive(row, key_columns))
283
+
284
+ return RowsDiff(
285
+ exclusive=ExclusiveDiff(dataset1=t1_exclusive_rows_jsonified, dataset2=t2_exclusive_rows_jsonified),
286
+ diff=diff_rows_jsonified,
287
+ )
288
+
289
+
290
+ def _jsonify_diff(row: Dict[str, Any], key_columns: List[str]) -> Dict[str, JsonDiffRowValue]:
291
+ columns = collections.defaultdict(dict)
292
+ for field, value in row.items():
293
+ if field in ("is_exclusive_a", "is_exclusive_b"):
294
+ continue
295
+
296
+ if field.startswith("is_diff_"):
297
+ column_name = field[len("is_diff_") :]
298
+ columns[column_name]["isDiff"] = bool(value)
299
+
300
+ elif field.endswith("_a"):
301
+ column_name = field[: -len("_a")]
302
+ columns[column_name]["dataset1"] = value
303
+ columns[column_name]["isPK"] = column_name in key_columns
304
+
305
+ elif field.endswith("_b"):
306
+ column_name = field[: -len("_b")]
307
+ columns[column_name]["dataset2"] = value
308
+ columns[column_name]["isPK"] = column_name in key_columns
309
+
310
+ return {column: JsonDiffRowValue(**data) for column, data in columns.items()}
311
+
312
+
313
+ def _jsonify_exclusive(row: Dict[str, Any], key_columns: List[str]) -> Dict[str, JsonExclusiveRowValue]:
314
+ columns = collections.defaultdict(dict)
315
+ for field, value in row.items():
316
+ if field in ("is_exclusive_a", "is_exclusive_b"):
317
+ continue
318
+ if field.startswith("is_diff_"):
319
+ continue
320
+ if field.endswith("_b") and row["is_exclusive_b"]:
321
+ column_name = field[: -len("_b")]
322
+ columns[column_name]["isPK"] = column_name in key_columns
323
+ columns[column_name]["value"] = value
324
+ elif field.endswith("_a") and row["is_exclusive_a"]:
325
+ column_name = field[: -len("_a")]
326
+ columns[column_name]["isPK"] = column_name in key_columns
327
+ columns[column_name]["value"] = value
328
+ return {column: JsonExclusiveRowValue(**data) for column, data in columns.items()}
329
+
330
+
331
+ def _jsonify_diff_summary(stats_dict: dict) -> JsonDiffSummary:
332
+ return JsonDiffSummary(
333
+ rows=Rows(
334
+ total=Total(dataset1=stats_dict["rows_A"], dataset2=stats_dict["rows_B"]),
335
+ exclusive=ExclusiveRows(
336
+ dataset1=stats_dict["exclusive_A"],
337
+ dataset2=stats_dict["exclusive_B"],
338
+ ),
339
+ updated=stats_dict["updated"],
340
+ unchanged=stats_dict["unchanged"],
341
+ ),
342
+ stats=Stats(diffCounts=stats_dict["values"]),
343
+ )
344
+
345
+
346
+ def _jsonify_columns_diff(
347
+ dataset1_columns: Columns, dataset2_columns: Columns, columns_diff: Dict[str, List[str]], key_columns: List[str]
348
+ ) -> JsonColumnsSummary:
349
+ return JsonColumnsSummary(
350
+ dataset1=[
351
+ Column(name=name, type=type_, kind=_map_kind(kind).value) for (name, type_, kind) in dataset1_columns
352
+ ],
353
+ dataset2=[
354
+ Column(name=name, type=type_, kind=_map_kind(kind).value) for (name, type_, kind) in dataset2_columns
355
+ ],
356
+ primaryKey=key_columns,
357
+ exclusive=ExclusiveColumns(
358
+ dataset2=list(columns_diff.get("added", [])),
359
+ dataset1=list(columns_diff.get("removed", [])),
360
+ ),
361
+ typeChanged=list(columns_diff.get("changed", [])),
362
+ )
363
+
364
+
365
+ def _map_kind(kind: ColType) -> ColumnKind:
366
+ for raw_kind, json_kind in KIND_MAPPING:
367
+ if isinstance(kind, raw_kind):
368
+ return json_kind
369
+ return ColumnKind.UNSUPPORTED