dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
data_diff/__init__.py ADDED
@@ -0,0 +1,221 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Iterator, Optional, Sequence, Tuple, Union
16
+
17
+ from data_diff.abcs.database_types import DbPath, DbTime
18
+ from data_diff.databases import Database
19
+ from data_diff.databases._connect import connect
20
+ from data_diff.diff_tables import Algorithm
21
+ from data_diff.hashdiff_tables import (
22
+ DEAFULT_TIMEOUT,
23
+ DEFAULT_BISECTION_FACTOR,
24
+ DEFAULT_BISECTION_THRESHOLD,
25
+ DEFAULT_ENGRESS_LIMIT,
26
+ DEFAULT_PER_COLUMN_DIFF_LIMIT,
27
+ HashDiffer,
28
+ )
29
+ from data_diff.joindiff_tables import TABLE_WRITE_LIMIT, JoinDiffer
30
+ from data_diff.table_segment import TableSegment
31
+ from data_diff.utils import Vector, eval_name_template
32
+
33
+
34
+ def connect_to_table(
35
+ db_info: Union[str, dict],
36
+ table_name: Union[DbPath, str],
37
+ key_columns: str = ("id",),
38
+ thread_count: Optional[int] = 1,
39
+ **kwargs,
40
+ ) -> TableSegment:
41
+ """Connects to the given database, and creates a TableSegment instance
42
+
43
+ Parameters:
44
+ db_info: Either a URI string, or a dict of connection options.
45
+ table_name: Name of the table as a string, or a tuple that signifies the path.
46
+ key_columns: Names of the key columns
47
+ thread_count: Number of threads for this connection (only if using a threadpooled db implementation)
48
+
49
+ See Also:
50
+ :meth:`connect`
51
+ """
52
+ if isinstance(db_info, dict):
53
+ keys_to_remove = [k for k, v in db_info.items() if v is None]
54
+ for k in keys_to_remove:
55
+ db_info.pop(k)
56
+ if isinstance(key_columns, str):
57
+ key_columns = (key_columns,)
58
+
59
+ db: Database = connect(db_info, thread_count=thread_count)
60
+
61
+ if isinstance(table_name, str):
62
+ table_name = db.dialect.parse_table_name(table_name)
63
+
64
+ return TableSegment(db, table_name, key_columns, **kwargs)
65
+
66
+
67
+ def diff_tables(
68
+ table1: TableSegment,
69
+ table2: TableSegment,
70
+ *,
71
+ # Name of the key column, which uniquely identifies each row (usually id)
72
+ key_columns: Sequence[str] = None,
73
+ # Name of updated column, which signals that rows changed (usually updated_at or last_update)
74
+ update_column: str = None,
75
+ # Extra columns to compare
76
+ extra_columns: Tuple[str, ...] = None,
77
+ # Start/end key_column values, used to restrict the segment
78
+ min_key: Vector = None,
79
+ max_key: Vector = None,
80
+ # Start/end update_column values, used to restrict the segment
81
+ min_update: DbTime = None,
82
+ max_update: DbTime = None,
83
+ # Enable/disable threaded diffing. Needed to take advantage of database threads.
84
+ threaded: bool = True,
85
+ # Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
86
+ # There may be many pools, so number of actual threads can be a lot higher.
87
+ max_threadpool_size: Optional[int] = 1,
88
+ # Algorithm
89
+ algorithm: Algorithm = Algorithm.AUTO,
90
+ # An additional 'where' expression to restrict the search space.
91
+ where: str = None,
92
+ # Into how many segments to bisect per iteration (hashdiff only)
93
+ bisection_factor: int = DEFAULT_BISECTION_FACTOR,
94
+ # When should we stop bisecting and compare locally (in row count; hashdiff only)
95
+ bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD,
96
+ # Enable/disable validating that the key columns are unique. (joindiff only)
97
+ validate_unique_key: bool = True,
98
+ # Enable/disable sampling of exclusive rows. Creates a temporary table. (joindiff only)
99
+ sample_exclusive_rows: bool = False,
100
+ # Path of new table to write diff results to. Disabled if not provided. (joindiff only)
101
+ materialize_to_table: Union[str, DbPath] = None,
102
+ # Materialize every row, not just those that are different. (joindiff only)
103
+ materialize_all_rows: bool = False,
104
+ # Maximum number of rows to write when materializing, per thread. (joindiff only)
105
+ table_write_limit: int = TABLE_WRITE_LIMIT,
106
+ # Skips diffing any rows with null keys. (joindiff only)
107
+ skip_null_keys: bool = False,
108
+ # Type check
109
+ strict: bool = True,
110
+ # Maximum number diff per column
111
+ per_column_diff_limit: int = DEFAULT_PER_COLUMN_DIFF_LIMIT,
112
+ # Maximum number of rows to download
113
+ egress_limit: int = DEFAULT_ENGRESS_LIMIT,
114
+ # Timeout limit in minutes
115
+ # (used for diffing large tables, to avoid long-running queries)
116
+ timeout_limit: int = DEAFULT_TIMEOUT,
117
+ in_memory_diff: bool = False,
118
+ ) -> Iterator:
119
+ """Finds the diff between table1 and table2.
120
+
121
+ Parameters:
122
+ key_columns (Tuple[str, ...]): Name of the key column, which uniquely identifies each row (usually id)
123
+ update_column (str, optional): Name of updated column, which signals that rows changed.
124
+ Usually updated_at or last_update. Used by `min_update` and `max_update`.
125
+ extra_columns (Tuple[str, ...], optional): Extra columns to compare
126
+ min_key (:data:`Vector`, optional): Lowest key value, used to restrict the segment
127
+ max_key (:data:`Vector`, optional): Highest key value, used to restrict the segment
128
+ min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
129
+ max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
130
+ threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
131
+ max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
132
+ Only relevant when `threaded` is ``True``.
133
+ There may be many pools, so number of actual threads can be a lot higher.
134
+ where (str, optional): An additional 'where' expression to restrict the search space.
135
+ algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`. Default=`AUTO`)
136
+ bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
137
+ bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
138
+ and compare locally. (Used when algorithm is `HASHDIFF`).
139
+ validate_unique_key (bool): Enable/disable validating that the key columns are unique. (used for `JOINDIFF`. default: True)
140
+ Single query, and can't be threaded, so it's very slow on non-cloud dbs.
141
+ Future versions will detect UNIQUE constraints in the schema.
142
+ sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. Creates a temporary table. (used for `JOINDIFF`. default: False)
143
+ materialize_to_table (Union[str, DbPath], optional): Path of new table to write diff results to. Disabled if not provided. Used for `JOINDIFF`.
144
+ materialize_all_rows (bool): Materialize every row, not just those that are different. (used for `JOINDIFF`. default: False)
145
+ table_write_limit (int): Maximum number of rows to write when materializing, per thread.
146
+ skip_null_keys (bool): Skips diffing any rows with null PKs (displays a warning if any are null) (used for `JOINDIFF`. default: False)
147
+
148
+ Note:
149
+ The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
150
+ `key_columns`, `update_column`, `extra_columns`, `min_key`, `max_key`, `where`.
151
+ If different values are needed per table, it's possible to omit them here, and instead set
152
+ them directly when creating each :class:`TableSegment`.
153
+
154
+ Example:
155
+ >>> table1 = connect_to_table('postgresql:///', 'Rating', 'id')
156
+ >>> list(diff_tables(table1, table1))
157
+ []
158
+
159
+ See Also:
160
+ :class:`TableSegment`
161
+ :class:`HashDiffer`
162
+ :class:`JoinDiffer`
163
+
164
+ """
165
+ if isinstance(key_columns, str):
166
+ key_columns = (key_columns,)
167
+
168
+ tables = [table1, table2]
169
+ override_attrs = {
170
+ k: v
171
+ for k, v in dict(
172
+ key_columns=key_columns,
173
+ update_column=update_column,
174
+ extra_columns=extra_columns,
175
+ min_key=min_key,
176
+ max_key=max_key,
177
+ min_update=min_update,
178
+ max_update=max_update,
179
+ where=where,
180
+ ).items()
181
+ if v is not None
182
+ }
183
+
184
+ segments = [t.new(**override_attrs) for t in tables] if override_attrs else tables
185
+
186
+ algorithm = Algorithm(algorithm)
187
+ if algorithm == Algorithm.AUTO:
188
+ algorithm = Algorithm.JOINDIFF if table1.database is table2.database else Algorithm.HASHDIFF
189
+
190
+ if algorithm == Algorithm.HASHDIFF:
191
+ differ = HashDiffer(
192
+ bisection_factor=bisection_factor,
193
+ bisection_threshold=bisection_threshold,
194
+ threaded=threaded,
195
+ max_threadpool_size=max_threadpool_size,
196
+ strict=strict,
197
+ t1_row_count=table1.count(),
198
+ t2_row_count=table2.count(),
199
+ per_column_diff_limit=per_column_diff_limit,
200
+ egress_limit=egress_limit,
201
+ timeout_limit=timeout_limit,
202
+ in_memory_diff=in_memory_diff,
203
+ )
204
+ elif algorithm == Algorithm.JOINDIFF:
205
+ if isinstance(materialize_to_table, str):
206
+ table_name = eval_name_template(materialize_to_table)
207
+ materialize_to_table = table1.database.dialect.parse_table_name(table_name)
208
+ differ = JoinDiffer(
209
+ threaded=threaded,
210
+ max_threadpool_size=max_threadpool_size,
211
+ validate_unique_key=validate_unique_key,
212
+ sample_exclusive_rows=sample_exclusive_rows,
213
+ materialize_to_table=materialize_to_table,
214
+ materialize_all_rows=materialize_all_rows,
215
+ table_write_limit=table_write_limit,
216
+ skip_null_keys=skip_null_keys,
217
+ )
218
+ else:
219
+ raise ValueError(f"Unknown algorithm: {algorithm}")
220
+
221
+ return differ.diff_tables(*segments)