dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
data_diff/__init__.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import Iterator, Optional, Sequence, Tuple, Union
|
|
16
|
+
|
|
17
|
+
from data_diff.abcs.database_types import DbPath, DbTime
|
|
18
|
+
from data_diff.databases import Database
|
|
19
|
+
from data_diff.databases._connect import connect
|
|
20
|
+
from data_diff.diff_tables import Algorithm
|
|
21
|
+
from data_diff.hashdiff_tables import (
|
|
22
|
+
DEAFULT_TIMEOUT,
|
|
23
|
+
DEFAULT_BISECTION_FACTOR,
|
|
24
|
+
DEFAULT_BISECTION_THRESHOLD,
|
|
25
|
+
DEFAULT_ENGRESS_LIMIT,
|
|
26
|
+
DEFAULT_PER_COLUMN_DIFF_LIMIT,
|
|
27
|
+
HashDiffer,
|
|
28
|
+
)
|
|
29
|
+
from data_diff.joindiff_tables import TABLE_WRITE_LIMIT, JoinDiffer
|
|
30
|
+
from data_diff.table_segment import TableSegment
|
|
31
|
+
from data_diff.utils import Vector, eval_name_template
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def connect_to_table(
|
|
35
|
+
db_info: Union[str, dict],
|
|
36
|
+
table_name: Union[DbPath, str],
|
|
37
|
+
key_columns: str = ("id",),
|
|
38
|
+
thread_count: Optional[int] = 1,
|
|
39
|
+
**kwargs,
|
|
40
|
+
) -> TableSegment:
|
|
41
|
+
"""Connects to the given database, and creates a TableSegment instance
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
db_info: Either a URI string, or a dict of connection options.
|
|
45
|
+
table_name: Name of the table as a string, or a tuple that signifies the path.
|
|
46
|
+
key_columns: Names of the key columns
|
|
47
|
+
thread_count: Number of threads for this connection (only if using a threadpooled db implementation)
|
|
48
|
+
|
|
49
|
+
See Also:
|
|
50
|
+
:meth:`connect`
|
|
51
|
+
"""
|
|
52
|
+
if isinstance(db_info, dict):
|
|
53
|
+
keys_to_remove = [k for k, v in db_info.items() if v is None]
|
|
54
|
+
for k in keys_to_remove:
|
|
55
|
+
db_info.pop(k)
|
|
56
|
+
if isinstance(key_columns, str):
|
|
57
|
+
key_columns = (key_columns,)
|
|
58
|
+
|
|
59
|
+
db: Database = connect(db_info, thread_count=thread_count)
|
|
60
|
+
|
|
61
|
+
if isinstance(table_name, str):
|
|
62
|
+
table_name = db.dialect.parse_table_name(table_name)
|
|
63
|
+
|
|
64
|
+
return TableSegment(db, table_name, key_columns, **kwargs)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def diff_tables(
|
|
68
|
+
table1: TableSegment,
|
|
69
|
+
table2: TableSegment,
|
|
70
|
+
*,
|
|
71
|
+
# Name of the key column, which uniquely identifies each row (usually id)
|
|
72
|
+
key_columns: Sequence[str] = None,
|
|
73
|
+
# Name of updated column, which signals that rows changed (usually updated_at or last_update)
|
|
74
|
+
update_column: str = None,
|
|
75
|
+
# Extra columns to compare
|
|
76
|
+
extra_columns: Tuple[str, ...] = None,
|
|
77
|
+
# Start/end key_column values, used to restrict the segment
|
|
78
|
+
min_key: Vector = None,
|
|
79
|
+
max_key: Vector = None,
|
|
80
|
+
# Start/end update_column values, used to restrict the segment
|
|
81
|
+
min_update: DbTime = None,
|
|
82
|
+
max_update: DbTime = None,
|
|
83
|
+
# Enable/disable threaded diffing. Needed to take advantage of database threads.
|
|
84
|
+
threaded: bool = True,
|
|
85
|
+
# Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
|
|
86
|
+
# There may be many pools, so number of actual threads can be a lot higher.
|
|
87
|
+
max_threadpool_size: Optional[int] = 1,
|
|
88
|
+
# Algorithm
|
|
89
|
+
algorithm: Algorithm = Algorithm.AUTO,
|
|
90
|
+
# An additional 'where' expression to restrict the search space.
|
|
91
|
+
where: str = None,
|
|
92
|
+
# Into how many segments to bisect per iteration (hashdiff only)
|
|
93
|
+
bisection_factor: int = DEFAULT_BISECTION_FACTOR,
|
|
94
|
+
# When should we stop bisecting and compare locally (in row count; hashdiff only)
|
|
95
|
+
bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD,
|
|
96
|
+
# Enable/disable validating that the key columns are unique. (joindiff only)
|
|
97
|
+
validate_unique_key: bool = True,
|
|
98
|
+
# Enable/disable sampling of exclusive rows. Creates a temporary table. (joindiff only)
|
|
99
|
+
sample_exclusive_rows: bool = False,
|
|
100
|
+
# Path of new table to write diff results to. Disabled if not provided. (joindiff only)
|
|
101
|
+
materialize_to_table: Union[str, DbPath] = None,
|
|
102
|
+
# Materialize every row, not just those that are different. (joindiff only)
|
|
103
|
+
materialize_all_rows: bool = False,
|
|
104
|
+
# Maximum number of rows to write when materializing, per thread. (joindiff only)
|
|
105
|
+
table_write_limit: int = TABLE_WRITE_LIMIT,
|
|
106
|
+
# Skips diffing any rows with null keys. (joindiff only)
|
|
107
|
+
skip_null_keys: bool = False,
|
|
108
|
+
# Type check
|
|
109
|
+
strict: bool = True,
|
|
110
|
+
# Maximum number diff per column
|
|
111
|
+
per_column_diff_limit: int = DEFAULT_PER_COLUMN_DIFF_LIMIT,
|
|
112
|
+
# Maximum number of rows to download
|
|
113
|
+
egress_limit: int = DEFAULT_ENGRESS_LIMIT,
|
|
114
|
+
# Timeout limit in minutes
|
|
115
|
+
# (used for diffing large tables, to avoid long-running queries)
|
|
116
|
+
timeout_limit: int = DEAFULT_TIMEOUT,
|
|
117
|
+
in_memory_diff: bool = False,
|
|
118
|
+
) -> Iterator:
|
|
119
|
+
"""Finds the diff between table1 and table2.
|
|
120
|
+
|
|
121
|
+
Parameters:
|
|
122
|
+
key_columns (Tuple[str, ...]): Name of the key column, which uniquely identifies each row (usually id)
|
|
123
|
+
update_column (str, optional): Name of updated column, which signals that rows changed.
|
|
124
|
+
Usually updated_at or last_update. Used by `min_update` and `max_update`.
|
|
125
|
+
extra_columns (Tuple[str, ...], optional): Extra columns to compare
|
|
126
|
+
min_key (:data:`Vector`, optional): Lowest key value, used to restrict the segment
|
|
127
|
+
max_key (:data:`Vector`, optional): Highest key value, used to restrict the segment
|
|
128
|
+
min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
|
|
129
|
+
max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
|
|
130
|
+
threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
|
|
131
|
+
max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
|
|
132
|
+
Only relevant when `threaded` is ``True``.
|
|
133
|
+
There may be many pools, so number of actual threads can be a lot higher.
|
|
134
|
+
where (str, optional): An additional 'where' expression to restrict the search space.
|
|
135
|
+
algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`. Default=`AUTO`)
|
|
136
|
+
bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
|
|
137
|
+
bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
|
|
138
|
+
and compare locally. (Used when algorithm is `HASHDIFF`).
|
|
139
|
+
validate_unique_key (bool): Enable/disable validating that the key columns are unique. (used for `JOINDIFF`. default: True)
|
|
140
|
+
Single query, and can't be threaded, so it's very slow on non-cloud dbs.
|
|
141
|
+
Future versions will detect UNIQUE constraints in the schema.
|
|
142
|
+
sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. Creates a temporary table. (used for `JOINDIFF`. default: False)
|
|
143
|
+
materialize_to_table (Union[str, DbPath], optional): Path of new table to write diff results to. Disabled if not provided. Used for `JOINDIFF`.
|
|
144
|
+
materialize_all_rows (bool): Materialize every row, not just those that are different. (used for `JOINDIFF`. default: False)
|
|
145
|
+
table_write_limit (int): Maximum number of rows to write when materializing, per thread.
|
|
146
|
+
skip_null_keys (bool): Skips diffing any rows with null PKs (displays a warning if any are null) (used for `JOINDIFF`. default: False)
|
|
147
|
+
|
|
148
|
+
Note:
|
|
149
|
+
The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
|
|
150
|
+
`key_columns`, `update_column`, `extra_columns`, `min_key`, `max_key`, `where`.
|
|
151
|
+
If different values are needed per table, it's possible to omit them here, and instead set
|
|
152
|
+
them directly when creating each :class:`TableSegment`.
|
|
153
|
+
|
|
154
|
+
Example:
|
|
155
|
+
>>> table1 = connect_to_table('postgresql:///', 'Rating', 'id')
|
|
156
|
+
>>> list(diff_tables(table1, table1))
|
|
157
|
+
[]
|
|
158
|
+
|
|
159
|
+
See Also:
|
|
160
|
+
:class:`TableSegment`
|
|
161
|
+
:class:`HashDiffer`
|
|
162
|
+
:class:`JoinDiffer`
|
|
163
|
+
|
|
164
|
+
"""
|
|
165
|
+
if isinstance(key_columns, str):
|
|
166
|
+
key_columns = (key_columns,)
|
|
167
|
+
|
|
168
|
+
tables = [table1, table2]
|
|
169
|
+
override_attrs = {
|
|
170
|
+
k: v
|
|
171
|
+
for k, v in dict(
|
|
172
|
+
key_columns=key_columns,
|
|
173
|
+
update_column=update_column,
|
|
174
|
+
extra_columns=extra_columns,
|
|
175
|
+
min_key=min_key,
|
|
176
|
+
max_key=max_key,
|
|
177
|
+
min_update=min_update,
|
|
178
|
+
max_update=max_update,
|
|
179
|
+
where=where,
|
|
180
|
+
).items()
|
|
181
|
+
if v is not None
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
segments = [t.new(**override_attrs) for t in tables] if override_attrs else tables
|
|
185
|
+
|
|
186
|
+
algorithm = Algorithm(algorithm)
|
|
187
|
+
if algorithm == Algorithm.AUTO:
|
|
188
|
+
algorithm = Algorithm.JOINDIFF if table1.database is table2.database else Algorithm.HASHDIFF
|
|
189
|
+
|
|
190
|
+
if algorithm == Algorithm.HASHDIFF:
|
|
191
|
+
differ = HashDiffer(
|
|
192
|
+
bisection_factor=bisection_factor,
|
|
193
|
+
bisection_threshold=bisection_threshold,
|
|
194
|
+
threaded=threaded,
|
|
195
|
+
max_threadpool_size=max_threadpool_size,
|
|
196
|
+
strict=strict,
|
|
197
|
+
t1_row_count=table1.count(),
|
|
198
|
+
t2_row_count=table2.count(),
|
|
199
|
+
per_column_diff_limit=per_column_diff_limit,
|
|
200
|
+
egress_limit=egress_limit,
|
|
201
|
+
timeout_limit=timeout_limit,
|
|
202
|
+
in_memory_diff=in_memory_diff,
|
|
203
|
+
)
|
|
204
|
+
elif algorithm == Algorithm.JOINDIFF:
|
|
205
|
+
if isinstance(materialize_to_table, str):
|
|
206
|
+
table_name = eval_name_template(materialize_to_table)
|
|
207
|
+
materialize_to_table = table1.database.dialect.parse_table_name(table_name)
|
|
208
|
+
differ = JoinDiffer(
|
|
209
|
+
threaded=threaded,
|
|
210
|
+
max_threadpool_size=max_threadpool_size,
|
|
211
|
+
validate_unique_key=validate_unique_key,
|
|
212
|
+
sample_exclusive_rows=sample_exclusive_rows,
|
|
213
|
+
materialize_to_table=materialize_to_table,
|
|
214
|
+
materialize_all_rows=materialize_all_rows,
|
|
215
|
+
table_write_limit=table_write_limit,
|
|
216
|
+
skip_null_keys=skip_null_keys,
|
|
217
|
+
)
|
|
218
|
+
else:
|
|
219
|
+
raise ValueError(f"Unknown algorithm: {algorithm}")
|
|
220
|
+
|
|
221
|
+
return differ.diff_tables(*segments)
|