dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Contains the implementation of two classes:
|
|
16
|
+
|
|
17
|
+
- LexicographicSpace - a lexicographic space of arbitrary dimensions.
|
|
18
|
+
- BoundedLexicographicSpace - a lexicographic space, where the lowest point may be non-zero.
|
|
19
|
+
|
|
20
|
+
A lexicographic space is a space of increasing natural values, ordered by lexicographic order.
|
|
21
|
+
Read more: https://mathworld.wolfram.com/LexicographicOrder.html
|
|
22
|
+
|
|
23
|
+
These abstractions were written to support compound keys in the hashdiff algorithm.
|
|
24
|
+
In the hashdiff algorithm, we rely on the order of the column keys, to segment the table correctly.
|
|
25
|
+
SQL orders the columns of tables based on lexicographic ordering.
|
|
26
|
+
Since we need an evenly spaced "range" function over the space, which has arbitrary dimensions, we have
|
|
27
|
+
to implement it ourself.
|
|
28
|
+
|
|
29
|
+
As a further optimization, since we each time operate on segments of the ordered table, we add support
|
|
30
|
+
for working with a restricted space, which will reduce the likelihood of gaps in our "select", when the
|
|
31
|
+
keys are not evenly distributed.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from random import randint, randrange
|
|
35
|
+
from typing import Tuple
|
|
36
|
+
|
|
37
|
+
import attrs
|
|
38
|
+
|
|
39
|
+
from data_diff.utils import safezip
|
|
40
|
+
|
|
41
|
+
Vector = Tuple[int]
|
|
42
|
+
Interval = Tuple[int]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class Overflow(ValueError):
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def neg_interval(interval):
|
|
50
|
+
return tuple(-i for i in interval)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def neg_v(v: Vector):
|
|
54
|
+
return tuple(-i for i in v)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def sub_v(v1: Vector, v2: Vector):
|
|
58
|
+
return tuple(i1 - i2 for i1, i2 in safezip(v1, v2))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def add_v(v1: Vector, v2: Vector):
|
|
62
|
+
return tuple(i1 + i2 for i1, i2 in safezip(v1, v2))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def rand_v_in_range(v1: Vector, v2: Vector):
|
|
66
|
+
return tuple(irandrange(i1, i2) for i1, i2 in safezip(v1, v2))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def irandrange(start, stop):
|
|
70
|
+
if start == stop:
|
|
71
|
+
return start
|
|
72
|
+
return randrange(start, stop)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@attrs.define(frozen=True)
|
|
76
|
+
class LexicographicSpace:
|
|
77
|
+
"""Lexicographic space of arbitrary dimensions.
|
|
78
|
+
|
|
79
|
+
All elements must be of the same length as the number of dimensions. (no rpadding)
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def __init__(self, dims: Vector) -> None:
|
|
83
|
+
super().__init__()
|
|
84
|
+
self.dims = dims
|
|
85
|
+
|
|
86
|
+
def __contains__(self, v: Vector) -> bool:
|
|
87
|
+
return all(0 <= i < d for i, d in safezip(v, self.dims))
|
|
88
|
+
|
|
89
|
+
def add(self, v1: Vector, v2: Vector) -> Vector:
|
|
90
|
+
# assert v1 in self and v2 in self, (v1, v2)
|
|
91
|
+
|
|
92
|
+
carry = 0
|
|
93
|
+
res = []
|
|
94
|
+
for i1, i2, d in reversed(list(safezip(v1, v2, self.dims))):
|
|
95
|
+
n = i1 + i2 + carry
|
|
96
|
+
carry = n // d
|
|
97
|
+
assert carry <= 1
|
|
98
|
+
n %= d
|
|
99
|
+
res.append(n)
|
|
100
|
+
|
|
101
|
+
if carry:
|
|
102
|
+
raise Overflow("Overflow")
|
|
103
|
+
|
|
104
|
+
new_v = tuple(reversed(res))
|
|
105
|
+
assert new_v in self
|
|
106
|
+
return new_v
|
|
107
|
+
|
|
108
|
+
def sub(self, v1: Vector, v2: Vector):
|
|
109
|
+
return self.add(v1, neg_v(v2))
|
|
110
|
+
|
|
111
|
+
def _divide(self, v: Vector, count: int):
|
|
112
|
+
n = 0
|
|
113
|
+
for x, d in zip(v, self.dims[1:] + (1,), strict=True):
|
|
114
|
+
x += n
|
|
115
|
+
rem = x % count
|
|
116
|
+
n = rem * d
|
|
117
|
+
yield x // count
|
|
118
|
+
|
|
119
|
+
def divide(self, v: Vector, count: int) -> Vector:
|
|
120
|
+
return tuple(self._divide(v, count))
|
|
121
|
+
|
|
122
|
+
def range(self, min_value: Vector, max_value: Vector, count: int):
|
|
123
|
+
assert min_value in self and max_value in self
|
|
124
|
+
count -= 1
|
|
125
|
+
size = self.sub(max_value, min_value)
|
|
126
|
+
interval = self.divide(size, count)
|
|
127
|
+
n = min_value
|
|
128
|
+
for i in range(count):
|
|
129
|
+
yield n
|
|
130
|
+
n = self.add(n, interval)
|
|
131
|
+
yield n
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class BoundedLexicographicSpace:
|
|
135
|
+
"""Lexicographic space of arbitrary dimensions, where the lowest point may be non-zero.
|
|
136
|
+
|
|
137
|
+
i.e. a space resticted by a "bounding-box" between two arbitrary points.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
def __init__(self, min_bound: Vector, max_bound: Vector) -> None:
|
|
141
|
+
super().__init__()
|
|
142
|
+
|
|
143
|
+
dims = tuple(mx - mn for mn, mx in safezip(min_bound, max_bound))
|
|
144
|
+
if not all(d >= 0 for d in dims):
|
|
145
|
+
raise ValueError("Error: Negative dimension!")
|
|
146
|
+
if not (dims[0] > 0):
|
|
147
|
+
raise ValueError("First dimension must be non-zero!")
|
|
148
|
+
|
|
149
|
+
self.min_bound = min_bound
|
|
150
|
+
self.max_bound = max_bound
|
|
151
|
+
|
|
152
|
+
self.uspace = LexicographicSpace(dims)
|
|
153
|
+
|
|
154
|
+
def __contains__(self, p: Vector) -> bool:
|
|
155
|
+
return all(mn <= i < mx for i, mn, mx in safezip(p, self.min_bound, self.max_bound))
|
|
156
|
+
|
|
157
|
+
def to_uspace(self, v: Vector) -> Vector:
|
|
158
|
+
assert v in self
|
|
159
|
+
return sub_v(v, self.min_bound)
|
|
160
|
+
|
|
161
|
+
def from_uspace(self, v: Vector) -> Vector:
|
|
162
|
+
res = add_v(v, self.min_bound)
|
|
163
|
+
assert res in self
|
|
164
|
+
return res
|
|
165
|
+
|
|
166
|
+
def add_interval(self, v1: Vector, interval: Interval) -> Vector:
|
|
167
|
+
return self.from_uspace(self.uspace.add(self.to_uspace(v1), interval))
|
|
168
|
+
|
|
169
|
+
def sub_interval(self, v1: Vector, interval: Interval) -> Vector:
|
|
170
|
+
return self.from_uspace(self.uspace.sub(self.to_uspace(v1), interval))
|
|
171
|
+
|
|
172
|
+
def sub(self, v1: Vector, v2: Vector) -> Interval:
|
|
173
|
+
return self.uspace.sub(self.to_uspace(v1), self.to_uspace(v2))
|
|
174
|
+
|
|
175
|
+
def range(self, min_value: Vector, max_value: Vector, count: int):
|
|
176
|
+
return [
|
|
177
|
+
self.from_uspace(v) for v in self.uspace.range(self.to_uspace(min_value), self.to_uspace(max_value), count)
|
|
178
|
+
]
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def test_lex_space():
|
|
182
|
+
# Test add
|
|
183
|
+
binspace = LexicographicSpace((2, 2, 2, 2))
|
|
184
|
+
zero = (0, 0, 0, 0)
|
|
185
|
+
one = (0, 0, 0, 1)
|
|
186
|
+
bin_nums = [zero]
|
|
187
|
+
for i in range(15):
|
|
188
|
+
last = bin_nums[-1]
|
|
189
|
+
bin_nums.append(binspace.add(last, one))
|
|
190
|
+
five = bin_nums[5]
|
|
191
|
+
seven = bin_nums[7]
|
|
192
|
+
eight = bin_nums[8]
|
|
193
|
+
fifteen = bin_nums[15]
|
|
194
|
+
|
|
195
|
+
assert binspace.add(binspace.add(one, five), one) == seven
|
|
196
|
+
assert binspace.add(one, seven) == eight
|
|
197
|
+
assert binspace.add(seven, eight) == fifteen
|
|
198
|
+
|
|
199
|
+
assert binspace.sub(eight, one) == seven
|
|
200
|
+
assert binspace.sub(fifteen, seven) == eight
|
|
201
|
+
|
|
202
|
+
r = list(binspace.range(one, seven, 4))
|
|
203
|
+
assert r == [one, bin_nums[3], five, seven], r
|
|
204
|
+
|
|
205
|
+
decspace = LexicographicSpace((10, 10, 10))
|
|
206
|
+
assert decspace.divide((4, 5, 2), 2) == (2, 2, 6)
|
|
207
|
+
assert decspace.divide((3, 0, 2), 2) == (1, 5, 1)
|
|
208
|
+
|
|
209
|
+
# Restricted space
|
|
210
|
+
|
|
211
|
+
rspace1 = BoundedLexicographicSpace((2, 2), (8, 8))
|
|
212
|
+
assert rspace1.add_interval((2, 2), (0, 0)) == (2, 2)
|
|
213
|
+
assert rspace1.add_interval((2, 2), (0, 1)) == (2, 3)
|
|
214
|
+
assert rspace1.add_interval((2, 2), (0, 6)) == (3, 2)
|
|
215
|
+
assert rspace1.add_interval((2, 2), (0, 7)) == (3, 3)
|
|
216
|
+
# space.add((2,2), (6, 0)) # Overflow
|
|
217
|
+
|
|
218
|
+
rspace2 = BoundedLexicographicSpace((4, 4, 4, 4), (6, 6, 6, 6))
|
|
219
|
+
_one = (4, 4, 4, 5)
|
|
220
|
+
_three = (4, 4, 5, 5)
|
|
221
|
+
_five = (4, 5, 4, 5)
|
|
222
|
+
_seven = (4, 5, 5, 5)
|
|
223
|
+
assert rspace2.add_interval(rspace2.add_interval(_five, one), one) == _seven
|
|
224
|
+
assert rspace2.sub_interval(rspace2.sub_interval(_seven, one), one) == _five
|
|
225
|
+
|
|
226
|
+
r = list(rspace2.range(_one, _seven, 4))
|
|
227
|
+
assert r == [_one, _three, _five, _seven], r
|
|
228
|
+
|
|
229
|
+
# Test range -
|
|
230
|
+
# For random bounds and min/max values, assert that range() generates steps with uniform distances
|
|
231
|
+
MAX_COLUMNS = 16
|
|
232
|
+
MAX_DIM = 10000
|
|
233
|
+
MAX_BISECTION = 128
|
|
234
|
+
|
|
235
|
+
for n in range(1, MAX_COLUMNS):
|
|
236
|
+
min_bound = tuple(randint(0, MAX_DIM) for i in range(n))
|
|
237
|
+
size = tuple(randint(1, MAX_DIM) for i in range(n))
|
|
238
|
+
max_bound = add_v(min_bound, size)
|
|
239
|
+
|
|
240
|
+
sp = BoundedLexicographicSpace(min_bound, max_bound)
|
|
241
|
+
|
|
242
|
+
max_value = rand_v_in_range(min_bound, max_bound)
|
|
243
|
+
min_value = rand_v_in_range(min_bound, max_value)
|
|
244
|
+
for count in range(2, MAX_BISECTION):
|
|
245
|
+
r = sp.range(min_value, max_value, count)
|
|
246
|
+
assert len(r) == count
|
|
247
|
+
diffs = [sp.sub(b, a) for a, b in zip(r[:-1], r[1:])]
|
|
248
|
+
assert len(set(diffs)) == 1 # Uniform!
|
|
249
|
+
# print('.', end='')
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
if __name__ == "__main__":
|
|
253
|
+
test_lex_space()
|
data_diff/parse_time.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from datetime import datetime, timedelta
|
|
17
|
+
from difflib import SequenceMatcher
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ParseError(ValueError):
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
TIME_UNITS = dict(
|
|
25
|
+
seconds="seconds",
|
|
26
|
+
minutes="minutes",
|
|
27
|
+
hours="hours",
|
|
28
|
+
days="days",
|
|
29
|
+
weeks="weeks",
|
|
30
|
+
months="months",
|
|
31
|
+
years="years",
|
|
32
|
+
# Shortcuts
|
|
33
|
+
s="seconds",
|
|
34
|
+
min="minutes",
|
|
35
|
+
h="hours",
|
|
36
|
+
d="days",
|
|
37
|
+
w="weeks",
|
|
38
|
+
mon="months",
|
|
39
|
+
y="years",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
EXTRAPOLATED = {"months": (30, "days"), "years": (365, "days")}
|
|
43
|
+
assert set(EXTRAPOLATED) <= set(TIME_UNITS)
|
|
44
|
+
|
|
45
|
+
TIME_RE = re.compile(r"(\d+)([a-z]+)")
|
|
46
|
+
|
|
47
|
+
UNITS_STR = ", ".join(sorted(TIME_UNITS.keys()))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def string_similarity(a, b) -> SequenceMatcher:
|
|
51
|
+
return SequenceMatcher(None, a, b).ratio()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def parse_time_atom(count, unit):
|
|
55
|
+
count = int(count)
|
|
56
|
+
try:
|
|
57
|
+
unit = TIME_UNITS[unit]
|
|
58
|
+
except KeyError:
|
|
59
|
+
most_similar = max(TIME_UNITS, key=lambda k: string_similarity(k, unit))
|
|
60
|
+
raise ParseError(
|
|
61
|
+
f"'{unit}' is not a recognized time unit. Did you mean '{most_similar}'?" f"\nSupported units: {UNITS_STR}"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if unit in EXTRAPOLATED:
|
|
65
|
+
mul, unit = EXTRAPOLATED[unit]
|
|
66
|
+
count *= mul
|
|
67
|
+
return count, unit
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def parse_time_delta(t: str) -> timedelta:
|
|
71
|
+
time_dict = {}
|
|
72
|
+
while t:
|
|
73
|
+
m = TIME_RE.match(t)
|
|
74
|
+
if not m:
|
|
75
|
+
raise ParseError(f"Cannot parse '{t}': Not a recognized time delta")
|
|
76
|
+
count, unit = parse_time_atom(*m.groups())
|
|
77
|
+
if unit in time_dict:
|
|
78
|
+
raise ParseError(f"Time unit {unit} specified more than once")
|
|
79
|
+
time_dict[unit] = count
|
|
80
|
+
t = t[m.end() :]
|
|
81
|
+
|
|
82
|
+
if not time_dict:
|
|
83
|
+
raise ParseError("No time difference specified")
|
|
84
|
+
return timedelta(**time_dict)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def parse_time_before(time: datetime, delta: str) -> datetime:
|
|
88
|
+
return time - parse_time_delta(delta)
|
data_diff/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
data_diff/queries/api.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from data_diff.queries.ast_classes import *
|
|
16
|
+
from data_diff.queries.base import args_as_tuple
|
|
17
|
+
from data_diff.utils import CaseAwareMapping, CaseSensitiveDict
|
|
18
|
+
|
|
19
|
+
this = This()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def join(*tables: ITable) -> Join:
|
|
23
|
+
"""Inner-join a sequence of table expressions"
|
|
24
|
+
|
|
25
|
+
When joining, it's recommended to use explicit tables names, instead of `this`, in order to avoid potential name collisions.
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
::
|
|
29
|
+
|
|
30
|
+
person = table('person')
|
|
31
|
+
city = table('city')
|
|
32
|
+
|
|
33
|
+
name_and_city = (
|
|
34
|
+
join(person, city)
|
|
35
|
+
.on(person['city_id'] == city['id'])
|
|
36
|
+
.select(person['id'], city['name'])
|
|
37
|
+
)
|
|
38
|
+
"""
|
|
39
|
+
return Join(tables)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def leftjoin(*tables: ITable) -> Join:
|
|
43
|
+
"""Left-joins a sequence of table expressions.
|
|
44
|
+
|
|
45
|
+
See Also: ``join()``
|
|
46
|
+
"""
|
|
47
|
+
return Join(tables, "LEFT")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def rightjoin(*tables: ITable) -> Join:
|
|
51
|
+
"""Right-joins a sequence of table expressions.
|
|
52
|
+
|
|
53
|
+
See Also: ``join()``
|
|
54
|
+
"""
|
|
55
|
+
return Join(tables, "RIGHT")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def outerjoin(*tables: ITable) -> Join:
|
|
59
|
+
"""Outer-joins a sequence of table expressions.
|
|
60
|
+
|
|
61
|
+
See Also: ``join()``
|
|
62
|
+
"""
|
|
63
|
+
return Join(tables, "FULL OUTER")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def cte(expr: Expr, *, name: Optional[str] = None, params: Sequence[str] = None) -> Cte:
|
|
67
|
+
"""Define a CTE"""
|
|
68
|
+
return Cte(expr, name, params)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def table(*path: str, schema: Union[dict, CaseAwareMapping] = None) -> TablePath:
|
|
72
|
+
"""Defines a table with a path (dotted name), and optionally a schema.
|
|
73
|
+
|
|
74
|
+
Parameters:
|
|
75
|
+
path: A list of names that make up the path to the table.
|
|
76
|
+
schema: a dictionary of {name: type}
|
|
77
|
+
"""
|
|
78
|
+
if len(path) == 1 and isinstance(path[0], tuple):
|
|
79
|
+
(path,) = path
|
|
80
|
+
if not all(isinstance(i, str) for i in path):
|
|
81
|
+
raise TypeError(f"All elements of table path must be of type 'str'. Got: {path}")
|
|
82
|
+
if schema and not isinstance(schema, CaseAwareMapping):
|
|
83
|
+
assert isinstance(schema, dict)
|
|
84
|
+
schema = CaseSensitiveDict(schema)
|
|
85
|
+
return TablePath(path, schema)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def or_(*exprs: Expr) -> Union[BinBoolOp, Expr]:
|
|
89
|
+
"""Apply OR between a sequence of boolean expressions"""
|
|
90
|
+
exprs = args_as_tuple(exprs)
|
|
91
|
+
if len(exprs) == 1:
|
|
92
|
+
return exprs[0]
|
|
93
|
+
return BinBoolOp("OR", exprs)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def and_(*exprs: Expr) -> Union[BinBoolOp, Expr]:
|
|
97
|
+
"""Apply AND between a sequence of boolean expressions"""
|
|
98
|
+
exprs = args_as_tuple(exprs)
|
|
99
|
+
if len(exprs) == 1:
|
|
100
|
+
return exprs[0]
|
|
101
|
+
return BinBoolOp("AND", exprs)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def sum_(expr: Expr) -> Func:
|
|
105
|
+
"""Call SUM(expr)"""
|
|
106
|
+
return Func("sum", [expr])
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def avg(expr: Expr) -> Func:
|
|
110
|
+
"""Call AVG(expr)"""
|
|
111
|
+
return Func("avg", [expr])
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def min_(expr: Expr) -> Func:
|
|
115
|
+
"""Call MIN(expr)"""
|
|
116
|
+
return Func("min", [expr])
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def max_(expr: Expr) -> Func:
|
|
120
|
+
"""Call MAX(expr)"""
|
|
121
|
+
return Func("max", [expr])
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def exists(expr: Expr) -> Func:
|
|
125
|
+
"""Call EXISTS(expr)"""
|
|
126
|
+
return Func("exists", [expr])
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def if_(cond: Expr, then: Expr, else_: Optional[Expr] = None) -> CaseWhen:
|
|
130
|
+
"""Conditional expression, shortcut to when-then-else.
|
|
131
|
+
|
|
132
|
+
Example:
|
|
133
|
+
::
|
|
134
|
+
|
|
135
|
+
# SELECT CASE WHEN b THEN c ELSE d END FROM foo
|
|
136
|
+
table('foo').select(if_(b, c, d))
|
|
137
|
+
"""
|
|
138
|
+
return when(cond).then(then).else_(else_)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def when(*when_exprs: Expr) -> QB_When:
|
|
142
|
+
"""Start a when-then expression
|
|
143
|
+
|
|
144
|
+
Example:
|
|
145
|
+
::
|
|
146
|
+
|
|
147
|
+
# SELECT CASE
|
|
148
|
+
# WHEN (type = 'text') THEN text
|
|
149
|
+
# WHEN (type = 'number') THEN number
|
|
150
|
+
# ELSE 'unknown type' END
|
|
151
|
+
# FROM foo
|
|
152
|
+
rows = table('foo').select(
|
|
153
|
+
when(this.type == 'text').then(this.text)
|
|
154
|
+
.when(this.type == 'number').then(this.number)
|
|
155
|
+
.else_('unknown type')
|
|
156
|
+
)
|
|
157
|
+
"""
|
|
158
|
+
return CaseWhen([]).when(*when_exprs)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def coalesce(*exprs) -> Func:
|
|
162
|
+
"Returns a call to COALESCE"
|
|
163
|
+
exprs = args_as_tuple(exprs)
|
|
164
|
+
return Func("COALESCE", exprs)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def insert_rows_in_batches(db, tbl: TablePath, rows, *, columns=None, batch_size=1024 * 8) -> None:
|
|
168
|
+
assert batch_size > 0
|
|
169
|
+
rows = list(rows)
|
|
170
|
+
|
|
171
|
+
while rows:
|
|
172
|
+
batch, rows = rows[:batch_size], rows[batch_size:]
|
|
173
|
+
db.query(tbl.insert_rows(batch, columns=columns))
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def current_timestamp() -> CurrentTimestamp:
|
|
177
|
+
"""Returns CURRENT_TIMESTAMP() or NOW()"""
|
|
178
|
+
return CurrentTimestamp()
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def code(code: str, **kw: Dict[str, Expr]) -> Code:
|
|
182
|
+
"""Inline raw SQL code.
|
|
183
|
+
|
|
184
|
+
It allows users to use features and syntax that Sqeleton doesn't yet support.
|
|
185
|
+
|
|
186
|
+
It's the user's responsibility to make sure the contents of the string given to `code()` are correct and safe for execution.
|
|
187
|
+
|
|
188
|
+
Strings given to `code()` are actually templates, and can embed query expressions given as arguments:
|
|
189
|
+
|
|
190
|
+
Parameters:
|
|
191
|
+
code: template string of SQL code. Templated variables are signified with '{var}'.
|
|
192
|
+
kw: optional parameters for SQL template.
|
|
193
|
+
|
|
194
|
+
Examples:
|
|
195
|
+
::
|
|
196
|
+
|
|
197
|
+
# SELECT b, <x> FROM tmp WHERE <y>
|
|
198
|
+
table('tmp').select(this.b, code("<x>")).where(code("<y>"))
|
|
199
|
+
|
|
200
|
+
::
|
|
201
|
+
|
|
202
|
+
def tablesample(tbl, size):
|
|
203
|
+
return code("SELECT * FROM {tbl} TABLESAMPLE BERNOULLI ({size})", tbl=tbl, size=size)
|
|
204
|
+
|
|
205
|
+
nonzero = table('points').where(this.x > 0, this.y > 0)
|
|
206
|
+
|
|
207
|
+
# SELECT * FROM points WHERE (x > 0) AND (y > 0) TABLESAMPLE BERNOULLI (10)
|
|
208
|
+
sample_expr = tablesample(nonzero)
|
|
209
|
+
"""
|
|
210
|
+
return Code(code, kw)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
commit = Commit()
|