dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,253 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Contains the implementation of two classes:
16
+
17
+ - LexicographicSpace - a lexicographic space of arbitrary dimensions.
18
+ - BoundedLexicographicSpace - a lexicographic space, where the lowest point may be non-zero.
19
+
20
+ A lexicographic space is a space of increasing natural values, ordered by lexicographic order.
21
+ Read more: https://mathworld.wolfram.com/LexicographicOrder.html
22
+
23
+ These abstractions were written to support compound keys in the hashdiff algorithm.
24
+ In the hashdiff algorithm, we rely on the order of the column keys, to segment the table correctly.
25
+ SQL orders the columns of tables based on lexicographic ordering.
26
+ Since we need an evenly spaced "range" function over the space, which has arbitrary dimensions, we have
27
+ to implement it ourself.
28
+
29
+ As a further optimization, since we each time operate on segments of the ordered table, we add support
30
+ for working with a restricted space, which will reduce the likelihood of gaps in our "select", when the
31
+ keys are not evenly distributed.
32
+ """
33
+
34
+ from random import randint, randrange
35
+ from typing import Tuple
36
+
37
+ import attrs
38
+
39
+ from data_diff.utils import safezip
40
+
41
+ Vector = Tuple[int]
42
+ Interval = Tuple[int]
43
+
44
+
45
+ class Overflow(ValueError):
46
+ pass
47
+
48
+
49
+ def neg_interval(interval):
50
+ return tuple(-i for i in interval)
51
+
52
+
53
+ def neg_v(v: Vector):
54
+ return tuple(-i for i in v)
55
+
56
+
57
+ def sub_v(v1: Vector, v2: Vector):
58
+ return tuple(i1 - i2 for i1, i2 in safezip(v1, v2))
59
+
60
+
61
+ def add_v(v1: Vector, v2: Vector):
62
+ return tuple(i1 + i2 for i1, i2 in safezip(v1, v2))
63
+
64
+
65
+ def rand_v_in_range(v1: Vector, v2: Vector):
66
+ return tuple(irandrange(i1, i2) for i1, i2 in safezip(v1, v2))
67
+
68
+
69
+ def irandrange(start, stop):
70
+ if start == stop:
71
+ return start
72
+ return randrange(start, stop)
73
+
74
+
75
+ @attrs.define(frozen=True)
76
+ class LexicographicSpace:
77
+ """Lexicographic space of arbitrary dimensions.
78
+
79
+ All elements must be of the same length as the number of dimensions. (no rpadding)
80
+ """
81
+
82
+ def __init__(self, dims: Vector) -> None:
83
+ super().__init__()
84
+ self.dims = dims
85
+
86
+ def __contains__(self, v: Vector) -> bool:
87
+ return all(0 <= i < d for i, d in safezip(v, self.dims))
88
+
89
+ def add(self, v1: Vector, v2: Vector) -> Vector:
90
+ # assert v1 in self and v2 in self, (v1, v2)
91
+
92
+ carry = 0
93
+ res = []
94
+ for i1, i2, d in reversed(list(safezip(v1, v2, self.dims))):
95
+ n = i1 + i2 + carry
96
+ carry = n // d
97
+ assert carry <= 1
98
+ n %= d
99
+ res.append(n)
100
+
101
+ if carry:
102
+ raise Overflow("Overflow")
103
+
104
+ new_v = tuple(reversed(res))
105
+ assert new_v in self
106
+ return new_v
107
+
108
+ def sub(self, v1: Vector, v2: Vector):
109
+ return self.add(v1, neg_v(v2))
110
+
111
+ def _divide(self, v: Vector, count: int):
112
+ n = 0
113
+ for x, d in zip(v, self.dims[1:] + (1,), strict=True):
114
+ x += n
115
+ rem = x % count
116
+ n = rem * d
117
+ yield x // count
118
+
119
+ def divide(self, v: Vector, count: int) -> Vector:
120
+ return tuple(self._divide(v, count))
121
+
122
+ def range(self, min_value: Vector, max_value: Vector, count: int):
123
+ assert min_value in self and max_value in self
124
+ count -= 1
125
+ size = self.sub(max_value, min_value)
126
+ interval = self.divide(size, count)
127
+ n = min_value
128
+ for i in range(count):
129
+ yield n
130
+ n = self.add(n, interval)
131
+ yield n
132
+
133
+
134
+ class BoundedLexicographicSpace:
135
+ """Lexicographic space of arbitrary dimensions, where the lowest point may be non-zero.
136
+
137
+ i.e. a space resticted by a "bounding-box" between two arbitrary points.
138
+ """
139
+
140
+ def __init__(self, min_bound: Vector, max_bound: Vector) -> None:
141
+ super().__init__()
142
+
143
+ dims = tuple(mx - mn for mn, mx in safezip(min_bound, max_bound))
144
+ if not all(d >= 0 for d in dims):
145
+ raise ValueError("Error: Negative dimension!")
146
+ if not (dims[0] > 0):
147
+ raise ValueError("First dimension must be non-zero!")
148
+
149
+ self.min_bound = min_bound
150
+ self.max_bound = max_bound
151
+
152
+ self.uspace = LexicographicSpace(dims)
153
+
154
+ def __contains__(self, p: Vector) -> bool:
155
+ return all(mn <= i < mx for i, mn, mx in safezip(p, self.min_bound, self.max_bound))
156
+
157
+ def to_uspace(self, v: Vector) -> Vector:
158
+ assert v in self
159
+ return sub_v(v, self.min_bound)
160
+
161
+ def from_uspace(self, v: Vector) -> Vector:
162
+ res = add_v(v, self.min_bound)
163
+ assert res in self
164
+ return res
165
+
166
+ def add_interval(self, v1: Vector, interval: Interval) -> Vector:
167
+ return self.from_uspace(self.uspace.add(self.to_uspace(v1), interval))
168
+
169
+ def sub_interval(self, v1: Vector, interval: Interval) -> Vector:
170
+ return self.from_uspace(self.uspace.sub(self.to_uspace(v1), interval))
171
+
172
+ def sub(self, v1: Vector, v2: Vector) -> Interval:
173
+ return self.uspace.sub(self.to_uspace(v1), self.to_uspace(v2))
174
+
175
+ def range(self, min_value: Vector, max_value: Vector, count: int):
176
+ return [
177
+ self.from_uspace(v) for v in self.uspace.range(self.to_uspace(min_value), self.to_uspace(max_value), count)
178
+ ]
179
+
180
+
181
+ def test_lex_space():
182
+ # Test add
183
+ binspace = LexicographicSpace((2, 2, 2, 2))
184
+ zero = (0, 0, 0, 0)
185
+ one = (0, 0, 0, 1)
186
+ bin_nums = [zero]
187
+ for i in range(15):
188
+ last = bin_nums[-1]
189
+ bin_nums.append(binspace.add(last, one))
190
+ five = bin_nums[5]
191
+ seven = bin_nums[7]
192
+ eight = bin_nums[8]
193
+ fifteen = bin_nums[15]
194
+
195
+ assert binspace.add(binspace.add(one, five), one) == seven
196
+ assert binspace.add(one, seven) == eight
197
+ assert binspace.add(seven, eight) == fifteen
198
+
199
+ assert binspace.sub(eight, one) == seven
200
+ assert binspace.sub(fifteen, seven) == eight
201
+
202
+ r = list(binspace.range(one, seven, 4))
203
+ assert r == [one, bin_nums[3], five, seven], r
204
+
205
+ decspace = LexicographicSpace((10, 10, 10))
206
+ assert decspace.divide((4, 5, 2), 2) == (2, 2, 6)
207
+ assert decspace.divide((3, 0, 2), 2) == (1, 5, 1)
208
+
209
+ # Restricted space
210
+
211
+ rspace1 = BoundedLexicographicSpace((2, 2), (8, 8))
212
+ assert rspace1.add_interval((2, 2), (0, 0)) == (2, 2)
213
+ assert rspace1.add_interval((2, 2), (0, 1)) == (2, 3)
214
+ assert rspace1.add_interval((2, 2), (0, 6)) == (3, 2)
215
+ assert rspace1.add_interval((2, 2), (0, 7)) == (3, 3)
216
+ # space.add((2,2), (6, 0)) # Overflow
217
+
218
+ rspace2 = BoundedLexicographicSpace((4, 4, 4, 4), (6, 6, 6, 6))
219
+ _one = (4, 4, 4, 5)
220
+ _three = (4, 4, 5, 5)
221
+ _five = (4, 5, 4, 5)
222
+ _seven = (4, 5, 5, 5)
223
+ assert rspace2.add_interval(rspace2.add_interval(_five, one), one) == _seven
224
+ assert rspace2.sub_interval(rspace2.sub_interval(_seven, one), one) == _five
225
+
226
+ r = list(rspace2.range(_one, _seven, 4))
227
+ assert r == [_one, _three, _five, _seven], r
228
+
229
+ # Test range -
230
+ # For random bounds and min/max values, assert that range() generates steps with uniform distances
231
+ MAX_COLUMNS = 16
232
+ MAX_DIM = 10000
233
+ MAX_BISECTION = 128
234
+
235
+ for n in range(1, MAX_COLUMNS):
236
+ min_bound = tuple(randint(0, MAX_DIM) for i in range(n))
237
+ size = tuple(randint(1, MAX_DIM) for i in range(n))
238
+ max_bound = add_v(min_bound, size)
239
+
240
+ sp = BoundedLexicographicSpace(min_bound, max_bound)
241
+
242
+ max_value = rand_v_in_range(min_bound, max_bound)
243
+ min_value = rand_v_in_range(min_bound, max_value)
244
+ for count in range(2, MAX_BISECTION):
245
+ r = sp.range(min_value, max_value, count)
246
+ assert len(r) == count
247
+ diffs = [sp.sub(b, a) for a, b in zip(r[:-1], r[1:])]
248
+ assert len(set(diffs)) == 1 # Uniform!
249
+ # print('.', end='')
250
+
251
+
252
+ if __name__ == "__main__":
253
+ test_lex_space()
@@ -0,0 +1,88 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import re
16
+ from datetime import datetime, timedelta
17
+ from difflib import SequenceMatcher
18
+
19
+
20
+ class ParseError(ValueError):
21
+ pass
22
+
23
+
24
+ TIME_UNITS = dict(
25
+ seconds="seconds",
26
+ minutes="minutes",
27
+ hours="hours",
28
+ days="days",
29
+ weeks="weeks",
30
+ months="months",
31
+ years="years",
32
+ # Shortcuts
33
+ s="seconds",
34
+ min="minutes",
35
+ h="hours",
36
+ d="days",
37
+ w="weeks",
38
+ mon="months",
39
+ y="years",
40
+ )
41
+
42
+ EXTRAPOLATED = {"months": (30, "days"), "years": (365, "days")}
43
+ assert set(EXTRAPOLATED) <= set(TIME_UNITS)
44
+
45
+ TIME_RE = re.compile(r"(\d+)([a-z]+)")
46
+
47
+ UNITS_STR = ", ".join(sorted(TIME_UNITS.keys()))
48
+
49
+
50
+ def string_similarity(a, b) -> SequenceMatcher:
51
+ return SequenceMatcher(None, a, b).ratio()
52
+
53
+
54
+ def parse_time_atom(count, unit):
55
+ count = int(count)
56
+ try:
57
+ unit = TIME_UNITS[unit]
58
+ except KeyError:
59
+ most_similar = max(TIME_UNITS, key=lambda k: string_similarity(k, unit))
60
+ raise ParseError(
61
+ f"'{unit}' is not a recognized time unit. Did you mean '{most_similar}'?" f"\nSupported units: {UNITS_STR}"
62
+ )
63
+
64
+ if unit in EXTRAPOLATED:
65
+ mul, unit = EXTRAPOLATED[unit]
66
+ count *= mul
67
+ return count, unit
68
+
69
+
70
+ def parse_time_delta(t: str) -> timedelta:
71
+ time_dict = {}
72
+ while t:
73
+ m = TIME_RE.match(t)
74
+ if not m:
75
+ raise ParseError(f"Cannot parse '{t}': Not a recognized time delta")
76
+ count, unit = parse_time_atom(*m.groups())
77
+ if unit in time_dict:
78
+ raise ParseError(f"Time unit {unit} specified more than once")
79
+ time_dict[unit] = count
80
+ t = t[m.end() :]
81
+
82
+ if not time_dict:
83
+ raise ParseError("No time difference specified")
84
+ return timedelta(**time_dict)
85
+
86
+
87
+ def parse_time_before(time: datetime, delta: str) -> datetime:
88
+ return time - parse_time_delta(delta)
data_diff/py.typed ADDED
File without changes
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,213 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from data_diff.queries.ast_classes import *
16
+ from data_diff.queries.base import args_as_tuple
17
+ from data_diff.utils import CaseAwareMapping, CaseSensitiveDict
18
+
19
+ this = This()
20
+
21
+
22
+ def join(*tables: ITable) -> Join:
23
+ """Inner-join a sequence of table expressions"
24
+
25
+ When joining, it's recommended to use explicit tables names, instead of `this`, in order to avoid potential name collisions.
26
+
27
+ Example:
28
+ ::
29
+
30
+ person = table('person')
31
+ city = table('city')
32
+
33
+ name_and_city = (
34
+ join(person, city)
35
+ .on(person['city_id'] == city['id'])
36
+ .select(person['id'], city['name'])
37
+ )
38
+ """
39
+ return Join(tables)
40
+
41
+
42
+ def leftjoin(*tables: ITable) -> Join:
43
+ """Left-joins a sequence of table expressions.
44
+
45
+ See Also: ``join()``
46
+ """
47
+ return Join(tables, "LEFT")
48
+
49
+
50
+ def rightjoin(*tables: ITable) -> Join:
51
+ """Right-joins a sequence of table expressions.
52
+
53
+ See Also: ``join()``
54
+ """
55
+ return Join(tables, "RIGHT")
56
+
57
+
58
+ def outerjoin(*tables: ITable) -> Join:
59
+ """Outer-joins a sequence of table expressions.
60
+
61
+ See Also: ``join()``
62
+ """
63
+ return Join(tables, "FULL OUTER")
64
+
65
+
66
+ def cte(expr: Expr, *, name: Optional[str] = None, params: Sequence[str] = None) -> Cte:
67
+ """Define a CTE"""
68
+ return Cte(expr, name, params)
69
+
70
+
71
+ def table(*path: str, schema: Union[dict, CaseAwareMapping] = None) -> TablePath:
72
+ """Defines a table with a path (dotted name), and optionally a schema.
73
+
74
+ Parameters:
75
+ path: A list of names that make up the path to the table.
76
+ schema: a dictionary of {name: type}
77
+ """
78
+ if len(path) == 1 and isinstance(path[0], tuple):
79
+ (path,) = path
80
+ if not all(isinstance(i, str) for i in path):
81
+ raise TypeError(f"All elements of table path must be of type 'str'. Got: {path}")
82
+ if schema and not isinstance(schema, CaseAwareMapping):
83
+ assert isinstance(schema, dict)
84
+ schema = CaseSensitiveDict(schema)
85
+ return TablePath(path, schema)
86
+
87
+
88
+ def or_(*exprs: Expr) -> Union[BinBoolOp, Expr]:
89
+ """Apply OR between a sequence of boolean expressions"""
90
+ exprs = args_as_tuple(exprs)
91
+ if len(exprs) == 1:
92
+ return exprs[0]
93
+ return BinBoolOp("OR", exprs)
94
+
95
+
96
+ def and_(*exprs: Expr) -> Union[BinBoolOp, Expr]:
97
+ """Apply AND between a sequence of boolean expressions"""
98
+ exprs = args_as_tuple(exprs)
99
+ if len(exprs) == 1:
100
+ return exprs[0]
101
+ return BinBoolOp("AND", exprs)
102
+
103
+
104
+ def sum_(expr: Expr) -> Func:
105
+ """Call SUM(expr)"""
106
+ return Func("sum", [expr])
107
+
108
+
109
+ def avg(expr: Expr) -> Func:
110
+ """Call AVG(expr)"""
111
+ return Func("avg", [expr])
112
+
113
+
114
+ def min_(expr: Expr) -> Func:
115
+ """Call MIN(expr)"""
116
+ return Func("min", [expr])
117
+
118
+
119
+ def max_(expr: Expr) -> Func:
120
+ """Call MAX(expr)"""
121
+ return Func("max", [expr])
122
+
123
+
124
+ def exists(expr: Expr) -> Func:
125
+ """Call EXISTS(expr)"""
126
+ return Func("exists", [expr])
127
+
128
+
129
+ def if_(cond: Expr, then: Expr, else_: Optional[Expr] = None) -> CaseWhen:
130
+ """Conditional expression, shortcut to when-then-else.
131
+
132
+ Example:
133
+ ::
134
+
135
+ # SELECT CASE WHEN b THEN c ELSE d END FROM foo
136
+ table('foo').select(if_(b, c, d))
137
+ """
138
+ return when(cond).then(then).else_(else_)
139
+
140
+
141
+ def when(*when_exprs: Expr) -> QB_When:
142
+ """Start a when-then expression
143
+
144
+ Example:
145
+ ::
146
+
147
+ # SELECT CASE
148
+ # WHEN (type = 'text') THEN text
149
+ # WHEN (type = 'number') THEN number
150
+ # ELSE 'unknown type' END
151
+ # FROM foo
152
+ rows = table('foo').select(
153
+ when(this.type == 'text').then(this.text)
154
+ .when(this.type == 'number').then(this.number)
155
+ .else_('unknown type')
156
+ )
157
+ """
158
+ return CaseWhen([]).when(*when_exprs)
159
+
160
+
161
+ def coalesce(*exprs) -> Func:
162
+ "Returns a call to COALESCE"
163
+ exprs = args_as_tuple(exprs)
164
+ return Func("COALESCE", exprs)
165
+
166
+
167
+ def insert_rows_in_batches(db, tbl: TablePath, rows, *, columns=None, batch_size=1024 * 8) -> None:
168
+ assert batch_size > 0
169
+ rows = list(rows)
170
+
171
+ while rows:
172
+ batch, rows = rows[:batch_size], rows[batch_size:]
173
+ db.query(tbl.insert_rows(batch, columns=columns))
174
+
175
+
176
+ def current_timestamp() -> CurrentTimestamp:
177
+ """Returns CURRENT_TIMESTAMP() or NOW()"""
178
+ return CurrentTimestamp()
179
+
180
+
181
+ def code(code: str, **kw: Dict[str, Expr]) -> Code:
182
+ """Inline raw SQL code.
183
+
184
+ It allows users to use features and syntax that Sqeleton doesn't yet support.
185
+
186
+ It's the user's responsibility to make sure the contents of the string given to `code()` are correct and safe for execution.
187
+
188
+ Strings given to `code()` are actually templates, and can embed query expressions given as arguments:
189
+
190
+ Parameters:
191
+ code: template string of SQL code. Templated variables are signified with '{var}'.
192
+ kw: optional parameters for SQL template.
193
+
194
+ Examples:
195
+ ::
196
+
197
+ # SELECT b, <x> FROM tmp WHERE <y>
198
+ table('tmp').select(this.b, code("<x>")).where(code("<y>"))
199
+
200
+ ::
201
+
202
+ def tablesample(tbl, size):
203
+ return code("SELECT * FROM {tbl} TABLESAMPLE BERNOULLI ({size})", tbl=tbl, size=size)
204
+
205
+ nonzero = table('points').where(this.x > 0, this.y > 0)
206
+
207
+ # SELECT * FROM points WHERE (x > 0) AND (y > 0) TABLESAMPLE BERNOULLI (10)
208
+ sample_expr = tablesample(nonzero)
209
+ """
210
+ return Code(code, kw)
211
+
212
+
213
+ commit = Commit()