dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,323 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ import weakref
17
+ from contextlib import suppress
18
+ from itertools import zip_longest
19
+ from typing import Dict, Hashable, MutableMapping, Optional, Type, Union
20
+
21
+ import attrs
22
+ import dsnparse
23
+ import toml
24
+ from typing_extensions import Self
25
+
26
+ from data_diff.databases.base import Database, ThreadedDatabase
27
+ from data_diff.databases.bigquery import BigQuery
28
+ from data_diff.databases.clickhouse import Clickhouse
29
+ from data_diff.databases.databricks import Databricks
30
+ from data_diff.databases.duckdb import DuckDB
31
+ from data_diff.databases.mssql import MsSQL
32
+ from data_diff.databases.mysql import MySQL
33
+ from data_diff.databases.oracle import Oracle
34
+ from data_diff.databases.postgresql import PostgreSQL
35
+ from data_diff.databases.presto import Presto
36
+ from data_diff.databases.redshift import Redshift
37
+ from data_diff.databases.snowflake import Snowflake
38
+ from data_diff.databases.sybase import Sybase
39
+ from data_diff.databases.trino import Trino
40
+ from data_diff.databases.vertica import Vertica
41
+
42
+
43
+ @attrs.frozen
44
+ class MatchUriPath:
45
+ database_cls: Type[Database]
46
+
47
+ def match_path(self, dsn):
48
+ help_str = self.database_cls.CONNECT_URI_HELP
49
+ params = self.database_cls.CONNECT_URI_PARAMS
50
+ kwparams = self.database_cls.CONNECT_URI_KWPARAMS
51
+
52
+ dsn_dict = dict(dsn.query)
53
+ matches = {}
54
+ for param, arg in zip_longest(params, dsn.paths):
55
+ if param is None:
56
+ raise ValueError(f"Too many parts to path. Expected format: {help_str}")
57
+
58
+ optional = param.endswith("?")
59
+ param = param.rstrip("?")
60
+
61
+ if arg is None:
62
+ try:
63
+ arg = dsn_dict.pop(param)
64
+ except KeyError:
65
+ if not optional:
66
+ raise ValueError(f"URI must specify '{param}'. Expected format: {help_str}")
67
+
68
+ arg = None
69
+
70
+ assert param and param not in matches
71
+ matches[param] = arg
72
+
73
+ for param in kwparams:
74
+ try:
75
+ arg = dsn_dict.pop(param)
76
+ except KeyError:
77
+ raise ValueError(f"URI must specify '{param}'. Expected format: {help_str}")
78
+
79
+ assert param and arg and param not in matches, (param, arg, matches.keys())
80
+ matches[param] = arg
81
+
82
+ for param, value in dsn_dict.items():
83
+ if param in matches:
84
+ raise ValueError(
85
+ f"Parameter '{param}' already provided as positional argument. Expected format: {help_str}"
86
+ )
87
+
88
+ matches[param] = value
89
+
90
+ return matches
91
+
92
+
93
+ DATABASE_BY_SCHEME = {
94
+ "postgres": PostgreSQL,
95
+ "mysql": MySQL,
96
+ "oracle": Oracle,
97
+ "redshift": Redshift,
98
+ "snowflake": Snowflake,
99
+ "presto": Presto,
100
+ "bigquery": BigQuery,
101
+ "databricks": Databricks,
102
+ "duckdb": DuckDB,
103
+ "trino": Trino,
104
+ "clickhouse": Clickhouse,
105
+ "vertica": Vertica,
106
+ "mssql": MsSQL,
107
+ "sybase": Sybase,
108
+ }
109
+
110
+
111
+ @attrs.define(frozen=False, init=False)
112
+ class Connect:
113
+ """Provides methods for connecting to a supported database using a URL or connection dict."""
114
+
115
+ database_by_scheme: Dict[str, Database]
116
+ conn_cache: MutableMapping[Hashable, Database]
117
+
118
+ def __init__(self, database_by_scheme: Dict[str, Database] = DATABASE_BY_SCHEME) -> None:
119
+ super().__init__()
120
+ self.database_by_scheme = database_by_scheme
121
+ self.conn_cache = weakref.WeakValueDictionary()
122
+
123
+ def for_databases(self, *dbs) -> Self:
124
+ database_by_scheme = {k: db for k, db in self.database_by_scheme.items() if k in dbs}
125
+ return type(self)(database_by_scheme)
126
+
127
+ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1, **kwargs) -> Database:
128
+ """Connect to the given database uri
129
+
130
+ thread_count determines the max number of worker threads per database,
131
+ if relevant. None means no limit.
132
+
133
+ Parameters:
134
+ db_uri (str): The URI for the database to connect
135
+ thread_count (int, optional): Size of the threadpool. Ignored by cloud databases. (default: 1)
136
+
137
+ Note: For non-cloud databases, a low thread-pool size may be a performance bottleneck.
138
+
139
+ Supported schemes:
140
+ - postgresql
141
+ - mysql
142
+ - oracle
143
+ - snowflake
144
+ - bigquery
145
+ - redshift
146
+ - presto
147
+ - databricks
148
+ - trino
149
+ - clickhouse
150
+ - vertica
151
+ - duckdb
152
+ - sybase
153
+ """
154
+
155
+ dsn = dsnparse.parse(db_uri)
156
+ if len(dsn.schemes) > 1:
157
+ raise NotImplementedError("No support for multiple schemes")
158
+ (scheme,) = dsn.schemes
159
+
160
+ if scheme == "toml":
161
+ toml_path = dsn.path or dsn.host
162
+ database = dsn.fragment
163
+ if not database:
164
+ raise ValueError("Must specify a database name, e.g. 'toml://path#database'. ")
165
+ with open(toml_path) as f:
166
+ config = toml.load(f)
167
+ try:
168
+ conn_dict = config["database"][database]
169
+ except KeyError:
170
+ raise ValueError(f"Cannot find database config named '{database}'.")
171
+ return self.connect_with_dict(conn_dict, thread_count, **kwargs)
172
+
173
+ try:
174
+ cls = self.database_by_scheme[scheme]
175
+ except KeyError:
176
+ raise NotImplementedError(f"Scheme '{scheme}' currently not supported")
177
+
178
+ if scheme == "databricks":
179
+ assert not dsn.user
180
+ kw = {}
181
+ kw["access_token"] = dsn.password
182
+ kw["http_path"] = dsn.path
183
+ kw["server_hostname"] = dsn.host
184
+ kw.update(dsn.query)
185
+ elif scheme == "duckdb":
186
+ kw = {}
187
+ kw["filepath"] = dsn.dbname
188
+ kw["dbname"] = dsn.user
189
+ else:
190
+ matcher = MatchUriPath(cls)
191
+ kw = matcher.match_path(dsn)
192
+
193
+ if scheme == "bigquery":
194
+ kw["project"] = dsn.host
195
+ return cls(**kw, **kwargs)
196
+
197
+ if scheme == "snowflake":
198
+ kw["account"] = dsn.host
199
+ assert not dsn.port
200
+ kw["user"] = dsn.user
201
+ kw["password"] = dsn.password
202
+ else:
203
+ if scheme == "oracle":
204
+ kw["host"] = dsn.hostloc
205
+ else:
206
+ kw["host"] = dsn.host
207
+ kw["port"] = dsn.port
208
+ kw["user"] = dsn.user
209
+ if dsn.password:
210
+ kw["password"] = dsn.password
211
+
212
+ kw = {k: v for k, v in kw.items() if v is not None}
213
+
214
+ if isinstance(cls, type) and issubclass(cls, ThreadedDatabase):
215
+ db = cls(thread_count=thread_count, **kw, **kwargs)
216
+ else:
217
+ db = cls(**kw, **kwargs)
218
+
219
+ return self._connection_created(db)
220
+
221
+ def connect_with_dict(self, d, thread_count, **kwargs):
222
+ d = dict(d)
223
+ driver = d.pop("driver")
224
+ try:
225
+ cls = self.database_by_scheme[driver]
226
+ except KeyError:
227
+ raise NotImplementedError(f"Driver '{driver}' currently not supported")
228
+
229
+ if issubclass(cls, ThreadedDatabase):
230
+ db = cls(thread_count=thread_count, **d, **kwargs)
231
+ else:
232
+ db = cls(**d, **kwargs)
233
+
234
+ return self._connection_created(db)
235
+
236
+ def _connection_created(self, db):
237
+ "Nop function to be overridden by subclasses."
238
+ return db
239
+
240
+ def __call__(
241
+ self, db_conf: Union[str, dict], thread_count: Optional[int] = 1, shared: bool = True, **kwargs
242
+ ) -> Database:
243
+ """Connect to a database using the given database configuration.
244
+
245
+ Configuration can be given either as a URI string, or as a dict of {option: value}.
246
+
247
+ The dictionary configuration uses the same keys as the TOML 'database' definition given with --conf.
248
+
249
+ thread_count determines the max number of worker threads per database,
250
+ if relevant. None means no limit.
251
+
252
+ Parameters:
253
+ db_conf (str | dict): The configuration for the database to connect. URI or dict.
254
+ thread_count (int, optional): Size of the threadpool. Ignored by cloud databases. (default: 1)
255
+ shared (bool): Whether to cache and return the same connection for the same db_conf. (default: True)
256
+ bigquery_credentials (google.oauth2.credentials.Credentials): Custom Google oAuth2 credential for BigQuery.
257
+ (default: None)
258
+
259
+ Note: For non-cloud databases, a low thread-pool size may be a performance bottleneck.
260
+
261
+ Supported drivers:
262
+ - postgresql
263
+ - mysql
264
+ - oracle
265
+ - snowflake
266
+ - bigquery
267
+ - redshift
268
+ - presto
269
+ - databricks
270
+ - trino
271
+ - clickhouse
272
+ - vertica
273
+ - sybase
274
+
275
+ Example:
276
+ >>> connect("mysql://localhost/db")
277
+ <data_diff.databases.mysql.MySQL object at ...>
278
+ >>> connect({"driver": "mysql", "host": "localhost", "database": "db"})
279
+ <data_diff.databases.mysql.MySQL object at ...>
280
+ """
281
+ cache_key = self.__make_cache_key(db_conf)
282
+ if shared:
283
+ with suppress(KeyError):
284
+ conn = self.conn_cache[cache_key]
285
+ if not conn.is_closed:
286
+ return conn
287
+
288
+ if isinstance(db_conf, str):
289
+ conn = self.connect_to_uri(db_conf, thread_count, **kwargs)
290
+ elif isinstance(db_conf, dict):
291
+ conn = self.connect_with_dict(db_conf, thread_count, **kwargs)
292
+ else:
293
+ raise TypeError(f"db configuration must be a URI string or a dictionary. Instead got '{db_conf}'.")
294
+
295
+ if shared:
296
+ self.conn_cache[cache_key] = conn
297
+ return conn
298
+
299
+ def __make_cache_key(self, db_conf: Union[str, dict]) -> Hashable:
300
+ if isinstance(db_conf, dict):
301
+ return tuple(db_conf.items())
302
+ return db_conf
303
+
304
+
305
+ @attrs.define(frozen=False, init=False)
306
+ class Connect_SetUTC(Connect):
307
+ """Provides methods for connecting to a supported database using a URL or connection dict.
308
+
309
+ Ensures all sessions use UTC Timezone, if possible.
310
+ """
311
+
312
+ def _connection_created(self, db):
313
+ db = super()._connection_created(db)
314
+ try:
315
+ db.query(db.dialect.set_timezone_to_utc())
316
+ except NotImplementedError:
317
+ logging.debug(
318
+ f"Database '{db}' does not allow setting timezone. We recommend making sure it's set to 'UTC'."
319
+ )
320
+ return db
321
+
322
+
323
+ connect = Connect_SetUTC(DATABASE_BY_SCHEME)