dcs-sdk 1.7.3__tar.gz → 1.7.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/PKG-INFO +2 -2
  2. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/README.md +1 -1
  3. dcs_sdk-1.7.5/dcs_core/integrations/databases/databricks.py +553 -0
  4. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/__version__.py +1 -1
  5. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/utils/utils.py +24 -12
  6. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/pyproject.toml +1 -1
  7. dcs_sdk-1.7.3/dcs_core/integrations/databases/databricks.py +0 -51
  8. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/__init__.py +0 -0
  9. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/__main__.py +0 -0
  10. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/abcs/__init__.py +0 -0
  11. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/abcs/compiler.py +0 -0
  12. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/abcs/database_types.py +0 -0
  13. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/config.py +0 -0
  14. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/__init__.py +0 -0
  15. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/_connect.py +0 -0
  16. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/base.py +0 -0
  17. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/bigquery.py +0 -0
  18. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/clickhouse.py +0 -0
  19. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/databricks.py +0 -0
  20. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/duckdb.py +0 -0
  21. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/mssql.py +0 -0
  22. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/mysql.py +0 -0
  23. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/oracle.py +0 -0
  24. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/postgresql.py +0 -0
  25. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/presto.py +0 -0
  26. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/redis.py +0 -0
  27. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/redshift.py +0 -0
  28. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/snowflake.py +0 -0
  29. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/sybase.py +0 -0
  30. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/trino.py +0 -0
  31. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/databases/vertica.py +0 -0
  32. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/diff_tables.py +0 -0
  33. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/errors.py +0 -0
  34. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/format.py +0 -0
  35. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/hashdiff_tables.py +0 -0
  36. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/info_tree.py +0 -0
  37. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/joindiff_tables.py +0 -0
  38. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/lexicographic_space.py +0 -0
  39. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/parse_time.py +0 -0
  40. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/py.typed +0 -0
  41. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/queries/__init__.py +0 -0
  42. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/queries/api.py +0 -0
  43. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/queries/ast_classes.py +0 -0
  44. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/queries/base.py +0 -0
  45. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/queries/extras.py +0 -0
  46. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/query_utils.py +0 -0
  47. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/schema.py +0 -0
  48. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/table_segment.py +0 -0
  49. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/thread_utils.py +0 -0
  50. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/utils.py +0 -0
  51. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/data_diff/version.py +0 -0
  52. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/__init__.py +0 -0
  53. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/__main__.py +0 -0
  54. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/__version__.py +0 -0
  55. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/cli/__init__.py +0 -0
  56. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/cli/cli.py +0 -0
  57. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/__init__.py +0 -0
  58. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/common/__init__.py +0 -0
  59. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/common/errors.py +0 -0
  60. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/common/models/__init__.py +0 -0
  61. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/common/models/configuration.py +0 -0
  62. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/common/models/dashboard.py +0 -0
  63. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/common/models/data_source_resource.py +0 -0
  64. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/common/models/metric.py +0 -0
  65. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/common/models/profile.py +0 -0
  66. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/common/models/validation.py +0 -0
  67. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/common/models/widget.py +0 -0
  68. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/configuration/__init__.py +0 -0
  69. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/configuration/config_loader.py +0 -0
  70. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/configuration/configuration_parser.py +0 -0
  71. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/configuration/configuration_parser_arc.py +0 -0
  72. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/datasource/__init__.py +0 -0
  73. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/datasource/base.py +0 -0
  74. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/datasource/file_datasource.py +0 -0
  75. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/datasource/manager.py +0 -0
  76. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/datasource/search_datasource.py +0 -0
  77. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/datasource/sql_datasource.py +0 -0
  78. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/inspect.py +0 -0
  79. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/logger/__init__.py +0 -0
  80. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/logger/base.py +0 -0
  81. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/logger/default_logger.py +0 -0
  82. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/metric/__init__.py +0 -0
  83. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/metric/base.py +0 -0
  84. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/metric/combined_metric.py +0 -0
  85. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/metric/custom_metric.py +0 -0
  86. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/metric/manager.py +0 -0
  87. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/metric/numeric_metric.py +0 -0
  88. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/metric/reliability_metric.py +0 -0
  89. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/profiling/__init__.py +0 -0
  90. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/profiling/datasource_profiling.py +0 -0
  91. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/profiling/numeric_field_profiling.py +0 -0
  92. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/profiling/text_field_profiling.py +0 -0
  93. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/repository/__init__.py +0 -0
  94. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/repository/metric_repository.py +0 -0
  95. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/utils/__init__.py +0 -0
  96. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/utils/log.py +0 -0
  97. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/utils/tracking.py +0 -0
  98. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/utils/utils.py +0 -0
  99. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/validation/__init__.py +0 -0
  100. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/validation/base.py +0 -0
  101. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/validation/completeness_validation.py +0 -0
  102. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/validation/custom_query_validation.py +0 -0
  103. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/validation/manager.py +0 -0
  104. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/validation/numeric_validation.py +0 -0
  105. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/validation/reliability_validation.py +0 -0
  106. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/validation/uniqueness_validation.py +0 -0
  107. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/core/validation/validity_validation.py +0 -0
  108. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/__init__.py +0 -0
  109. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/__init__.py +0 -0
  110. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/azure_blob.py +0 -0
  111. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/bigquery.py +0 -0
  112. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/db2.py +0 -0
  113. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/duck_db.py +0 -0
  114. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/elasticsearch.py +0 -0
  115. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/mssql.py +0 -0
  116. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/mysql.py +0 -0
  117. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/opensearch.py +0 -0
  118. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/oracle.py +0 -0
  119. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/postgres.py +0 -0
  120. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/redshift.py +0 -0
  121. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/snowflake.py +0 -0
  122. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/spark_df.py +0 -0
  123. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/databases/sybase.py +0 -0
  124. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/storage/__init__.py +0 -0
  125. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/storage/local_file.py +0 -0
  126. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/utils/__init__.py +0 -0
  127. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/integrations/utils/utils.py +0 -0
  128. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/report/__init__.py +0 -0
  129. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/report/dashboard.py +0 -0
  130. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/report/models.py +0 -0
  131. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  132. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  133. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  134. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  135. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/report/static/assets/images/docs.svg +0 -0
  136. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/report/static/assets/images/github.svg +0 -0
  137. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/report/static/assets/images/logo.svg +0 -0
  138. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/report/static/assets/images/slack.svg +0 -0
  139. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/report/static/index.js +0 -0
  140. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_core/report/static/index.js.LICENSE.txt +0 -0
  141. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/__init__.py +0 -0
  142. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/__main__.py +0 -0
  143. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/cli/__init__.py +0 -0
  144. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/cli/cli.py +0 -0
  145. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/__init__.py +0 -0
  146. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/config/__init__.py +0 -0
  147. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/config/config_loader.py +0 -0
  148. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/data_diff/__init__.py +0 -0
  149. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/data_diff/data_differ.py +0 -0
  150. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/rules/__init__.py +0 -0
  151. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/rules/rules_mappping.py +0 -0
  152. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/rules/rules_repository.py +0 -0
  153. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/rules/schema_rules.py +0 -0
  154. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/utils/__init__.py +0 -0
  155. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/utils/serializer.py +0 -0
  156. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/utils/similarity_score/__init__.py +0 -0
  157. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/utils/similarity_score/base_provider.py +0 -0
  158. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +0 -0
  159. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +0 -0
  160. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +0 -0
  161. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/utils/table.py +0 -0
  162. {dcs_sdk-1.7.3 → dcs_sdk-1.7.5}/dcs_sdk/sdk/utils/themes.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dcs-sdk
3
- Version: 1.7.3
3
+ Version: 1.7.5
4
4
  Summary: SDK for DataChecks
5
5
  Author: Waterdip Labs
6
6
  Author-email: hello@waterdip.ai
@@ -86,7 +86,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
86
86
  Description-Content-Type: text/markdown
87
87
 
88
88
  <h1 align="center">
89
- DCS SDK v1.7.3
89
+ DCS SDK v1.7.4
90
90
  </h1>
91
91
 
92
92
  > SDK for DataChecks
@@ -1,5 +1,5 @@
1
1
  <h1 align="center">
2
- DCS SDK v1.7.3
2
+ DCS SDK v1.7.4
3
3
  </h1>
4
4
 
5
5
  > SDK for DataChecks
@@ -0,0 +1,553 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import datetime
16
+ import math
17
+ from decimal import Decimal
18
+ from typing import Any, Dict, List, Optional, Tuple
19
+ from uuid import UUID
20
+
21
+ from sqlalchemy import create_engine, text
22
+ from sqlalchemy.engine import URL
23
+
24
+ from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
25
+ from dcs_core.core.common.models.data_source_resource import RawColumnInfo
26
+ from dcs_core.core.datasource.sql_datasource import SQLDataSource
27
+
28
+
29
+ class DatabricksDataSource(SQLDataSource):
30
+ def __init__(self, data_source_name: str, data_connection: Dict):
31
+ super().__init__(data_source_name, data_connection)
32
+
33
+ def connect(self) -> Any:
34
+ """
35
+ Connect to the data source
36
+ """
37
+ try:
38
+ url = URL.create(
39
+ "databricks",
40
+ username="token",
41
+ password=self.data_connection.get("token"),
42
+ host=self.data_connection.get("host"),
43
+ port=self.data_connection.get("port", 443),
44
+ database=self.data_connection.get("schema"),
45
+ query={
46
+ "http_path": self.data_connection.get("http_path"),
47
+ "catalog": self.data_connection.get("catalog"),
48
+ },
49
+ )
50
+ engine = create_engine(url, echo=True)
51
+ self.connection = engine.connect()
52
+ self.schema_name = self.data_connection.get("schema")
53
+ return self.connection
54
+ except Exception as e:
55
+ raise DataChecksDataSourcesConnectionError(
56
+ message=f"Failed to connect to Databricks data source: [{str(e)}]"
57
+ )
58
+
59
+ def quote_column(self, column: str) -> str:
60
+ return f"`{column}`"
61
+
62
+ def quote_database(self, database: str) -> str:
63
+ return f"`{database}`"
64
+
65
+ def qualified_table_name(self, table_name: str) -> str:
66
+ if self.schema_name:
67
+ return f"`{self.schema_name}`.`{table_name}`"
68
+ return f"`{table_name}`"
69
+
70
+ def query_get_database_version(self, database_version_query: Optional[str] = None) -> str:
71
+ """
72
+ Get the database version
73
+ :return: version string
74
+ """
75
+ query = database_version_query or "SELECT version()"
76
+ result = self.fetchone(query)
77
+ if result:
78
+ return result[0]
79
+ return None
80
+
81
+ def query_get_table_indexes(self, table: str, schema: str | None = None) -> dict[str, dict]:
82
+ """
83
+ Get index information for a table.
84
+ For Databricks, this primarily returns Primary Key information as traditional indexes are not exposed identically.
85
+ :param table: Table name
86
+ :param schema: Optional schema name
87
+ :return: Dictionary with index details
88
+ """
89
+ schema = schema or self.schema_name
90
+ database = self.data_connection.get("catalog") or "hive_metastore"
91
+ quoted_database = self.quote_database(database)
92
+
93
+ # Databricks Unity Catalog stores constraints in information_schema
94
+ # We will fetch Primary Key info and structure it as an "index"
95
+ query = f"""
96
+ SELECT
97
+ tc.constraint_name,
98
+ kcu.column_name,
99
+ kcu.ordinal_position
100
+ FROM {quoted_database}.information_schema.table_constraints AS tc
101
+ JOIN {quoted_database}.information_schema.key_column_usage AS kcu
102
+ ON tc.constraint_name = kcu.constraint_name
103
+ AND tc.table_schema = kcu.table_schema
104
+ WHERE tc.table_schema = '{schema}'
105
+ AND tc.table_name = '{table}'
106
+ AND tc.constraint_type = 'PRIMARY KEY'
107
+ ORDER BY kcu.ordinal_position
108
+ """
109
+
110
+ try:
111
+ rows = self.fetchall(query)
112
+ except Exception as e:
113
+ # Fallback or silent failure if table doesn't exist or info schema not accessible
114
+ return {}
115
+
116
+ indexes = {}
117
+ if rows:
118
+ # In Databricks, the PK constraint name acts as the index name for this purpose
119
+ constraint_name = rows[0][0]
120
+ indexes[constraint_name] = {"columns": [], "index_type": "PRIMARY KEY", "is_primary_key": True}
121
+
122
+ for row in rows:
123
+ col_name = row[1]
124
+ ordinal = row[2]
125
+ indexes[constraint_name]["columns"].append({"column_name": col_name, "column_order": ordinal})
126
+
127
+ return indexes
128
+
129
+ def get_table_foreign_key_info(self, table_name: str, schema: str | None = None) -> list[dict]:
130
+ """
131
+ Get foreign key information for a table.
132
+ :param table_name: Table name
133
+ :param schema: Optional schema name
134
+ :return: List of dicts with FK details
135
+ """
136
+ schema = schema or self.schema_name
137
+ database = self.data_connection.get("catalog") or "hive_metastore"
138
+ quoted_database = self.quote_database(database)
139
+
140
+ # Standard ISO SQL query for Foreign Keys using information_schema
141
+ # Works for Unity Catalog
142
+ query = f"""
143
+ SELECT
144
+ tc.constraint_name,
145
+ tc.table_name,
146
+ kcu.column_name AS fk_column,
147
+ rel_kcu.table_name AS referenced_table,
148
+ rel_kcu.column_name AS referenced_column
149
+ FROM {quoted_database}.information_schema.table_constraints tc
150
+ JOIN {quoted_database}.information_schema.key_column_usage kcu
151
+ ON tc.constraint_name = kcu.constraint_name
152
+ AND tc.table_schema = kcu.table_schema
153
+ JOIN {quoted_database}.information_schema.referential_constraints rc
154
+ ON tc.constraint_name = rc.constraint_name
155
+ AND tc.table_schema = rc.constraint_schema
156
+ JOIN {quoted_database}.information_schema.key_column_usage rel_kcu
157
+ ON rc.unique_constraint_name = rel_kcu.constraint_name
158
+ AND rc.unique_constraint_schema = rel_kcu.table_schema
159
+ AND kcu.ordinal_position = rel_kcu.ordinal_position
160
+ WHERE tc.constraint_type = 'FOREIGN KEY'
161
+ AND tc.table_name = '{table_name}'
162
+ AND tc.table_schema = '{schema}'
163
+ """
164
+
165
+ try:
166
+ rows = self.fetchall(query)
167
+ except Exception:
168
+ return []
169
+
170
+ fk_info = []
171
+ for row in rows:
172
+ fk_info.append(
173
+ {
174
+ "constraint_name": row[0],
175
+ "table_name": row[1],
176
+ "fk_column": row[2],
177
+ "referenced_table": row[3],
178
+ "referenced_column": row[4],
179
+ }
180
+ )
181
+
182
+ return fk_info
183
+
184
+ def query_get_table_names(
185
+ self,
186
+ schema: str | None = None,
187
+ with_view: bool = False,
188
+ ) -> dict:
189
+ """
190
+ Get the list of tables in the database.
191
+ :param schema: optional schema name
192
+ :param with_view: whether to include views
193
+ :return: dictionary with table names and optionally view names
194
+ """
195
+ schema = schema or self.schema_name
196
+ database = self.data_connection.get("catalog") or "hive_metastore"
197
+ quoted_database = self.quote_database(database)
198
+
199
+ if with_view:
200
+ table_type_condition = "table_type IN ('MANAGED', 'EXTERNAL', 'VIEW')"
201
+ else:
202
+ table_type_condition = "table_type IN ('MANAGED', 'EXTERNAL')"
203
+
204
+ query = (
205
+ f"SELECT table_name, table_type FROM {quoted_database}.information_schema.tables "
206
+ f"WHERE table_schema = '{schema}' "
207
+ )
208
+ if not with_view:
209
+ query += " AND table_type != 'VIEW'"
210
+
211
+ rows = self.fetchall(query)
212
+
213
+ if with_view:
214
+ result = {"table": [], "view": []}
215
+ if rows:
216
+ for row in rows:
217
+ table_name = row[0]
218
+ table_type = row[1]
219
+ if "VIEW" in table_type:
220
+ result["view"].append(table_name)
221
+ else:
222
+ result["table"].append(table_name)
223
+ else:
224
+ result = {"table": []}
225
+ if rows:
226
+ result["table"] = [row[0] for row in rows]
227
+
228
+ return result
229
+
230
+ def query_get_table_columns(
231
+ self,
232
+ table: str,
233
+ schema: str | None = None,
234
+ ) -> RawColumnInfo:
235
+ """
236
+ Get the schema of a table.
237
+ :param table: table name
238
+ :return: RawColumnInfo object containing column information
239
+ """
240
+ schema = schema or self.schema_name
241
+ database = self.data_connection.get("catalog") or "hive_metastore"
242
+ quoted_database = self.quote_database(database)
243
+
244
+ query = (
245
+ f"SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale, "
246
+ f"character_maximum_length "
247
+ f"FROM {quoted_database}.information_schema.columns "
248
+ f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
249
+ )
250
+ rows = self.fetchall(query)
251
+
252
+ if not rows:
253
+ raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
254
+
255
+ column_info = {
256
+ r[0]: RawColumnInfo(
257
+ column_name=self.safe_get(r, 0),
258
+ data_type=self.safe_get(r, 1),
259
+ datetime_precision=self.safe_get(r, 2),
260
+ numeric_precision=self.safe_get(r, 3),
261
+ numeric_scale=self.safe_get(r, 4),
262
+ character_maximum_length=self.safe_get(r, 5),
263
+ )
264
+ for r in rows
265
+ }
266
+ return column_info
267
+
268
+ def fetch_rows(
269
+ self,
270
+ query: str,
271
+ limit: int = 1,
272
+ with_column_names: bool = False,
273
+ complete_query: Optional[str] = None,
274
+ ) -> Tuple[List, Optional[List[str]]]:
275
+ """
276
+ Fetch rows from the database.
277
+
278
+ :param query: SQL query to execute.
279
+ :param limit: Number of rows to fetch.
280
+ :param with_column_names: Whether to include column names in the result.
281
+ :return: Tuple of (rows, column_names or None)
282
+ """
283
+ query = complete_query or f"SELECT * FROM ({query}) AS subquery LIMIT {limit}"
284
+
285
+ result = self.connection.execute(text(query))
286
+ rows = result.fetchmany(limit)
287
+
288
+ if with_column_names:
289
+ column_names = result.keys()
290
+ return rows, list(column_names)
291
+ else:
292
+ return rows, None
293
+
294
+ def fetch_sample_values_from_database(
295
+ self,
296
+ table_name: str,
297
+ column_names: list[str],
298
+ limit: int = 5,
299
+ ) -> List[Tuple]:
300
+ """
301
+ Fetch sample rows for specific columns from the given table.
302
+
303
+ :param table_name: The name of the table.
304
+ :param column_names: List of column names to fetch.
305
+ :param limit: Number of rows to fetch.
306
+ :return: List of row tuples.
307
+ """
308
+ table_name = self.qualified_table_name(table_name)
309
+
310
+ if not column_names:
311
+ raise ValueError("At least one column name must be provided")
312
+
313
+ if len(column_names) == 1 and column_names[0] == "*":
314
+ query = f"SELECT * FROM {table_name} LIMIT {limit}"
315
+ else:
316
+ columns = ", ".join([self.quote_column(col) for col in column_names])
317
+ query = f"SELECT {columns} FROM {table_name} LIMIT {limit}"
318
+
319
+ result = self.connection.execute(text(query))
320
+ column_names = list(result.keys())
321
+ rows = result.fetchall()
322
+ return rows, column_names
323
+
324
+ def build_table_metrics_query(
325
+ self,
326
+ table_name: str,
327
+ column_info: list[dict],
328
+ additional_queries: Optional[List[str]] = None,
329
+ ) -> list[dict]:
330
+ query_parts = []
331
+ if not column_info:
332
+ return []
333
+
334
+ for col in column_info:
335
+ name = col["column_name"]
336
+ dtype = col["data_type"].lower()
337
+ quoted = self.quote_column(name)
338
+
339
+ if dtype in ("string", "varchar"):
340
+ distinct_expr = f"{quoted}"
341
+ else:
342
+ distinct_expr = f"{quoted}"
343
+
344
+ query_parts.append(f"COUNT(DISTINCT {distinct_expr}) AS `{name}_distinct`")
345
+ query_parts.append(f"COUNT(*) - COUNT(DISTINCT {distinct_expr}) AS `{name}_duplicate`")
346
+ query_parts.append(
347
+ f"SUM(CASE WHEN {self.quote_column(name)} IS NULL THEN 1 ELSE 0 END) AS `{name}_is_null`"
348
+ )
349
+
350
+ if dtype in (
351
+ "int",
352
+ "integer",
353
+ "bigint",
354
+ "long",
355
+ "smallint",
356
+ "tinyint",
357
+ "decimal",
358
+ "numeric",
359
+ "float",
360
+ "double",
361
+ ):
362
+ query_parts.append(f"MIN({self.quote_column(name)}) AS `{name}_min`")
363
+ query_parts.append(f"MAX({self.quote_column(name)}) AS `{name}_max`")
364
+ query_parts.append(f"AVG({self.quote_column(name)}) AS `{name}_average`")
365
+
366
+ elif dtype in ("string", "varchar", "char", "text"):
367
+ # Databricks uses length() or char_length()
368
+ query_parts.append(f"MAX(length({self.quote_column(name)})) AS `{name}_max_character_length`")
369
+
370
+ elif dtype in ("timestamp", "date", "boolean"):
371
+ query_parts.append(f"MIN({self.quote_column(name)}) AS `{name}_min`")
372
+ query_parts.append(f"MAX({self.quote_column(name)}) AS `{name}_max`")
373
+
374
+ if additional_queries:
375
+ for queries in additional_queries:
376
+ query_parts.append(queries)
377
+
378
+ qualified_table = self.qualified_table_name(table_name)
379
+ joined_parts = ",\n ".join(query_parts)
380
+ query = f"SELECT\n {joined_parts}\nFROM {qualified_table};"
381
+
382
+ result = self.connection.execute(text(query))
383
+ row = dict(list(result)[0]._mapping)
384
+
385
+ def _normalize_metrics(value):
386
+ """
387
+ Safely normalizes DB metric values into JSON-serializable Python types.
388
+ """
389
+ if value is None:
390
+ return None
391
+
392
+ if isinstance(value, Decimal):
393
+ return float(value)
394
+ if isinstance(value, (int, float, bool)):
395
+ return value
396
+
397
+ if isinstance(value, (datetime.datetime, datetime.date)):
398
+ return value.isoformat()
399
+
400
+ if isinstance(value, UUID):
401
+ return str(value)
402
+
403
+ if isinstance(value, list):
404
+ return [_normalize_metrics(v) for v in value]
405
+ if isinstance(value, dict):
406
+ return {k: _normalize_metrics(v) for k, v in value.items()}
407
+
408
+ return str(value)
409
+
410
+ column_wise = []
411
+ for col in column_info:
412
+ name = col["column_name"]
413
+ col_metrics = {}
414
+
415
+ for key, value in row.items():
416
+ if key.startswith(f"{name}_"):
417
+ metric_name = key[len(name) + 1 :]
418
+ col_metrics[metric_name] = _normalize_metrics(value)
419
+
420
+ column_wise.append({"column_name": name, "metrics": col_metrics})
421
+
422
+ for col_data in column_wise:
423
+ metrics = col_data["metrics"]
424
+ distinct_count = metrics.get("distinct")
425
+ col_name = col_data["column_name"]
426
+ dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
427
+
428
+ quoted = self.quote_column(col_name)
429
+
430
+ is_dtype_numeric = (
431
+ True
432
+ if dtype
433
+ in (
434
+ "int",
435
+ "integer",
436
+ "bigint",
437
+ "long",
438
+ "smallint",
439
+ "tinyint",
440
+ "decimal",
441
+ "numeric",
442
+ "float",
443
+ "double",
444
+ )
445
+ else False
446
+ )
447
+
448
+ if is_dtype_numeric:
449
+ col_min = metrics.get("min")
450
+ col_max = metrics.get("max")
451
+
452
+ if col_min is not None and col_max is not None and col_min != col_max:
453
+ bucket_count = 20
454
+ bucket_size = (col_max - col_min) / bucket_count
455
+
456
+ bucket_queries = []
457
+ for i in range(bucket_count):
458
+ start = col_min + i * bucket_size
459
+ end = col_min + (i + 1) * bucket_size
460
+
461
+ # Databricks SQL syntax for CASE WHEN
462
+ bucket_queries.append(
463
+ f"SUM(CASE WHEN {quoted} >= {start} AND {quoted} < {end} THEN 1 ELSE 0 END) AS `bucket_{i}`"
464
+ )
465
+
466
+ bucket_sql = f"SELECT {', '.join(bucket_queries)} FROM {qualified_table}"
467
+
468
+ try:
469
+ bucket_result = self.connection.execute(text(bucket_sql)).fetchone()
470
+ distribution = []
471
+
472
+ for i in range(bucket_count):
473
+ start_raw = col_min + i * bucket_size
474
+ end_raw = col_min + (i + 1) * bucket_size
475
+ if dtype in ("int", "integer", "bigint", "long", "smallint", "tinyint"):
476
+ start = math.floor(start_raw)
477
+ end = math.ceil(end_raw)
478
+ else:
479
+ start = round(start_raw, 2)
480
+ end = round(end_raw, 2)
481
+
482
+ # Fetch by index or name (sqlalchemy row access)
483
+ count = bucket_result[i]
484
+
485
+ distribution.append(
486
+ {
487
+ "col_val": f"{start} - {end}",
488
+ "count": count,
489
+ }
490
+ )
491
+
492
+ metrics["distribution_graph"] = distribution
493
+
494
+ except Exception as e:
495
+ print(f"Failed to generate numeric distribution for {col_name}: {e}")
496
+
497
+ continue
498
+
499
+ if isinstance(distinct_count, (int, float)) and distinct_count <= 20:
500
+ group_expr = quoted
501
+
502
+ dist_query = (
503
+ f"SELECT {group_expr}, COUNT(*) "
504
+ f"FROM {qualified_table} GROUP BY {group_expr} ORDER BY COUNT(*) DESC"
505
+ )
506
+
507
+ try:
508
+ dist_result = self.connection.execute(text(dist_query)).fetchall()
509
+
510
+ distribution = []
511
+ for r in dist_result:
512
+ val = _normalize_metrics(r[0])
513
+ distribution.append(
514
+ {
515
+ "col_val": val,
516
+ "count": r[1],
517
+ }
518
+ )
519
+
520
+ metrics["distribution_graph"] = distribution
521
+
522
+ except Exception as e:
523
+ print(f"Failed to generate distribution graph for column {col_name}: {e}")
524
+
525
+ for col_data in column_wise:
526
+ metrics = col_data["metrics"]
527
+ # Formatting as per existing pattern
528
+ is_dtype_numeric = (
529
+ True
530
+ if next(c["data_type"].lower() for c in column_info if c["column_name"] == col_data["column_name"])
531
+ in (
532
+ "int",
533
+ "integer",
534
+ "bigint",
535
+ "long",
536
+ "smallint",
537
+ "tinyint",
538
+ "decimal",
539
+ "numeric",
540
+ "float",
541
+ "double",
542
+ )
543
+ else False
544
+ )
545
+
546
+ formatted_metrics_data = {
547
+ "general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
548
+ "is_dtype_numeric": is_dtype_numeric,
549
+ "distribution_data": metrics.get("distribution_graph", []),
550
+ }
551
+ col_data["metrics"] = formatted_metrics_data
552
+
553
+ return column_wise
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.7.3"
15
+ __version__ = "1.7.5"
@@ -439,27 +439,39 @@ def apply_custom_masking(
439
439
 
440
440
  # common masking columns
441
441
  for col in common_masking_cols:
442
- src_val = str(source.get(col, ""))
443
- tgt_val = str(target.get(col, ""))
442
+ src_val = source.get(col)
443
+ tgt_val = target.get(col)
444
444
 
445
- src_len, tgt_len = len(src_val), len(tgt_val)
446
- if src_len == tgt_len and src_val != tgt_val:
447
- masked_source[col] = mask_char * (src_len + 1)
448
- masked_target[col] = mask_char * tgt_len
445
+ if src_val is None and tgt_val is None:
446
+ continue
447
+
448
+ src_str = str(src_val) if src_val is not None else ""
449
+ tgt_str = str(tgt_val) if tgt_val is not None else ""
450
+
451
+ src_len, tgt_len = len(src_str), len(tgt_str)
452
+ if src_len == tgt_len and src_str != tgt_str:
453
+ if src_val is not None:
454
+ masked_source[col] = mask_char * (src_len + 1)
455
+ if tgt_val is not None:
456
+ masked_target[col] = mask_char * tgt_len
449
457
  else:
450
- masked_source[col] = mask_char * src_len
451
- masked_target[col] = mask_char * tgt_len
458
+ if src_val is not None:
459
+ masked_source[col] = mask_char * src_len
460
+ if tgt_val is not None:
461
+ masked_target[col] = mask_char * tgt_len
452
462
 
453
463
  # Non-common columns
454
464
  for col in source_masking_cols:
455
465
  if col not in common_masking_cols:
456
- val = source.get(col, "")
457
- masked_source[col] = mask_char * len(val)
466
+ val = source.get(col)
467
+ if val is not None:
468
+ masked_source[col] = mask_char * len(str(val))
458
469
 
459
470
  for col in target_masking_cols:
460
471
  if col not in common_masking_cols:
461
- val = target.get(col, "")
462
- masked_target[col] = mask_char * len(val)
472
+ val = target.get(col)
473
+ if val is not None:
474
+ masked_target[col] = mask_char * len(str(val))
463
475
 
464
476
  return masked_source, masked_target
465
477
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dcs-sdk"
3
- version = "1.7.3"
3
+ version = "1.7.5"
4
4
  description = "SDK for DataChecks"
5
5
  authors = ["Waterdip Labs <hello@waterdip.ai>"]
6
6
  readme = "README.md"