dcs-sdk 1.6.5__tar.gz → 1.6.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/PKG-INFO +4 -2
  2. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/README.md +1 -1
  3. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/__init__.py +0 -2
  4. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/common/errors.py +18 -0
  5. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/common/models/configuration.py +6 -0
  6. dcs_sdk-1.6.7/dcs_core/core/datasource/file_datasource.py +26 -0
  7. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/datasource/manager.py +15 -0
  8. dcs_sdk-1.6.7/dcs_core/integrations/databases/azure_blob.py +115 -0
  9. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/mssql.py +156 -6
  10. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/postgres.py +90 -2
  11. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/__version__.py +1 -1
  12. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/config/config_loader.py +13 -0
  13. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/data_diff/data_differ.py +59 -12
  14. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/utils/utils.py +136 -1
  15. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/pyproject.toml +3 -1
  16. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/__main__.py +0 -0
  17. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/abcs/__init__.py +0 -0
  18. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/abcs/compiler.py +0 -0
  19. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/abcs/database_types.py +0 -0
  20. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/config.py +0 -0
  21. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/__init__.py +0 -0
  22. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/_connect.py +0 -0
  23. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/base.py +0 -0
  24. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/bigquery.py +0 -0
  25. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/clickhouse.py +0 -0
  26. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/databricks.py +0 -0
  27. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/duckdb.py +0 -0
  28. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/mssql.py +0 -0
  29. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/mysql.py +0 -0
  30. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/oracle.py +0 -0
  31. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/postgresql.py +0 -0
  32. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/presto.py +0 -0
  33. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/redis.py +0 -0
  34. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/redshift.py +0 -0
  35. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/snowflake.py +0 -0
  36. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/sybase.py +0 -0
  37. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/trino.py +0 -0
  38. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/databases/vertica.py +0 -0
  39. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/diff_tables.py +0 -0
  40. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/errors.py +0 -0
  41. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/format.py +0 -0
  42. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/hashdiff_tables.py +0 -0
  43. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/info_tree.py +0 -0
  44. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/joindiff_tables.py +0 -0
  45. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/lexicographic_space.py +0 -0
  46. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/parse_time.py +0 -0
  47. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/py.typed +0 -0
  48. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/queries/__init__.py +0 -0
  49. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/queries/api.py +0 -0
  50. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/queries/ast_classes.py +0 -0
  51. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/queries/base.py +0 -0
  52. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/queries/extras.py +0 -0
  53. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/query_utils.py +0 -0
  54. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/schema.py +0 -0
  55. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/table_segment.py +0 -0
  56. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/thread_utils.py +0 -0
  57. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/utils.py +0 -0
  58. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/data_diff/version.py +0 -0
  59. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/__init__.py +0 -0
  60. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/__main__.py +0 -0
  61. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/__version__.py +0 -0
  62. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/cli/__init__.py +0 -0
  63. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/cli/cli.py +0 -0
  64. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/__init__.py +0 -0
  65. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/common/__init__.py +0 -0
  66. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/common/models/__init__.py +0 -0
  67. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/common/models/dashboard.py +0 -0
  68. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/common/models/data_source_resource.py +0 -0
  69. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/common/models/metric.py +0 -0
  70. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/common/models/profile.py +0 -0
  71. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/common/models/validation.py +0 -0
  72. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/common/models/widget.py +0 -0
  73. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/configuration/__init__.py +0 -0
  74. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/configuration/config_loader.py +0 -0
  75. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/configuration/configuration_parser.py +0 -0
  76. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/configuration/configuration_parser_arc.py +0 -0
  77. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/datasource/__init__.py +0 -0
  78. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/datasource/base.py +0 -0
  79. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/datasource/search_datasource.py +0 -0
  80. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/datasource/sql_datasource.py +0 -0
  81. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/inspect.py +0 -0
  82. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/logger/__init__.py +0 -0
  83. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/logger/base.py +0 -0
  84. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/logger/default_logger.py +0 -0
  85. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/metric/__init__.py +0 -0
  86. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/metric/base.py +0 -0
  87. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/metric/combined_metric.py +0 -0
  88. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/metric/custom_metric.py +0 -0
  89. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/metric/manager.py +0 -0
  90. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/metric/numeric_metric.py +0 -0
  91. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/metric/reliability_metric.py +0 -0
  92. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/profiling/__init__.py +0 -0
  93. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/profiling/datasource_profiling.py +0 -0
  94. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/profiling/numeric_field_profiling.py +0 -0
  95. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/profiling/text_field_profiling.py +0 -0
  96. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/repository/__init__.py +0 -0
  97. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/repository/metric_repository.py +0 -0
  98. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/utils/__init__.py +0 -0
  99. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/utils/log.py +0 -0
  100. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/utils/tracking.py +0 -0
  101. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/utils/utils.py +0 -0
  102. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/validation/__init__.py +0 -0
  103. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/validation/base.py +0 -0
  104. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/validation/completeness_validation.py +0 -0
  105. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/validation/custom_query_validation.py +0 -0
  106. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/validation/manager.py +0 -0
  107. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/validation/numeric_validation.py +0 -0
  108. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/validation/reliability_validation.py +0 -0
  109. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/validation/uniqueness_validation.py +0 -0
  110. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/core/validation/validity_validation.py +0 -0
  111. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/__init__.py +0 -0
  112. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/__init__.py +0 -0
  113. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/bigquery.py +0 -0
  114. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/databricks.py +0 -0
  115. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/db2.py +0 -0
  116. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/elasticsearch.py +0 -0
  117. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/mysql.py +0 -0
  118. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/opensearch.py +0 -0
  119. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/oracle.py +0 -0
  120. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/redshift.py +0 -0
  121. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/snowflake.py +0 -0
  122. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/spark_df.py +0 -0
  123. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/databases/sybase.py +0 -0
  124. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/storage/__init__.py +0 -0
  125. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/storage/local_file.py +0 -0
  126. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/utils/__init__.py +0 -0
  127. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/integrations/utils/utils.py +0 -0
  128. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/report/__init__.py +0 -0
  129. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/report/dashboard.py +0 -0
  130. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/report/models.py +0 -0
  131. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  132. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  133. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  134. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  135. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/report/static/assets/images/docs.svg +0 -0
  136. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/report/static/assets/images/github.svg +0 -0
  137. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/report/static/assets/images/logo.svg +0 -0
  138. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/report/static/assets/images/slack.svg +0 -0
  139. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/report/static/index.js +0 -0
  140. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_core/report/static/index.js.LICENSE.txt +0 -0
  141. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/__init__.py +0 -0
  142. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/__main__.py +0 -0
  143. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/cli/__init__.py +0 -0
  144. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/cli/cli.py +0 -0
  145. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/__init__.py +0 -0
  146. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/config/__init__.py +0 -0
  147. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/data_diff/__init__.py +0 -0
  148. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/rules/__init__.py +0 -0
  149. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/rules/rules_mappping.py +0 -0
  150. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/rules/rules_repository.py +0 -0
  151. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/rules/schema_rules.py +0 -0
  152. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/utils/__init__.py +0 -0
  153. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/utils/serializer.py +0 -0
  154. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/utils/similarity_score/__init__.py +0 -0
  155. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/utils/similarity_score/base_provider.py +0 -0
  156. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +0 -0
  157. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +0 -0
  158. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +0 -0
  159. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/utils/table.py +0 -0
  160. {dcs_sdk-1.6.5 → dcs_sdk-1.6.7}/dcs_sdk/sdk/utils/themes.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dcs-sdk
3
- Version: 1.6.5
3
+ Version: 1.6.7
4
4
  Summary: SDK for DataChecks
5
5
  Author: Waterdip Labs
6
6
  Author-email: hello@waterdip.ai
@@ -30,6 +30,8 @@ Provides-Extra: sybase
30
30
  Provides-Extra: trino
31
31
  Provides-Extra: vertica
32
32
  Requires-Dist: attrs (>=23.1.0)
33
+ Requires-Dist: azure-identity (>=1.25.1,<2.0.0)
34
+ Requires-Dist: azure-storage-blob (>=12.27.1,<13.0.0)
33
35
  Requires-Dist: click (>=8.1)
34
36
  Requires-Dist: clickhouse-driver (>=0.2.9) ; extra == "clickhouse" or extra == "all-dbs"
35
37
  Requires-Dist: cryptography (>=44.0.1) ; extra == "snowflake" or extra == "all-dbs"
@@ -84,7 +86,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
84
86
  Description-Content-Type: text/markdown
85
87
 
86
88
  <h1 align="center">
87
- DCS SDK v1.6.4
89
+ DCS SDK v1.6.7
88
90
  </h1>
89
91
 
90
92
  > SDK for DataChecks
@@ -1,5 +1,5 @@
1
1
  <h1 align="center">
2
- DCS SDK v1.6.4
2
+ DCS SDK v1.6.7
3
3
  </h1>
4
4
 
5
5
  > SDK for DataChecks
@@ -55,9 +55,7 @@ def connect_to_table(
55
55
  db_info.pop(k)
56
56
  if isinstance(key_columns, str):
57
57
  key_columns = (key_columns,)
58
-
59
58
  db: Database = connect(db_info, thread_count=thread_count)
60
-
61
59
  if isinstance(table_name, str):
62
60
  table_name = db.dialect.parse_table_name(table_name)
63
61
 
@@ -16,6 +16,8 @@ ERROR_RUNTIME = "runtime_error"
16
16
  ERROR_CONFIGURATION = "configuration_error"
17
17
  ERROR_DATA_SOURCES_CONNECTION = "data_sources_connection_error"
18
18
  ERROR_METRIC_GENERATION = "metric_generation_error"
19
+ ERROR_FETCHING_TABLE = "table_fetch_error"
20
+ ERROR_FETCHING_COLUMN = "column_fetch_error"
19
21
 
20
22
 
21
23
  class DataChecksRuntimeError(Exception):
@@ -48,3 +50,19 @@ class DataChecksMetricGenerationError(Exception):
48
50
  def __init__(self, message):
49
51
  super().__init__(message)
50
52
  self.error_code = ERROR_METRIC_GENERATION
53
+
54
+
55
+ class DatachecksTableFetchError(Exception):
56
+ """Raised when there is an error in fetching table."""
57
+
58
+ def __init__(self, message):
59
+ super().__init__(message)
60
+ self.error_code = ERROR_FETCHING_TABLE
61
+
62
+
63
+ class DatachecksColumnFetchError(Exception):
64
+ """Raised when there is an error in fetching column."""
65
+
66
+ def __init__(self, message):
67
+ super().__init__(message)
68
+ self.error_code = ERROR_FETCHING_COLUMN
@@ -43,6 +43,7 @@ class DataSourceType(str, Enum):
43
43
  ORACLE = "oracle"
44
44
  DB2 = "db2"
45
45
  SYBASE = "sybase"
46
+ AZURE_BLOB = "azure_blob"
46
47
 
47
48
 
48
49
  class DataSourceLanguageSupport(str, Enum):
@@ -85,6 +86,11 @@ class DataSourceConnectionConfiguration:
85
86
  security: Optional[str] = None # IBM DB2 specific configuration
86
87
  protocol: Optional[str] = None # IBM DB2 specific configuration
87
88
  server: Optional[str] = None
89
+ account_name: Optional[str] = None
90
+ container_name: Optional[str] = None
91
+ account_key: Optional[str] = None
92
+ endpoint_suffix: Optional[str] = None
93
+ subfolder_path: Optional[str] = None
88
94
 
89
95
 
90
96
  @dataclass
@@ -0,0 +1,26 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Dict
16
+
17
+ from dcs_core.core.datasource.base import DataSource
18
+
19
+
20
+ class FileDataSource(DataSource):
21
+ """
22
+ Abstract class for File data sources
23
+ """
24
+
25
+ def __init__(self, data_source_name: str, data_connection: Dict):
26
+ super().__init__(data_source_name, data_connection)
@@ -11,6 +11,20 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
15
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.manager
16
+ #
17
+ # Licensed under the Apache License, Version 2.0 (the "License");
18
+ # you may not use this file except in compliance with the License.
19
+ # You may obtain a copy of the License at
20
+ #
21
+ # http://www.apache.org/licenses/LICENSE-2.0
22
+ #
23
+ # Unless required by applicable law or agreed to in writing, software
24
+ # distributed under the License is distributed on an "AS IS" BASIS,
25
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26
+ # See the License for the specific language governing permissions and
27
+ # limitations under the License.
14
28
  import importlib
15
29
  from dataclasses import asdict
16
30
  from typing import Dict, List
@@ -43,6 +57,7 @@ class DataSourceManager:
43
57
  "oracle": "OracleDataSource",
44
58
  "db2": "DB2DataSource",
45
59
  "sybase": "SybaseDataSource",
60
+ "azure_blob": "AzureBlobDataSource",
46
61
  }
47
62
 
48
63
  def __init__(self, config: Configuration):
@@ -0,0 +1,115 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import io
16
+ from typing import Any, Dict, List, Optional
17
+
18
+ import pandas as pd
19
+ from azure.storage.blob import BlobServiceClient
20
+
21
+ from dcs_core.core.common.errors import (
22
+ DatachecksColumnFetchError,
23
+ DataChecksDataSourcesConnectionError,
24
+ DatachecksTableFetchError,
25
+ )
26
+ from dcs_core.core.datasource.file_datasource import FileDataSource
27
+
28
+
29
+ class AzureBlobDataSource(FileDataSource):
30
+ def __init__(self, data_source_name: str, data_connection: Dict):
31
+ super().__init__(data_source_name, data_connection)
32
+ self.allowed_file_extensions = [".csv"]
33
+ self.blob_service_client: Optional[BlobServiceClient] = None
34
+ self.connection = None
35
+
36
+ def connect(self) -> Any:
37
+ """
38
+ Connect to the file data source
39
+ """
40
+ try:
41
+ account_name = self.data_connection.get("account_name")
42
+ container_name = self.data_connection.get("container_name")
43
+ account_key = self.data_connection.get("account_key")
44
+ endpoint_suffix = self.data_connection.get("endpoint_suffix", "core.windows.net")
45
+ connection_str = f"https://{account_name}.blob.{endpoint_suffix}"
46
+ blob_service_client = BlobServiceClient(account_url=connection_str, credential=account_key)
47
+ self.blob_service_client = blob_service_client
48
+ self.connection = blob_service_client.get_container_client(container=container_name)
49
+ return self.connection
50
+ except Exception as e:
51
+ raise DataChecksDataSourcesConnectionError(f"Failed to connect to Azure Blob Storage: {e}")
52
+
53
+ def is_connected(self) -> bool:
54
+ """
55
+ Check if the file data source is connected
56
+ """
57
+ return self.connection is not None
58
+
59
+ def close(self):
60
+ """
61
+ Close the connection
62
+ """
63
+ self.connection.close()
64
+ self.blob_service_client.close()
65
+ self.connection = None
66
+ self.blob_service_client = None
67
+
68
+ def query_get_table_names(self) -> dict:
69
+ """
70
+ Query to get table names (blob names in this case)
71
+ """
72
+ if not self.is_connected():
73
+ raise DataChecksDataSourcesConnectionError("Not connected to Azure Blob Storage")
74
+ try:
75
+ subfolder = self.data_connection.get("subfolder", "")
76
+ blob_iterator = self.connection.list_blobs(name_starts_with=subfolder)
77
+ blobs = [
78
+ blob.name
79
+ for blob in blob_iterator
80
+ if len(blob.name.split("/")) == 1 and blob.name.endswith(tuple(self.allowed_file_extensions))
81
+ ]
82
+ return {"table": blobs}
83
+ except Exception as e:
84
+ raise DatachecksTableFetchError(f"Failed to list blobs: {e}")
85
+
86
+ def query_get_table_columns(self, table: str) -> List[dict]:
87
+ """
88
+ Get column names for a table (CSV blob in this case).
89
+ """
90
+ if not self.is_connected():
91
+ raise DataChecksDataSourcesConnectionError("Not connected to Azure Blob Storage")
92
+
93
+ if not any(table.endswith(ext) for ext in self.allowed_file_extensions):
94
+ raise ValueError(f"Unsupported file type for {table}. Allowed: {self.allowed_file_extensions}")
95
+
96
+ try:
97
+ blob_client = self.connection.get_blob_client(blob=table)
98
+ download_stream = blob_client.download_blob()
99
+ data = download_stream.readall()
100
+ if table.endswith(".csv"):
101
+ df = pd.read_csv(io.BytesIO(data))
102
+ else:
103
+ raise ValueError(f"Unsupported file type for {table}. Allowed: {self.allowed_file_extensions}")
104
+
105
+ return [{"column_name": col, "column_type": "string"} for col in df.columns.tolist()]
106
+ except Exception as e:
107
+ raise DatachecksColumnFetchError(f"Failed to read columns from blob '{table}': {e}")
108
+
109
+ def query_get_database_version(self) -> str:
110
+ """
111
+ Get the database version
112
+ :return: version string
113
+ """
114
+ api_version = self.blob_service_client.api_version
115
+ return api_version
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import datetime
16
+ import math
16
17
  from decimal import Decimal
17
18
  from typing import Any, Dict, List, Optional, Tuple, Union
18
19
  from uuid import UUID
@@ -706,13 +707,15 @@ class MssqlDataSource(SQLDataSource):
706
707
  cursor = self.connection.cursor()
707
708
  try:
708
709
  cursor.execute(query)
709
- columns = [column[0] for column in cursor.description]
710
- result_row = cursor.fetchone()
710
+ if cursor.description:
711
+ columns = [column[0] for column in cursor.description]
712
+ result_row = cursor.fetchone()
713
+ row = dict(zip(columns, result_row)) if result_row else {}
714
+ else:
715
+ row = {}
711
716
  finally:
712
717
  cursor.close()
713
718
 
714
- row = dict(zip(columns, result_row))
715
-
716
719
  def _normalize_metrics(value):
717
720
  """Safely normalize DB metric values for JSON serialization."""
718
721
  if value is None:
@@ -737,11 +740,158 @@ class MssqlDataSource(SQLDataSource):
737
740
  col_metrics = {}
738
741
 
739
742
  for key, value in row.items():
740
- if key.startswith(f"{name}_"):
741
- metric_name = key[len(name) + 1 :]
743
+ clean_key = key.replace("[", "").replace("]", "")
744
+ if clean_key.startswith(f"{name}_"):
745
+ metric_name = clean_key[len(name) + 1 :]
742
746
  col_metrics[metric_name] = _normalize_metrics(value)
743
747
 
744
748
  column_wise.append({"column_name": name, "metrics": col_metrics})
749
+
750
+ for col_data in column_wise:
751
+ metrics = col_data["metrics"]
752
+ distinct_count = metrics.get("distinct")
753
+ col_name = col_data["column_name"]
754
+
755
+ dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
756
+
757
+ quoted = self.quote_column(col_name)
758
+
759
+ is_dtype_numeric = (
760
+ True
761
+ if dtype
762
+ in (
763
+ "int",
764
+ "integer",
765
+ "bigint",
766
+ "smallint",
767
+ "tinyint",
768
+ "decimal",
769
+ "numeric",
770
+ "float",
771
+ "real",
772
+ "money",
773
+ "smallmoney",
774
+ )
775
+ else False
776
+ )
777
+
778
+ if is_dtype_numeric:
779
+ col_min = metrics.get("min")
780
+ col_max = metrics.get("max")
781
+
782
+ if col_min is not None and col_max is not None and col_min != col_max:
783
+ bucket_count = 20
784
+ bucket_size = (float(col_max) - float(col_min)) / bucket_count
785
+
786
+ bucket_queries = []
787
+ for i in range(bucket_count):
788
+ start = float(col_min) + i * bucket_size
789
+ end = float(col_min) + (i + 1) * bucket_size
790
+
791
+ bucket_queries.append(
792
+ f"SUM(CASE WHEN {quoted} >= {start} AND {quoted} < {end} THEN 1 ELSE 0 END) AS bucket_{i}"
793
+ )
794
+
795
+ bucket_sql = f"SELECT {', '.join(bucket_queries)} FROM {qualified_table}"
796
+
797
+ try:
798
+ bucket_result = self.connection.execute(text(bucket_sql)).fetchone()
799
+ distribution = []
800
+
801
+ for i in range(bucket_count):
802
+ start_raw = float(col_min) + i * bucket_size
803
+ end_raw = float(col_min) + (i + 1) * bucket_size
804
+
805
+ if dtype in ("int", "integer", "bigint", "smallint", "tinyint"):
806
+ start = math.floor(start_raw)
807
+ end = math.ceil(end_raw)
808
+ else:
809
+ start = round(start_raw, 2)
810
+ end = round(end_raw, 2)
811
+
812
+ count = bucket_result[i] if bucket_result and bucket_result[i] is not None else 0
813
+
814
+ distribution.append(
815
+ {
816
+ "col_val": f"{start} - {end}",
817
+ "count": count,
818
+ }
819
+ )
820
+
821
+ metrics["distribution_graph"] = distribution
822
+
823
+ except Exception as e:
824
+ print(f"Failed to generate numeric distribution for {col_name}: {e}")
825
+
826
+ continue
827
+
828
+ if isinstance(distinct_count, (int, float)) and distinct_count <= 20:
829
+ if dtype in ("text", "ntext", "xml"):
830
+ group_expr = f"CAST({quoted} AS NVARCHAR(MAX))"
831
+ else:
832
+ group_expr = quoted
833
+
834
+ dist_query = (
835
+ f"SELECT {group_expr}, COUNT(*) "
836
+ f"FROM {qualified_table} GROUP BY {group_expr} ORDER BY COUNT(*) DESC"
837
+ )
838
+
839
+ try:
840
+ dist_cursor = self.connection.cursor()
841
+ dist_cursor.execute(dist_query)
842
+ dist_result = dist_cursor.fetchall()
843
+ dist_cursor.close()
844
+
845
+ distribution = []
846
+
847
+ for r in dist_result:
848
+ val = _normalize_metrics(r[0])
849
+ distribution.append(
850
+ {
851
+ "col_val": val,
852
+ "count": r[1],
853
+ }
854
+ )
855
+
856
+ metrics["distribution_graph"] = distribution
857
+
858
+ except Exception as e:
859
+ print(f"Failed to generate distribution graph for column {col_name}: {e}")
860
+
861
+ for col_data in column_wise:
862
+ metrics = col_data["metrics"]
863
+ distinct_count = metrics.get("distinct")
864
+ col_name = col_data["column_name"]
865
+ dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
866
+
867
+ quoted = self.quote_column(col_name)
868
+
869
+ is_dtype_numeric = (
870
+ True
871
+ if dtype
872
+ in (
873
+ "int",
874
+ "integer",
875
+ "bigint",
876
+ "smallint",
877
+ "tinyint",
878
+ "decimal",
879
+ "numeric",
880
+ "float",
881
+ "real",
882
+ "money",
883
+ "smallmoney",
884
+ )
885
+ else False
886
+ )
887
+
888
+ formatted_metrics_data = {
889
+ "general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
890
+ "is_dtype_numeric": is_dtype_numeric,
891
+ "distribution_data": metrics.get("distribution_graph", []),
892
+ }
893
+ col_data["metrics"] = formatted_metrics_data
894
+
745
895
  return column_wise
746
896
 
747
897
  def fetch_sample_values_from_database(
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import datetime
16
+ import math
16
17
  from decimal import Decimal
17
18
  from typing import Any, Dict, List, Optional, Tuple
18
19
  from uuid import UUID
@@ -411,9 +412,73 @@ class PostgresDataSource(SQLDataSource):
411
412
  col_name = col_data["column_name"]
412
413
  dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
413
414
 
414
- if isinstance(distinct_count, (int, float)) and distinct_count < 20:
415
- quoted = self.quote_column(col_name)
415
+ quoted = self.quote_column(col_name)
416
+
417
+ is_dtype_numeric = (
418
+ True
419
+ if dtype
420
+ in (
421
+ "int",
422
+ "integer",
423
+ "bigint",
424
+ "smallint",
425
+ "decimal",
426
+ "numeric",
427
+ "float",
428
+ "double",
429
+ )
430
+ else False
431
+ )
432
+
433
+ if is_dtype_numeric:
434
+ col_min = metrics.get("min")
435
+ col_max = metrics.get("max")
436
+
437
+ if col_min is not None and col_max is not None and col_min != col_max:
438
+ bucket_count = 20
439
+ bucket_size = (col_max - col_min) / bucket_count
440
+
441
+ bucket_queries = []
442
+ for i in range(bucket_count):
443
+ start = col_min + i * bucket_size
444
+ end = col_min + (i + 1) * bucket_size
445
+
446
+ bucket_queries.append(
447
+ f"SUM(CASE WHEN {quoted} >= {start} AND {quoted} < {end} THEN 1 ELSE 0 END) AS bucket_{i}"
448
+ )
449
+
450
+ bucket_sql = f"SELECT {', '.join(bucket_queries)} FROM {qualified_table}"
451
+
452
+ try:
453
+ bucket_result = self.connection.execute(text(bucket_sql)).fetchone()
454
+ distribution = []
455
+
456
+ for i in range(bucket_count):
457
+ start_raw = col_min + i * bucket_size
458
+ end_raw = col_min + (i + 1) * bucket_size
459
+ if dtype in ("int", "integer", "bigint", "smallint"):
460
+ start = math.floor(start_raw)
461
+ end = math.ceil(end_raw)
462
+ else:
463
+ start = round(start_raw, 2)
464
+ end = round(end_raw, 2)
465
+ count = bucket_result[i]
466
+
467
+ distribution.append(
468
+ {
469
+ "col_val": f"{start} - {end}",
470
+ "count": count,
471
+ }
472
+ )
416
473
 
474
+ metrics["distribution_graph"] = distribution
475
+
476
+ except Exception as e:
477
+ print(f"Failed to generate numeric distribution for {col_name}: {e}")
478
+
479
+ continue
480
+
481
+ if isinstance(distinct_count, (int, float)) and distinct_count <= 20:
417
482
  if dtype in ("json", "jsonb"):
418
483
  group_expr = f"{quoted}::text"
419
484
  else:
@@ -444,8 +509,31 @@ class PostgresDataSource(SQLDataSource):
444
509
 
445
510
  for col_data in column_wise:
446
511
  metrics = col_data["metrics"]
512
+ distinct_count = metrics.get("distinct")
513
+ col_name = col_data["column_name"]
514
+ dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
515
+
516
+ quoted = self.quote_column(col_name)
517
+
518
+ is_dtype_numeric = (
519
+ True
520
+ if dtype
521
+ in (
522
+ "int",
523
+ "integer",
524
+ "bigint",
525
+ "smallint",
526
+ "decimal",
527
+ "numeric",
528
+ "float",
529
+ "double",
530
+ )
531
+ else False
532
+ )
533
+
447
534
  formatted_metrics_data = {
448
535
  "general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
536
+ "is_dtype_numeric": is_dtype_numeric,
449
537
  "distribution_data": metrics.get("distribution_graph", []),
450
538
  }
451
539
  col_data["metrics"] = formatted_metrics_data
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.6.4"
15
+ __version__ = "1.6.7"
@@ -47,6 +47,7 @@ class SourceTargetConnection(BaseModel):
47
47
  port: Optional[Union[int, str]] = None
48
48
  driver: str
49
49
  table: Optional[str] = None
50
+ datasource_type: Optional[str] = None
50
51
  database: Optional[str] = None
51
52
  filepath: Optional[str] = None
52
53
  catalog: Optional[str] = None
@@ -66,6 +67,11 @@ class SourceTargetConnection(BaseModel):
66
67
  impersonate_service_account: Optional[str] = None # bigquery specific
67
68
  bigquery_credentials: Optional[str] = None # bigquery specific
68
69
  transform_columns: Dict[str, str] | None = None
70
+ account_name: Optional[str] = None
71
+ container_name: Optional[str] = None
72
+ account_key: Optional[str] = None
73
+ endpoint_suffix: Optional[str] = None
74
+ subfolder_path: Optional[str] = None
69
75
 
70
76
 
71
77
  class SimilarityConfig(BaseModel):
@@ -140,6 +146,7 @@ class DataDiffConfig:
140
146
  "mysql": "mysql",
141
147
  "sybase": "sybase",
142
148
  "bigquery": "bigquery",
149
+ "azure_blob": "duckdb",
143
150
  }
144
151
 
145
152
  def __init__(
@@ -307,6 +314,12 @@ class DataDiffConfig:
307
314
  "impersonate_service_account": connection.get("connection", {}).get("impersonate_service_account"),
308
315
  "bigquery_credentials": connection.get("connection", {}).get("bigquery_credentials"),
309
316
  "transform_columns": transform_columns,
317
+ "datasource_type": connection.get("type"),
318
+ "account_name": connection.get("connection", {}).get("account_name"),
319
+ "container_name": connection.get("connection", {}).get("container_name"),
320
+ "account_key": connection.get("connection", {}).get("account_key"),
321
+ "endpoint_suffix": connection.get("connection", {}).get("endpoint_suffix"),
322
+ "subfolder_path": connection.get("connection", {}).get("subfolder_path"),
310
323
  }
311
324
 
312
325
  def get_data_diff_configs(self) -> List[Comparison]: