dcs-sdk 1.7.0__tar.gz → 1.7.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/PKG-INFO +2 -2
  2. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/README.md +1 -1
  3. dcs_sdk-1.7.2/dcs_core/core/datasource/file_datasource.py +124 -0
  4. dcs_sdk-1.7.2/dcs_core/integrations/databases/azure_blob.py +106 -0
  5. dcs_sdk-1.7.2/dcs_core/integrations/databases/duck_db.py +141 -0
  6. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/mssql.py +8 -5
  7. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/oracle.py +2 -4
  8. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/postgres.py +0 -2
  9. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/__version__.py +1 -1
  10. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/pyproject.toml +1 -1
  11. dcs_sdk-1.7.0/dcs_core/core/datasource/file_datasource.py +0 -30
  12. dcs_sdk-1.7.0/dcs_core/integrations/databases/azure_blob.py +0 -217
  13. dcs_sdk-1.7.0/dcs_core/integrations/databases/duck_db.py +0 -72
  14. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/__init__.py +0 -0
  15. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/__main__.py +0 -0
  16. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/abcs/__init__.py +0 -0
  17. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/abcs/compiler.py +0 -0
  18. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/abcs/database_types.py +0 -0
  19. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/config.py +0 -0
  20. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/__init__.py +0 -0
  21. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/_connect.py +0 -0
  22. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/base.py +0 -0
  23. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/bigquery.py +0 -0
  24. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/clickhouse.py +0 -0
  25. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/databricks.py +0 -0
  26. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/duckdb.py +0 -0
  27. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/mssql.py +0 -0
  28. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/mysql.py +0 -0
  29. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/oracle.py +0 -0
  30. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/postgresql.py +0 -0
  31. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/presto.py +0 -0
  32. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/redis.py +0 -0
  33. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/redshift.py +0 -0
  34. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/snowflake.py +0 -0
  35. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/sybase.py +0 -0
  36. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/trino.py +0 -0
  37. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/databases/vertica.py +0 -0
  38. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/diff_tables.py +0 -0
  39. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/errors.py +0 -0
  40. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/format.py +0 -0
  41. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/hashdiff_tables.py +0 -0
  42. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/info_tree.py +0 -0
  43. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/joindiff_tables.py +0 -0
  44. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/lexicographic_space.py +0 -0
  45. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/parse_time.py +0 -0
  46. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/py.typed +0 -0
  47. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/queries/__init__.py +0 -0
  48. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/queries/api.py +0 -0
  49. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/queries/ast_classes.py +0 -0
  50. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/queries/base.py +0 -0
  51. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/queries/extras.py +0 -0
  52. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/query_utils.py +0 -0
  53. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/schema.py +0 -0
  54. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/table_segment.py +0 -0
  55. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/thread_utils.py +0 -0
  56. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/utils.py +0 -0
  57. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/data_diff/version.py +0 -0
  58. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/__init__.py +0 -0
  59. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/__main__.py +0 -0
  60. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/__version__.py +0 -0
  61. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/cli/__init__.py +0 -0
  62. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/cli/cli.py +0 -0
  63. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/__init__.py +0 -0
  64. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/common/__init__.py +0 -0
  65. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/common/errors.py +0 -0
  66. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/common/models/__init__.py +0 -0
  67. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/common/models/configuration.py +0 -0
  68. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/common/models/dashboard.py +0 -0
  69. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/common/models/data_source_resource.py +0 -0
  70. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/common/models/metric.py +0 -0
  71. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/common/models/profile.py +0 -0
  72. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/common/models/validation.py +0 -0
  73. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/common/models/widget.py +0 -0
  74. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/configuration/__init__.py +0 -0
  75. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/configuration/config_loader.py +0 -0
  76. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/configuration/configuration_parser.py +0 -0
  77. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/configuration/configuration_parser_arc.py +0 -0
  78. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/datasource/__init__.py +0 -0
  79. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/datasource/base.py +0 -0
  80. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/datasource/manager.py +0 -0
  81. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/datasource/search_datasource.py +0 -0
  82. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/datasource/sql_datasource.py +0 -0
  83. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/inspect.py +0 -0
  84. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/logger/__init__.py +0 -0
  85. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/logger/base.py +0 -0
  86. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/logger/default_logger.py +0 -0
  87. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/metric/__init__.py +0 -0
  88. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/metric/base.py +0 -0
  89. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/metric/combined_metric.py +0 -0
  90. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/metric/custom_metric.py +0 -0
  91. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/metric/manager.py +0 -0
  92. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/metric/numeric_metric.py +0 -0
  93. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/metric/reliability_metric.py +0 -0
  94. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/profiling/__init__.py +0 -0
  95. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/profiling/datasource_profiling.py +0 -0
  96. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/profiling/numeric_field_profiling.py +0 -0
  97. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/profiling/text_field_profiling.py +0 -0
  98. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/repository/__init__.py +0 -0
  99. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/repository/metric_repository.py +0 -0
  100. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/utils/__init__.py +0 -0
  101. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/utils/log.py +0 -0
  102. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/utils/tracking.py +0 -0
  103. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/utils/utils.py +0 -0
  104. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/validation/__init__.py +0 -0
  105. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/validation/base.py +0 -0
  106. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/validation/completeness_validation.py +0 -0
  107. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/validation/custom_query_validation.py +0 -0
  108. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/validation/manager.py +0 -0
  109. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/validation/numeric_validation.py +0 -0
  110. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/validation/reliability_validation.py +0 -0
  111. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/validation/uniqueness_validation.py +0 -0
  112. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/core/validation/validity_validation.py +0 -0
  113. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/__init__.py +0 -0
  114. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/__init__.py +0 -0
  115. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/bigquery.py +0 -0
  116. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/databricks.py +0 -0
  117. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/db2.py +0 -0
  118. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/elasticsearch.py +0 -0
  119. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/mysql.py +0 -0
  120. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/opensearch.py +0 -0
  121. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/redshift.py +0 -0
  122. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/snowflake.py +0 -0
  123. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/spark_df.py +0 -0
  124. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/databases/sybase.py +0 -0
  125. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/storage/__init__.py +0 -0
  126. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/storage/local_file.py +0 -0
  127. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/utils/__init__.py +0 -0
  128. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/integrations/utils/utils.py +0 -0
  129. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/report/__init__.py +0 -0
  130. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/report/dashboard.py +0 -0
  131. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/report/models.py +0 -0
  132. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  133. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  134. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  135. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  136. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/report/static/assets/images/docs.svg +0 -0
  137. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/report/static/assets/images/github.svg +0 -0
  138. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/report/static/assets/images/logo.svg +0 -0
  139. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/report/static/assets/images/slack.svg +0 -0
  140. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/report/static/index.js +0 -0
  141. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_core/report/static/index.js.LICENSE.txt +0 -0
  142. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/__init__.py +0 -0
  143. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/__main__.py +0 -0
  144. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/cli/__init__.py +0 -0
  145. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/cli/cli.py +0 -0
  146. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/__init__.py +0 -0
  147. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/config/__init__.py +0 -0
  148. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/config/config_loader.py +0 -0
  149. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/data_diff/__init__.py +0 -0
  150. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/data_diff/data_differ.py +0 -0
  151. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/rules/__init__.py +0 -0
  152. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/rules/rules_mappping.py +0 -0
  153. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/rules/rules_repository.py +0 -0
  154. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/rules/schema_rules.py +0 -0
  155. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/utils/__init__.py +0 -0
  156. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/utils/serializer.py +0 -0
  157. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/utils/similarity_score/__init__.py +0 -0
  158. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/utils/similarity_score/base_provider.py +0 -0
  159. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +0 -0
  160. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +0 -0
  161. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +0 -0
  162. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/utils/table.py +0 -0
  163. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/utils/themes.py +0 -0
  164. {dcs_sdk-1.7.0 → dcs_sdk-1.7.2}/dcs_sdk/sdk/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dcs-sdk
3
- Version: 1.7.0
3
+ Version: 1.7.2
4
4
  Summary: SDK for DataChecks
5
5
  Author: Waterdip Labs
6
6
  Author-email: hello@waterdip.ai
@@ -86,7 +86,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
86
86
  Description-Content-Type: text/markdown
87
87
 
88
88
  <h1 align="center">
89
- DCS SDK v1.7.0
89
+ DCS SDK v1.7.2
90
90
  </h1>
91
91
 
92
92
  > SDK for DataChecks
@@ -1,5 +1,5 @@
1
1
  <h1 align="center">
2
- DCS SDK v1.7.0
2
+ DCS SDK v1.7.2
3
3
  </h1>
4
4
 
5
5
  > SDK for DataChecks
@@ -0,0 +1,124 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import uuid
17
+ from abc import ABC, abstractmethod
18
+ from contextlib import contextmanager
19
+ from pathlib import Path
20
+ from typing import Dict, Iterator
21
+
22
+ import duckdb
23
+ from loguru import logger
24
+
25
+ from dcs_core.core.common.models.data_source_resource import RawColumnInfo
26
+ from dcs_core.core.datasource.base import DataSource
27
+ from dcs_core.integrations.databases.duck_db import DuckDb
28
+
29
+
30
+ class FileDataSource(DataSource, ABC):
31
+ """
32
+ Abstract class for File data sources
33
+ """
34
+
35
+ def __init__(self, data_source_name: str, data_connection: Dict):
36
+ super().__init__(data_source_name, data_connection)
37
+ self.temp_dir_name = "tmp"
38
+
39
+ @contextmanager
40
+ def as_duckdb(self, table_name: str) -> Iterator["DuckDb"]:
41
+ """Returns a DuckDB instance for the given table name"""
42
+ duckdb_path = self.load_file_to_duckdb(table_name)
43
+ duck_db_ds = DuckDb(data_source_name=self.data_source_name, data_connection={"file_path": duckdb_path})
44
+ try:
45
+ duck_db_ds.connect()
46
+ yield duck_db_ds
47
+ finally:
48
+ duck_db_ds.close()
49
+
50
+ @abstractmethod
51
+ def query_get_table_names(self) -> dict:
52
+ """
53
+ Query to get table names
54
+ """
55
+ pass
56
+
57
+ @abstractmethod
58
+ def query_get_database_version(self) -> str:
59
+ """
60
+ Get the database version
61
+ :return: version string
62
+ """
63
+ pass
64
+
65
+ @abstractmethod
66
+ def _download_to_path(self, table_name: str, path: str) -> None:
67
+ """Vendor-specific download"""
68
+ pass
69
+
70
+ def load_file_to_duckdb(self, table_name: str) -> str:
71
+ """Template method"""
72
+ os.makedirs(self.temp_dir_name, exist_ok=True)
73
+
74
+ ext = Path(table_name).suffix
75
+ if not ext:
76
+ raise ValueError(f"Invalid file name {table_name}")
77
+
78
+ temp_path = f"{self.temp_dir_name}/{uuid.uuid4()}{ext}"
79
+
80
+ try:
81
+ self._download_to_path(table_name, temp_path)
82
+ return self._load_path_to_duckdb(temp_path, table_name)
83
+ finally:
84
+ if os.path.exists(temp_path):
85
+ os.remove(temp_path)
86
+ logger.info(f"Cleaned up temp file {temp_path}")
87
+
88
+ def _load_path_to_duckdb(self, path: str, table_name: str) -> str:
89
+ """Shared DuckDB loading logic"""
90
+ tmp_dir = self.temp_dir_name
91
+ duckdb_path = f"{tmp_dir}/{uuid.uuid4()}.duckdb"
92
+ table_stem = Path(table_name).stem
93
+
94
+ logger.info(f"Loading {path} into DuckDB")
95
+
96
+ conn = None
97
+ try:
98
+ conn = duckdb.connect(database=duckdb_path, read_only=False)
99
+ conn.execute(
100
+ f'CREATE TABLE "{table_stem}" AS SELECT * FROM read_csv_auto(?)',
101
+ [path],
102
+ )
103
+ logger.info(f"Successfully loaded data into {duckdb_path}")
104
+ return duckdb_path
105
+ except Exception as e:
106
+ logger.warning(f"read_csv_auto failed: {e}. Trying with ALL_VARCHAR=TRUE")
107
+ try:
108
+ if conn:
109
+ conn.close()
110
+ conn = duckdb.connect(database=duckdb_path, read_only=False)
111
+ conn.execute(
112
+ f'CREATE TABLE "{table_stem}" AS ' f"SELECT * FROM read_csv(?, ALL_VARCHAR=TRUE, SAMPLE_SIZE=-1)",
113
+ [path],
114
+ )
115
+ logger.info(f"Successfully loaded data with ALL_VARCHAR into {duckdb_path}")
116
+ return duckdb_path
117
+ except Exception as fallback_error:
118
+ logger.error(f"Failed to load CSV into DuckDB: {fallback_error}")
119
+ if os.path.exists(duckdb_path):
120
+ os.remove(duckdb_path)
121
+ raise
122
+ finally:
123
+ if conn:
124
+ conn.close()
@@ -0,0 +1,106 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Any, Dict, Optional
16
+
17
+ from azure.storage.blob import BlobServiceClient
18
+ from loguru import logger
19
+
20
+ from dcs_core.core.common.errors import (
21
+ DataChecksDataSourcesConnectionError,
22
+ DatachecksTableFetchError,
23
+ )
24
+ from dcs_core.core.datasource.file_datasource import FileDataSource
25
+
26
+
27
+ class AzureBlobDataSource(FileDataSource):
28
+ def __init__(self, data_source_name: str, data_connection: Dict):
29
+ super().__init__(data_source_name, data_connection)
30
+ self.allowed_file_extensions = [".csv"]
31
+ self.blob_service_client: Optional[BlobServiceClient] = None
32
+ self.DEFAULT_NUMERIC_PRECISION = 16383
33
+ self.connection = None
34
+
35
+ def connect(self) -> Any:
36
+ """
37
+ Connect to the file data source
38
+ """
39
+ try:
40
+ account_name = self.data_connection.get("account_name")
41
+ container_name = self.data_connection.get("container_name")
42
+ account_key = self.data_connection.get("account_key")
43
+ endpoint_suffix = self.data_connection.get("endpoint_suffix", "core.windows.net")
44
+ connection_str = f"https://{account_name}.blob.{endpoint_suffix}"
45
+ blob_service_client = BlobServiceClient(account_url=connection_str, credential=account_key)
46
+ self.blob_service_client = blob_service_client
47
+ self.connection = blob_service_client.get_container_client(container=container_name)
48
+ return self.connection
49
+ except Exception as e:
50
+ raise DataChecksDataSourcesConnectionError(f"Failed to connect to Azure Blob Storage: {e}")
51
+
52
+ def is_connected(self) -> bool:
53
+ """
54
+ Check if the file data source is connected
55
+ """
56
+ return self.connection is not None
57
+
58
+ def close(self):
59
+ """
60
+ Close the connection
61
+ """
62
+ self.connection.close()
63
+ self.blob_service_client.close()
64
+ self.connection = None
65
+ self.blob_service_client = None
66
+
67
+ def query_get_table_names(self) -> dict:
68
+ """
69
+ Query to get table names (blob names in this case)
70
+ """
71
+ if not self.is_connected():
72
+ raise DataChecksDataSourcesConnectionError("Not connected to Azure Blob Storage")
73
+ try:
74
+ subfolder = self.data_connection.get("subfolder", "")
75
+ blob_iterator = self.connection.list_blobs(name_starts_with=subfolder)
76
+ blobs = [
77
+ blob.name
78
+ for blob in blob_iterator
79
+ if len(blob.name.split("/")) == 1 and blob.name.endswith(tuple(self.allowed_file_extensions))
80
+ ]
81
+ return {"table": blobs}
82
+ except Exception as e:
83
+ raise DatachecksTableFetchError(f"Failed to list blobs: {e}")
84
+
85
+ def safe_get(self, lst, idx, default=None):
86
+ return lst[idx] if 0 <= idx < len(lst) else default
87
+
88
+ def query_get_database_version(self) -> str:
89
+ """
90
+ Get the database version
91
+ :return: version string
92
+ """
93
+ api_version = self.blob_service_client.api_version
94
+ return api_version
95
+
96
+ def _download_to_path(self, table_name: str, path: str):
97
+ """Download blob to path"""
98
+ blob_client = self.connection.get_blob_client(blob=table_name)
99
+ logger.info(f"Downloading {table_name} to {path}")
100
+ try:
101
+ with open(path, "wb") as f:
102
+ stream = blob_client.download_blob()
103
+ for chunk in stream.chunks():
104
+ f.write(chunk)
105
+ except Exception as e:
106
+ raise DataChecksDataSourcesConnectionError(f"Failed to download blob '{table_name}': {e}")
@@ -0,0 +1,141 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+ from pathlib import Path
16
+ from typing import Any, Dict
17
+
18
+ import duckdb
19
+ from loguru import logger
20
+
21
+ from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
22
+ from dcs_core.core.common.models.data_source_resource import RawColumnInfo
23
+ from dcs_core.core.datasource.sql_datasource import SQLDataSource
24
+
25
+
26
+ class DuckDb(SQLDataSource):
27
+ def __init__(self, data_source_name: str, data_connection: Dict):
28
+ super().__init__(data_source_name, data_connection)
29
+ self.connection = None
30
+ self.use_sa_text_query = False
31
+ self.regex_patterns = {
32
+ "uuid": r"^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$",
33
+ "usa_phone": r"^(\+1[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}$",
34
+ "email": r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$",
35
+ "usa_zip_code": r"^[0-9]{5}(?:-[0-9]{4})?$",
36
+ "ssn": r"^[0-9]{3}-[0-9]{2}-[0-9]{4}$",
37
+ "sedol": r"^[B-DF-HJ-NP-TV-XZ0-9]{6}[0-9]$",
38
+ "lei": r"^[A-Z0-9]{18}[0-9]{2}$",
39
+ "cusip": r"^[0-9A-Z]{9}$",
40
+ "figi": r"^BBG[A-Z0-9]{9}$",
41
+ "isin": r"^[A-Z]{2}[A-Z0-9]{9}[0-9]$",
42
+ "perm_id": r"^\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{3}$",
43
+ }
44
+ self.DEFAULT_NUMERIC_PRECISION = 16383
45
+
46
+ def connect(self) -> Any:
47
+ """
48
+ Connect to the file data source
49
+ """
50
+ try:
51
+ file_path = self.data_connection.get("file_path")
52
+ self.connection = duckdb.connect(database=file_path)
53
+ return self.connection
54
+ except Exception as e:
55
+ raise DataChecksDataSourcesConnectionError(f"Failed to connect to DuckDB: {e}")
56
+
57
+ def is_connected(self) -> bool:
58
+ """
59
+ Check if the file data source is connected
60
+ """
61
+ return self.connection is not None
62
+
63
+ def close(self):
64
+ """
65
+ Close the connection
66
+ """
67
+ logger.info("Closing DuckDB connection")
68
+ if self.connection:
69
+ self.connection.close()
70
+ try:
71
+ fp = self.data_connection.get("file_path")
72
+ if fp and os.path.exists(fp):
73
+ os.remove(fp)
74
+ except Exception as e:
75
+ logger.error(f"Failed to remove the file {self.data_connection.get('file_path')}: {e}")
76
+
77
+ def qualified_table_name(self, table_name: str) -> str:
78
+ """
79
+ Get the qualified table name
80
+ :param table_name: name of the table
81
+ :return: qualified table name
82
+ """
83
+ return f'"{table_name}"'
84
+
85
+ def quote_column(self, column: str) -> str:
86
+ """
87
+ Quote the column name
88
+ :param column: name of the column
89
+ :return: quoted column name
90
+ """
91
+ return f'"{column}"'
92
+
93
+ def query_get_table_columns(
94
+ self,
95
+ table: str,
96
+ schema: str | None = None,
97
+ ) -> Dict[str, RawColumnInfo]:
98
+ """
99
+ Get the schema of a table.
100
+ :param table: table name
101
+ :return: Dictionary with column names and their types
102
+ """
103
+ schema = schema or self.schema_name
104
+ info_schema_path = ["information_schema", "columns"]
105
+ if self.database:
106
+ database = self.quote_database(self.database)
107
+ info_schema_path.insert(0, database)
108
+
109
+ query = f"""
110
+ SELECT
111
+ column_name,
112
+ data_type,
113
+ CASE WHEN data_type IN ('TIMESTAMP', 'TIME') THEN datetime_precision ELSE NULL END AS datetime_precision,
114
+ CASE WHEN data_type = 'DECIMAL' THEN COALESCE(numeric_precision, 131072 + {self.DEFAULT_NUMERIC_PRECISION})
115
+ WHEN data_type IN ('DOUBLE', 'REAL', 'FLOAT') THEN numeric_precision
116
+ ELSE numeric_precision END AS numeric_precision,
117
+ CASE WHEN data_type = 'DECIMAL' THEN COALESCE(numeric_scale, {self.DEFAULT_NUMERIC_PRECISION}) ELSE numeric_scale END AS numeric_scale,
118
+ NULL AS collation_name,
119
+ CASE WHEN data_type = 'VARCHAR' THEN character_maximum_length ELSE NULL END AS character_maximum_length
120
+ FROM information_schema.columns
121
+ WHERE table_name = '{table}'
122
+ ORDER BY ordinal_position
123
+ """
124
+
125
+ rows = self.fetchall(query)
126
+ if not rows:
127
+ raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
128
+
129
+ column_info = {
130
+ r[0]: RawColumnInfo(
131
+ column_name=self.safe_get(r, 0),
132
+ data_type=self.safe_get(r, 1),
133
+ datetime_precision=self.safe_get(r, 2),
134
+ numeric_precision=self.safe_get(r, 3),
135
+ numeric_scale=self.safe_get(r, 4),
136
+ collation_name=self.safe_get(r, 5),
137
+ character_maximum_length=self.safe_get(r, 6),
138
+ )
139
+ for r in rows
140
+ }
141
+ return column_info
@@ -167,8 +167,6 @@ class MssqlDataSource(SQLDataSource):
167
167
  :return: Dictionary with index details
168
168
  """
169
169
  schema = schema or self.schema_name
170
- table = table.upper()
171
- schema = schema.upper()
172
170
 
173
171
  query = f"""
174
172
  SELECT
@@ -673,8 +671,6 @@ class MssqlDataSource(SQLDataSource):
673
671
 
674
672
  quoted_name = self.quote_column(name)
675
673
 
676
- query_parts.append(f"COUNT(DISTINCT {quoted_name}) AS [{name}_distinct]")
677
- query_parts.append(f"COUNT({quoted_name}) - COUNT(DISTINCT {quoted_name}) AS [{name}_duplicate]")
678
674
  query_parts.append(f"SUM(CASE WHEN {quoted_name} IS NULL THEN 1 ELSE 0 END) AS [{name}_is_null]")
679
675
 
680
676
  if dtype in (
@@ -690,13 +686,20 @@ class MssqlDataSource(SQLDataSource):
690
686
  "money",
691
687
  "smallmoney",
692
688
  ):
689
+ query_parts.append(f"COUNT({quoted_name}) - COUNT(DISTINCT {quoted_name}) AS [{name}_duplicate]")
693
690
  query_parts.append(f"MIN({quoted_name}) AS [{name}_min]")
694
691
  query_parts.append(f"MAX({quoted_name}) AS [{name}_max]")
695
692
  query_parts.append(f"AVG(CAST({quoted_name} AS FLOAT)) AS [{name}_average]")
693
+ query_parts.append(f"COUNT(DISTINCT {quoted_name}) AS [{name}_distinct]")
696
694
 
697
- elif dtype in ("varchar", "nvarchar", "char", "nchar", "text", "ntext"):
695
+ elif dtype in ("varchar", "nvarchar", "char", "nchar"):
698
696
  query_parts.append(f"MAX(LEN({quoted_name})) AS [{name}_max_character_length]")
699
697
 
698
+ elif dtype in ("text", "ntext", "xml"):
699
+ query_parts.append(
700
+ f"MAX(LEN(CAST({quoted_name} AS NVARCHAR(MAX)))) " f"AS [{name}_max_character_length]"
701
+ )
702
+
700
703
  if additional_queries:
701
704
  query_parts.extend(additional_queries)
702
705
 
@@ -143,8 +143,6 @@ class OracleDataSource(SQLDataSource):
143
143
  :return: Dictionary with index details
144
144
  """
145
145
  schema = schema or self.schema_name
146
- table = table.upper()
147
- schema = schema.upper()
148
146
 
149
147
  query = f"""
150
148
  SELECT
@@ -696,8 +694,8 @@ class OracleDataSource(SQLDataSource):
696
694
  AND r_ac.OWNER = r_acc.OWNER
697
695
  AND acc.POSITION = r_acc.POSITION
698
696
  WHERE ac.CONSTRAINT_TYPE = 'R'
699
- AND ac.TABLE_NAME = '{table_name.upper()}'
700
- AND ac.OWNER = '{schema.upper()}';
697
+ AND ac.TABLE_NAME = '{table_name}'
698
+ AND ac.OWNER = '{schema}';
701
699
  """
702
700
 
703
701
  try:
@@ -136,8 +136,6 @@ class PostgresDataSource(SQLDataSource):
136
136
  :return: Dictionary with index details
137
137
  """
138
138
  schema = schema or self.schema_name
139
- table = table.lower()
140
- schema = schema.lower()
141
139
 
142
140
  query = f"""
143
141
  SELECT
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.7.0"
15
+ __version__ = "1.7.2"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dcs-sdk"
3
- version = "1.7.0"
3
+ version = "1.7.2"
4
4
  description = "SDK for DataChecks"
5
5
  authors = ["Waterdip Labs <hello@waterdip.ai>"]
6
6
  readme = "README.md"
@@ -1,30 +0,0 @@
1
- # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from typing import Dict
16
-
17
- from dcs_core.core.datasource.base import DataSource
18
-
19
-
20
- class FileDataSource(DataSource):
21
- """
22
- Abstract class for File data sources
23
- """
24
-
25
- def __init__(self, data_source_name: str, data_connection: Dict):
26
- super().__init__(data_source_name, data_connection)
27
-
28
- def load_file_to_duckdb(self, table_name: str):
29
- """Load the file to duckdb"""
30
- pass