dcs-sdk 1.7.0__tar.gz → 1.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/PKG-INFO +2 -2
  2. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/README.md +1 -1
  3. dcs_sdk-1.7.1/dcs_core/core/datasource/file_datasource.py +124 -0
  4. dcs_sdk-1.7.1/dcs_core/integrations/databases/azure_blob.py +112 -0
  5. dcs_sdk-1.7.1/dcs_core/integrations/databases/duck_db.py +141 -0
  6. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/__version__.py +1 -1
  7. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/pyproject.toml +1 -1
  8. dcs_sdk-1.7.0/dcs_core/core/datasource/file_datasource.py +0 -30
  9. dcs_sdk-1.7.0/dcs_core/integrations/databases/azure_blob.py +0 -217
  10. dcs_sdk-1.7.0/dcs_core/integrations/databases/duck_db.py +0 -72
  11. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/__init__.py +0 -0
  12. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/__main__.py +0 -0
  13. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/abcs/__init__.py +0 -0
  14. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/abcs/compiler.py +0 -0
  15. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/abcs/database_types.py +0 -0
  16. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/config.py +0 -0
  17. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/__init__.py +0 -0
  18. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/_connect.py +0 -0
  19. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/base.py +0 -0
  20. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/bigquery.py +0 -0
  21. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/clickhouse.py +0 -0
  22. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/databricks.py +0 -0
  23. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/duckdb.py +0 -0
  24. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/mssql.py +0 -0
  25. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/mysql.py +0 -0
  26. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/oracle.py +0 -0
  27. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/postgresql.py +0 -0
  28. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/presto.py +0 -0
  29. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/redis.py +0 -0
  30. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/redshift.py +0 -0
  31. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/snowflake.py +0 -0
  32. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/sybase.py +0 -0
  33. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/trino.py +0 -0
  34. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/databases/vertica.py +0 -0
  35. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/diff_tables.py +0 -0
  36. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/errors.py +0 -0
  37. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/format.py +0 -0
  38. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/hashdiff_tables.py +0 -0
  39. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/info_tree.py +0 -0
  40. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/joindiff_tables.py +0 -0
  41. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/lexicographic_space.py +0 -0
  42. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/parse_time.py +0 -0
  43. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/py.typed +0 -0
  44. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/queries/__init__.py +0 -0
  45. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/queries/api.py +0 -0
  46. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/queries/ast_classes.py +0 -0
  47. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/queries/base.py +0 -0
  48. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/queries/extras.py +0 -0
  49. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/query_utils.py +0 -0
  50. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/schema.py +0 -0
  51. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/table_segment.py +0 -0
  52. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/thread_utils.py +0 -0
  53. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/utils.py +0 -0
  54. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/data_diff/version.py +0 -0
  55. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/__init__.py +0 -0
  56. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/__main__.py +0 -0
  57. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/__version__.py +0 -0
  58. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/cli/__init__.py +0 -0
  59. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/cli/cli.py +0 -0
  60. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/__init__.py +0 -0
  61. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/common/__init__.py +0 -0
  62. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/common/errors.py +0 -0
  63. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/common/models/__init__.py +0 -0
  64. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/common/models/configuration.py +0 -0
  65. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/common/models/dashboard.py +0 -0
  66. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/common/models/data_source_resource.py +0 -0
  67. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/common/models/metric.py +0 -0
  68. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/common/models/profile.py +0 -0
  69. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/common/models/validation.py +0 -0
  70. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/common/models/widget.py +0 -0
  71. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/configuration/__init__.py +0 -0
  72. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/configuration/config_loader.py +0 -0
  73. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/configuration/configuration_parser.py +0 -0
  74. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/configuration/configuration_parser_arc.py +0 -0
  75. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/datasource/__init__.py +0 -0
  76. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/datasource/base.py +0 -0
  77. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/datasource/manager.py +0 -0
  78. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/datasource/search_datasource.py +0 -0
  79. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/datasource/sql_datasource.py +0 -0
  80. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/inspect.py +0 -0
  81. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/logger/__init__.py +0 -0
  82. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/logger/base.py +0 -0
  83. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/logger/default_logger.py +0 -0
  84. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/metric/__init__.py +0 -0
  85. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/metric/base.py +0 -0
  86. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/metric/combined_metric.py +0 -0
  87. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/metric/custom_metric.py +0 -0
  88. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/metric/manager.py +0 -0
  89. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/metric/numeric_metric.py +0 -0
  90. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/metric/reliability_metric.py +0 -0
  91. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/profiling/__init__.py +0 -0
  92. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/profiling/datasource_profiling.py +0 -0
  93. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/profiling/numeric_field_profiling.py +0 -0
  94. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/profiling/text_field_profiling.py +0 -0
  95. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/repository/__init__.py +0 -0
  96. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/repository/metric_repository.py +0 -0
  97. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/utils/__init__.py +0 -0
  98. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/utils/log.py +0 -0
  99. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/utils/tracking.py +0 -0
  100. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/utils/utils.py +0 -0
  101. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/validation/__init__.py +0 -0
  102. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/validation/base.py +0 -0
  103. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/validation/completeness_validation.py +0 -0
  104. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/validation/custom_query_validation.py +0 -0
  105. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/validation/manager.py +0 -0
  106. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/validation/numeric_validation.py +0 -0
  107. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/validation/reliability_validation.py +0 -0
  108. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/validation/uniqueness_validation.py +0 -0
  109. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/core/validation/validity_validation.py +0 -0
  110. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/__init__.py +0 -0
  111. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/__init__.py +0 -0
  112. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/bigquery.py +0 -0
  113. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/databricks.py +0 -0
  114. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/db2.py +0 -0
  115. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/elasticsearch.py +0 -0
  116. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/mssql.py +0 -0
  117. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/mysql.py +0 -0
  118. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/opensearch.py +0 -0
  119. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/oracle.py +0 -0
  120. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/postgres.py +0 -0
  121. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/redshift.py +0 -0
  122. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/snowflake.py +0 -0
  123. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/spark_df.py +0 -0
  124. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/databases/sybase.py +0 -0
  125. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/storage/__init__.py +0 -0
  126. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/storage/local_file.py +0 -0
  127. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/utils/__init__.py +0 -0
  128. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/integrations/utils/utils.py +0 -0
  129. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/report/__init__.py +0 -0
  130. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/report/dashboard.py +0 -0
  131. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/report/models.py +0 -0
  132. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  133. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  134. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  135. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  136. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/report/static/assets/images/docs.svg +0 -0
  137. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/report/static/assets/images/github.svg +0 -0
  138. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/report/static/assets/images/logo.svg +0 -0
  139. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/report/static/assets/images/slack.svg +0 -0
  140. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/report/static/index.js +0 -0
  141. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_core/report/static/index.js.LICENSE.txt +0 -0
  142. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/__init__.py +0 -0
  143. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/__main__.py +0 -0
  144. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/cli/__init__.py +0 -0
  145. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/cli/cli.py +0 -0
  146. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/__init__.py +0 -0
  147. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/config/__init__.py +0 -0
  148. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/config/config_loader.py +0 -0
  149. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/data_diff/__init__.py +0 -0
  150. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/data_diff/data_differ.py +0 -0
  151. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/rules/__init__.py +0 -0
  152. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/rules/rules_mappping.py +0 -0
  153. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/rules/rules_repository.py +0 -0
  154. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/rules/schema_rules.py +0 -0
  155. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/utils/__init__.py +0 -0
  156. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/utils/serializer.py +0 -0
  157. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/utils/similarity_score/__init__.py +0 -0
  158. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/utils/similarity_score/base_provider.py +0 -0
  159. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +0 -0
  160. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +0 -0
  161. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +0 -0
  162. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/utils/table.py +0 -0
  163. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/utils/themes.py +0 -0
  164. {dcs_sdk-1.7.0 → dcs_sdk-1.7.1}/dcs_sdk/sdk/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dcs-sdk
3
- Version: 1.7.0
3
+ Version: 1.7.1
4
4
  Summary: SDK for DataChecks
5
5
  Author: Waterdip Labs
6
6
  Author-email: hello@waterdip.ai
@@ -86,7 +86,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
86
86
  Description-Content-Type: text/markdown
87
87
 
88
88
  <h1 align="center">
89
- DCS SDK v1.7.0
89
+ DCS SDK v1.7.1
90
90
  </h1>
91
91
 
92
92
  > SDK for DataChecks
@@ -1,5 +1,5 @@
1
1
  <h1 align="center">
2
- DCS SDK v1.7.0
2
+ DCS SDK v1.7.1
3
3
  </h1>
4
4
 
5
5
  > SDK for DataChecks
@@ -0,0 +1,124 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import uuid
17
+ from abc import ABC, abstractmethod
18
+ from contextlib import contextmanager
19
+ from pathlib import Path
20
+ from typing import Dict, Iterator
21
+
22
+ import duckdb
23
+ from loguru import logger
24
+
25
+ from dcs_core.core.common.models.data_source_resource import RawColumnInfo
26
+ from dcs_core.core.datasource.base import DataSource
27
+ from dcs_core.integrations.databases.duck_db import DuckDb
28
+
29
+
30
+ class FileDataSource(DataSource, ABC):
31
+ """
32
+ Abstract class for File data sources
33
+ """
34
+
35
+ def __init__(self, data_source_name: str, data_connection: Dict):
36
+ super().__init__(data_source_name, data_connection)
37
+ self.temp_dir_name = "tmp"
38
+
39
+ @contextmanager
40
+ def as_duckdb(self, table_name: str) -> Iterator["DuckDb"]:
41
+ """Returns a DuckDB instance for the given table name"""
42
+ duckdb_path = self.load_file_to_duckdb(table_name)
43
+ duck_db_ds = DuckDb(data_source_name=self.data_source_name, data_connection={"file_path": duckdb_path})
44
+ try:
45
+ duck_db_ds.connect()
46
+ yield duck_db_ds
47
+ finally:
48
+ duck_db_ds.close()
49
+
50
+ @abstractmethod
51
+ def query_get_table_names(self) -> dict:
52
+ """
53
+ Query to get table names
54
+ """
55
+ pass
56
+
57
+ @abstractmethod
58
+ def query_get_database_version(self) -> str:
59
+ """
60
+ Get the database version
61
+ :return: version string
62
+ """
63
+ pass
64
+
65
+ @abstractmethod
66
+ def _download_to_path(self, table_name: str, path: str) -> None:
67
+ """Vendor-specific download"""
68
+ pass
69
+
70
+ def load_file_to_duckdb(self, table_name: str) -> str:
71
+ """Template method"""
72
+ os.makedirs(self.temp_dir_name, exist_ok=True)
73
+
74
+ ext = Path(table_name).suffix
75
+ if not ext:
76
+ raise ValueError(f"Invalid file name {table_name}")
77
+
78
+ temp_path = f"{self.temp_dir_name}/{uuid.uuid4()}{ext}"
79
+
80
+ try:
81
+ self._download_to_path(table_name, temp_path)
82
+ return self._load_path_to_duckdb(temp_path, table_name)
83
+ finally:
84
+ if os.path.exists(temp_path):
85
+ os.remove(temp_path)
86
+ logger.info(f"Cleaned up temp file {temp_path}")
87
+
88
+ def _load_path_to_duckdb(self, path: str, table_name: str) -> str:
89
+ """Shared DuckDB loading logic"""
90
+ tmp_dir = self.temp_dir_name
91
+ duckdb_path = f"{tmp_dir}/{uuid.uuid4()}.duckdb"
92
+ table_stem = Path(table_name).stem
93
+
94
+ logger.info(f"Loading {path} into DuckDB")
95
+
96
+ conn = None
97
+ try:
98
+ conn = duckdb.connect(database=duckdb_path, read_only=False)
99
+ conn.execute(
100
+ f'CREATE TABLE "{table_stem}" AS SELECT * FROM read_csv_auto(?)',
101
+ [path],
102
+ )
103
+ logger.info(f"Successfully loaded data into {duckdb_path}")
104
+ return duckdb_path
105
+ except Exception as e:
106
+ logger.warning(f"read_csv_auto failed: {e}. Trying with ALL_VARCHAR=TRUE")
107
+ try:
108
+ if conn:
109
+ conn.close()
110
+ conn = duckdb.connect(database=duckdb_path, read_only=False)
111
+ conn.execute(
112
+ f'CREATE TABLE "{table_stem}" AS ' f"SELECT * FROM read_csv(?, ALL_VARCHAR=TRUE, SAMPLE_SIZE=-1)",
113
+ [path],
114
+ )
115
+ logger.info(f"Successfully loaded data with ALL_VARCHAR into {duckdb_path}")
116
+ return duckdb_path
117
+ except Exception as fallback_error:
118
+ logger.error(f"Failed to load CSV into DuckDB: {fallback_error}")
119
+ if os.path.exists(duckdb_path):
120
+ os.remove(duckdb_path)
121
+ raise
122
+ finally:
123
+ if conn:
124
+ conn.close()
@@ -0,0 +1,112 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import uuid
17
+ from pathlib import Path
18
+ from typing import Any, Dict, Optional
19
+
20
+ import duckdb
21
+ from azure.storage.blob import BlobServiceClient
22
+ from loguru import logger
23
+
24
+ from dcs_core.core.common.errors import (
25
+ DatachecksColumnFetchError,
26
+ DataChecksDataSourcesConnectionError,
27
+ DatachecksTableFetchError,
28
+ )
29
+ from dcs_core.core.common.models.data_source_resource import RawColumnInfo
30
+ from dcs_core.core.datasource.file_datasource import FileDataSource
31
+
32
+
33
+ class AzureBlobDataSource(FileDataSource):
34
+ def __init__(self, data_source_name: str, data_connection: Dict):
35
+ super().__init__(data_source_name, data_connection)
36
+ self.allowed_file_extensions = [".csv"]
37
+ self.blob_service_client: Optional[BlobServiceClient] = None
38
+ self.DEFAULT_NUMERIC_PRECISION = 16383
39
+ self.connection = None
40
+
41
+ def connect(self) -> Any:
42
+ """
43
+ Connect to the file data source
44
+ """
45
+ try:
46
+ account_name = self.data_connection.get("account_name")
47
+ container_name = self.data_connection.get("container_name")
48
+ account_key = self.data_connection.get("account_key")
49
+ endpoint_suffix = self.data_connection.get("endpoint_suffix", "core.windows.net")
50
+ connection_str = f"https://{account_name}.blob.{endpoint_suffix}"
51
+ blob_service_client = BlobServiceClient(account_url=connection_str, credential=account_key)
52
+ self.blob_service_client = blob_service_client
53
+ self.connection = blob_service_client.get_container_client(container=container_name)
54
+ return self.connection
55
+ except Exception as e:
56
+ raise DataChecksDataSourcesConnectionError(f"Failed to connect to Azure Blob Storage: {e}")
57
+
58
+ def is_connected(self) -> bool:
59
+ """
60
+ Check if the file data source is connected
61
+ """
62
+ return self.connection is not None
63
+
64
+ def close(self):
65
+ """
66
+ Close the connection
67
+ """
68
+ self.connection.close()
69
+ self.blob_service_client.close()
70
+ self.connection = None
71
+ self.blob_service_client = None
72
+
73
+ def query_get_table_names(self) -> dict:
74
+ """
75
+ Query to get table names (blob names in this case)
76
+ """
77
+ if not self.is_connected():
78
+ raise DataChecksDataSourcesConnectionError("Not connected to Azure Blob Storage")
79
+ try:
80
+ subfolder = self.data_connection.get("subfolder", "")
81
+ blob_iterator = self.connection.list_blobs(name_starts_with=subfolder)
82
+ blobs = [
83
+ blob.name
84
+ for blob in blob_iterator
85
+ if len(blob.name.split("/")) == 1 and blob.name.endswith(tuple(self.allowed_file_extensions))
86
+ ]
87
+ return {"table": blobs}
88
+ except Exception as e:
89
+ raise DatachecksTableFetchError(f"Failed to list blobs: {e}")
90
+
91
+ def safe_get(self, lst, idx, default=None):
92
+ return lst[idx] if 0 <= idx < len(lst) else default
93
+
94
+ def query_get_database_version(self) -> str:
95
+ """
96
+ Get the database version
97
+ :return: version string
98
+ """
99
+ api_version = self.blob_service_client.api_version
100
+ return api_version
101
+
102
+ def _download_to_path(self, table_name: str, path: str):
103
+ """Download blob to path"""
104
+ blob_client = self.connection.get_blob_client(blob=table_name)
105
+ logger.info(f"Downloading {table_name} to {path}")
106
+ try:
107
+ with open(path, "wb") as f:
108
+ stream = blob_client.download_blob()
109
+ for chunk in stream.chunks():
110
+ f.write(chunk)
111
+ except Exception as e:
112
+ raise DataChecksDataSourcesConnectionError(f"Failed to download blob '{table_name}': {e}")
@@ -0,0 +1,141 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+ from pathlib import Path
16
+ from typing import Any, Dict
17
+
18
+ import duckdb
19
+ from loguru import logger
20
+
21
+ from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
22
+ from dcs_core.core.common.models.data_source_resource import RawColumnInfo
23
+ from dcs_core.core.datasource.sql_datasource import SQLDataSource
24
+
25
+
26
+ class DuckDb(SQLDataSource):
27
+ def __init__(self, data_source_name: str, data_connection: Dict):
28
+ super().__init__(data_source_name, data_connection)
29
+ self.connection = None
30
+ self.use_sa_text_query = False
31
+ self.regex_patterns = {
32
+ "uuid": r"^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$",
33
+ "usa_phone": r"^(\+1[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}$",
34
+ "email": r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$",
35
+ "usa_zip_code": r"^[0-9]{5}(?:-[0-9]{4})?$",
36
+ "ssn": r"^[0-9]{3}-[0-9]{2}-[0-9]{4}$",
37
+ "sedol": r"^[B-DF-HJ-NP-TV-XZ0-9]{6}[0-9]$",
38
+ "lei": r"^[A-Z0-9]{18}[0-9]{2}$",
39
+ "cusip": r"^[0-9A-Z]{9}$",
40
+ "figi": r"^BBG[A-Z0-9]{9}$",
41
+ "isin": r"^[A-Z]{2}[A-Z0-9]{9}[0-9]$",
42
+ "perm_id": r"^\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{3}$",
43
+ }
44
+ self.DEFAULT_NUMERIC_PRECISION = 16383
45
+
46
+ def connect(self) -> Any:
47
+ """
48
+ Connect to the file data source
49
+ """
50
+ try:
51
+ file_path = self.data_connection.get("file_path")
52
+ self.connection = duckdb.connect(database=file_path)
53
+ return self.connection
54
+ except Exception as e:
55
+ raise DataChecksDataSourcesConnectionError(f"Failed to connect to DuckDB: {e}")
56
+
57
+ def is_connected(self) -> bool:
58
+ """
59
+ Check if the file data source is connected
60
+ """
61
+ return self.connection is not None
62
+
63
+ def close(self):
64
+ """
65
+ Close the connection
66
+ """
67
+ logger.info("Closing DuckDB connection")
68
+ if self.connection:
69
+ self.connection.close()
70
+ try:
71
+ fp = self.data_connection.get("file_path")
72
+ if fp and os.path.exists(fp):
73
+ os.remove(fp)
74
+ except Exception as e:
75
+ logger.error(f"Failed to remove the file {self.data_connection.get('file_path')}: {e}")
76
+
77
+ def qualified_table_name(self, table_name: str) -> str:
78
+ """
79
+ Get the qualified table name
80
+ :param table_name: name of the table
81
+ :return: qualified table name
82
+ """
83
+ return f'"{table_name}"'
84
+
85
+ def quote_column(self, column: str) -> str:
86
+ """
87
+ Quote the column name
88
+ :param column: name of the column
89
+ :return: quoted column name
90
+ """
91
+ return f'"{column}"'
92
+
93
+ def query_get_table_columns(
94
+ self,
95
+ table: str,
96
+ schema: str | None = None,
97
+ ) -> Dict[str, RawColumnInfo]:
98
+ """
99
+ Get the schema of a table.
100
+ :param table: table name
101
+ :return: Dictionary with column names and their types
102
+ """
103
+ schema = schema or self.schema_name
104
+ info_schema_path = ["information_schema", "columns"]
105
+ if self.database:
106
+ database = self.quote_database(self.database)
107
+ info_schema_path.insert(0, database)
108
+
109
+ query = f"""
110
+ SELECT
111
+ column_name,
112
+ data_type,
113
+ CASE WHEN data_type IN ('TIMESTAMP', 'TIME') THEN datetime_precision ELSE NULL END AS datetime_precision,
114
+ CASE WHEN data_type = 'DECIMAL' THEN COALESCE(numeric_precision, 131072 + {self.DEFAULT_NUMERIC_PRECISION})
115
+ WHEN data_type IN ('DOUBLE', 'REAL', 'FLOAT') THEN numeric_precision
116
+ ELSE numeric_precision END AS numeric_precision,
117
+ CASE WHEN data_type = 'DECIMAL' THEN COALESCE(numeric_scale, {self.DEFAULT_NUMERIC_PRECISION}) ELSE numeric_scale END AS numeric_scale,
118
+ NULL AS collation_name,
119
+ CASE WHEN data_type = 'VARCHAR' THEN character_maximum_length ELSE NULL END AS character_maximum_length
120
+ FROM information_schema.columns
121
+ WHERE table_name = '{table}'
122
+ ORDER BY ordinal_position
123
+ """
124
+
125
+ rows = self.fetchall(query)
126
+ if not rows:
127
+ raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
128
+
129
+ column_info = {
130
+ r[0]: RawColumnInfo(
131
+ column_name=self.safe_get(r, 0),
132
+ data_type=self.safe_get(r, 1),
133
+ datetime_precision=self.safe_get(r, 2),
134
+ numeric_precision=self.safe_get(r, 3),
135
+ numeric_scale=self.safe_get(r, 4),
136
+ collation_name=self.safe_get(r, 5),
137
+ character_maximum_length=self.safe_get(r, 6),
138
+ )
139
+ for r in rows
140
+ }
141
+ return column_info
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.7.0"
15
+ __version__ = "1.7.1"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dcs-sdk"
3
- version = "1.7.0"
3
+ version = "1.7.1"
4
4
  description = "SDK for DataChecks"
5
5
  authors = ["Waterdip Labs <hello@waterdip.ai>"]
6
6
  readme = "README.md"
@@ -1,30 +0,0 @@
1
- # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from typing import Dict
16
-
17
- from dcs_core.core.datasource.base import DataSource
18
-
19
-
20
- class FileDataSource(DataSource):
21
- """
22
- Abstract class for File data sources
23
- """
24
-
25
- def __init__(self, data_source_name: str, data_connection: Dict):
26
- super().__init__(data_source_name, data_connection)
27
-
28
- def load_file_to_duckdb(self, table_name: str):
29
- """Load the file to duckdb"""
30
- pass
@@ -1,217 +0,0 @@
1
- # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import io
16
- import os
17
- import uuid
18
- from concurrent.futures import ThreadPoolExecutor
19
- from pathlib import Path
20
- from queue import Empty, Queue
21
- from typing import Any, Dict, List, Optional
22
-
23
- import duckdb
24
- import pandas as pd
25
- from azure.storage.blob import BlobServiceClient
26
- from loguru import logger
27
-
28
- from dcs_core.core.common.errors import (
29
- DatachecksColumnFetchError,
30
- DataChecksDataSourcesConnectionError,
31
- DatachecksTableFetchError,
32
- )
33
- from dcs_core.core.datasource.file_datasource import FileDataSource
34
-
35
-
36
- class AzureBlobDataSource(FileDataSource):
37
- def __init__(self, data_source_name: str, data_connection: Dict):
38
- super().__init__(data_source_name, data_connection)
39
- self.allowed_file_extensions = [".csv"]
40
- self.blob_service_client: Optional[BlobServiceClient] = None
41
- self.connection = None
42
-
43
- def connect(self) -> Any:
44
- """
45
- Connect to the file data source
46
- """
47
- try:
48
- account_name = self.data_connection.get("account_name")
49
- container_name = self.data_connection.get("container_name")
50
- account_key = self.data_connection.get("account_key")
51
- endpoint_suffix = self.data_connection.get("endpoint_suffix", "core.windows.net")
52
- connection_str = f"https://{account_name}.blob.{endpoint_suffix}"
53
- blob_service_client = BlobServiceClient(account_url=connection_str, credential=account_key)
54
- self.blob_service_client = blob_service_client
55
- self.connection = blob_service_client.get_container_client(container=container_name)
56
- return self.connection
57
- except Exception as e:
58
- raise DataChecksDataSourcesConnectionError(f"Failed to connect to Azure Blob Storage: {e}")
59
-
60
- def is_connected(self) -> bool:
61
- """
62
- Check if the file data source is connected
63
- """
64
- return self.connection is not None
65
-
66
- def close(self):
67
- """
68
- Close the connection
69
- """
70
- self.connection.close()
71
- self.blob_service_client.close()
72
- self.connection = None
73
- self.blob_service_client = None
74
-
75
- def query_get_table_names(self) -> dict:
76
- """
77
- Query to get table names (blob names in this case)
78
- """
79
- if not self.is_connected():
80
- raise DataChecksDataSourcesConnectionError("Not connected to Azure Blob Storage")
81
- try:
82
- subfolder = self.data_connection.get("subfolder", "")
83
- blob_iterator = self.connection.list_blobs(name_starts_with=subfolder)
84
- blobs = [
85
- blob.name
86
- for blob in blob_iterator
87
- if len(blob.name.split("/")) == 1 and blob.name.endswith(tuple(self.allowed_file_extensions))
88
- ]
89
- return {"table": blobs}
90
- except Exception as e:
91
- raise DatachecksTableFetchError(f"Failed to list blobs: {e}")
92
-
93
- def query_get_table_columns(self, table: str) -> List[dict]:
94
- """
95
- Get column names for a table (CSV blob in this case).
96
- """
97
- if not self.is_connected():
98
- raise DataChecksDataSourcesConnectionError("Not connected to Azure Blob Storage")
99
-
100
- if not any(table.endswith(ext) for ext in self.allowed_file_extensions):
101
- raise ValueError(f"Unsupported file type for {table}. Allowed: {self.allowed_file_extensions}")
102
-
103
- try:
104
- blob_client = self.connection.get_blob_client(blob=table)
105
- download_stream = blob_client.download_blob()
106
- data = download_stream.readall()
107
- if table.endswith(".csv"):
108
- df = pd.read_csv(io.BytesIO(data))
109
- else:
110
- raise ValueError(f"Unsupported file type for {table}. Allowed: {self.allowed_file_extensions}")
111
-
112
- return [{"column_name": col, "column_type": "string"} for col in df.columns.tolist()]
113
- except Exception as e:
114
- raise DatachecksColumnFetchError(f"Failed to read columns from blob '{table}': {e}")
115
-
116
- def query_get_database_version(self) -> str:
117
- """
118
- Get the database version
119
- :return: version string
120
- """
121
- api_version = self.blob_service_client.api_version
122
- return api_version
123
-
124
- def _chunk_load_to_pandas(self, queue: Queue, result_df: list, timeout: float = 2.0):
125
- """Consumer thread: read CSV chunks from queue & build final DataFrame"""
126
- df = pd.DataFrame()
127
- try:
128
- while True:
129
- try:
130
- data = queue.get(timeout=timeout)
131
- except Empty:
132
- continue
133
-
134
- if data is None:
135
- break
136
-
137
- try:
138
- chunk = pd.read_csv(io.BytesIO(data), dtype=str)
139
- df = pd.concat([df, chunk], ignore_index=True)
140
- except Exception as e:
141
- logger.error(f"[ERROR] Failed to read CSV chunk: {e}")
142
- continue
143
-
144
- except Exception as e:
145
- logger.error(f"[FATAL] Consumer crashed: {e}")
146
-
147
- finally:
148
- result_df.append(df)
149
-
150
- def _load_blob_to_pandas(self, table_name: str):
151
- blob_client = self.connection.get_blob_client(blob=table_name)
152
- CHUNK_SIZE = 4 * 1024 * 1024
153
- blob_size = blob_client.get_blob_properties().size
154
- start = 0
155
- queue = Queue()
156
- result_df = []
157
-
158
- with ThreadPoolExecutor(max_workers=1) as executor:
159
- executor.submit(self._chunk_load_to_pandas, queue, result_df)
160
-
161
- all_data = b""
162
- while start < blob_size:
163
- end = min(start + CHUNK_SIZE - 1, blob_size - 1)
164
- data = blob_client.download_blob(offset=start, length=end - start + 1).readall()
165
- all_data += data
166
- queue.put(data)
167
- start += CHUNK_SIZE
168
-
169
- queue.put(None)
170
- if not result_df or len(result_df) == 0:
171
- raise ValueError("No data downloaded from Azure Blob Storage")
172
- return result_df[0]
173
-
174
- def _load_pd_to_duckdb(self, df: pd.DataFrame, table_name: str):
175
- dir_name = "tmp"
176
- if not os.path.exists(dir_name):
177
- os.makedirs(dir_name)
178
-
179
- duck_db_file_name = f"{dir_name}/{uuid.uuid4()}.duckdb"
180
- file_path = None
181
- try:
182
- table_name = table_name
183
-
184
- conn = duckdb.connect(database=duck_db_file_name, read_only=False)
185
-
186
- file_path = duck_db_file_name
187
-
188
- conn.register("df_view", df)
189
-
190
- conn.execute(
191
- f"""
192
- CREATE OR REPLACE TABLE "{table_name}" AS
193
- SELECT * FROM df_view;
194
- """
195
- )
196
- conn.unregister("df_view")
197
- conn.close()
198
-
199
- except Exception as e:
200
- logger.error(f"Error in loading CSV to DuckDB: {e}")
201
- raise
202
-
203
- return file_path
204
-
205
- def load_file_to_duckdb(self, table_name: str):
206
- logger.info(f"Loading {table_name} to pandas")
207
- df: pd.DataFrame = self._load_blob_to_pandas(table_name)
208
-
209
- if df is None or df.empty:
210
- raise ValueError("No data downloaded from Azure Blob Storage")
211
-
212
- name_only = Path(table_name).stem
213
-
214
- logger.info(f"Loading {table_name} to duckdb")
215
- file_path = self._load_pd_to_duckdb(df, name_only)
216
-
217
- return file_path