dcs-sdk 1.6.9__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -44,6 +44,7 @@ class DataSourceType(str, Enum):
44
44
  DB2 = "db2"
45
45
  SYBASE = "sybase"
46
46
  AZURE_BLOB = "azure_blob"
47
+ DUCK_DB = "duck_db"
47
48
 
48
49
 
49
50
  class DataSourceLanguageSupport(str, Enum):
@@ -92,6 +93,8 @@ class DataSourceConnectionConfiguration:
92
93
  endpoint_suffix: Optional[str] = None
93
94
  subfolder_path: Optional[str] = None
94
95
 
96
+ file_path: Optional[str] = None
97
+
95
98
 
96
99
  @dataclass
97
100
  class DataSourceConfiguration:
@@ -81,6 +81,7 @@ class DataSourceConfigParser(ConfigParser):
81
81
  protocol=config["connection"].get("protocol"),
82
82
  driver=config["connection"].get("driver"),
83
83
  server=config["connection"].get("server"),
84
+ file_path=config["connection"].get("file_path"),
84
85
  )
85
86
  return connection_config
86
87
 
@@ -24,3 +24,7 @@ class FileDataSource(DataSource):
24
24
 
25
25
  def __init__(self, data_source_name: str, data_connection: Dict):
26
26
  super().__init__(data_source_name, data_connection)
27
+
28
+ def load_file_to_duckdb(self, table_name: str):
29
+ """Load the file to duckdb"""
30
+ pass
@@ -58,6 +58,7 @@ class DataSourceManager:
58
58
  "db2": "DB2DataSource",
59
59
  "sybase": "SybaseDataSource",
60
60
  "azure_blob": "AzureBlobDataSource",
61
+ "duck_db": "DuckDb",
61
62
  }
62
63
 
63
64
  def __init__(self, config: Configuration):
dcs_core/core/inspect.py CHANGED
@@ -130,7 +130,6 @@ class Inspect:
130
130
  try:
131
131
  self.data_source_manager.connect()
132
132
  self.validation_manager.build_validations()
133
-
134
133
  validation_infos: Dict[str, ValidationInfo] = {}
135
134
 
136
135
  for datasource, _ in self.validation_manager.get_validations.items():
@@ -13,10 +13,17 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import io
16
+ import os
17
+ import uuid
18
+ from concurrent.futures import ThreadPoolExecutor
19
+ from pathlib import Path
20
+ from queue import Empty, Queue
16
21
  from typing import Any, Dict, List, Optional
17
22
 
23
+ import duckdb
18
24
  import pandas as pd
19
25
  from azure.storage.blob import BlobServiceClient
26
+ from loguru import logger
20
27
 
21
28
  from dcs_core.core.common.errors import (
22
29
  DatachecksColumnFetchError,
@@ -113,3 +120,98 @@ class AzureBlobDataSource(FileDataSource):
113
120
  """
114
121
  api_version = self.blob_service_client.api_version
115
122
  return api_version
123
+
124
+ def _chunk_load_to_pandas(self, queue: Queue, result_df: list, timeout: float = 2.0):
125
+ """Consumer thread: read CSV chunks from queue & build final DataFrame"""
126
+ df = pd.DataFrame()
127
+ try:
128
+ while True:
129
+ try:
130
+ data = queue.get(timeout=timeout)
131
+ except Empty:
132
+ continue
133
+
134
+ if data is None:
135
+ break
136
+
137
+ try:
138
+ chunk = pd.read_csv(io.BytesIO(data), dtype=str)
139
+ df = pd.concat([df, chunk], ignore_index=True)
140
+ except Exception as e:
141
+ logger.error(f"[ERROR] Failed to read CSV chunk: {e}")
142
+ continue
143
+
144
+ except Exception as e:
145
+ logger.error(f"[FATAL] Consumer crashed: {e}")
146
+
147
+ finally:
148
+ result_df.append(df)
149
+
150
+ def _load_blob_to_pandas(self, table_name: str):
151
+ blob_client = self.connection.get_blob_client(blob=table_name)
152
+ CHUNK_SIZE = 4 * 1024 * 1024
153
+ blob_size = blob_client.get_blob_properties().size
154
+ start = 0
155
+ queue = Queue()
156
+ result_df = []
157
+
158
+ with ThreadPoolExecutor(max_workers=1) as executor:
159
+ executor.submit(self._chunk_load_to_pandas, queue, result_df)
160
+
161
+ all_data = b""
162
+ while start < blob_size:
163
+ end = min(start + CHUNK_SIZE - 1, blob_size - 1)
164
+ data = blob_client.download_blob(offset=start, length=end - start + 1).readall()
165
+ all_data += data
166
+ queue.put(data)
167
+ start += CHUNK_SIZE
168
+
169
+ queue.put(None)
170
+ if not result_df or len(result_df) == 0:
171
+ raise ValueError("No data downloaded from Azure Blob Storage")
172
+ return result_df[0]
173
+
174
+ def _load_pd_to_duckdb(self, df: pd.DataFrame, table_name: str):
175
+ dir_name = "tmp"
176
+ if not os.path.exists(dir_name):
177
+ os.makedirs(dir_name)
178
+
179
+ duck_db_file_name = f"{dir_name}/{uuid.uuid4()}.duckdb"
180
+ file_path = None
181
+ try:
182
+ table_name = table_name
183
+
184
+ conn = duckdb.connect(database=duck_db_file_name, read_only=False)
185
+
186
+ file_path = duck_db_file_name
187
+
188
+ conn.register("df_view", df)
189
+
190
+ conn.execute(
191
+ f"""
192
+ CREATE OR REPLACE TABLE "{table_name}" AS
193
+ SELECT * FROM df_view;
194
+ """
195
+ )
196
+ conn.unregister("df_view")
197
+ conn.close()
198
+
199
+ except Exception as e:
200
+ logger.error(f"Error in loading CSV to DuckDB: {e}")
201
+ raise
202
+
203
+ return file_path
204
+
205
+ def load_file_to_duckdb(self, table_name: str):
206
+ logger.info(f"Loading {table_name} to pandas")
207
+ df: pd.DataFrame = self._load_blob_to_pandas(table_name)
208
+
209
+ if df is None or df.empty:
210
+ raise ValueError("No data downloaded from Azure Blob Storage")
211
+
212
+ name_only = Path(table_name).stem
213
+
214
+ logger.info(f"Loading {table_name} to duckdb")
215
+ file_path = self._load_pd_to_duckdb(df, name_only)
216
+
217
+ return file_path
@@ -0,0 +1,72 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+ from typing import Any, Dict
16
+
17
+ import duckdb
18
+ from loguru import logger
19
+
20
+ from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
21
+ from dcs_core.core.datasource.sql_datasource import SQLDataSource
22
+
23
+
24
+ class DuckDb(SQLDataSource):
25
+ def __init__(self, data_source_name: str, data_connection: Dict):
26
+ super().__init__(data_source_name, data_connection)
27
+ self.connection = None
28
+ self.use_sa_text_query = False
29
+
30
+ def connect(self) -> Any:
31
+ """
32
+ Connect to the file data source
33
+ """
34
+ try:
35
+ file_path = self.data_connection.get("file_path")
36
+ self.connection = duckdb.connect(database=file_path)
37
+ return self.connection
38
+ except Exception as e:
39
+ raise DataChecksDataSourcesConnectionError(f"Failed to connect to DuckDB: {e}")
40
+
41
+ def is_connected(self) -> bool:
42
+ """
43
+ Check if the file data source is connected
44
+ """
45
+ return self.connection is not None
46
+
47
+ def close(self):
48
+ """
49
+ Close the connection
50
+ """
51
+ logger.info("Closing DuckDB connection")
52
+ self.connection.close()
53
+ try:
54
+ os.remove(self.data_connection.get("file_path"))
55
+ except Exception as e:
56
+ logger.error(f"Failed to remove the file {self.data_connection.get('file_path')}: {e}")
57
+
58
+ def qualified_table_name(self, table_name: str) -> str:
59
+ """
60
+ Get the qualified table name
61
+ :param table_name: name of the table
62
+ :return: qualified table name
63
+ """
64
+ return f'"{table_name}"'
65
+
66
+ def quote_column(self, column: str) -> str:
67
+ """
68
+ Quote the column name
69
+ :param column: name of the column
70
+ :return: quoted column name
71
+ """
72
+ return f'"{column}"'
dcs_sdk/__version__.py CHANGED
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.6.9"
15
+ __version__ = "1.7.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dcs-sdk
3
- Version: 1.6.9
3
+ Version: 1.7.0
4
4
  Summary: SDK for DataChecks
5
5
  Author: Waterdip Labs
6
6
  Author-email: hello@waterdip.ai
@@ -86,7 +86,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
86
86
  Description-Content-Type: text/markdown
87
87
 
88
88
  <h1 align="center">
89
- DCS SDK v1.6.9
89
+ DCS SDK v1.7.0
90
90
  </h1>
91
91
 
92
92
  > SDK for DataChecks
@@ -51,7 +51,7 @@ dcs_core/core/__init__.py,sha256=8XyOIsx-uCpaEZUgfOrb0DCdvmz1TipNQdz01h7mun0,761
51
51
  dcs_core/core/common/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
52
52
  dcs_core/core/common/errors.py,sha256=nRczSqORCjcDngAuDsqzsc3_yZQzuUX26lPov0pTE1I,2268
53
53
  dcs_core/core/common/models/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
54
- dcs_core/core/common/models/configuration.py,sha256=cFFr_SiAqYR3NIFGfz4rJVVX-LuGu-9TJC47ghL3Tes,9396
54
+ dcs_core/core/common/models/configuration.py,sha256=PDmDZMkjSxKho8XIPrE0rF_SPe3CRt5KIXOezOBHiGg,9457
55
55
  dcs_core/core/common/models/dashboard.py,sha256=_WV1kbs4cKlFZ5QcXyMdTmDSZLYxhvZWWWQzvHReMxM,814
56
56
  dcs_core/core/common/models/data_source_resource.py,sha256=rNvj5NjvEQi2irHYjClKBFZbp70LTX9oGCPDeFURlAI,1559
57
57
  dcs_core/core/common/models/metric.py,sha256=0Oxp7YvWZVy7zbmi4u_opBDeknsuzXmnOrK01pP2fQw,4843
@@ -60,15 +60,15 @@ dcs_core/core/common/models/validation.py,sha256=yGSL-hZgvKqSgj0nqNIqUm_DmNVlKbl
60
60
  dcs_core/core/common/models/widget.py,sha256=-IaZ5dAmPPZwMvpzJDQfEINfIPUsqS5rufBak1c7Y6A,1083
61
61
  dcs_core/core/configuration/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
62
62
  dcs_core/core/configuration/config_loader.py,sha256=xySV5DIJC7a1VioE2sq5X8rSInW-4qF4hm5bT-wxVlc,5637
63
- dcs_core/core/configuration/configuration_parser.py,sha256=KGOJqWbOWhTacuMwM1N55Kh6Ug-WrrjYLAaH9a8ynRk,11347
63
+ dcs_core/core/configuration/configuration_parser.py,sha256=ue7tzWkOpamhXw_DJhr5ZkqVKIEbP1AiZ2bQegsbdzg,11408
64
64
  dcs_core/core/configuration/configuration_parser_arc.py,sha256=TOoPf12pEXLdkjEGJEGV6rJOMR8yqLedla6T1x6g-Xw,14057
65
65
  dcs_core/core/datasource/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
66
66
  dcs_core/core/datasource/base.py,sha256=YD_UuGuoORFJNX30IQMk6aitiiTCHaiAddSNgUBmRtA,1935
67
- dcs_core/core/datasource/file_datasource.py,sha256=HG4av7KUFTfH2UlAl4bqcNI6MxpbSOA26cDqxmLUqh0,913
68
- dcs_core/core/datasource/manager.py,sha256=3oBjIqV0YYjXubCDGVBJP_jzrv-oBgBA-octoa8Wvaw,4795
67
+ dcs_core/core/datasource/file_datasource.py,sha256=ppjGOtzSaBV3DWIute0dvOHQxtfULgeIefsDFW3xZz8,1017
68
+ dcs_core/core/datasource/manager.py,sha256=cuh6XAOCxn2b9SQxYwYurgBb6WUD8ZS6KRIg3FAloYU,4824
69
69
  dcs_core/core/datasource/search_datasource.py,sha256=_conk1Q_kywJhKHYyEScoKlVt_yRd05zuAISvDmXqjw,15014
70
70
  dcs_core/core/datasource/sql_datasource.py,sha256=dlX-E--hadl2q8XpMNRyZmLGC35tltBsGDzlyZqzqtw,40730
71
- dcs_core/core/inspect.py,sha256=QICJKcEpQClLacsfNClFoiF08M01QnJh_U2VsXRh1iA,6427
71
+ dcs_core/core/inspect.py,sha256=H5MCR_o71hz1lvkueSMAJ5C4R93hogrEWl04iuwgrdM,6426
72
72
  dcs_core/core/logger/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
73
73
  dcs_core/core/logger/base.py,sha256=dbW48Y1t6okXy1dx8GSH3KpE5HX6pebHGuKqacJWF0E,1024
74
74
  dcs_core/core/logger/default_logger.py,sha256=7yieM99xnepxq5FSlu0TEhVd55X_kUGRqjiio5loQuU,4097
@@ -100,10 +100,11 @@ dcs_core/core/validation/uniqueness_validation.py,sha256=a6zm0_omiULKbQcDit8J913
100
100
  dcs_core/core/validation/validity_validation.py,sha256=358oAGH112oVxyPhDnfT-ypVaMAkpZ8pM73qogtdh9w,35297
101
101
  dcs_core/integrations/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
102
102
  dcs_core/integrations/databases/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
103
- dcs_core/integrations/databases/azure_blob.py,sha256=rOPj-dv3ZaGUrr_rLMn8xjZXuEjlzcdfZv2RcZgnbps,4674
103
+ dcs_core/integrations/databases/azure_blob.py,sha256=XjFLWuWYoWh5DGow1zgYypHvGtP7qN2oFwlvRth2aQ0,7973
104
104
  dcs_core/integrations/databases/bigquery.py,sha256=26RuypLMmiARZIWkV_mxtnNL2yCs94YWerSGH5Nr10Q,7337
105
105
  dcs_core/integrations/databases/databricks.py,sha256=n4fm5m_mtRCdtjLGDvbNW18u7Ev234vDBjq_lxuOxns,1978
106
106
  dcs_core/integrations/databases/db2.py,sha256=hNGivvYCitp88ouZlCxp7iRQ-vnPiK1kL8x85NyGotk,26492
107
+ dcs_core/integrations/databases/duck_db.py,sha256=J4ReqFmUFfbYbZlvjmjZfHiD5CkgX4TnzqljGf5wJCQ,2422
107
108
  dcs_core/integrations/databases/elasticsearch.py,sha256=6CTGs1WGrfgdDRNVt9DpOB0_z_znT6YoVj10E1WY-wQ,2152
108
109
  dcs_core/integrations/databases/mssql.py,sha256=g0MmoG8-xFphJ2oZl-q_OZ2oT6yz-lVY09JTIvIx4-0,38910
109
110
  dcs_core/integrations/databases/mysql.py,sha256=mUFLIGdbF_ktIlA19P7kq7holp5ZkRezGgN6TL_uiJ4,15815
@@ -133,7 +134,7 @@ dcs_core/report/static/index.js,sha256=p4wvku-zlXi0y4gWeSzV1amY0s4mjtUq2QsezARLV
133
134
  dcs_core/report/static/index.js.LICENSE.txt,sha256=bBDZBJVEDrqjCi7sfoF8CchjFn3hdcbNkP7ub7kbcXQ,201041
134
135
  dcs_sdk/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
135
136
  dcs_sdk/__main__.py,sha256=Qn8stIaQGrdLjHQ-H7xO0T-brtq5RWZoWU9QvqoarV8,683
136
- dcs_sdk/__version__.py,sha256=iSCeuxA5501nrospfk7ajv7gQWKLpPSH4npFVobI-gY,633
137
+ dcs_sdk/__version__.py,sha256=eNGN1UbpUzU92QJAufllEa8THGec2TPMG38N7GY-Rc8,633
137
138
  dcs_sdk/cli/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
138
139
  dcs_sdk/cli/cli.py,sha256=jaO52UrMWLafcF_yhqllPkmYSTuO2sksFi30fYFdAB4,4406
139
140
  dcs_sdk/sdk/__init__.py,sha256=skrZcgWWJBL6NXTUERywJ3qRJRemgpDXyW7lPg1FJk8,2107
@@ -155,7 +156,7 @@ dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py,sha256=puAWP
155
156
  dcs_sdk/sdk/utils/table.py,sha256=X8HxdYTWyx_oVrBWPsXlmA-xJKXXDBW9RrhlWNqA1As,18224
156
157
  dcs_sdk/sdk/utils/themes.py,sha256=Meo2Yldv4uyPpEqI7qdA28Aa6sxtwUU1dLKKm4QavjM,1403
157
158
  dcs_sdk/sdk/utils/utils.py,sha256=a9QGEVL8L7asbJm_VBwgKvJQknsvuqWS0uTUaHsDPiY,16463
158
- dcs_sdk-1.6.9.dist-info/METADATA,sha256=Od43VgAhrNLKIICnGJ3nSzmuUYJzHl7miH6OSysVE9U,7652
159
- dcs_sdk-1.6.9.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
160
- dcs_sdk-1.6.9.dist-info/entry_points.txt,sha256=XhODNz7UccgPOyklXgp7pIfTTXArd6-V0mImjhnhwto,80
161
- dcs_sdk-1.6.9.dist-info/RECORD,,
159
+ dcs_sdk-1.7.0.dist-info/METADATA,sha256=y4bCqzJK1nn3mIkDJ8uAQwfgHNR_Gytw_X3hic0Jhi0,7652
160
+ dcs_sdk-1.7.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
161
+ dcs_sdk-1.7.0.dist-info/entry_points.txt,sha256=XhODNz7UccgPOyklXgp7pIfTTXArd6-V0mImjhnhwto,80
162
+ dcs_sdk-1.7.0.dist-info/RECORD,,