dcs-sdk 1.6.9__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dcs_core/core/common/models/configuration.py +3 -0
- dcs_core/core/configuration/configuration_parser.py +1 -0
- dcs_core/core/datasource/file_datasource.py +4 -0
- dcs_core/core/datasource/manager.py +1 -0
- dcs_core/core/inspect.py +0 -1
- dcs_core/integrations/databases/azure_blob.py +102 -0
- dcs_core/integrations/databases/duck_db.py +72 -0
- dcs_sdk/__version__.py +1 -1
- {dcs_sdk-1.6.9.dist-info → dcs_sdk-1.7.0.dist-info}/METADATA +2 -2
- {dcs_sdk-1.6.9.dist-info → dcs_sdk-1.7.0.dist-info}/RECORD +12 -11
- {dcs_sdk-1.6.9.dist-info → dcs_sdk-1.7.0.dist-info}/WHEEL +0 -0
- {dcs_sdk-1.6.9.dist-info → dcs_sdk-1.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -44,6 +44,7 @@ class DataSourceType(str, Enum):
|
|
|
44
44
|
DB2 = "db2"
|
|
45
45
|
SYBASE = "sybase"
|
|
46
46
|
AZURE_BLOB = "azure_blob"
|
|
47
|
+
DUCK_DB = "duck_db"
|
|
47
48
|
|
|
48
49
|
|
|
49
50
|
class DataSourceLanguageSupport(str, Enum):
|
|
@@ -92,6 +93,8 @@ class DataSourceConnectionConfiguration:
|
|
|
92
93
|
endpoint_suffix: Optional[str] = None
|
|
93
94
|
subfolder_path: Optional[str] = None
|
|
94
95
|
|
|
96
|
+
file_path: Optional[str] = None
|
|
97
|
+
|
|
95
98
|
|
|
96
99
|
@dataclass
|
|
97
100
|
class DataSourceConfiguration:
|
|
@@ -81,6 +81,7 @@ class DataSourceConfigParser(ConfigParser):
|
|
|
81
81
|
protocol=config["connection"].get("protocol"),
|
|
82
82
|
driver=config["connection"].get("driver"),
|
|
83
83
|
server=config["connection"].get("server"),
|
|
84
|
+
file_path=config["connection"].get("file_path"),
|
|
84
85
|
)
|
|
85
86
|
return connection_config
|
|
86
87
|
|
dcs_core/core/inspect.py
CHANGED
|
@@ -13,10 +13,17 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import io
|
|
16
|
+
import os
|
|
17
|
+
import uuid
|
|
18
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from queue import Empty, Queue
|
|
16
21
|
from typing import Any, Dict, List, Optional
|
|
17
22
|
|
|
23
|
+
import duckdb
|
|
18
24
|
import pandas as pd
|
|
19
25
|
from azure.storage.blob import BlobServiceClient
|
|
26
|
+
from loguru import logger
|
|
20
27
|
|
|
21
28
|
from dcs_core.core.common.errors import (
|
|
22
29
|
DatachecksColumnFetchError,
|
|
@@ -113,3 +120,98 @@ class AzureBlobDataSource(FileDataSource):
|
|
|
113
120
|
"""
|
|
114
121
|
api_version = self.blob_service_client.api_version
|
|
115
122
|
return api_version
|
|
123
|
+
|
|
124
|
+
def _chunk_load_to_pandas(self, queue: Queue, result_df: list, timeout: float = 2.0):
|
|
125
|
+
"""Consumer thread: read CSV chunks from queue & build final DataFrame"""
|
|
126
|
+
df = pd.DataFrame()
|
|
127
|
+
try:
|
|
128
|
+
while True:
|
|
129
|
+
try:
|
|
130
|
+
data = queue.get(timeout=timeout)
|
|
131
|
+
except Empty:
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
if data is None:
|
|
135
|
+
break
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
chunk = pd.read_csv(io.BytesIO(data), dtype=str)
|
|
139
|
+
df = pd.concat([df, chunk], ignore_index=True)
|
|
140
|
+
except Exception as e:
|
|
141
|
+
logger.error(f"[ERROR] Failed to read CSV chunk: {e}")
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
except Exception as e:
|
|
145
|
+
logger.error(f"[FATAL] Consumer crashed: {e}")
|
|
146
|
+
|
|
147
|
+
finally:
|
|
148
|
+
result_df.append(df)
|
|
149
|
+
|
|
150
|
+
def _load_blob_to_pandas(self, table_name: str):
|
|
151
|
+
blob_client = self.connection.get_blob_client(blob=table_name)
|
|
152
|
+
CHUNK_SIZE = 4 * 1024 * 1024
|
|
153
|
+
blob_size = blob_client.get_blob_properties().size
|
|
154
|
+
start = 0
|
|
155
|
+
queue = Queue()
|
|
156
|
+
result_df = []
|
|
157
|
+
|
|
158
|
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
|
159
|
+
executor.submit(self._chunk_load_to_pandas, queue, result_df)
|
|
160
|
+
|
|
161
|
+
all_data = b""
|
|
162
|
+
while start < blob_size:
|
|
163
|
+
end = min(start + CHUNK_SIZE - 1, blob_size - 1)
|
|
164
|
+
data = blob_client.download_blob(offset=start, length=end - start + 1).readall()
|
|
165
|
+
all_data += data
|
|
166
|
+
queue.put(data)
|
|
167
|
+
start += CHUNK_SIZE
|
|
168
|
+
|
|
169
|
+
queue.put(None)
|
|
170
|
+
if not result_df or len(result_df) == 0:
|
|
171
|
+
raise ValueError("No data downloaded from Azure Blob Storage")
|
|
172
|
+
return result_df[0]
|
|
173
|
+
|
|
174
|
+
def _load_pd_to_duckdb(self, df: pd.DataFrame, table_name: str):
|
|
175
|
+
dir_name = "tmp"
|
|
176
|
+
if not os.path.exists(dir_name):
|
|
177
|
+
os.makedirs(dir_name)
|
|
178
|
+
|
|
179
|
+
duck_db_file_name = f"{dir_name}/{uuid.uuid4()}.duckdb"
|
|
180
|
+
file_path = None
|
|
181
|
+
try:
|
|
182
|
+
table_name = table_name
|
|
183
|
+
|
|
184
|
+
conn = duckdb.connect(database=duck_db_file_name, read_only=False)
|
|
185
|
+
|
|
186
|
+
file_path = duck_db_file_name
|
|
187
|
+
|
|
188
|
+
conn.register("df_view", df)
|
|
189
|
+
|
|
190
|
+
conn.execute(
|
|
191
|
+
f"""
|
|
192
|
+
CREATE OR REPLACE TABLE "{table_name}" AS
|
|
193
|
+
SELECT * FROM df_view;
|
|
194
|
+
"""
|
|
195
|
+
)
|
|
196
|
+
conn.unregister("df_view")
|
|
197
|
+
conn.close()
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
logger.error(f"Error in loading CSV to DuckDB: {e}")
|
|
201
|
+
raise
|
|
202
|
+
|
|
203
|
+
return file_path
|
|
204
|
+
|
|
205
|
+
def load_file_to_duckdb(self, table_name: str):
|
|
206
|
+
logger.info(f"Loading {table_name} to pandas")
|
|
207
|
+
df: pd.DataFrame = self._load_blob_to_pandas(table_name)
|
|
208
|
+
|
|
209
|
+
if df is None or df.empty:
|
|
210
|
+
raise ValueError("No data downloaded from Azure Blob Storage")
|
|
211
|
+
|
|
212
|
+
name_only = Path(table_name).stem
|
|
213
|
+
|
|
214
|
+
logger.info(f"Loading {table_name} to duckdb")
|
|
215
|
+
file_path = self._load_pd_to_duckdb(df, name_only)
|
|
216
|
+
|
|
217
|
+
return file_path
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import os
|
|
15
|
+
from typing import Any, Dict
|
|
16
|
+
|
|
17
|
+
import duckdb
|
|
18
|
+
from loguru import logger
|
|
19
|
+
|
|
20
|
+
from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
|
|
21
|
+
from dcs_core.core.datasource.sql_datasource import SQLDataSource
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DuckDb(SQLDataSource):
|
|
25
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
26
|
+
super().__init__(data_source_name, data_connection)
|
|
27
|
+
self.connection = None
|
|
28
|
+
self.use_sa_text_query = False
|
|
29
|
+
|
|
30
|
+
def connect(self) -> Any:
|
|
31
|
+
"""
|
|
32
|
+
Connect to the file data source
|
|
33
|
+
"""
|
|
34
|
+
try:
|
|
35
|
+
file_path = self.data_connection.get("file_path")
|
|
36
|
+
self.connection = duckdb.connect(database=file_path)
|
|
37
|
+
return self.connection
|
|
38
|
+
except Exception as e:
|
|
39
|
+
raise DataChecksDataSourcesConnectionError(f"Failed to connect to DuckDB: {e}")
|
|
40
|
+
|
|
41
|
+
def is_connected(self) -> bool:
|
|
42
|
+
"""
|
|
43
|
+
Check if the file data source is connected
|
|
44
|
+
"""
|
|
45
|
+
return self.connection is not None
|
|
46
|
+
|
|
47
|
+
def close(self):
|
|
48
|
+
"""
|
|
49
|
+
Close the connection
|
|
50
|
+
"""
|
|
51
|
+
logger.info("Closing DuckDB connection")
|
|
52
|
+
self.connection.close()
|
|
53
|
+
try:
|
|
54
|
+
os.remove(self.data_connection.get("file_path"))
|
|
55
|
+
except Exception as e:
|
|
56
|
+
logger.error(f"Failed to remove the file {self.data_connection.get('file_path')}: {e}")
|
|
57
|
+
|
|
58
|
+
def qualified_table_name(self, table_name: str) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Get the qualified table name
|
|
61
|
+
:param table_name: name of the table
|
|
62
|
+
:return: qualified table name
|
|
63
|
+
"""
|
|
64
|
+
return f'"{table_name}"'
|
|
65
|
+
|
|
66
|
+
def quote_column(self, column: str) -> str:
|
|
67
|
+
"""
|
|
68
|
+
Quote the column name
|
|
69
|
+
:param column: name of the column
|
|
70
|
+
:return: quoted column name
|
|
71
|
+
"""
|
|
72
|
+
return f'"{column}"'
|
dcs_sdk/__version__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dcs-sdk
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.7.0
|
|
4
4
|
Summary: SDK for DataChecks
|
|
5
5
|
Author: Waterdip Labs
|
|
6
6
|
Author-email: hello@waterdip.ai
|
|
@@ -86,7 +86,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
|
|
|
86
86
|
Description-Content-Type: text/markdown
|
|
87
87
|
|
|
88
88
|
<h1 align="center">
|
|
89
|
-
DCS SDK v1.
|
|
89
|
+
DCS SDK v1.7.0
|
|
90
90
|
</h1>
|
|
91
91
|
|
|
92
92
|
> SDK for DataChecks
|
|
@@ -51,7 +51,7 @@ dcs_core/core/__init__.py,sha256=8XyOIsx-uCpaEZUgfOrb0DCdvmz1TipNQdz01h7mun0,761
|
|
|
51
51
|
dcs_core/core/common/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
52
52
|
dcs_core/core/common/errors.py,sha256=nRczSqORCjcDngAuDsqzsc3_yZQzuUX26lPov0pTE1I,2268
|
|
53
53
|
dcs_core/core/common/models/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
54
|
-
dcs_core/core/common/models/configuration.py,sha256=
|
|
54
|
+
dcs_core/core/common/models/configuration.py,sha256=PDmDZMkjSxKho8XIPrE0rF_SPe3CRt5KIXOezOBHiGg,9457
|
|
55
55
|
dcs_core/core/common/models/dashboard.py,sha256=_WV1kbs4cKlFZ5QcXyMdTmDSZLYxhvZWWWQzvHReMxM,814
|
|
56
56
|
dcs_core/core/common/models/data_source_resource.py,sha256=rNvj5NjvEQi2irHYjClKBFZbp70LTX9oGCPDeFURlAI,1559
|
|
57
57
|
dcs_core/core/common/models/metric.py,sha256=0Oxp7YvWZVy7zbmi4u_opBDeknsuzXmnOrK01pP2fQw,4843
|
|
@@ -60,15 +60,15 @@ dcs_core/core/common/models/validation.py,sha256=yGSL-hZgvKqSgj0nqNIqUm_DmNVlKbl
|
|
|
60
60
|
dcs_core/core/common/models/widget.py,sha256=-IaZ5dAmPPZwMvpzJDQfEINfIPUsqS5rufBak1c7Y6A,1083
|
|
61
61
|
dcs_core/core/configuration/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
62
62
|
dcs_core/core/configuration/config_loader.py,sha256=xySV5DIJC7a1VioE2sq5X8rSInW-4qF4hm5bT-wxVlc,5637
|
|
63
|
-
dcs_core/core/configuration/configuration_parser.py,sha256=
|
|
63
|
+
dcs_core/core/configuration/configuration_parser.py,sha256=ue7tzWkOpamhXw_DJhr5ZkqVKIEbP1AiZ2bQegsbdzg,11408
|
|
64
64
|
dcs_core/core/configuration/configuration_parser_arc.py,sha256=TOoPf12pEXLdkjEGJEGV6rJOMR8yqLedla6T1x6g-Xw,14057
|
|
65
65
|
dcs_core/core/datasource/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
66
66
|
dcs_core/core/datasource/base.py,sha256=YD_UuGuoORFJNX30IQMk6aitiiTCHaiAddSNgUBmRtA,1935
|
|
67
|
-
dcs_core/core/datasource/file_datasource.py,sha256=
|
|
68
|
-
dcs_core/core/datasource/manager.py,sha256=
|
|
67
|
+
dcs_core/core/datasource/file_datasource.py,sha256=ppjGOtzSaBV3DWIute0dvOHQxtfULgeIefsDFW3xZz8,1017
|
|
68
|
+
dcs_core/core/datasource/manager.py,sha256=cuh6XAOCxn2b9SQxYwYurgBb6WUD8ZS6KRIg3FAloYU,4824
|
|
69
69
|
dcs_core/core/datasource/search_datasource.py,sha256=_conk1Q_kywJhKHYyEScoKlVt_yRd05zuAISvDmXqjw,15014
|
|
70
70
|
dcs_core/core/datasource/sql_datasource.py,sha256=dlX-E--hadl2q8XpMNRyZmLGC35tltBsGDzlyZqzqtw,40730
|
|
71
|
-
dcs_core/core/inspect.py,sha256=
|
|
71
|
+
dcs_core/core/inspect.py,sha256=H5MCR_o71hz1lvkueSMAJ5C4R93hogrEWl04iuwgrdM,6426
|
|
72
72
|
dcs_core/core/logger/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
73
73
|
dcs_core/core/logger/base.py,sha256=dbW48Y1t6okXy1dx8GSH3KpE5HX6pebHGuKqacJWF0E,1024
|
|
74
74
|
dcs_core/core/logger/default_logger.py,sha256=7yieM99xnepxq5FSlu0TEhVd55X_kUGRqjiio5loQuU,4097
|
|
@@ -100,10 +100,11 @@ dcs_core/core/validation/uniqueness_validation.py,sha256=a6zm0_omiULKbQcDit8J913
|
|
|
100
100
|
dcs_core/core/validation/validity_validation.py,sha256=358oAGH112oVxyPhDnfT-ypVaMAkpZ8pM73qogtdh9w,35297
|
|
101
101
|
dcs_core/integrations/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
102
102
|
dcs_core/integrations/databases/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
103
|
-
dcs_core/integrations/databases/azure_blob.py,sha256=
|
|
103
|
+
dcs_core/integrations/databases/azure_blob.py,sha256=XjFLWuWYoWh5DGow1zgYypHvGtP7qN2oFwlvRth2aQ0,7973
|
|
104
104
|
dcs_core/integrations/databases/bigquery.py,sha256=26RuypLMmiARZIWkV_mxtnNL2yCs94YWerSGH5Nr10Q,7337
|
|
105
105
|
dcs_core/integrations/databases/databricks.py,sha256=n4fm5m_mtRCdtjLGDvbNW18u7Ev234vDBjq_lxuOxns,1978
|
|
106
106
|
dcs_core/integrations/databases/db2.py,sha256=hNGivvYCitp88ouZlCxp7iRQ-vnPiK1kL8x85NyGotk,26492
|
|
107
|
+
dcs_core/integrations/databases/duck_db.py,sha256=J4ReqFmUFfbYbZlvjmjZfHiD5CkgX4TnzqljGf5wJCQ,2422
|
|
107
108
|
dcs_core/integrations/databases/elasticsearch.py,sha256=6CTGs1WGrfgdDRNVt9DpOB0_z_znT6YoVj10E1WY-wQ,2152
|
|
108
109
|
dcs_core/integrations/databases/mssql.py,sha256=g0MmoG8-xFphJ2oZl-q_OZ2oT6yz-lVY09JTIvIx4-0,38910
|
|
109
110
|
dcs_core/integrations/databases/mysql.py,sha256=mUFLIGdbF_ktIlA19P7kq7holp5ZkRezGgN6TL_uiJ4,15815
|
|
@@ -133,7 +134,7 @@ dcs_core/report/static/index.js,sha256=p4wvku-zlXi0y4gWeSzV1amY0s4mjtUq2QsezARLV
|
|
|
133
134
|
dcs_core/report/static/index.js.LICENSE.txt,sha256=bBDZBJVEDrqjCi7sfoF8CchjFn3hdcbNkP7ub7kbcXQ,201041
|
|
134
135
|
dcs_sdk/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
135
136
|
dcs_sdk/__main__.py,sha256=Qn8stIaQGrdLjHQ-H7xO0T-brtq5RWZoWU9QvqoarV8,683
|
|
136
|
-
dcs_sdk/__version__.py,sha256=
|
|
137
|
+
dcs_sdk/__version__.py,sha256=eNGN1UbpUzU92QJAufllEa8THGec2TPMG38N7GY-Rc8,633
|
|
137
138
|
dcs_sdk/cli/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
138
139
|
dcs_sdk/cli/cli.py,sha256=jaO52UrMWLafcF_yhqllPkmYSTuO2sksFi30fYFdAB4,4406
|
|
139
140
|
dcs_sdk/sdk/__init__.py,sha256=skrZcgWWJBL6NXTUERywJ3qRJRemgpDXyW7lPg1FJk8,2107
|
|
@@ -155,7 +156,7 @@ dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py,sha256=puAWP
|
|
|
155
156
|
dcs_sdk/sdk/utils/table.py,sha256=X8HxdYTWyx_oVrBWPsXlmA-xJKXXDBW9RrhlWNqA1As,18224
|
|
156
157
|
dcs_sdk/sdk/utils/themes.py,sha256=Meo2Yldv4uyPpEqI7qdA28Aa6sxtwUU1dLKKm4QavjM,1403
|
|
157
158
|
dcs_sdk/sdk/utils/utils.py,sha256=a9QGEVL8L7asbJm_VBwgKvJQknsvuqWS0uTUaHsDPiY,16463
|
|
158
|
-
dcs_sdk-1.
|
|
159
|
-
dcs_sdk-1.
|
|
160
|
-
dcs_sdk-1.
|
|
161
|
-
dcs_sdk-1.
|
|
159
|
+
dcs_sdk-1.7.0.dist-info/METADATA,sha256=y4bCqzJK1nn3mIkDJ8uAQwfgHNR_Gytw_X3hic0Jhi0,7652
|
|
160
|
+
dcs_sdk-1.7.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
161
|
+
dcs_sdk-1.7.0.dist-info/entry_points.txt,sha256=XhODNz7UccgPOyklXgp7pIfTTXArd6-V0mImjhnhwto,80
|
|
162
|
+
dcs_sdk-1.7.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|