dcs-sdk 1.6.6__py3-none-any.whl → 1.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +0 -2
- dcs_core/core/common/errors.py +18 -0
- dcs_core/core/common/models/configuration.py +6 -0
- dcs_core/core/datasource/file_datasource.py +26 -0
- dcs_core/core/datasource/manager.py +15 -0
- dcs_core/integrations/databases/azure_blob.py +115 -0
- dcs_sdk/__version__.py +1 -1
- dcs_sdk/sdk/config/config_loader.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +59 -12
- dcs_sdk/sdk/utils/utils.py +137 -1
- {dcs_sdk-1.6.6.dist-info → dcs_sdk-1.6.8.dist-info}/METADATA +4 -2
- {dcs_sdk-1.6.6.dist-info → dcs_sdk-1.6.8.dist-info}/RECORD +14 -12
- {dcs_sdk-1.6.6.dist-info → dcs_sdk-1.6.8.dist-info}/WHEEL +0 -0
- {dcs_sdk-1.6.6.dist-info → dcs_sdk-1.6.8.dist-info}/entry_points.txt +0 -0
data_diff/__init__.py
CHANGED
|
@@ -55,9 +55,7 @@ def connect_to_table(
|
|
|
55
55
|
db_info.pop(k)
|
|
56
56
|
if isinstance(key_columns, str):
|
|
57
57
|
key_columns = (key_columns,)
|
|
58
|
-
|
|
59
58
|
db: Database = connect(db_info, thread_count=thread_count)
|
|
60
|
-
|
|
61
59
|
if isinstance(table_name, str):
|
|
62
60
|
table_name = db.dialect.parse_table_name(table_name)
|
|
63
61
|
|
dcs_core/core/common/errors.py
CHANGED
|
@@ -16,6 +16,8 @@ ERROR_RUNTIME = "runtime_error"
|
|
|
16
16
|
ERROR_CONFIGURATION = "configuration_error"
|
|
17
17
|
ERROR_DATA_SOURCES_CONNECTION = "data_sources_connection_error"
|
|
18
18
|
ERROR_METRIC_GENERATION = "metric_generation_error"
|
|
19
|
+
ERROR_FETCHING_TABLE = "table_fetch_error"
|
|
20
|
+
ERROR_FETCHING_COLUMN = "column_fetch_error"
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
class DataChecksRuntimeError(Exception):
|
|
@@ -48,3 +50,19 @@ class DataChecksMetricGenerationError(Exception):
|
|
|
48
50
|
def __init__(self, message):
|
|
49
51
|
super().__init__(message)
|
|
50
52
|
self.error_code = ERROR_METRIC_GENERATION
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class DatachecksTableFetchError(Exception):
|
|
56
|
+
"""Raised when there is an error in fetching table."""
|
|
57
|
+
|
|
58
|
+
def __init__(self, message):
|
|
59
|
+
super().__init__(message)
|
|
60
|
+
self.error_code = ERROR_FETCHING_TABLE
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class DatachecksColumnFetchError(Exception):
|
|
64
|
+
"""Raised when there is an error in fetching column."""
|
|
65
|
+
|
|
66
|
+
def __init__(self, message):
|
|
67
|
+
super().__init__(message)
|
|
68
|
+
self.error_code = ERROR_FETCHING_COLUMN
|
|
@@ -43,6 +43,7 @@ class DataSourceType(str, Enum):
|
|
|
43
43
|
ORACLE = "oracle"
|
|
44
44
|
DB2 = "db2"
|
|
45
45
|
SYBASE = "sybase"
|
|
46
|
+
AZURE_BLOB = "azure_blob"
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
class DataSourceLanguageSupport(str, Enum):
|
|
@@ -85,6 +86,11 @@ class DataSourceConnectionConfiguration:
|
|
|
85
86
|
security: Optional[str] = None # IBM DB2 specific configuration
|
|
86
87
|
protocol: Optional[str] = None # IBM DB2 specific configuration
|
|
87
88
|
server: Optional[str] = None
|
|
89
|
+
account_name: Optional[str] = None
|
|
90
|
+
container_name: Optional[str] = None
|
|
91
|
+
account_key: Optional[str] = None
|
|
92
|
+
endpoint_suffix: Optional[str] = None
|
|
93
|
+
subfolder_path: Optional[str] = None
|
|
88
94
|
|
|
89
95
|
|
|
90
96
|
@dataclass
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import Dict
|
|
16
|
+
|
|
17
|
+
from dcs_core.core.datasource.base import DataSource
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FileDataSource(DataSource):
|
|
21
|
+
"""
|
|
22
|
+
Abstract class for File data sources
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
26
|
+
super().__init__(data_source_name, data_connection)
|
|
@@ -11,6 +11,20 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.manager
|
|
16
|
+
#
|
|
17
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
18
|
+
# you may not use this file except in compliance with the License.
|
|
19
|
+
# You may obtain a copy of the License at
|
|
20
|
+
#
|
|
21
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
22
|
+
#
|
|
23
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
24
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
25
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
26
|
+
# See the License for the specific language governing permissions and
|
|
27
|
+
# limitations under the License.
|
|
14
28
|
import importlib
|
|
15
29
|
from dataclasses import asdict
|
|
16
30
|
from typing import Dict, List
|
|
@@ -43,6 +57,7 @@ class DataSourceManager:
|
|
|
43
57
|
"oracle": "OracleDataSource",
|
|
44
58
|
"db2": "DB2DataSource",
|
|
45
59
|
"sybase": "SybaseDataSource",
|
|
60
|
+
"azure_blob": "AzureBlobDataSource",
|
|
46
61
|
}
|
|
47
62
|
|
|
48
63
|
def __init__(self, config: Configuration):
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import io
|
|
16
|
+
from typing import Any, Dict, List, Optional
|
|
17
|
+
|
|
18
|
+
import pandas as pd
|
|
19
|
+
from azure.storage.blob import BlobServiceClient
|
|
20
|
+
|
|
21
|
+
from dcs_core.core.common.errors import (
|
|
22
|
+
DatachecksColumnFetchError,
|
|
23
|
+
DataChecksDataSourcesConnectionError,
|
|
24
|
+
DatachecksTableFetchError,
|
|
25
|
+
)
|
|
26
|
+
from dcs_core.core.datasource.file_datasource import FileDataSource
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class AzureBlobDataSource(FileDataSource):
|
|
30
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
31
|
+
super().__init__(data_source_name, data_connection)
|
|
32
|
+
self.allowed_file_extensions = [".csv"]
|
|
33
|
+
self.blob_service_client: Optional[BlobServiceClient] = None
|
|
34
|
+
self.connection = None
|
|
35
|
+
|
|
36
|
+
def connect(self) -> Any:
|
|
37
|
+
"""
|
|
38
|
+
Connect to the file data source
|
|
39
|
+
"""
|
|
40
|
+
try:
|
|
41
|
+
account_name = self.data_connection.get("account_name")
|
|
42
|
+
container_name = self.data_connection.get("container_name")
|
|
43
|
+
account_key = self.data_connection.get("account_key")
|
|
44
|
+
endpoint_suffix = self.data_connection.get("endpoint_suffix", "core.windows.net")
|
|
45
|
+
connection_str = f"https://{account_name}.blob.{endpoint_suffix}"
|
|
46
|
+
blob_service_client = BlobServiceClient(account_url=connection_str, credential=account_key)
|
|
47
|
+
self.blob_service_client = blob_service_client
|
|
48
|
+
self.connection = blob_service_client.get_container_client(container=container_name)
|
|
49
|
+
return self.connection
|
|
50
|
+
except Exception as e:
|
|
51
|
+
raise DataChecksDataSourcesConnectionError(f"Failed to connect to Azure Blob Storage: {e}")
|
|
52
|
+
|
|
53
|
+
def is_connected(self) -> bool:
|
|
54
|
+
"""
|
|
55
|
+
Check if the file data source is connected
|
|
56
|
+
"""
|
|
57
|
+
return self.connection is not None
|
|
58
|
+
|
|
59
|
+
def close(self):
|
|
60
|
+
"""
|
|
61
|
+
Close the connection
|
|
62
|
+
"""
|
|
63
|
+
self.connection.close()
|
|
64
|
+
self.blob_service_client.close()
|
|
65
|
+
self.connection = None
|
|
66
|
+
self.blob_service_client = None
|
|
67
|
+
|
|
68
|
+
def query_get_table_names(self) -> dict:
|
|
69
|
+
"""
|
|
70
|
+
Query to get table names (blob names in this case)
|
|
71
|
+
"""
|
|
72
|
+
if not self.is_connected():
|
|
73
|
+
raise DataChecksDataSourcesConnectionError("Not connected to Azure Blob Storage")
|
|
74
|
+
try:
|
|
75
|
+
subfolder = self.data_connection.get("subfolder", "")
|
|
76
|
+
blob_iterator = self.connection.list_blobs(name_starts_with=subfolder)
|
|
77
|
+
blobs = [
|
|
78
|
+
blob.name
|
|
79
|
+
for blob in blob_iterator
|
|
80
|
+
if len(blob.name.split("/")) == 1 and blob.name.endswith(tuple(self.allowed_file_extensions))
|
|
81
|
+
]
|
|
82
|
+
return {"table": blobs}
|
|
83
|
+
except Exception as e:
|
|
84
|
+
raise DatachecksTableFetchError(f"Failed to list blobs: {e}")
|
|
85
|
+
|
|
86
|
+
def query_get_table_columns(self, table: str) -> List[dict]:
|
|
87
|
+
"""
|
|
88
|
+
Get column names for a table (CSV blob in this case).
|
|
89
|
+
"""
|
|
90
|
+
if not self.is_connected():
|
|
91
|
+
raise DataChecksDataSourcesConnectionError("Not connected to Azure Blob Storage")
|
|
92
|
+
|
|
93
|
+
if not any(table.endswith(ext) for ext in self.allowed_file_extensions):
|
|
94
|
+
raise ValueError(f"Unsupported file type for {table}. Allowed: {self.allowed_file_extensions}")
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
blob_client = self.connection.get_blob_client(blob=table)
|
|
98
|
+
download_stream = blob_client.download_blob()
|
|
99
|
+
data = download_stream.readall()
|
|
100
|
+
if table.endswith(".csv"):
|
|
101
|
+
df = pd.read_csv(io.BytesIO(data))
|
|
102
|
+
else:
|
|
103
|
+
raise ValueError(f"Unsupported file type for {table}. Allowed: {self.allowed_file_extensions}")
|
|
104
|
+
|
|
105
|
+
return [{"column_name": col, "column_type": "string"} for col in df.columns.tolist()]
|
|
106
|
+
except Exception as e:
|
|
107
|
+
raise DatachecksColumnFetchError(f"Failed to read columns from blob '{table}': {e}")
|
|
108
|
+
|
|
109
|
+
def query_get_database_version(self) -> str:
|
|
110
|
+
"""
|
|
111
|
+
Get the database version
|
|
112
|
+
:return: version string
|
|
113
|
+
"""
|
|
114
|
+
api_version = self.blob_service_client.api_version
|
|
115
|
+
return api_version
|
dcs_sdk/__version__.py
CHANGED
|
@@ -47,6 +47,7 @@ class SourceTargetConnection(BaseModel):
|
|
|
47
47
|
port: Optional[Union[int, str]] = None
|
|
48
48
|
driver: str
|
|
49
49
|
table: Optional[str] = None
|
|
50
|
+
datasource_type: Optional[str] = None
|
|
50
51
|
database: Optional[str] = None
|
|
51
52
|
filepath: Optional[str] = None
|
|
52
53
|
catalog: Optional[str] = None
|
|
@@ -66,6 +67,11 @@ class SourceTargetConnection(BaseModel):
|
|
|
66
67
|
impersonate_service_account: Optional[str] = None # bigquery specific
|
|
67
68
|
bigquery_credentials: Optional[str] = None # bigquery specific
|
|
68
69
|
transform_columns: Dict[str, str] | None = None
|
|
70
|
+
account_name: Optional[str] = None
|
|
71
|
+
container_name: Optional[str] = None
|
|
72
|
+
account_key: Optional[str] = None
|
|
73
|
+
endpoint_suffix: Optional[str] = None
|
|
74
|
+
subfolder_path: Optional[str] = None
|
|
69
75
|
|
|
70
76
|
|
|
71
77
|
class SimilarityConfig(BaseModel):
|
|
@@ -140,6 +146,7 @@ class DataDiffConfig:
|
|
|
140
146
|
"mysql": "mysql",
|
|
141
147
|
"sybase": "sybase",
|
|
142
148
|
"bigquery": "bigquery",
|
|
149
|
+
"azure_blob": "duckdb",
|
|
143
150
|
}
|
|
144
151
|
|
|
145
152
|
def __init__(
|
|
@@ -307,6 +314,12 @@ class DataDiffConfig:
|
|
|
307
314
|
"impersonate_service_account": connection.get("connection", {}).get("impersonate_service_account"),
|
|
308
315
|
"bigquery_credentials": connection.get("connection", {}).get("bigquery_credentials"),
|
|
309
316
|
"transform_columns": transform_columns,
|
|
317
|
+
"datasource_type": connection.get("type"),
|
|
318
|
+
"account_name": connection.get("connection", {}).get("account_name"),
|
|
319
|
+
"container_name": connection.get("connection", {}).get("container_name"),
|
|
320
|
+
"account_key": connection.get("connection", {}).get("account_key"),
|
|
321
|
+
"endpoint_suffix": connection.get("connection", {}).get("endpoint_suffix"),
|
|
322
|
+
"subfolder_path": connection.get("connection", {}).get("subfolder_path"),
|
|
310
323
|
}
|
|
311
324
|
|
|
312
325
|
def get_data_diff_configs(self) -> List[Comparison]:
|
|
@@ -18,6 +18,7 @@ import time
|
|
|
18
18
|
from collections import defaultdict
|
|
19
19
|
from contextlib import suppress
|
|
20
20
|
from datetime import datetime, timezone
|
|
21
|
+
from pathlib import Path
|
|
21
22
|
from typing import Dict, Optional
|
|
22
23
|
|
|
23
24
|
from loguru import logger
|
|
@@ -32,9 +33,11 @@ from dcs_sdk.sdk.utils.serializer import serialize_table_schema
|
|
|
32
33
|
from dcs_sdk.sdk.utils.table import create_table_schema_row_count, differ_rows
|
|
33
34
|
from dcs_sdk.sdk.utils.themes import theme_1
|
|
34
35
|
from dcs_sdk.sdk.utils.utils import (
|
|
36
|
+
azure_to_csv_file,
|
|
35
37
|
calculate_column_differences,
|
|
36
38
|
convert_to_masked_if_required,
|
|
37
39
|
duck_db_load_csv_to_table,
|
|
40
|
+
duck_db_load_pd_to_table,
|
|
38
41
|
find_identical_columns,
|
|
39
42
|
generate_table_name,
|
|
40
43
|
obfuscate_sensitive_data,
|
|
@@ -67,6 +70,7 @@ class DBTableDiffer:
|
|
|
67
70
|
self.target_db: Database = None
|
|
68
71
|
self.similarity = self.config.similarity
|
|
69
72
|
self.similarity_providers = None
|
|
73
|
+
self.allowed_file_comparison_types = ["azure_blob"]
|
|
70
74
|
if self.similarity:
|
|
71
75
|
from dcs_sdk.sdk.utils.similarity_score.base_provider import (
|
|
72
76
|
ensure_nltk_data,
|
|
@@ -88,6 +92,8 @@ class DBTableDiffer:
|
|
|
88
92
|
"levenshtein": LevenshteinDistanceProvider,
|
|
89
93
|
"cosine": CosineSimilarityProvider,
|
|
90
94
|
}
|
|
95
|
+
self.original_source_table_name = self.config.source.table
|
|
96
|
+
self.original_target_table_name = self.config.target.table
|
|
91
97
|
|
|
92
98
|
def create_dataset_dict(
|
|
93
99
|
self,
|
|
@@ -96,6 +102,7 @@ class DBTableDiffer:
|
|
|
96
102
|
db_name: str,
|
|
97
103
|
file_path: str,
|
|
98
104
|
database_type: str,
|
|
105
|
+
is_file_ds: bool = False,
|
|
99
106
|
) -> Dict:
|
|
100
107
|
schema_list = [serialize_table_schema(v) for v in table.get_schema().values()]
|
|
101
108
|
schema_list.sort(key=lambda x: x["column_name"].upper())
|
|
@@ -106,8 +113,8 @@ class DBTableDiffer:
|
|
|
106
113
|
"workspace": config.workspace,
|
|
107
114
|
"database_type": database_type,
|
|
108
115
|
"table_name": table.table_path[0],
|
|
109
|
-
"schema": table.database.default_schema,
|
|
110
|
-
"database": db_name,
|
|
116
|
+
"schema": table.database.default_schema if not is_file_ds else None,
|
|
117
|
+
"database": db_name if not is_file_ds else None,
|
|
111
118
|
"primary_keys": list(table.key_columns),
|
|
112
119
|
"file_path": file_path,
|
|
113
120
|
"files": [] if file_path is None else [generate_table_name(csv, False) for csv in glob.glob(file_path)],
|
|
@@ -217,15 +224,50 @@ class DBTableDiffer:
|
|
|
217
224
|
)
|
|
218
225
|
|
|
219
226
|
def process_duckdb(self, is_source: bool):
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
227
|
+
try:
|
|
228
|
+
ds_type = self.config.source.datasource_type if is_source else self.config.target.datasource_type
|
|
229
|
+
if ds_type in self.allowed_file_comparison_types:
|
|
230
|
+
try:
|
|
231
|
+
if ds_type == "azure_blob":
|
|
232
|
+
df = azure_to_csv_file(self.config, is_source)
|
|
233
|
+
name_only = (
|
|
234
|
+
Path(self.config.source.table).stem if is_source else Path(self.config.target.table).stem
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
if is_source:
|
|
238
|
+
self.config.source.table = name_only
|
|
239
|
+
else:
|
|
240
|
+
self.config.target.table = name_only
|
|
241
|
+
|
|
242
|
+
if not duck_db_load_pd_to_table(config=self.config, is_source=is_source, df=df):
|
|
243
|
+
raise ValueError(
|
|
244
|
+
f"Error loading CSV into DuckDB for the {'source' if is_source else 'target'} table."
|
|
245
|
+
)
|
|
246
|
+
except Exception as e:
|
|
247
|
+
raise RuntimeError(
|
|
248
|
+
f"Failed processing Azure Blob for {'source' if is_source else 'target'}: {e}"
|
|
249
|
+
) from e
|
|
250
|
+
|
|
251
|
+
else:
|
|
252
|
+
try:
|
|
253
|
+
filepath = self.config.source.filepath if is_source else self.config.target.filepath
|
|
254
|
+
if filepath is None:
|
|
255
|
+
raise ValueError("File path is required for file-based source.")
|
|
256
|
+
|
|
257
|
+
if filepath.endswith(".csv"):
|
|
258
|
+
if not duck_db_load_csv_to_table(self.config, filepath, is_source):
|
|
259
|
+
raise ValueError(
|
|
260
|
+
f"Error loading CSV into DuckDB for the {'source' if is_source else 'target'} table."
|
|
261
|
+
)
|
|
262
|
+
else:
|
|
263
|
+
raise ValueError(f"Unsupported file format: {filepath}")
|
|
264
|
+
except Exception as e:
|
|
265
|
+
raise RuntimeError(
|
|
266
|
+
f"Failed processing local file for {'source' if is_source else 'target'}: {e}"
|
|
267
|
+
) from e
|
|
268
|
+
|
|
269
|
+
except Exception as e:
|
|
270
|
+
raise RuntimeError(f"process_duckdb failed for {'source' if is_source else 'target'}: {e}") from e
|
|
229
271
|
|
|
230
272
|
def _prepare_source_table(self) -> Optional[str]:
|
|
231
273
|
view_name = None
|
|
@@ -346,6 +388,7 @@ class DBTableDiffer:
|
|
|
346
388
|
db1_name,
|
|
347
389
|
self.source_file_path,
|
|
348
390
|
"file" if self.config.source.driver == "duckdb" else self.config.source.driver,
|
|
391
|
+
True if self.config.source.driver == "duckdb" else False,
|
|
349
392
|
)
|
|
350
393
|
target_dataset = self.create_dataset_dict(
|
|
351
394
|
self.config.target,
|
|
@@ -353,6 +396,7 @@ class DBTableDiffer:
|
|
|
353
396
|
db2_name,
|
|
354
397
|
self.target_file_path,
|
|
355
398
|
"file" if self.config.target.driver == "duckdb" else self.config.target.driver,
|
|
399
|
+
True if self.config.target.driver == "duckdb" else False,
|
|
356
400
|
)
|
|
357
401
|
table_1_row_count = source_dataset.get("row_count", 0)
|
|
358
402
|
table_2_row_count = target_dataset.get("row_count", 0)
|
|
@@ -690,7 +734,10 @@ class DBTableDiffer:
|
|
|
690
734
|
|
|
691
735
|
self.response.update({"column_transforms": column_transforms})
|
|
692
736
|
self.response.update({"schema_overrides": schema_overrides})
|
|
693
|
-
|
|
737
|
+
self.config.source.table = self.original_source_table_name
|
|
738
|
+
self.config.target.table = self.original_target_table_name
|
|
739
|
+
self.response["source_dataset"]["table_name"] = self.original_source_table_name
|
|
740
|
+
self.response["target_dataset"]["table_name"] = self.original_target_table_name
|
|
694
741
|
return self.response
|
|
695
742
|
except Exception as e:
|
|
696
743
|
logger.exception(f"Error during diff_tables: {e}")
|
dcs_sdk/sdk/utils/utils.py
CHANGED
|
@@ -13,12 +13,19 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import glob
|
|
16
|
+
import io
|
|
16
17
|
import os
|
|
17
18
|
import uuid
|
|
19
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from queue import Empty, Queue
|
|
18
22
|
from typing import List, Optional, Union
|
|
19
23
|
|
|
20
24
|
import duckdb
|
|
25
|
+
import pandas as pd
|
|
21
26
|
import requests
|
|
27
|
+
from azure.storage.blob import BlobServiceClient
|
|
28
|
+
from loguru import logger
|
|
22
29
|
|
|
23
30
|
from dcs_sdk.sdk.config.config_loader import Comparison
|
|
24
31
|
from dcs_sdk.sdk.rules.rules_repository import RulesRepository
|
|
@@ -137,6 +144,135 @@ def calculate_column_differences(source_columns, target_columns, columns_mapping
|
|
|
137
144
|
)
|
|
138
145
|
|
|
139
146
|
|
|
147
|
+
def chunk_load_to_pandas(queue: Queue, result_df: list, timeout: float = 2.0):
|
|
148
|
+
"""Consumer thread: read CSV chunks from queue & build final DataFrame"""
|
|
149
|
+
df = pd.DataFrame()
|
|
150
|
+
try:
|
|
151
|
+
while True:
|
|
152
|
+
try:
|
|
153
|
+
data = queue.get(timeout=timeout)
|
|
154
|
+
except Empty:
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
if data is None:
|
|
158
|
+
break
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
chunk = pd.read_csv(io.BytesIO(data), dtype=str)
|
|
162
|
+
df = pd.concat([df, chunk], ignore_index=True)
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.error(f"[ERROR] Failed to read CSV chunk: {e}")
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
except Exception as e:
|
|
168
|
+
logger.error(f"[FATAL] Consumer crashed: {e}")
|
|
169
|
+
|
|
170
|
+
finally:
|
|
171
|
+
result_df.append(df)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def azure_to_csv_file(config: Comparison, is_source: bool = False) -> tuple[str, str]:
|
|
175
|
+
"""Download CSV from Azure and save to local file"""
|
|
176
|
+
CHUNK_SIZE = 4 * 1024 * 1024
|
|
177
|
+
account_name = config.source.account_name if is_source else config.target.account_name
|
|
178
|
+
container_name = config.source.container_name if is_source else config.target.container_name
|
|
179
|
+
account_key = config.source.account_key if is_source else config.target.account_key
|
|
180
|
+
endpoint_suffix = config.source.endpoint_suffix if is_source else config.target.endpoint_suffix
|
|
181
|
+
|
|
182
|
+
table = config.source.table if is_source else config.target.table
|
|
183
|
+
|
|
184
|
+
connection_str = f"https://{account_name}.blob.{endpoint_suffix}"
|
|
185
|
+
blob_client = BlobServiceClient(account_url=connection_str, credential=account_key).get_blob_client(
|
|
186
|
+
container=container_name, blob=table
|
|
187
|
+
)
|
|
188
|
+
blob_size = blob_client.get_blob_properties().size
|
|
189
|
+
start = 0
|
|
190
|
+
queue = Queue()
|
|
191
|
+
result_df = []
|
|
192
|
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
|
193
|
+
executor.submit(chunk_load_to_pandas, queue, result_df)
|
|
194
|
+
|
|
195
|
+
all_data = b""
|
|
196
|
+
while start < blob_size:
|
|
197
|
+
end = min(start + CHUNK_SIZE - 1, blob_size - 1)
|
|
198
|
+
data = blob_client.download_blob(offset=start, length=end - start + 1).readall()
|
|
199
|
+
all_data += data
|
|
200
|
+
queue.put(data)
|
|
201
|
+
start += CHUNK_SIZE
|
|
202
|
+
|
|
203
|
+
queue.put(None)
|
|
204
|
+
if not result_df or len(result_df) == 0:
|
|
205
|
+
raise ValueError("No data downloaded from Azure Blob Storage")
|
|
206
|
+
return result_df[0]
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def duck_db_load_pd_to_table(config: Comparison, is_source: bool = False, df: pd.DataFrame = None) -> bool:
|
|
210
|
+
if df is None:
|
|
211
|
+
logger.error("DataFrame is None, cannot load to DuckDB")
|
|
212
|
+
return False
|
|
213
|
+
dir_name = "tmp"
|
|
214
|
+
if not os.path.exists(dir_name):
|
|
215
|
+
os.makedirs(dir_name)
|
|
216
|
+
|
|
217
|
+
if is_source:
|
|
218
|
+
pk_cols = config.primary_keys_source
|
|
219
|
+
else:
|
|
220
|
+
pk_cols = config.primary_keys_target
|
|
221
|
+
|
|
222
|
+
duck_db_file_name = f"{dir_name}/{uuid.uuid4()}.duckdb"
|
|
223
|
+
create_view = False
|
|
224
|
+
query = None
|
|
225
|
+
if is_source and config.source_query:
|
|
226
|
+
create_view = True
|
|
227
|
+
query = config.source_query
|
|
228
|
+
elif not is_source and config.target_query:
|
|
229
|
+
create_view = True
|
|
230
|
+
query = config.target_query
|
|
231
|
+
|
|
232
|
+
try:
|
|
233
|
+
table_name = config.source.table if is_source else config.target.table
|
|
234
|
+
|
|
235
|
+
conn = duckdb.connect(database=duck_db_file_name, read_only=False)
|
|
236
|
+
|
|
237
|
+
conn.register("df_view", df)
|
|
238
|
+
|
|
239
|
+
conn.execute(
|
|
240
|
+
f"""
|
|
241
|
+
CREATE OR REPLACE TABLE {table_name} AS
|
|
242
|
+
SELECT * FROM df_view;
|
|
243
|
+
"""
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
if pk_cols and len(pk_cols) > 0:
|
|
247
|
+
pk_cols_str = ", ".join(pk_cols)
|
|
248
|
+
conn.execute(
|
|
249
|
+
f"""
|
|
250
|
+
CREATE INDEX idx_{table_name} ON {table_name} ({pk_cols_str});
|
|
251
|
+
"""
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
if create_view:
|
|
255
|
+
view_name = f"{table_name}_query"
|
|
256
|
+
conn.execute(
|
|
257
|
+
f"""
|
|
258
|
+
CREATE VIEW {view_name} AS {query};
|
|
259
|
+
"""
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
conn.unregister("df_view")
|
|
263
|
+
conn.close()
|
|
264
|
+
|
|
265
|
+
except Exception as e:
|
|
266
|
+
logger.error(f"Error in loading CSV to DuckDB: {e}")
|
|
267
|
+
return False
|
|
268
|
+
|
|
269
|
+
if is_source:
|
|
270
|
+
config.source.filepath = duck_db_file_name
|
|
271
|
+
else:
|
|
272
|
+
config.target.filepath = duck_db_file_name
|
|
273
|
+
return True
|
|
274
|
+
|
|
275
|
+
|
|
140
276
|
def duck_db_load_csv_to_table(config: Comparison, path, is_source: bool = False) -> bool:
|
|
141
277
|
dir_name = "tmp"
|
|
142
278
|
if not os.path.exists(dir_name):
|
|
@@ -194,7 +330,7 @@ def duck_db_load_csv_to_table(config: Comparison, path, is_source: bool = False)
|
|
|
194
330
|
)
|
|
195
331
|
conn.close()
|
|
196
332
|
except Exception as e:
|
|
197
|
-
|
|
333
|
+
logger.error(f"Error in loading CSV to DuckDB: {e}")
|
|
198
334
|
return False
|
|
199
335
|
|
|
200
336
|
if is_source:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dcs-sdk
|
|
3
|
-
Version: 1.6.
|
|
3
|
+
Version: 1.6.8
|
|
4
4
|
Summary: SDK for DataChecks
|
|
5
5
|
Author: Waterdip Labs
|
|
6
6
|
Author-email: hello@waterdip.ai
|
|
@@ -30,6 +30,8 @@ Provides-Extra: sybase
|
|
|
30
30
|
Provides-Extra: trino
|
|
31
31
|
Provides-Extra: vertica
|
|
32
32
|
Requires-Dist: attrs (>=23.1.0)
|
|
33
|
+
Requires-Dist: azure-identity (>=1.25.1,<2.0.0)
|
|
34
|
+
Requires-Dist: azure-storage-blob (>=12.27.1,<13.0.0)
|
|
33
35
|
Requires-Dist: click (>=8.1)
|
|
34
36
|
Requires-Dist: clickhouse-driver (>=0.2.9) ; extra == "clickhouse" or extra == "all-dbs"
|
|
35
37
|
Requires-Dist: cryptography (>=44.0.1) ; extra == "snowflake" or extra == "all-dbs"
|
|
@@ -84,7 +86,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
|
|
|
84
86
|
Description-Content-Type: text/markdown
|
|
85
87
|
|
|
86
88
|
<h1 align="center">
|
|
87
|
-
DCS SDK v1.6.
|
|
89
|
+
DCS SDK v1.6.8
|
|
88
90
|
</h1>
|
|
89
91
|
|
|
90
92
|
> SDK for DataChecks
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
data_diff/__init__.py,sha256=
|
|
1
|
+
data_diff/__init__.py,sha256=NcZ2rwvDST7cMyaaLANvNhoaFn-jC_WDg9pxDLXhZ04,10411
|
|
2
2
|
data_diff/__main__.py,sha256=UvFvBKU74202bfRcIO_Wk-SU8WmnNuDK_1YVJpueMlc,16969
|
|
3
3
|
data_diff/abcs/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
4
4
|
data_diff/abcs/compiler.py,sha256=RuGhGlLTQuCzOJfYxa4gjcADsyvbZ9yZPuDuY6XH8Rk,785
|
|
@@ -49,9 +49,9 @@ dcs_core/cli/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
|
49
49
|
dcs_core/cli/cli.py,sha256=dSr3D62XhjCEn4G5Jb0O4q05G1_YAMJgaOnLqciMAmI,6020
|
|
50
50
|
dcs_core/core/__init__.py,sha256=8XyOIsx-uCpaEZUgfOrb0DCdvmz1TipNQdz01h7mun0,761
|
|
51
51
|
dcs_core/core/common/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
52
|
-
dcs_core/core/common/errors.py,sha256=
|
|
52
|
+
dcs_core/core/common/errors.py,sha256=nRczSqORCjcDngAuDsqzsc3_yZQzuUX26lPov0pTE1I,2268
|
|
53
53
|
dcs_core/core/common/models/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
54
|
-
dcs_core/core/common/models/configuration.py,sha256=
|
|
54
|
+
dcs_core/core/common/models/configuration.py,sha256=cFFr_SiAqYR3NIFGfz4rJVVX-LuGu-9TJC47ghL3Tes,9396
|
|
55
55
|
dcs_core/core/common/models/dashboard.py,sha256=_WV1kbs4cKlFZ5QcXyMdTmDSZLYxhvZWWWQzvHReMxM,814
|
|
56
56
|
dcs_core/core/common/models/data_source_resource.py,sha256=rNvj5NjvEQi2irHYjClKBFZbp70LTX9oGCPDeFURlAI,1559
|
|
57
57
|
dcs_core/core/common/models/metric.py,sha256=0Oxp7YvWZVy7zbmi4u_opBDeknsuzXmnOrK01pP2fQw,4843
|
|
@@ -64,7 +64,8 @@ dcs_core/core/configuration/configuration_parser.py,sha256=KGOJqWbOWhTacuMwM1N55
|
|
|
64
64
|
dcs_core/core/configuration/configuration_parser_arc.py,sha256=TOoPf12pEXLdkjEGJEGV6rJOMR8yqLedla6T1x6g-Xw,14057
|
|
65
65
|
dcs_core/core/datasource/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
66
66
|
dcs_core/core/datasource/base.py,sha256=YD_UuGuoORFJNX30IQMk6aitiiTCHaiAddSNgUBmRtA,1935
|
|
67
|
-
dcs_core/core/datasource/
|
|
67
|
+
dcs_core/core/datasource/file_datasource.py,sha256=HG4av7KUFTfH2UlAl4bqcNI6MxpbSOA26cDqxmLUqh0,913
|
|
68
|
+
dcs_core/core/datasource/manager.py,sha256=3oBjIqV0YYjXubCDGVBJP_jzrv-oBgBA-octoa8Wvaw,4795
|
|
68
69
|
dcs_core/core/datasource/search_datasource.py,sha256=_conk1Q_kywJhKHYyEScoKlVt_yRd05zuAISvDmXqjw,15014
|
|
69
70
|
dcs_core/core/datasource/sql_datasource.py,sha256=dlX-E--hadl2q8XpMNRyZmLGC35tltBsGDzlyZqzqtw,40730
|
|
70
71
|
dcs_core/core/inspect.py,sha256=QICJKcEpQClLacsfNClFoiF08M01QnJh_U2VsXRh1iA,6427
|
|
@@ -99,6 +100,7 @@ dcs_core/core/validation/uniqueness_validation.py,sha256=a6zm0_omiULKbQcDit8J913
|
|
|
99
100
|
dcs_core/core/validation/validity_validation.py,sha256=358oAGH112oVxyPhDnfT-ypVaMAkpZ8pM73qogtdh9w,35297
|
|
100
101
|
dcs_core/integrations/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
101
102
|
dcs_core/integrations/databases/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
103
|
+
dcs_core/integrations/databases/azure_blob.py,sha256=rOPj-dv3ZaGUrr_rLMn8xjZXuEjlzcdfZv2RcZgnbps,4674
|
|
102
104
|
dcs_core/integrations/databases/bigquery.py,sha256=26RuypLMmiARZIWkV_mxtnNL2yCs94YWerSGH5Nr10Q,7337
|
|
103
105
|
dcs_core/integrations/databases/databricks.py,sha256=n4fm5m_mtRCdtjLGDvbNW18u7Ev234vDBjq_lxuOxns,1978
|
|
104
106
|
dcs_core/integrations/databases/db2.py,sha256=hNGivvYCitp88ouZlCxp7iRQ-vnPiK1kL8x85NyGotk,26492
|
|
@@ -131,14 +133,14 @@ dcs_core/report/static/index.js,sha256=p4wvku-zlXi0y4gWeSzV1amY0s4mjtUq2QsezARLV
|
|
|
131
133
|
dcs_core/report/static/index.js.LICENSE.txt,sha256=bBDZBJVEDrqjCi7sfoF8CchjFn3hdcbNkP7ub7kbcXQ,201041
|
|
132
134
|
dcs_sdk/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
133
135
|
dcs_sdk/__main__.py,sha256=Qn8stIaQGrdLjHQ-H7xO0T-brtq5RWZoWU9QvqoarV8,683
|
|
134
|
-
dcs_sdk/__version__.py,sha256=
|
|
136
|
+
dcs_sdk/__version__.py,sha256=sFz5TIfT8CPd_5meGnOcu6HHSYeHzkBf8FK68f4xbvU,633
|
|
135
137
|
dcs_sdk/cli/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
136
138
|
dcs_sdk/cli/cli.py,sha256=jaO52UrMWLafcF_yhqllPkmYSTuO2sksFi30fYFdAB4,4406
|
|
137
139
|
dcs_sdk/sdk/__init__.py,sha256=skrZcgWWJBL6NXTUERywJ3qRJRemgpDXyW7lPg1FJk8,2107
|
|
138
140
|
dcs_sdk/sdk/config/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
139
|
-
dcs_sdk/sdk/config/config_loader.py,sha256=
|
|
141
|
+
dcs_sdk/sdk/config/config_loader.py,sha256=ZbSGQ56LsHv4_mxNhYrf6eoegO2R4PaqAs8iAghU73M,22435
|
|
140
142
|
dcs_sdk/sdk/data_diff/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
141
|
-
dcs_sdk/sdk/data_diff/data_differ.py,sha256=
|
|
143
|
+
dcs_sdk/sdk/data_diff/data_differ.py,sha256=00lKfGU4xMeXuS_Wpvjf-TAgMiZ7r5_bv1EQsv1EdjQ,39050
|
|
142
144
|
dcs_sdk/sdk/rules/__init__.py,sha256=_BkKcE_jfdDQI_ECdOamJaefMKEXrKpYjPpnBQXl_Xs,657
|
|
143
145
|
dcs_sdk/sdk/rules/rules_mappping.py,sha256=fxakVkf7B2cVkYSO946LTim_HmMsl6lBDBqZjTTsSPI,1292
|
|
144
146
|
dcs_sdk/sdk/rules/rules_repository.py,sha256=x0Rli-wdnHAmXm5526go_qC3P-eFRt-4L7fs4hNqC-g,7564
|
|
@@ -152,8 +154,8 @@ dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py,sha256=Jd0TvIGOULNTsiCL_F
|
|
|
152
154
|
dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py,sha256=puAWPnoWfNo4BN4-kXIUHrtrt5jLv3Vkw_NfHvjYrn4,1185
|
|
153
155
|
dcs_sdk/sdk/utils/table.py,sha256=X8HxdYTWyx_oVrBWPsXlmA-xJKXXDBW9RrhlWNqA1As,18224
|
|
154
156
|
dcs_sdk/sdk/utils/themes.py,sha256=Meo2Yldv4uyPpEqI7qdA28Aa6sxtwUU1dLKKm4QavjM,1403
|
|
155
|
-
dcs_sdk/sdk/utils/utils.py,sha256=
|
|
156
|
-
dcs_sdk-1.6.
|
|
157
|
-
dcs_sdk-1.6.
|
|
158
|
-
dcs_sdk-1.6.
|
|
159
|
-
dcs_sdk-1.6.
|
|
157
|
+
dcs_sdk/sdk/utils/utils.py,sha256=1QsHT1Rg1LTfZDskuESrk8DfL34a_71RgCt-ceNftIE,16317
|
|
158
|
+
dcs_sdk-1.6.8.dist-info/METADATA,sha256=-20PhrAQqYbjGBwAgA032BxOxu7DIY244iAxJMb-ZvE,7670
|
|
159
|
+
dcs_sdk-1.6.8.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
160
|
+
dcs_sdk-1.6.8.dist-info/entry_points.txt,sha256=XhODNz7UccgPOyklXgp7pIfTTXArd6-V0mImjhnhwto,80
|
|
161
|
+
dcs_sdk-1.6.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|