dcs-sdk 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. dcs_sdk-0.3.0/PKG-INFO +271 -0
  2. dcs_sdk-0.3.0/README.md +218 -0
  3. dcs_sdk-0.3.0/data_diff/__init__.py +201 -0
  4. dcs_sdk-0.3.0/data_diff/__main__.py +517 -0
  5. dcs_sdk-0.3.0/data_diff/abcs/__init__.py +13 -0
  6. dcs_sdk-0.3.0/data_diff/abcs/compiler.py +27 -0
  7. dcs_sdk-0.3.0/data_diff/abcs/database_types.py +321 -0
  8. dcs_sdk-0.3.0/data_diff/config.py +141 -0
  9. dcs_sdk-0.3.0/data_diff/databases/__init__.py +38 -0
  10. dcs_sdk-0.3.0/data_diff/databases/_connect.py +319 -0
  11. dcs_sdk-0.3.0/data_diff/databases/base.py +1314 -0
  12. dcs_sdk-0.3.0/data_diff/databases/bigquery.py +330 -0
  13. dcs_sdk-0.3.0/data_diff/databases/clickhouse.py +217 -0
  14. dcs_sdk-0.3.0/data_diff/databases/databricks.py +262 -0
  15. dcs_sdk-0.3.0/data_diff/databases/duckdb.py +207 -0
  16. dcs_sdk-0.3.0/data_diff/databases/mssql.py +243 -0
  17. dcs_sdk-0.3.0/data_diff/databases/mysql.py +171 -0
  18. dcs_sdk-0.3.0/data_diff/databases/oracle.py +209 -0
  19. dcs_sdk-0.3.0/data_diff/databases/postgresql.py +289 -0
  20. dcs_sdk-0.3.0/data_diff/databases/presto.py +209 -0
  21. dcs_sdk-0.3.0/data_diff/databases/redshift.py +233 -0
  22. dcs_sdk-0.3.0/data_diff/databases/snowflake.py +222 -0
  23. dcs_sdk-0.3.0/data_diff/databases/trino.py +73 -0
  24. dcs_sdk-0.3.0/data_diff/databases/vertica.py +174 -0
  25. dcs_sdk-0.3.0/data_diff/diff_tables.py +411 -0
  26. dcs_sdk-0.3.0/data_diff/errors.py +17 -0
  27. dcs_sdk-0.3.0/data_diff/format.py +369 -0
  28. dcs_sdk-0.3.0/data_diff/hashdiff_tables.py +285 -0
  29. dcs_sdk-0.3.0/data_diff/info_tree.py +76 -0
  30. dcs_sdk-0.3.0/data_diff/joindiff_tables.py +434 -0
  31. dcs_sdk-0.3.0/data_diff/lexicographic_space.py +253 -0
  32. dcs_sdk-0.3.0/data_diff/parse_time.py +88 -0
  33. dcs_sdk-0.3.0/data_diff/py.typed +0 -0
  34. dcs_sdk-0.3.0/data_diff/queries/__init__.py +13 -0
  35. dcs_sdk-0.3.0/data_diff/queries/api.py +213 -0
  36. dcs_sdk-0.3.0/data_diff/queries/ast_classes.py +811 -0
  37. dcs_sdk-0.3.0/data_diff/queries/base.py +38 -0
  38. dcs_sdk-0.3.0/data_diff/queries/extras.py +43 -0
  39. dcs_sdk-0.3.0/data_diff/query_utils.py +70 -0
  40. dcs_sdk-0.3.0/data_diff/schema.py +67 -0
  41. dcs_sdk-0.3.0/data_diff/table_segment.py +302 -0
  42. dcs_sdk-0.3.0/data_diff/thread_utils.py +112 -0
  43. dcs_sdk-0.3.0/data_diff/utils.py +612 -0
  44. dcs_sdk-0.3.0/data_diff/version.py +15 -0
  45. dcs_sdk-0.3.0/dcs_sdk/__init__.py +13 -0
  46. dcs_sdk-0.3.0/dcs_sdk/__main__.py +20 -0
  47. dcs_sdk-0.3.0/dcs_sdk/__version__.py +15 -0
  48. dcs_sdk-0.3.0/dcs_sdk/cli/__init__.py +13 -0
  49. dcs_sdk-0.3.0/dcs_sdk/cli/cli.py +132 -0
  50. dcs_sdk-0.3.0/dcs_sdk/sdk/__init__.py +44 -0
  51. dcs_sdk-0.3.0/dcs_sdk/sdk/config/__init__.py +13 -0
  52. dcs_sdk-0.3.0/dcs_sdk/sdk/config/config_loader.py +237 -0
  53. dcs_sdk-0.3.0/dcs_sdk/sdk/data_diff/__init__.py +13 -0
  54. dcs_sdk-0.3.0/dcs_sdk/sdk/data_diff/data_differ.py +350 -0
  55. dcs_sdk-0.3.0/dcs_sdk/sdk/utils/__init__.py +13 -0
  56. dcs_sdk-0.3.0/dcs_sdk/sdk/utils/serializer.py +25 -0
  57. dcs_sdk-0.3.0/dcs_sdk/sdk/utils/table.py +222 -0
  58. dcs_sdk-0.3.0/dcs_sdk/sdk/utils/themes.py +40 -0
  59. dcs_sdk-0.3.0/dcs_sdk/sdk/utils/utils.py +214 -0
  60. dcs_sdk-0.3.0/pyproject.toml +105 -0
dcs_sdk-0.3.0/PKG-INFO ADDED
@@ -0,0 +1,271 @@
1
+ Metadata-Version: 2.1
2
+ Name: dcs-sdk
3
+ Version: 0.3.0
4
+ Summary: SDK for DataChecks
5
+ Author: Waterdip Labs
6
+ Author-email: hello@waterdip.ai
7
+ Requires-Python: >=3.10,<3.12
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Provides-Extra: all-dbs
12
+ Provides-Extra: clickhouse
13
+ Provides-Extra: databricks
14
+ Provides-Extra: mssql
15
+ Provides-Extra: mysql
16
+ Provides-Extra: oracle
17
+ Provides-Extra: postgresql
18
+ Provides-Extra: preql
19
+ Provides-Extra: presto
20
+ Provides-Extra: redshift
21
+ Provides-Extra: snowflake
22
+ Provides-Extra: trino
23
+ Provides-Extra: vertica
24
+ Requires-Dist: attrs (>=23.1.0)
25
+ Requires-Dist: click (>=8.1)
26
+ Requires-Dist: clickhouse-driver ; extra == "clickhouse" or extra == "all-dbs"
27
+ Requires-Dist: cryptography ; extra == "snowflake" or extra == "all-dbs"
28
+ Requires-Dist: databricks-sql-connector (>=3.3.0,<4.0.0) ; extra == "databricks"
29
+ Requires-Dist: dsnparse (<0.2.0)
30
+ Requires-Dist: duckdb (>=0.9.0)
31
+ Requires-Dist: keyring
32
+ Requires-Dist: mashumaro[msgpack] (>=2.9,<3.11.0)
33
+ Requires-Dist: mysql-connector-python (>=8.0.29) ; extra == "mysql" or extra == "all-dbs"
34
+ Requires-Dist: oracledb ; extra == "oracle" or extra == "all-dbs"
35
+ Requires-Dist: packaging (>=24.1,<25.0)
36
+ Requires-Dist: preql (>=0.2.19) ; extra == "preql" or extra == "all-dbs"
37
+ Requires-Dist: presto-python-client ; extra == "presto" or extra == "all-dbs"
38
+ Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgresql" or extra == "redshift" or extra == "all-dbs"
39
+ Requires-Dist: pydantic (>=1.10.12)
40
+ Requires-Dist: pyodbc (>=4.0.39) ; extra == "mssql" or extra == "all-dbs"
41
+ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
42
+ Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
43
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
44
+ Requires-Dist: rich
45
+ Requires-Dist: snowflake-connector-python (>=3.0.2,<4.0.0) ; extra == "snowflake" or extra == "all-dbs"
46
+ Requires-Dist: tabulate (>=0.9.0)
47
+ Requires-Dist: toml (>=0.10.2)
48
+ Requires-Dist: trino (>=0.314.0) ; extra == "trino" or extra == "all-dbs"
49
+ Requires-Dist: typing-extensions (>=4.0.1)
50
+ Requires-Dist: urllib3 (<2)
51
+ Requires-Dist: vertica-python ; extra == "vertica" or extra == "all-dbs"
52
+ Description-Content-Type: text/markdown
53
+
54
+ <h1 align="center">
55
+ DCS CLI v0.3.0
56
+ </h1>
57
+
58
+ > SDK for DataChecks
59
+
60
+
61
+ ## Installation
62
+
63
+ > Python version `>=3.10,<3.12`
64
+
65
+ ```bash
66
+
67
+ $ pip install dcs-cli[all-dbs]
68
+
69
+ ```
70
+
71
+ ## Supported Databases
72
+
73
+ > Availability Status
74
+
75
+ | Database | Code Name | Supported |
76
+ | ---------- | ------------ | --------- |
77
+ | PostgreSQL | `postgres` | ✅ |
78
+ | Snowflake | `snowflake` | ✅ |
79
+ | Trino | `trino` | ✅ |
80
+ | Databricks | `databricks` | ✅ |
81
+ | File | `file` | ✅ |
82
+
83
+
84
+
85
+ ## Available Commands
86
+
87
+
88
+
89
+ | Option | Short Option | Required | Default | Description | Example |
90
+ | :-----------: | :----------: | :------: | :-------------: | :------------------------------------------------: | :----------------------------------------------------------------------------------------------: |
91
+ | --config-path | -C | **Yes** | None | Specify the file path for the configuration | dcs_cli run --config-path config.yaml --compare comp_name |
92
+ | --compare | | **Yes** | None | Run only specific comparison using comparison name | dcs_cli run --config-path config.yaml --compare comp_name |
93
+ | --save-json | -j | No | False | Save the data into a JSON file | dcs_cli run --config-path config.yaml --compare comp_name --save-json |
94
+ | --json-path | -jp | No | dcs_report.json | Specify the file path for JSON file | dcs_cli run --config-path config.yaml --compare comp_name --save-json --json-path ouput.json |
95
+ | --stats | | No | False | Print stats about data diff | dcs_cli run --config-path config.yaml --compare comp_name --stats |
96
+ | --url | | No | None | Specify url to send data to server | dcs_cli run --config-path config.yaml --compare comp_name --url https://comapre/send/data |
97
+ | --html-report | | No | False | Save table as HTML | dcs_cli run --config-path config.yaml --compare comp_name --html-report |
98
+ | --report-path | | No | dcs_report.html | Specify the file path for HTML report | dcs_cli run --config-path config.yaml --compare comp_name --html-report --report-path table.html |
99
+
100
+
101
+
102
+ ### Example Command [CLI]
103
+
104
+ ```sh
105
+ $ dcs_cli --version
106
+
107
+ $ dcs_cli --help
108
+
109
+ $ dcs_cli run -C example.yaml --compare comparison_one --stats -j -jp output.json --html-report --report-path result.html --url https://comapre/send/data
110
+ ```
111
+
112
+ <details>
113
+ <summary><h2>Example Configuration</h2></summary>
114
+
115
+ ```yml
116
+ data_sources:
117
+ - name: iris_snowflake
118
+ type: snowflake
119
+ id: f533c099-196f-48da-b231-1d4c380f84bf
120
+ workspace: default
121
+ connection:
122
+ account: bp54281.central-india.azure
123
+ username: !ENV ${SNOWFLAKE_USER}
124
+ password: !ENV ${SNOWFLAKE_PASS}
125
+ database: TEST_DCS
126
+ schema: PUBLIC
127
+ warehouse: compute_wh
128
+ role: accountadmin
129
+
130
+ - name: pgsql_azure
131
+ type: postgres
132
+ id: 4679b79a-7174-48fd-9c71-81cf806ef617
133
+ workspace: default
134
+ connection:
135
+ host: !ENV ${POSTGRES_HOST_ONE}
136
+ port: !ENV ${POSTGRES_PORT_ONE}
137
+ username: !ENV ${POSTGRES_USER_ONE}
138
+ password: !ENV ${POSTGRES_PASSWORD_ONE}
139
+ database: !ENV ${POSTGRES_DB_ONE}
140
+
141
+ - name: trino_test
142
+ type: trino
143
+ id: 9d86df86-6802-4551-a1ce-b98cdf3ec15f
144
+ workspace: default
145
+ connection:
146
+ host: localhost
147
+ port: 8080
148
+ username: admin
149
+ catalog: tpch
150
+ schema: sf100
151
+
152
+ - name: file_source_raw
153
+ id: b5a76a0a-1b8f-4222-a31d-a31740f23168
154
+ workspace: default
155
+ type: file
156
+ file_path: "nk.kyc_data/RAW_EMPLOYEE.csv"
157
+
158
+ - name: file_source_tl
159
+ id: 52c1f3c7-fd1e-4f3c-aed3-b01d8e1cfa4d
160
+ workspace: default
161
+ type: file
162
+ file_path: "TL_EMPLOYEE.csv"
163
+
164
+ - name: databricks_test
165
+ type: databricks
166
+ id: 6f1fd8d6-5a59-4ba5-be37-aec044b000e7
167
+ workspace: default
168
+ connection:
169
+ host: !ENV ${DATABRICKS_HOST}
170
+ port: !ENV ${DATABRICKS_PORT}
171
+ catalog: hive_metastore
172
+ schema: default
173
+ access_token: !ENV ${DATABRICKS_ACCESS_TOKEN}
174
+ http_path: !ENV ${DATABRICKS_HTTP_PATH}
175
+
176
+ comparisons:
177
+ # DB TO DB (SNOWFLAKE)
178
+ comparison_one:
179
+ source:
180
+ data_source: iris_snowflake
181
+ table: RAW_EMPLOYEE
182
+
183
+ target:
184
+ data_source: iris_snowflake
185
+ table: TL_EMPLOYEE
186
+ key_columns:
187
+ - CUSTID
188
+ columns:
189
+ - FIRSTNAME
190
+ - LASTNAME
191
+ - DESIGNATION
192
+ - SALARY
193
+
194
+ # DB TO DB (Postgres Azure)
195
+ comparison_two:
196
+ source:
197
+ data_source: pgsql_azure
198
+ table: actor
199
+ target:
200
+ data_source: pgsql_azure
201
+ table: actor2
202
+ key_columns:
203
+ - actor_id
204
+ columns:
205
+ - first_name
206
+ - last_name
207
+ - last_update
208
+ columns_mappings:
209
+ - source_column: actor_id
210
+ target_column: actor_id1
211
+ - source_column: first_name
212
+ target_column: first_name1
213
+ - source_column: last_name
214
+ target_column: last_name1
215
+ - source_column: last_update
216
+ target_column: last_update1
217
+
218
+ # FILE TO FILE
219
+ comparison_three:
220
+ source:
221
+ data_source: file_source_raw
222
+ table: RAW_EMPLOYEE
223
+
224
+ target:
225
+ data_source: file_source_tl
226
+ table: TL_EMPLOYEE
227
+ key_columns:
228
+ - custid
229
+ columns:
230
+ - FIRSTNAME
231
+ - lastname
232
+ - designation
233
+ - salary
234
+ columns_mappings:
235
+ - source_column: FIRSTNAME
236
+ target_column: firstname
237
+
238
+ # DB TO DB (Trino)
239
+ comparison_trino:
240
+ source:
241
+ data_source: trino_test
242
+ table: nation
243
+ target:
244
+ data_source: trino_test
245
+ table: region
246
+ key_columns:
247
+ - regionkey
248
+ columns:
249
+ - name
250
+
251
+ # DB TO DB (Databricks)
252
+ comparison_databricks:
253
+ source:
254
+ data_source: databricks_test
255
+ table: RAW_EMPLOYEE
256
+
257
+ target:
258
+ data_source: databricks_test
259
+ table: TL_EMPLOYEE
260
+ key_columns:
261
+ - custid
262
+ columns:
263
+ - FIRSTNAME
264
+ - lastname
265
+ - designation
266
+ - salary
267
+ columns_mappings:
268
+ - source_column: FIRSTNAME
269
+ target_column: firstname
270
+ ```
271
+ </details>
@@ -0,0 +1,218 @@
1
+ <h1 align="center">
2
+ DCS CLI v0.3.0
3
+ </h1>
4
+
5
+ > SDK for DataChecks
6
+
7
+
8
+ ## Installation
9
+
10
+ > Python version `>=3.10,<3.12`
11
+
12
+ ```bash
13
+
14
+ $ pip install dcs-cli[all-dbs]
15
+
16
+ ```
17
+
18
+ ## Supported Databases
19
+
20
+ > Availability Status
21
+
22
+ | Database | Code Name | Supported |
23
+ | ---------- | ------------ | --------- |
24
+ | PostgreSQL | `postgres` | ✅ |
25
+ | Snowflake | `snowflake` | ✅ |
26
+ | Trino | `trino` | ✅ |
27
+ | Databricks | `databricks` | ✅ |
28
+ | File | `file` | ✅ |
29
+
30
+
31
+
32
+ ## Available Commands
33
+
34
+
35
+
36
+ | Option | Short Option | Required | Default | Description | Example |
37
+ | :-----------: | :----------: | :------: | :-------------: | :------------------------------------------------: | :----------------------------------------------------------------------------------------------: |
38
+ | --config-path | -C | **Yes** | None | Specify the file path for the configuration | dcs_cli run --config-path config.yaml --compare comp_name |
39
+ | --compare | | **Yes** | None | Run only specific comparison using comparison name | dcs_cli run --config-path config.yaml --compare comp_name |
40
+ | --save-json | -j | No | False | Save the data into a JSON file | dcs_cli run --config-path config.yaml --compare comp_name --save-json |
41
+ | --json-path | -jp | No | dcs_report.json | Specify the file path for JSON file | dcs_cli run --config-path config.yaml --compare comp_name --save-json --json-path ouput.json |
42
+ | --stats | | No | False | Print stats about data diff | dcs_cli run --config-path config.yaml --compare comp_name --stats |
43
+ | --url | | No | None | Specify url to send data to server | dcs_cli run --config-path config.yaml --compare comp_name --url https://comapre/send/data |
44
+ | --html-report | | No | False | Save table as HTML | dcs_cli run --config-path config.yaml --compare comp_name --html-report |
45
+ | --report-path | | No | dcs_report.html | Specify the file path for HTML report | dcs_cli run --config-path config.yaml --compare comp_name --html-report --report-path table.html |
46
+
47
+
48
+
49
+ ### Example Command [CLI]
50
+
51
+ ```sh
52
+ $ dcs_cli --version
53
+
54
+ $ dcs_cli --help
55
+
56
+ $ dcs_cli run -C example.yaml --compare comparison_one --stats -j -jp output.json --html-report --report-path result.html --url https://comapre/send/data
57
+ ```
58
+
59
+ <details>
60
+ <summary><h2>Example Configuration</h2></summary>
61
+
62
+ ```yml
63
+ data_sources:
64
+ - name: iris_snowflake
65
+ type: snowflake
66
+ id: f533c099-196f-48da-b231-1d4c380f84bf
67
+ workspace: default
68
+ connection:
69
+ account: bp54281.central-india.azure
70
+ username: !ENV ${SNOWFLAKE_USER}
71
+ password: !ENV ${SNOWFLAKE_PASS}
72
+ database: TEST_DCS
73
+ schema: PUBLIC
74
+ warehouse: compute_wh
75
+ role: accountadmin
76
+
77
+ - name: pgsql_azure
78
+ type: postgres
79
+ id: 4679b79a-7174-48fd-9c71-81cf806ef617
80
+ workspace: default
81
+ connection:
82
+ host: !ENV ${POSTGRES_HOST_ONE}
83
+ port: !ENV ${POSTGRES_PORT_ONE}
84
+ username: !ENV ${POSTGRES_USER_ONE}
85
+ password: !ENV ${POSTGRES_PASSWORD_ONE}
86
+ database: !ENV ${POSTGRES_DB_ONE}
87
+
88
+ - name: trino_test
89
+ type: trino
90
+ id: 9d86df86-6802-4551-a1ce-b98cdf3ec15f
91
+ workspace: default
92
+ connection:
93
+ host: localhost
94
+ port: 8080
95
+ username: admin
96
+ catalog: tpch
97
+ schema: sf100
98
+
99
+ - name: file_source_raw
100
+ id: b5a76a0a-1b8f-4222-a31d-a31740f23168
101
+ workspace: default
102
+ type: file
103
+ file_path: "nk.kyc_data/RAW_EMPLOYEE.csv"
104
+
105
+ - name: file_source_tl
106
+ id: 52c1f3c7-fd1e-4f3c-aed3-b01d8e1cfa4d
107
+ workspace: default
108
+ type: file
109
+ file_path: "TL_EMPLOYEE.csv"
110
+
111
+ - name: databricks_test
112
+ type: databricks
113
+ id: 6f1fd8d6-5a59-4ba5-be37-aec044b000e7
114
+ workspace: default
115
+ connection:
116
+ host: !ENV ${DATABRICKS_HOST}
117
+ port: !ENV ${DATABRICKS_PORT}
118
+ catalog: hive_metastore
119
+ schema: default
120
+ access_token: !ENV ${DATABRICKS_ACCESS_TOKEN}
121
+ http_path: !ENV ${DATABRICKS_HTTP_PATH}
122
+
123
+ comparisons:
124
+ # DB TO DB (SNOWFLAKE)
125
+ comparison_one:
126
+ source:
127
+ data_source: iris_snowflake
128
+ table: RAW_EMPLOYEE
129
+
130
+ target:
131
+ data_source: iris_snowflake
132
+ table: TL_EMPLOYEE
133
+ key_columns:
134
+ - CUSTID
135
+ columns:
136
+ - FIRSTNAME
137
+ - LASTNAME
138
+ - DESIGNATION
139
+ - SALARY
140
+
141
+ # DB TO DB (Postgres Azure)
142
+ comparison_two:
143
+ source:
144
+ data_source: pgsql_azure
145
+ table: actor
146
+ target:
147
+ data_source: pgsql_azure
148
+ table: actor2
149
+ key_columns:
150
+ - actor_id
151
+ columns:
152
+ - first_name
153
+ - last_name
154
+ - last_update
155
+ columns_mappings:
156
+ - source_column: actor_id
157
+ target_column: actor_id1
158
+ - source_column: first_name
159
+ target_column: first_name1
160
+ - source_column: last_name
161
+ target_column: last_name1
162
+ - source_column: last_update
163
+ target_column: last_update1
164
+
165
+ # FILE TO FILE
166
+ comparison_three:
167
+ source:
168
+ data_source: file_source_raw
169
+ table: RAW_EMPLOYEE
170
+
171
+ target:
172
+ data_source: file_source_tl
173
+ table: TL_EMPLOYEE
174
+ key_columns:
175
+ - custid
176
+ columns:
177
+ - FIRSTNAME
178
+ - lastname
179
+ - designation
180
+ - salary
181
+ columns_mappings:
182
+ - source_column: FIRSTNAME
183
+ target_column: firstname
184
+
185
+ # DB TO DB (Trino)
186
+ comparison_trino:
187
+ source:
188
+ data_source: trino_test
189
+ table: nation
190
+ target:
191
+ data_source: trino_test
192
+ table: region
193
+ key_columns:
194
+ - regionkey
195
+ columns:
196
+ - name
197
+
198
+ # DB TO DB (Databricks)
199
+ comparison_databricks:
200
+ source:
201
+ data_source: databricks_test
202
+ table: RAW_EMPLOYEE
203
+
204
+ target:
205
+ data_source: databricks_test
206
+ table: TL_EMPLOYEE
207
+ key_columns:
208
+ - custid
209
+ columns:
210
+ - FIRSTNAME
211
+ - lastname
212
+ - designation
213
+ - salary
214
+ columns_mappings:
215
+ - source_column: FIRSTNAME
216
+ target_column: firstname
217
+ ```
218
+ </details>
@@ -0,0 +1,201 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Iterator, Optional, Sequence, Tuple, Union
16
+
17
+ from data_diff.abcs.database_types import DbPath, DbTime
18
+ from data_diff.databases import Database
19
+ from data_diff.databases._connect import connect
20
+ from data_diff.diff_tables import Algorithm
21
+ from data_diff.hashdiff_tables import (
22
+ DEFAULT_BISECTION_FACTOR,
23
+ DEFAULT_BISECTION_THRESHOLD,
24
+ HashDiffer,
25
+ )
26
+ from data_diff.joindiff_tables import TABLE_WRITE_LIMIT, JoinDiffer
27
+ from data_diff.table_segment import TableSegment
28
+ from data_diff.utils import Vector, eval_name_template
29
+
30
+
31
+ def connect_to_table(
32
+ db_info: Union[str, dict],
33
+ table_name: Union[DbPath, str],
34
+ key_columns: str = ("id",),
35
+ thread_count: Optional[int] = 1,
36
+ **kwargs,
37
+ ) -> TableSegment:
38
+ """Connects to the given database, and creates a TableSegment instance
39
+
40
+ Parameters:
41
+ db_info: Either a URI string, or a dict of connection options.
42
+ table_name: Name of the table as a string, or a tuple that signifies the path.
43
+ key_columns: Names of the key columns
44
+ thread_count: Number of threads for this connection (only if using a threadpooled db implementation)
45
+
46
+ See Also:
47
+ :meth:`connect`
48
+ """
49
+ if isinstance(db_info, dict):
50
+ keys_to_remove = [k for k, v in db_info.items() if v is None]
51
+ for k in keys_to_remove:
52
+ db_info.pop(k)
53
+ if isinstance(key_columns, str):
54
+ key_columns = (key_columns,)
55
+
56
+ db: Database = connect(db_info, thread_count=thread_count)
57
+
58
+ if isinstance(table_name, str):
59
+ table_name = db.dialect.parse_table_name(table_name)
60
+
61
+ return TableSegment(db, table_name, key_columns, **kwargs)
62
+
63
+
64
+ def diff_tables(
65
+ table1: TableSegment,
66
+ table2: TableSegment,
67
+ *,
68
+ # Name of the key column, which uniquely identifies each row (usually id)
69
+ key_columns: Sequence[str] = None,
70
+ # Name of updated column, which signals that rows changed (usually updated_at or last_update)
71
+ update_column: str = None,
72
+ # Extra columns to compare
73
+ extra_columns: Tuple[str, ...] = None,
74
+ # Start/end key_column values, used to restrict the segment
75
+ min_key: Vector = None,
76
+ max_key: Vector = None,
77
+ # Start/end update_column values, used to restrict the segment
78
+ min_update: DbTime = None,
79
+ max_update: DbTime = None,
80
+ # Enable/disable threaded diffing. Needed to take advantage of database threads.
81
+ threaded: bool = True,
82
+ # Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
83
+ # There may be many pools, so number of actual threads can be a lot higher.
84
+ max_threadpool_size: Optional[int] = 1,
85
+ # Algorithm
86
+ algorithm: Algorithm = Algorithm.AUTO,
87
+ # An additional 'where' expression to restrict the search space.
88
+ where: str = None,
89
+ # Into how many segments to bisect per iteration (hashdiff only)
90
+ bisection_factor: int = DEFAULT_BISECTION_FACTOR,
91
+ # When should we stop bisecting and compare locally (in row count; hashdiff only)
92
+ bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD,
93
+ # Enable/disable validating that the key columns are unique. (joindiff only)
94
+ validate_unique_key: bool = True,
95
+ # Enable/disable sampling of exclusive rows. Creates a temporary table. (joindiff only)
96
+ sample_exclusive_rows: bool = False,
97
+ # Path of new table to write diff results to. Disabled if not provided. (joindiff only)
98
+ materialize_to_table: Union[str, DbPath] = None,
99
+ # Materialize every row, not just those that are different. (joindiff only)
100
+ materialize_all_rows: bool = False,
101
+ # Maximum number of rows to write when materializing, per thread. (joindiff only)
102
+ table_write_limit: int = TABLE_WRITE_LIMIT,
103
+ # Skips diffing any rows with null keys. (joindiff only)
104
+ skip_null_keys: bool = False,
105
+ ) -> Iterator:
106
+ """Finds the diff between table1 and table2.
107
+
108
+ Parameters:
109
+ key_columns (Tuple[str, ...]): Name of the key column, which uniquely identifies each row (usually id)
110
+ update_column (str, optional): Name of updated column, which signals that rows changed.
111
+ Usually updated_at or last_update. Used by `min_update` and `max_update`.
112
+ extra_columns (Tuple[str, ...], optional): Extra columns to compare
113
+ min_key (:data:`Vector`, optional): Lowest key value, used to restrict the segment
114
+ max_key (:data:`Vector`, optional): Highest key value, used to restrict the segment
115
+ min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
116
+ max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
117
+ threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
118
+ max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
119
+ Only relevant when `threaded` is ``True``.
120
+ There may be many pools, so number of actual threads can be a lot higher.
121
+ where (str, optional): An additional 'where' expression to restrict the search space.
122
+ algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`. Default=`AUTO`)
123
+ bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
124
+ bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
125
+ and compare locally. (Used when algorithm is `HASHDIFF`).
126
+ validate_unique_key (bool): Enable/disable validating that the key columns are unique. (used for `JOINDIFF`. default: True)
127
+ Single query, and can't be threaded, so it's very slow on non-cloud dbs.
128
+ Future versions will detect UNIQUE constraints in the schema.
129
+ sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. Creates a temporary table. (used for `JOINDIFF`. default: False)
130
+ materialize_to_table (Union[str, DbPath], optional): Path of new table to write diff results to. Disabled if not provided. Used for `JOINDIFF`.
131
+ materialize_all_rows (bool): Materialize every row, not just those that are different. (used for `JOINDIFF`. default: False)
132
+ table_write_limit (int): Maximum number of rows to write when materializing, per thread.
133
+ skip_null_keys (bool): Skips diffing any rows with null PKs (displays a warning if any are null) (used for `JOINDIFF`. default: False)
134
+
135
+ Note:
136
+ The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
137
+ `key_columns`, `update_column`, `extra_columns`, `min_key`, `max_key`, `where`.
138
+ If different values are needed per table, it's possible to omit them here, and instead set
139
+ them directly when creating each :class:`TableSegment`.
140
+
141
+ Example:
142
+ >>> table1 = connect_to_table('postgresql:///', 'Rating', 'id')
143
+ >>> list(diff_tables(table1, table1))
144
+ []
145
+
146
+ See Also:
147
+ :class:`TableSegment`
148
+ :class:`HashDiffer`
149
+ :class:`JoinDiffer`
150
+
151
+ """
152
+ if isinstance(key_columns, str):
153
+ key_columns = (key_columns,)
154
+
155
+ tables = [table1, table2]
156
+ override_attrs = {
157
+ k: v
158
+ for k, v in dict(
159
+ key_columns=key_columns,
160
+ update_column=update_column,
161
+ extra_columns=extra_columns,
162
+ min_key=min_key,
163
+ max_key=max_key,
164
+ min_update=min_update,
165
+ max_update=max_update,
166
+ where=where,
167
+ ).items()
168
+ if v is not None
169
+ }
170
+
171
+ segments = [t.new(**override_attrs) for t in tables] if override_attrs else tables
172
+
173
+ algorithm = Algorithm(algorithm)
174
+ if algorithm == Algorithm.AUTO:
175
+ algorithm = Algorithm.JOINDIFF if table1.database is table2.database else Algorithm.HASHDIFF
176
+
177
+ if algorithm == Algorithm.HASHDIFF:
178
+ differ = HashDiffer(
179
+ bisection_factor=bisection_factor,
180
+ bisection_threshold=bisection_threshold,
181
+ threaded=threaded,
182
+ max_threadpool_size=max_threadpool_size,
183
+ )
184
+ elif algorithm == Algorithm.JOINDIFF:
185
+ if isinstance(materialize_to_table, str):
186
+ table_name = eval_name_template(materialize_to_table)
187
+ materialize_to_table = table1.database.dialect.parse_table_name(table_name)
188
+ differ = JoinDiffer(
189
+ threaded=threaded,
190
+ max_threadpool_size=max_threadpool_size,
191
+ validate_unique_key=validate_unique_key,
192
+ sample_exclusive_rows=sample_exclusive_rows,
193
+ materialize_to_table=materialize_to_table,
194
+ materialize_all_rows=materialize_all_rows,
195
+ table_write_limit=table_write_limit,
196
+ skip_null_keys=skip_null_keys,
197
+ )
198
+ else:
199
+ raise ValueError(f"Unknown algorithm: {algorithm}")
200
+
201
+ return differ.diff_tables(*segments)