dcs-sdk 1.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. dcs_sdk-1.7.0/PKG-INFO +144 -0
  2. dcs_sdk-1.7.0/README.md +56 -0
  3. dcs_sdk-1.7.0/data_diff/__init__.py +219 -0
  4. dcs_sdk-1.7.0/data_diff/__main__.py +517 -0
  5. dcs_sdk-1.7.0/data_diff/abcs/__init__.py +13 -0
  6. dcs_sdk-1.7.0/data_diff/abcs/compiler.py +27 -0
  7. dcs_sdk-1.7.0/data_diff/abcs/database_types.py +402 -0
  8. dcs_sdk-1.7.0/data_diff/config.py +141 -0
  9. dcs_sdk-1.7.0/data_diff/databases/__init__.py +38 -0
  10. dcs_sdk-1.7.0/data_diff/databases/_connect.py +323 -0
  11. dcs_sdk-1.7.0/data_diff/databases/base.py +1417 -0
  12. dcs_sdk-1.7.0/data_diff/databases/bigquery.py +376 -0
  13. dcs_sdk-1.7.0/data_diff/databases/clickhouse.py +217 -0
  14. dcs_sdk-1.7.0/data_diff/databases/databricks.py +262 -0
  15. dcs_sdk-1.7.0/data_diff/databases/duckdb.py +207 -0
  16. dcs_sdk-1.7.0/data_diff/databases/mssql.py +343 -0
  17. dcs_sdk-1.7.0/data_diff/databases/mysql.py +189 -0
  18. dcs_sdk-1.7.0/data_diff/databases/oracle.py +238 -0
  19. dcs_sdk-1.7.0/data_diff/databases/postgresql.py +293 -0
  20. dcs_sdk-1.7.0/data_diff/databases/presto.py +222 -0
  21. dcs_sdk-1.7.0/data_diff/databases/redis.py +93 -0
  22. dcs_sdk-1.7.0/data_diff/databases/redshift.py +233 -0
  23. dcs_sdk-1.7.0/data_diff/databases/snowflake.py +222 -0
  24. dcs_sdk-1.7.0/data_diff/databases/sybase.py +720 -0
  25. dcs_sdk-1.7.0/data_diff/databases/trino.py +73 -0
  26. dcs_sdk-1.7.0/data_diff/databases/vertica.py +174 -0
  27. dcs_sdk-1.7.0/data_diff/diff_tables.py +489 -0
  28. dcs_sdk-1.7.0/data_diff/errors.py +17 -0
  29. dcs_sdk-1.7.0/data_diff/format.py +369 -0
  30. dcs_sdk-1.7.0/data_diff/hashdiff_tables.py +1026 -0
  31. dcs_sdk-1.7.0/data_diff/info_tree.py +76 -0
  32. dcs_sdk-1.7.0/data_diff/joindiff_tables.py +434 -0
  33. dcs_sdk-1.7.0/data_diff/lexicographic_space.py +253 -0
  34. dcs_sdk-1.7.0/data_diff/parse_time.py +88 -0
  35. dcs_sdk-1.7.0/data_diff/py.typed +0 -0
  36. dcs_sdk-1.7.0/data_diff/queries/__init__.py +13 -0
  37. dcs_sdk-1.7.0/data_diff/queries/api.py +213 -0
  38. dcs_sdk-1.7.0/data_diff/queries/ast_classes.py +811 -0
  39. dcs_sdk-1.7.0/data_diff/queries/base.py +38 -0
  40. dcs_sdk-1.7.0/data_diff/queries/extras.py +43 -0
  41. dcs_sdk-1.7.0/data_diff/query_utils.py +70 -0
  42. dcs_sdk-1.7.0/data_diff/schema.py +67 -0
  43. dcs_sdk-1.7.0/data_diff/table_segment.py +583 -0
  44. dcs_sdk-1.7.0/data_diff/thread_utils.py +112 -0
  45. dcs_sdk-1.7.0/data_diff/utils.py +1022 -0
  46. dcs_sdk-1.7.0/data_diff/version.py +15 -0
  47. dcs_sdk-1.7.0/dcs_core/__init__.py +13 -0
  48. dcs_sdk-1.7.0/dcs_core/__main__.py +17 -0
  49. dcs_sdk-1.7.0/dcs_core/__version__.py +15 -0
  50. dcs_sdk-1.7.0/dcs_core/cli/__init__.py +13 -0
  51. dcs_sdk-1.7.0/dcs_core/cli/cli.py +165 -0
  52. dcs_sdk-1.7.0/dcs_core/core/__init__.py +19 -0
  53. dcs_sdk-1.7.0/dcs_core/core/common/__init__.py +13 -0
  54. dcs_sdk-1.7.0/dcs_core/core/common/errors.py +68 -0
  55. dcs_sdk-1.7.0/dcs_core/core/common/models/__init__.py +13 -0
  56. dcs_sdk-1.7.0/dcs_core/core/common/models/configuration.py +293 -0
  57. dcs_sdk-1.7.0/dcs_core/core/common/models/dashboard.py +24 -0
  58. dcs_sdk-1.7.0/dcs_core/core/common/models/data_source_resource.py +75 -0
  59. dcs_sdk-1.7.0/dcs_core/core/common/models/metric.py +160 -0
  60. dcs_sdk-1.7.0/dcs_core/core/common/models/profile.py +75 -0
  61. dcs_sdk-1.7.0/dcs_core/core/common/models/validation.py +216 -0
  62. dcs_sdk-1.7.0/dcs_core/core/common/models/widget.py +44 -0
  63. dcs_sdk-1.7.0/dcs_core/core/configuration/__init__.py +13 -0
  64. dcs_sdk-1.7.0/dcs_core/core/configuration/config_loader.py +139 -0
  65. dcs_sdk-1.7.0/dcs_core/core/configuration/configuration_parser.py +263 -0
  66. dcs_sdk-1.7.0/dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  67. dcs_sdk-1.7.0/dcs_core/core/datasource/__init__.py +13 -0
  68. dcs_sdk-1.7.0/dcs_core/core/datasource/base.py +62 -0
  69. dcs_sdk-1.7.0/dcs_core/core/datasource/file_datasource.py +30 -0
  70. dcs_sdk-1.7.0/dcs_core/core/datasource/manager.py +128 -0
  71. dcs_sdk-1.7.0/dcs_core/core/datasource/search_datasource.py +421 -0
  72. dcs_sdk-1.7.0/dcs_core/core/datasource/sql_datasource.py +1094 -0
  73. dcs_sdk-1.7.0/dcs_core/core/inspect.py +162 -0
  74. dcs_sdk-1.7.0/dcs_core/core/logger/__init__.py +13 -0
  75. dcs_sdk-1.7.0/dcs_core/core/logger/base.py +32 -0
  76. dcs_sdk-1.7.0/dcs_core/core/logger/default_logger.py +94 -0
  77. dcs_sdk-1.7.0/dcs_core/core/metric/__init__.py +13 -0
  78. dcs_sdk-1.7.0/dcs_core/core/metric/base.py +220 -0
  79. dcs_sdk-1.7.0/dcs_core/core/metric/combined_metric.py +98 -0
  80. dcs_sdk-1.7.0/dcs_core/core/metric/custom_metric.py +34 -0
  81. dcs_sdk-1.7.0/dcs_core/core/metric/manager.py +137 -0
  82. dcs_sdk-1.7.0/dcs_core/core/metric/numeric_metric.py +403 -0
  83. dcs_sdk-1.7.0/dcs_core/core/metric/reliability_metric.py +90 -0
  84. dcs_sdk-1.7.0/dcs_core/core/profiling/__init__.py +13 -0
  85. dcs_sdk-1.7.0/dcs_core/core/profiling/datasource_profiling.py +136 -0
  86. dcs_sdk-1.7.0/dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  87. dcs_sdk-1.7.0/dcs_core/core/profiling/text_field_profiling.py +67 -0
  88. dcs_sdk-1.7.0/dcs_core/core/repository/__init__.py +13 -0
  89. dcs_sdk-1.7.0/dcs_core/core/repository/metric_repository.py +77 -0
  90. dcs_sdk-1.7.0/dcs_core/core/utils/__init__.py +13 -0
  91. dcs_sdk-1.7.0/dcs_core/core/utils/log.py +29 -0
  92. dcs_sdk-1.7.0/dcs_core/core/utils/tracking.py +105 -0
  93. dcs_sdk-1.7.0/dcs_core/core/utils/utils.py +44 -0
  94. dcs_sdk-1.7.0/dcs_core/core/validation/__init__.py +13 -0
  95. dcs_sdk-1.7.0/dcs_core/core/validation/base.py +230 -0
  96. dcs_sdk-1.7.0/dcs_core/core/validation/completeness_validation.py +153 -0
  97. dcs_sdk-1.7.0/dcs_core/core/validation/custom_query_validation.py +24 -0
  98. dcs_sdk-1.7.0/dcs_core/core/validation/manager.py +282 -0
  99. dcs_sdk-1.7.0/dcs_core/core/validation/numeric_validation.py +276 -0
  100. dcs_sdk-1.7.0/dcs_core/core/validation/reliability_validation.py +91 -0
  101. dcs_sdk-1.7.0/dcs_core/core/validation/uniqueness_validation.py +61 -0
  102. dcs_sdk-1.7.0/dcs_core/core/validation/validity_validation.py +738 -0
  103. dcs_sdk-1.7.0/dcs_core/integrations/__init__.py +13 -0
  104. dcs_sdk-1.7.0/dcs_core/integrations/databases/__init__.py +13 -0
  105. dcs_sdk-1.7.0/dcs_core/integrations/databases/azure_blob.py +217 -0
  106. dcs_sdk-1.7.0/dcs_core/integrations/databases/bigquery.py +187 -0
  107. dcs_sdk-1.7.0/dcs_core/integrations/databases/databricks.py +51 -0
  108. dcs_sdk-1.7.0/dcs_core/integrations/databases/db2.py +652 -0
  109. dcs_sdk-1.7.0/dcs_core/integrations/databases/duck_db.py +72 -0
  110. dcs_sdk-1.7.0/dcs_core/integrations/databases/elasticsearch.py +61 -0
  111. dcs_sdk-1.7.0/dcs_core/integrations/databases/mssql.py +979 -0
  112. dcs_sdk-1.7.0/dcs_core/integrations/databases/mysql.py +409 -0
  113. dcs_sdk-1.7.0/dcs_core/integrations/databases/opensearch.py +64 -0
  114. dcs_sdk-1.7.0/dcs_core/integrations/databases/oracle.py +719 -0
  115. dcs_sdk-1.7.0/dcs_core/integrations/databases/postgres.py +570 -0
  116. dcs_sdk-1.7.0/dcs_core/integrations/databases/redshift.py +53 -0
  117. dcs_sdk-1.7.0/dcs_core/integrations/databases/snowflake.py +48 -0
  118. dcs_sdk-1.7.0/dcs_core/integrations/databases/spark_df.py +111 -0
  119. dcs_sdk-1.7.0/dcs_core/integrations/databases/sybase.py +1069 -0
  120. dcs_sdk-1.7.0/dcs_core/integrations/storage/__init__.py +13 -0
  121. dcs_sdk-1.7.0/dcs_core/integrations/storage/local_file.py +149 -0
  122. dcs_sdk-1.7.0/dcs_core/integrations/utils/__init__.py +13 -0
  123. dcs_sdk-1.7.0/dcs_core/integrations/utils/utils.py +36 -0
  124. dcs_sdk-1.7.0/dcs_core/report/__init__.py +13 -0
  125. dcs_sdk-1.7.0/dcs_core/report/dashboard.py +211 -0
  126. dcs_sdk-1.7.0/dcs_core/report/models.py +88 -0
  127. dcs_sdk-1.7.0/dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  128. dcs_sdk-1.7.0/dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  129. dcs_sdk-1.7.0/dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  130. dcs_sdk-1.7.0/dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  131. dcs_sdk-1.7.0/dcs_core/report/static/assets/images/docs.svg +6 -0
  132. dcs_sdk-1.7.0/dcs_core/report/static/assets/images/github.svg +4 -0
  133. dcs_sdk-1.7.0/dcs_core/report/static/assets/images/logo.svg +7 -0
  134. dcs_sdk-1.7.0/dcs_core/report/static/assets/images/slack.svg +13 -0
  135. dcs_sdk-1.7.0/dcs_core/report/static/index.js +2 -0
  136. dcs_sdk-1.7.0/dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  137. dcs_sdk-1.7.0/dcs_sdk/__init__.py +13 -0
  138. dcs_sdk-1.7.0/dcs_sdk/__main__.py +18 -0
  139. dcs_sdk-1.7.0/dcs_sdk/__version__.py +15 -0
  140. dcs_sdk-1.7.0/dcs_sdk/cli/__init__.py +13 -0
  141. dcs_sdk-1.7.0/dcs_sdk/cli/cli.py +163 -0
  142. dcs_sdk-1.7.0/dcs_sdk/sdk/__init__.py +58 -0
  143. dcs_sdk-1.7.0/dcs_sdk/sdk/config/__init__.py +13 -0
  144. dcs_sdk-1.7.0/dcs_sdk/sdk/config/config_loader.py +504 -0
  145. dcs_sdk-1.7.0/dcs_sdk/sdk/data_diff/__init__.py +13 -0
  146. dcs_sdk-1.7.0/dcs_sdk/sdk/data_diff/data_differ.py +874 -0
  147. dcs_sdk-1.7.0/dcs_sdk/sdk/rules/__init__.py +15 -0
  148. dcs_sdk-1.7.0/dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  149. dcs_sdk-1.7.0/dcs_sdk/sdk/rules/rules_repository.py +214 -0
  150. dcs_sdk-1.7.0/dcs_sdk/sdk/rules/schema_rules.py +65 -0
  151. dcs_sdk-1.7.0/dcs_sdk/sdk/utils/__init__.py +13 -0
  152. dcs_sdk-1.7.0/dcs_sdk/sdk/utils/serializer.py +25 -0
  153. dcs_sdk-1.7.0/dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  154. dcs_sdk-1.7.0/dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  155. dcs_sdk-1.7.0/dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  156. dcs_sdk-1.7.0/dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  157. dcs_sdk-1.7.0/dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  158. dcs_sdk-1.7.0/dcs_sdk/sdk/utils/table.py +475 -0
  159. dcs_sdk-1.7.0/dcs_sdk/sdk/utils/themes.py +40 -0
  160. dcs_sdk-1.7.0/dcs_sdk/sdk/utils/utils.py +485 -0
  161. dcs_sdk-1.7.0/pyproject.toml +168 -0
dcs_sdk-1.7.0/PKG-INFO ADDED
@@ -0,0 +1,144 @@
1
+ Metadata-Version: 2.4
2
+ Name: dcs-sdk
3
+ Version: 1.7.0
4
+ Summary: SDK for DataChecks
5
+ Author: Waterdip Labs
6
+ Author-email: hello@waterdip.ai
7
+ Requires-Python: >=3.10,<3.13
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Provides-Extra: all-dbs
13
+ Provides-Extra: bigquery
14
+ Provides-Extra: clickhouse
15
+ Provides-Extra: databricks
16
+ Provides-Extra: db2
17
+ Provides-Extra: elasticsearch
18
+ Provides-Extra: impyla
19
+ Provides-Extra: mssql
20
+ Provides-Extra: mysql
21
+ Provides-Extra: opensearch
22
+ Provides-Extra: oracle
23
+ Provides-Extra: postgresql
24
+ Provides-Extra: preql
25
+ Provides-Extra: presto
26
+ Provides-Extra: redshift
27
+ Provides-Extra: snowflake
28
+ Provides-Extra: spark
29
+ Provides-Extra: sybase
30
+ Provides-Extra: trino
31
+ Provides-Extra: vertica
32
+ Requires-Dist: attrs (>=23.1.0)
33
+ Requires-Dist: azure-identity (>=1.25.1,<2.0.0)
34
+ Requires-Dist: azure-storage-blob (>=12.27.1,<13.0.0)
35
+ Requires-Dist: click (>=8.1)
36
+ Requires-Dist: clickhouse-driver (>=0.2.9) ; extra == "clickhouse" or extra == "all-dbs"
37
+ Requires-Dist: cryptography (>=44.0.1) ; extra == "snowflake" or extra == "all-dbs"
38
+ Requires-Dist: databricks-sql-connector (>=3.3.0,<4.0.0) ; extra == "databricks" or extra == "all-dbs"
39
+ Requires-Dist: dsnparse (<0.2.0)
40
+ Requires-Dist: duckdb (>=0.9.0)
41
+ Requires-Dist: elasticsearch (>=9.1.0,<10.0.0) ; extra == "elasticsearch" or extra == "all-dbs"
42
+ Requires-Dist: google-cloud-bigquery (>=3.31.0,<4.0.0) ; extra == "bigquery" or extra == "all-dbs"
43
+ Requires-Dist: h11 (>=0.16.0,<0.17.0)
44
+ Requires-Dist: ibm-db (>=3.2.3,<4.0.0) ; extra == "db2" or extra == "all-dbs"
45
+ Requires-Dist: ibm-db-sa (>=0.4.1,<0.5.0) ; extra == "db2" or extra == "all-dbs"
46
+ Requires-Dist: impyla (>=0.20.0,<0.21.0) ; extra == "impyla" or extra == "all-dbs"
47
+ Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
48
+ Requires-Dist: keyring (>=25.3.0)
49
+ Requires-Dist: loguru (==0.7.2)
50
+ Requires-Dist: mashumaro[msgpack] (>=2.9,<3.11.0)
51
+ Requires-Dist: mysql-connector-python (>=9.0.1) ; extra == "mysql" or extra == "all-dbs"
52
+ Requires-Dist: nltk (>=3.9.1,<4.0.0)
53
+ Requires-Dist: numpy (==1.26.4)
54
+ Requires-Dist: opensearch-py (>=2.2.0,<3.0.0) ; extra == "opensearch" or extra == "all-dbs"
55
+ Requires-Dist: oracledb (>=2.4.1) ; extra == "oracle" or extra == "all-dbs"
56
+ Requires-Dist: packaging (>=24.1,<25.0)
57
+ Requires-Dist: preql (>=0.2.19) ; extra == "preql" or extra == "all-dbs"
58
+ Requires-Dist: presto-python-client (>=0.8.4) ; extra == "presto" or extra == "all-dbs"
59
+ Requires-Dist: protobuf (>=5.29.5,<6.0.0)
60
+ Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgresql" or extra == "redshift" or extra == "all-dbs"
61
+ Requires-Dist: pydantic (>=1.10.12)
62
+ Requires-Dist: pymysql[rsa] (>=1.1.0,<2.0.0) ; extra == "mysql" or extra == "all-dbs"
63
+ Requires-Dist: pyodbc (>=4.0.39) ; extra == "mssql" or extra == "sybase" or extra == "all-dbs"
64
+ Requires-Dist: pyparsing (>=3.1.1,<4.0.0)
65
+ Requires-Dist: pyspark (>=3.2.1,<4.0.0) ; extra == "spark" or extra == "all-dbs"
66
+ Requires-Dist: python-dateutil (>=2.8.2,<3.0.0)
67
+ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
68
+ Requires-Dist: pytz (>=2024.1)
69
+ Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
70
+ Requires-Dist: redis[hiredis] (>=5.2.1,<6.0.0)
71
+ Requires-Dist: requests (>=2.32.4,<3.0.0)
72
+ Requires-Dist: rich (>=13.8.0)
73
+ Requires-Dist: setuptools (>=78.1.1)
74
+ Requires-Dist: snowflake-connector-python (>=3.17.2) ; extra == "snowflake" or extra == "all-dbs"
75
+ Requires-Dist: snowflake-sqlalchemy (>=1.5.3,<2.0.0) ; extra == "snowflake" or extra == "all-dbs"
76
+ Requires-Dist: sqlalchemy (>=2.0.14,<2.1.0)
77
+ Requires-Dist: sqlalchemy-bigquery (>=1.8.0,<2.0.0) ; extra == "bigquery" or extra == "all-dbs"
78
+ Requires-Dist: sqlalchemy-sybase (>=2.0.0,<3.0.0) ; extra == "sybase" or extra == "all-dbs"
79
+ Requires-Dist: tabulate (>=0.9.0)
80
+ Requires-Dist: toml (>=0.10.2)
81
+ Requires-Dist: tornado (>=6.5,<7.0)
82
+ Requires-Dist: trino (>=0.314.0) ; extra == "trino" or extra == "all-dbs"
83
+ Requires-Dist: typing-extensions (>=4.0.1)
84
+ Requires-Dist: urllib3 (>=2.5.0,<3.0.0)
85
+ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-dbs"
86
+ Description-Content-Type: text/markdown
87
+
88
+ <h1 align="center">
89
+ DCS SDK v1.7.0
90
+ </h1>
91
+
92
+ > SDK for DataChecks
93
+
94
+ ## Installation
95
+
96
+ > Python version `>=3.10,<3.13`
97
+
98
+ ```bash
99
+
100
+ $ pip install dcs-sdk[all-dbs]
101
+
102
+ ```
103
+
104
+ ## Supported Databases
105
+
106
+ > Availability Status
107
+
108
+ | Database | Code Name | Supported |
109
+ | ----------------- | ------------ | --------- |
110
+ | PostgreSQL | `postgres` | ✅ |
111
+ | Snowflake | `snowflake` | ✅ |
112
+ | Trino | `trino` | ✅ |
113
+ | Databricks | `databricks` | ✅ |
114
+ | Oracle | `oracle` | ✅ |
115
+ | MSSQL | `mssql` | ✅ |
116
+ | MySQL | `mysql` | ✅ |
117
+ | SAP Sybase IQ/ASE | `sybase` | ✅ |
118
+ | File | `file` | ✅ |
119
+ | BigQuery | `bigquery` | ✅ |
120
+
121
+ ## Available Commands
122
+
123
+ | Option | Short Option | Required | Default | Description | Example |
124
+ | :-----------: | :----------: | :------: | :-------------: | :------------------------------------------------: | :------------------------------------------------------------------------------------------------------: |
125
+ | --config-path | -C | **Yes** | None | Specify the file path for the configuration | dcs-sdk run --config-path config.yaml --compare comp_name |
126
+ | --compare | | **Yes** | None | Run only specific comparison using comparison name | dcs-sdk run --config-path config.yaml --compare comp_name |
127
+ | --save-json | -j | No | False | Save the data into a JSON file | dcs-sdk run --config-path config.yaml --compare comp_name --save-json |
128
+ | --json-path | -jp | No | dcs_report.json | Specify the file path for JSON file | dcs-sdk run --config-path config.yaml --compare comp_name --save-json --json-path ouput.json |
129
+ | --stats | | No | False | Print stats about data diff | dcs-sdk run --config-path config.yaml --compare comp_name --stats |
130
+ | --url | | No | None | Specify url to send data to server | dcs-sdk run --config-path config.yaml --compare comp_name --url=https://comapre/send/data |
131
+ | --html-report | | No | False | Save table as HTML | dcs-sdk run --config-path config.yaml --compare comp_name --html-report |
132
+ | --report-path | | No | dcs_report.html | Specify the file path for HTML report | dcs-sdk run --config-path config.yaml --compare comp_name --html-report --report-path table.html |
133
+ | --table | | No | False | Display Comparison in table format | dcs-sdk run --config-path config.yaml --compare comp_name --html-report --report-path table.html --table |
134
+
135
+ ### Example Command [CLI]
136
+
137
+ ```sh
138
+ $ dcs-sdk --version
139
+
140
+ $ dcs-sdk --help
141
+
142
+ $ dcs-sdk run -C example.yaml --compare comparison_one --stats -j -jp output.json --html-report --report-path result.html --table --url=https://comapre/send/data
143
+ ```
144
+
@@ -0,0 +1,56 @@
1
+ <h1 align="center">
2
+ DCS SDK v1.7.0
3
+ </h1>
4
+
5
+ > SDK for DataChecks
6
+
7
+ ## Installation
8
+
9
+ > Python version `>=3.10,<3.13`
10
+
11
+ ```bash
12
+
13
+ $ pip install dcs-sdk[all-dbs]
14
+
15
+ ```
16
+
17
+ ## Supported Databases
18
+
19
+ > Availability Status
20
+
21
+ | Database | Code Name | Supported |
22
+ | ----------------- | ------------ | --------- |
23
+ | PostgreSQL | `postgres` | ✅ |
24
+ | Snowflake | `snowflake` | ✅ |
25
+ | Trino | `trino` | ✅ |
26
+ | Databricks | `databricks` | ✅ |
27
+ | Oracle | `oracle` | ✅ |
28
+ | MSSQL | `mssql` | ✅ |
29
+ | MySQL | `mysql` | ✅ |
30
+ | SAP Sybase IQ/ASE | `sybase` | ✅ |
31
+ | File | `file` | ✅ |
32
+ | BigQuery | `bigquery` | ✅ |
33
+
34
+ ## Available Commands
35
+
36
+ | Option | Short Option | Required | Default | Description | Example |
37
+ | :-----------: | :----------: | :------: | :-------------: | :------------------------------------------------: | :------------------------------------------------------------------------------------------------------: |
38
+ | --config-path | -C | **Yes** | None | Specify the file path for the configuration | dcs-sdk run --config-path config.yaml --compare comp_name |
39
+ | --compare | | **Yes** | None | Run only specific comparison using comparison name | dcs-sdk run --config-path config.yaml --compare comp_name |
40
+ | --save-json | -j | No | False | Save the data into a JSON file | dcs-sdk run --config-path config.yaml --compare comp_name --save-json |
41
+ | --json-path | -jp | No | dcs_report.json | Specify the file path for JSON file | dcs-sdk run --config-path config.yaml --compare comp_name --save-json --json-path ouput.json |
42
+ | --stats | | No | False | Print stats about data diff | dcs-sdk run --config-path config.yaml --compare comp_name --stats |
43
+ | --url | | No | None | Specify url to send data to server | dcs-sdk run --config-path config.yaml --compare comp_name --url=https://comapre/send/data |
44
+ | --html-report | | No | False | Save table as HTML | dcs-sdk run --config-path config.yaml --compare comp_name --html-report |
45
+ | --report-path | | No | dcs_report.html | Specify the file path for HTML report | dcs-sdk run --config-path config.yaml --compare comp_name --html-report --report-path table.html |
46
+ | --table | | No | False | Display Comparison in table format | dcs-sdk run --config-path config.yaml --compare comp_name --html-report --report-path table.html --table |
47
+
48
+ ### Example Command [CLI]
49
+
50
+ ```sh
51
+ $ dcs-sdk --version
52
+
53
+ $ dcs-sdk --help
54
+
55
+ $ dcs-sdk run -C example.yaml --compare comparison_one --stats -j -jp output.json --html-report --report-path result.html --table --url=https://comapre/send/data
56
+ ```
@@ -0,0 +1,219 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Iterator, Optional, Sequence, Tuple, Union
16
+
17
+ from data_diff.abcs.database_types import DbPath, DbTime
18
+ from data_diff.databases import Database
19
+ from data_diff.databases._connect import connect
20
+ from data_diff.diff_tables import Algorithm
21
+ from data_diff.hashdiff_tables import (
22
+ DEAFULT_TIMEOUT,
23
+ DEFAULT_BISECTION_FACTOR,
24
+ DEFAULT_BISECTION_THRESHOLD,
25
+ DEFAULT_ENGRESS_LIMIT,
26
+ DEFAULT_PER_COLUMN_DIFF_LIMIT,
27
+ HashDiffer,
28
+ )
29
+ from data_diff.joindiff_tables import TABLE_WRITE_LIMIT, JoinDiffer
30
+ from data_diff.table_segment import TableSegment
31
+ from data_diff.utils import Vector, eval_name_template
32
+
33
+
34
+ def connect_to_table(
35
+ db_info: Union[str, dict],
36
+ table_name: Union[DbPath, str],
37
+ key_columns: str = ("id",),
38
+ thread_count: Optional[int] = 1,
39
+ **kwargs,
40
+ ) -> TableSegment:
41
+ """Connects to the given database, and creates a TableSegment instance
42
+
43
+ Parameters:
44
+ db_info: Either a URI string, or a dict of connection options.
45
+ table_name: Name of the table as a string, or a tuple that signifies the path.
46
+ key_columns: Names of the key columns
47
+ thread_count: Number of threads for this connection (only if using a threadpooled db implementation)
48
+
49
+ See Also:
50
+ :meth:`connect`
51
+ """
52
+ if isinstance(db_info, dict):
53
+ keys_to_remove = [k for k, v in db_info.items() if v is None]
54
+ for k in keys_to_remove:
55
+ db_info.pop(k)
56
+ if isinstance(key_columns, str):
57
+ key_columns = (key_columns,)
58
+ db: Database = connect(db_info, thread_count=thread_count)
59
+ if isinstance(table_name, str):
60
+ table_name = db.dialect.parse_table_name(table_name)
61
+
62
+ return TableSegment(db, table_name, key_columns, **kwargs)
63
+
64
+
65
+ def diff_tables(
66
+ table1: TableSegment,
67
+ table2: TableSegment,
68
+ *,
69
+ # Name of the key column, which uniquely identifies each row (usually id)
70
+ key_columns: Sequence[str] = None,
71
+ # Name of updated column, which signals that rows changed (usually updated_at or last_update)
72
+ update_column: str = None,
73
+ # Extra columns to compare
74
+ extra_columns: Tuple[str, ...] = None,
75
+ # Start/end key_column values, used to restrict the segment
76
+ min_key: Vector = None,
77
+ max_key: Vector = None,
78
+ # Start/end update_column values, used to restrict the segment
79
+ min_update: DbTime = None,
80
+ max_update: DbTime = None,
81
+ # Enable/disable threaded diffing. Needed to take advantage of database threads.
82
+ threaded: bool = True,
83
+ # Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
84
+ # There may be many pools, so number of actual threads can be a lot higher.
85
+ max_threadpool_size: Optional[int] = 1,
86
+ # Algorithm
87
+ algorithm: Algorithm = Algorithm.AUTO,
88
+ # An additional 'where' expression to restrict the search space.
89
+ where: str = None,
90
+ # Into how many segments to bisect per iteration (hashdiff only)
91
+ bisection_factor: int = DEFAULT_BISECTION_FACTOR,
92
+ # When should we stop bisecting and compare locally (in row count; hashdiff only)
93
+ bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD,
94
+ # Enable/disable validating that the key columns are unique. (joindiff only)
95
+ validate_unique_key: bool = True,
96
+ # Enable/disable sampling of exclusive rows. Creates a temporary table. (joindiff only)
97
+ sample_exclusive_rows: bool = False,
98
+ # Path of new table to write diff results to. Disabled if not provided. (joindiff only)
99
+ materialize_to_table: Union[str, DbPath] = None,
100
+ # Materialize every row, not just those that are different. (joindiff only)
101
+ materialize_all_rows: bool = False,
102
+ # Maximum number of rows to write when materializing, per thread. (joindiff only)
103
+ table_write_limit: int = TABLE_WRITE_LIMIT,
104
+ # Skips diffing any rows with null keys. (joindiff only)
105
+ skip_null_keys: bool = False,
106
+ # Type check
107
+ strict: bool = True,
108
+ # Maximum number diff per column
109
+ per_column_diff_limit: int = DEFAULT_PER_COLUMN_DIFF_LIMIT,
110
+ # Maximum number of rows to download
111
+ egress_limit: int = DEFAULT_ENGRESS_LIMIT,
112
+ # Timeout limit in minutes
113
+ # (used for diffing large tables, to avoid long-running queries)
114
+ timeout_limit: int = DEAFULT_TIMEOUT,
115
+ in_memory_diff: bool = False,
116
+ ) -> Iterator:
117
+ """Finds the diff between table1 and table2.
118
+
119
+ Parameters:
120
+ key_columns (Tuple[str, ...]): Name of the key column, which uniquely identifies each row (usually id)
121
+ update_column (str, optional): Name of updated column, which signals that rows changed.
122
+ Usually updated_at or last_update. Used by `min_update` and `max_update`.
123
+ extra_columns (Tuple[str, ...], optional): Extra columns to compare
124
+ min_key (:data:`Vector`, optional): Lowest key value, used to restrict the segment
125
+ max_key (:data:`Vector`, optional): Highest key value, used to restrict the segment
126
+ min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
127
+ max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
128
+ threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
129
+ max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
130
+ Only relevant when `threaded` is ``True``.
131
+ There may be many pools, so number of actual threads can be a lot higher.
132
+ where (str, optional): An additional 'where' expression to restrict the search space.
133
+ algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`. Default=`AUTO`)
134
+ bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
135
+ bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
136
+ and compare locally. (Used when algorithm is `HASHDIFF`).
137
+ validate_unique_key (bool): Enable/disable validating that the key columns are unique. (used for `JOINDIFF`. default: True)
138
+ Single query, and can't be threaded, so it's very slow on non-cloud dbs.
139
+ Future versions will detect UNIQUE constraints in the schema.
140
+ sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. Creates a temporary table. (used for `JOINDIFF`. default: False)
141
+ materialize_to_table (Union[str, DbPath], optional): Path of new table to write diff results to. Disabled if not provided. Used for `JOINDIFF`.
142
+ materialize_all_rows (bool): Materialize every row, not just those that are different. (used for `JOINDIFF`. default: False)
143
+ table_write_limit (int): Maximum number of rows to write when materializing, per thread.
144
+ skip_null_keys (bool): Skips diffing any rows with null PKs (displays a warning if any are null) (used for `JOINDIFF`. default: False)
145
+
146
+ Note:
147
+ The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
148
+ `key_columns`, `update_column`, `extra_columns`, `min_key`, `max_key`, `where`.
149
+ If different values are needed per table, it's possible to omit them here, and instead set
150
+ them directly when creating each :class:`TableSegment`.
151
+
152
+ Example:
153
+ >>> table1 = connect_to_table('postgresql:///', 'Rating', 'id')
154
+ >>> list(diff_tables(table1, table1))
155
+ []
156
+
157
+ See Also:
158
+ :class:`TableSegment`
159
+ :class:`HashDiffer`
160
+ :class:`JoinDiffer`
161
+
162
+ """
163
+ if isinstance(key_columns, str):
164
+ key_columns = (key_columns,)
165
+
166
+ tables = [table1, table2]
167
+ override_attrs = {
168
+ k: v
169
+ for k, v in dict(
170
+ key_columns=key_columns,
171
+ update_column=update_column,
172
+ extra_columns=extra_columns,
173
+ min_key=min_key,
174
+ max_key=max_key,
175
+ min_update=min_update,
176
+ max_update=max_update,
177
+ where=where,
178
+ ).items()
179
+ if v is not None
180
+ }
181
+
182
+ segments = [t.new(**override_attrs) for t in tables] if override_attrs else tables
183
+
184
+ algorithm = Algorithm(algorithm)
185
+ if algorithm == Algorithm.AUTO:
186
+ algorithm = Algorithm.JOINDIFF if table1.database is table2.database else Algorithm.HASHDIFF
187
+
188
+ if algorithm == Algorithm.HASHDIFF:
189
+ differ = HashDiffer(
190
+ bisection_factor=bisection_factor,
191
+ bisection_threshold=bisection_threshold,
192
+ threaded=threaded,
193
+ max_threadpool_size=max_threadpool_size,
194
+ strict=strict,
195
+ t1_row_count=table1.count(),
196
+ t2_row_count=table2.count(),
197
+ per_column_diff_limit=per_column_diff_limit,
198
+ egress_limit=egress_limit,
199
+ timeout_limit=timeout_limit,
200
+ in_memory_diff=in_memory_diff,
201
+ )
202
+ elif algorithm == Algorithm.JOINDIFF:
203
+ if isinstance(materialize_to_table, str):
204
+ table_name = eval_name_template(materialize_to_table)
205
+ materialize_to_table = table1.database.dialect.parse_table_name(table_name)
206
+ differ = JoinDiffer(
207
+ threaded=threaded,
208
+ max_threadpool_size=max_threadpool_size,
209
+ validate_unique_key=validate_unique_key,
210
+ sample_exclusive_rows=sample_exclusive_rows,
211
+ materialize_to_table=materialize_to_table,
212
+ materialize_all_rows=materialize_all_rows,
213
+ table_write_limit=table_write_limit,
214
+ skip_null_keys=skip_null_keys,
215
+ )
216
+ else:
217
+ raise ValueError(f"Unknown algorithm: {algorithm}")
218
+
219
+ return differ.diff_tables(*segments)