dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
data_diff/version.py ADDED
@@ -0,0 +1,15 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ __version__ = "0.11.2"
dcs_core/__init__.py ADDED
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
dcs_core/__main__.py ADDED
@@ -0,0 +1,17 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from dcs_core.cli.cli import main
15
+
16
+ if __name__ == "__main__":
17
+ main()
@@ -0,0 +1,15 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ __version__ = "0.9.9"
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
dcs_core/cli/cli.py ADDED
@@ -0,0 +1,165 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+ import sys
16
+ import traceback
17
+ import uuid
18
+ import warnings
19
+ from typing import Union
20
+
21
+ import click
22
+ from loguru import logger
23
+ from rich import print
24
+ from rich.table import Table, Text
25
+
26
+ from dcs_core.__version__ import __version__
27
+ from dcs_core.core import Configuration, Inspect
28
+ from dcs_core.core.configuration.configuration_parser import load_configuration
29
+
30
+ # from datachecks.core.common.models.metric import DataSourceMetrics
31
+ from dcs_core.core.inspect import InspectOutput
32
+ from dcs_core.report.dashboard import DashboardInfoBuilder, html_template
33
+ from dcs_core.report.models import TemplateParams
34
+
35
+ logger.remove()
36
+ logger.add(sys.stderr, level="WARNING")
37
+ warnings.filterwarnings("ignore")
38
+
39
+
40
+ @click.version_option(package_name="datachecks", prog_name="datachecks")
41
+ @click.group(help=f"Datachecks CLI version {__version__}")
42
+ def main():
43
+ pass
44
+
45
+
46
+ @main.command(
47
+ short_help="Starts the datachecks inspection",
48
+ )
49
+ @click.option(
50
+ "-C",
51
+ "--config-path",
52
+ required=True,
53
+ default=None,
54
+ help="Specify the file path for configuration",
55
+ )
56
+ # Disabled for now TODO: Enable in future for validations
57
+ # @click.option(
58
+ # "--auto-profile",
59
+ # is_flag=True,
60
+ # help="Specify if the inspection should do auto-profile of all data sources",
61
+ # )
62
+ # @click.option(
63
+ # "--html-report",
64
+ # is_flag=True,
65
+ # help="Specify if the inspection should generate HTML report",
66
+ # )
67
+ # @click.option(
68
+ # "--report-path",
69
+ # required=False,
70
+ # default="datachecks_report.html",
71
+ # help="Specify the file path for HTML report",
72
+ # )
73
+ def inspect(
74
+ config_path: Union[str, None],
75
+ # auto_profile: bool = False, # Disabled for now
76
+ # html_report: bool = False,
77
+ # report_path: str = "datachecks_report.html",
78
+ ):
79
+ """
80
+ Starts the datachecks inspection
81
+ """
82
+ try:
83
+ is_file_exists = os.path.exists(config_path)
84
+ if not is_file_exists:
85
+ raise Exception(f"Invalid value for '-C' / '--config-path': File '{config_path}' does not exist.")
86
+ configuration: Configuration = load_configuration(config_path)
87
+ inspector = Inspect(configuration=configuration)
88
+
89
+ print("Starting [bold blue]datachecks[/bold blue] inspection...", ":zap:")
90
+ output: InspectOutput = inspector.run()
91
+
92
+ print("[bold green]Inspection completed successfully![/bold green] :tada:")
93
+ print(f"Inspection took {inspector.execution_time_taken} seconds")
94
+ # Disable for now
95
+ # if html_report:
96
+ # print(f"Generating HTML report at {report_path}")
97
+ # _build_html_report(inspect_output=output, report_path=report_path)
98
+ # print(f"HTML report generated at {report_path}")
99
+ # else:
100
+ print(_build_metric_cli_table(inspect_output=output))
101
+ sys.exit(0)
102
+
103
+ except Exception as e:
104
+ print(f"[bold red]Failed to run datachecks inspection: {str(e)} [/bold red]")
105
+ sys.exit(1)
106
+
107
+
108
+ def _build_metric_cli_table(*, inspect_output: InspectOutput):
109
+ table = Table(
110
+ title="List of Validations",
111
+ show_header=True,
112
+ header_style="bold blue",
113
+ )
114
+ table.add_column("Validation Name", style="cyan", no_wrap=True)
115
+ table.add_column("Data Source", style="magenta")
116
+ table.add_column("Validation Type", style="magenta")
117
+ table.add_column("Value", justify="right", style="green")
118
+ table.add_column("Is Valid", justify="right")
119
+ table.add_column("Reason", justify="right")
120
+
121
+ for identy, validation_info in inspect_output.validations.items():
122
+ _validity_style = "" if validation_info.is_valid is None else "red" if not validation_info.is_valid else "green"
123
+ value = (
124
+ validation_info.name,
125
+ validation_info.data_source_name,
126
+ validation_info.validation_function,
127
+ str(validation_info.value),
128
+ Text(
129
+ "-" if validation_info.is_valid is None else "Failed" if not validation_info.is_valid else "Passed",
130
+ style=_validity_style,
131
+ ),
132
+ "-" if validation_info.reason is None else validation_info.reason,
133
+ )
134
+ table.add_row(*value)
135
+
136
+ # for data_source_name, ds_metrics in inspect_output.metrics.items():
137
+ # row = None
138
+ # if isinstance(ds_metrics, DataSourceMetrics):
139
+ # for tabel_name, table_metrics in ds_metrics.table_metrics.items():
140
+ # for metric_identifier, metric in table_metrics.metrics.items():
141
+ # table.add_row(
142
+ # *_build_row(metric),
143
+ # )
144
+ # for index_name, index_metrics in ds_metrics.index_metrics.items():
145
+ # for metric_identifier, metric in index_metrics.metrics.items():
146
+ # table.add_row(
147
+ # *_build_row(metric),
148
+ # )
149
+ # else:
150
+ # for metric_identifier, metric in ds_metrics.metrics.items():
151
+ # table.add_row(
152
+ # *_build_row(metric),
153
+ # )
154
+
155
+ return table
156
+
157
+
158
+ def _build_html_report(*, inspect_output: InspectOutput, report_path: str):
159
+ template_params = TemplateParams(
160
+ dashboard_id="dcs_dashboard_" + str(uuid.uuid4()).replace("-", ""),
161
+ dashboard_info=DashboardInfoBuilder(inspect_output).build(),
162
+ )
163
+
164
+ with open(report_path, "w", encoding="utf-8") as out_file:
165
+ out_file.write(html_template(template_params))
@@ -0,0 +1,19 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from dcs_core.core.configuration.configuration_parser import (
16
+ Configuration,
17
+ load_configuration,
18
+ )
19
+ from dcs_core.core.inspect import Inspect
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,50 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ ERROR_RUNTIME = "runtime_error"
16
+ ERROR_CONFIGURATION = "configuration_error"
17
+ ERROR_DATA_SOURCES_CONNECTION = "data_sources_connection_error"
18
+ ERROR_METRIC_GENERATION = "metric_generation_error"
19
+
20
+
21
+ class DataChecksRuntimeError(Exception):
22
+ """Raised when there is an error in the configuration file."""
23
+
24
+ def __init__(self, message):
25
+ super().__init__(message)
26
+ self.error_code = ERROR_RUNTIME
27
+
28
+
29
+ class DataChecksConfigurationError(Exception):
30
+ """Raised when there is an error in the configuration file."""
31
+
32
+ def __init__(self, message):
33
+ super().__init__(message)
34
+ self.error_code = ERROR_CONFIGURATION
35
+
36
+
37
+ class DataChecksDataSourcesConnectionError(Exception):
38
+ """Raised when there is an error in the data sources."""
39
+
40
+ def __init__(self, message):
41
+ super().__init__(message)
42
+ self.error_code = ERROR_DATA_SOURCES_CONNECTION
43
+
44
+
45
+ class DataChecksMetricGenerationError(Exception):
46
+ """Raised when there is an error in the metric generation process."""
47
+
48
+ def __init__(self, message):
49
+ super().__init__(message)
50
+ self.error_code = ERROR_METRIC_GENERATION
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,284 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import re
15
+ from dataclasses import dataclass, field
16
+ from enum import Enum
17
+ from typing import Any, Dict, List, Optional, Union
18
+
19
+ from markdown_it.rules_block import reference
20
+
21
+ from dcs_core.core.common.models.data_source_resource import Field, Index, Table
22
+ from dcs_core.core.common.models.metric import MetricsType
23
+ from dcs_core.core.common.models.validation import (
24
+ Threshold,
25
+ Validation,
26
+ ValidationFunction,
27
+ ValidationFunctionType,
28
+ )
29
+
30
+
31
+ class DataSourceType(str, Enum):
32
+ OPENSEARCH = "opensearch"
33
+ ELASTICSEARCH = "elasticsearch"
34
+ POSTGRES = "postgres"
35
+ MYSQL = "mysql"
36
+ MSSQL = "mssql"
37
+ BIGQUERY = "bigquery"
38
+ # TEMPORARILY INACTIVE
39
+ # REDSHIFT = "redshift"
40
+ SNOWFLAKE = "snowflake"
41
+ DATABRICKS = "databricks"
42
+ SPARK_DF = "spark_df"
43
+ ORACLE = "oracle"
44
+ DB2 = "db2"
45
+ SYBASE = "sybase"
46
+
47
+
48
+ class DataSourceLanguageSupport(str, Enum):
49
+ SQL = "sql"
50
+ DSL_ES = "dsl_es"
51
+
52
+
53
+ @dataclass
54
+ class DataSourceConnectionConfiguration:
55
+ """
56
+ Connection configuration for a data source
57
+ """
58
+
59
+ host: Optional[str] = None
60
+ port: Optional[int] = None
61
+ database: Optional[str] = None
62
+ username: Optional[str] = None
63
+ password: Optional[str] = None
64
+ schema: Optional[str] = None
65
+
66
+ project: Optional[str] = None # BigQuery specific configuration
67
+ dataset: Optional[str] = None # BigQuery specific configuration
68
+ credentials_base64: Optional[str] = None # BigQuery specific configuration
69
+ keyfile: Optional[str] = None # BigQuery specific configuration
70
+
71
+ token: Optional[str] = None # Databricks specific configuration
72
+ catalog: Optional[str] = None # Databricks specific configuration
73
+ http_path: Optional[str] = None # Databricks specific configuration
74
+
75
+ account: Optional[str] = None # Snowflake specific configuration
76
+ warehouse: Optional[str] = None # Snowflake specific configuration
77
+ role: Optional[str] = None # Snowflake specific configuration
78
+
79
+ driver: Optional[str] = None # SQL Server specific configuration
80
+
81
+ spark_session: Optional[Any] = None # Spark specific configuration
82
+
83
+ service_name: Optional[str] = None # Oracle specific configuration
84
+
85
+ security: Optional[str] = None # IBM DB2 specific configuration
86
+ protocol: Optional[str] = None # IBM DB2 specific configuration
87
+ server: Optional[str] = None
88
+
89
+
90
+ @dataclass
91
+ class DataSourceConfiguration:
92
+ """
93
+ Data source configuration
94
+ """
95
+
96
+ name: str
97
+ type: DataSourceType
98
+ connection_config: DataSourceConnectionConfiguration
99
+ language_support: Optional[DataSourceLanguageSupport] = None
100
+
101
+
102
+ @dataclass
103
+ class ValidationConfig:
104
+ name: str
105
+ on: str
106
+ threshold: Optional[Threshold] = None
107
+ where: Optional[str] = None
108
+ query: Optional[str] = None
109
+ regex: Optional[str] = None
110
+ values: Optional[List] = None
111
+ ref: Optional[str] = None
112
+
113
+ def _ref_field_validation(self):
114
+ if self.ref is not None:
115
+ reference_resources = self.ref.strip().split(".")
116
+ if len(reference_resources) < 2 or len(reference_resources) > 3:
117
+ raise ValueError("ref field should be in the format of <datasource_name>.<dataset_name>.<field_name>")
118
+ self._ref_data_source_name = reference_resources[0]
119
+ self._ref_dataset_name = reference_resources[1]
120
+ self._ref_field_name = None
121
+
122
+ if len(reference_resources) == 3:
123
+ self._ref_field_name = reference_resources[2]
124
+
125
+ def _on_field_validation(self):
126
+ if self.on is None:
127
+ raise ValueError("on field is required")
128
+ dataset_validation_functions = [
129
+ ValidationFunction.FAILED_ROWS,
130
+ ValidationFunction.COUNT_ROWS,
131
+ ValidationFunction.COUNT_DOCUMENTS,
132
+ ValidationFunction.CUSTOM_SQL,
133
+ ValidationFunction.DELTA_COUNT_ROWS,
134
+ ]
135
+
136
+ if self.on.strip().startswith("delta"):
137
+ self._is_delta_validation = True
138
+ on_statement = re.search(r"^delta\s+(.+)", self.on.strip()).group(1)
139
+ else:
140
+ self._is_delta_validation = False
141
+ on_statement = self.on.strip()
142
+
143
+ if on_statement not in dataset_validation_functions:
144
+ self._validation_function_type = ValidationFunctionType.FIELD
145
+ if not re.match(r"^(\w+)\(([ \w-]+)\)$", on_statement):
146
+ raise ValueError(f"on field must be a valid function, was {on_statement}")
147
+ else:
148
+ column_validation_function = re.search(r"^(\w+)\(([ \w-]+)\)$", on_statement).group(1)
149
+
150
+ if column_validation_function not in [v for v in ValidationFunction]:
151
+ raise ValueError(f"{column_validation_function} is not a valid validation function")
152
+
153
+ if column_validation_function in dataset_validation_functions:
154
+ raise ValueError(f"{column_validation_function} is a table function, should not have column name")
155
+
156
+ self._validation_function = ValidationFunction(
157
+ column_validation_function
158
+ if not self._is_delta_validation
159
+ else f"delta_{column_validation_function}"
160
+ )
161
+ self._validation_field_name = re.search(r"^(\w+)\(([ \w-]+)\)$", on_statement).group(2)
162
+ else:
163
+ self._validation_function_type = ValidationFunctionType.DATASET
164
+ self._validation_function = ValidationFunction(
165
+ on_statement if not self._is_delta_validation else f"delta_{on_statement}"
166
+ )
167
+ self._validation_field_name = None
168
+
169
+ def __post_init__(self):
170
+ self._on_field_validation()
171
+ self._ref_field_validation()
172
+
173
+ @property
174
+ def get_validation_function(self) -> ValidationFunction:
175
+ return ValidationFunction(self._validation_function)
176
+
177
+ @property
178
+ def get_is_delta_validation(self):
179
+ return self._is_delta_validation
180
+
181
+ @property
182
+ def get_ref_data_source_name(self):
183
+ return self._ref_data_source_name if self.ref is not None else None
184
+
185
+ @property
186
+ def get_ref_dataset_name(self):
187
+ return self._ref_dataset_name if self.ref is not None else None
188
+
189
+ @property
190
+ def get_ref_field_name(self):
191
+ return self._ref_field_name if self.ref is not None else None
192
+
193
+ @property
194
+ def get_validation_function_type(self) -> ValidationFunctionType:
195
+ return self._validation_function_type
196
+
197
+ @property
198
+ def get_validation_field_name(self) -> str:
199
+ return self._validation_field_name if self._validation_field_name else None
200
+
201
+
202
+ @dataclass
203
+ class ValidationConfigByDataset:
204
+ """
205
+ Validation configuration group
206
+ """
207
+
208
+ data_source: str
209
+ dataset: str
210
+ validations: Dict[str, ValidationConfig]
211
+
212
+
213
+ @dataclass
214
+ class MetricsFilterConfiguration:
215
+ """
216
+ Filter configuration for a metric
217
+ """
218
+
219
+ where: Optional[str] = None
220
+
221
+
222
+ @dataclass
223
+ class MetricConfiguration:
224
+ """
225
+ Metric configuration
226
+ """
227
+
228
+ name: str
229
+ metric_type: MetricsType
230
+ expression: Optional[str] = None
231
+ query: Optional[str] = None
232
+ resource: Optional[Union[Table, Index, Field]] = None
233
+ validation: Optional[Validation] = None
234
+ filters: Optional[MetricsFilterConfiguration] = None
235
+
236
+ def __post_init__(self):
237
+ if self.expression is None and self.resource is None:
238
+ raise ValueError("Either expression or resource should be provided for a metric")
239
+
240
+
241
+ class MetricStorageType(str, Enum):
242
+ """
243
+ Metric storage type
244
+ """
245
+
246
+ LOCAL_FILE = "local_file"
247
+
248
+
249
+ @dataclass
250
+ class LocalFileStorageParameters:
251
+ """
252
+ Local file metric storage parameters
253
+ """
254
+
255
+ path: str
256
+
257
+
258
+ @dataclass
259
+ class MetricStorageConfiguration:
260
+ """
261
+ Metric storage configuration
262
+ """
263
+
264
+ type: MetricStorageType
265
+ params: Union[LocalFileStorageParameters]
266
+
267
+
268
+ @dataclass
269
+ class Configuration:
270
+ """
271
+ Configuration for the data checks
272
+ """
273
+
274
+ data_sources: Optional[Dict[str, DataSourceConfiguration]] = field(default_factory=dict)
275
+ validations: Optional[Dict[str, ValidationConfigByDataset]] = field(default_factory=dict)
276
+ metrics: Optional[Dict[str, MetricConfiguration]] = None
277
+ storage: Optional[MetricStorageConfiguration] = None
278
+
279
+ def add_spark_session(self, data_source_name: str, spark_session):
280
+ self.data_sources[data_source_name] = DataSourceConfiguration(
281
+ name=data_source_name,
282
+ type=DataSourceType.SPARK_DF,
283
+ connection_config=DataSourceConnectionConfiguration(spark_session=spark_session),
284
+ )
@@ -0,0 +1,24 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from dataclasses import dataclass
16
+ from typing import List
17
+
18
+ from dcs_core.core.common.models.widget import BaseWidgetInfo
19
+
20
+
21
+ @dataclass
22
+ class DashboardInfo:
23
+ name: str
24
+ widgets: List[BaseWidgetInfo]