dataops-testgen 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. dataops_testgen-2.2.0.dist-info/LICENSE +203 -0
  2. dataops_testgen-2.2.0.dist-info/METADATA +287 -0
  3. dataops_testgen-2.2.0.dist-info/NOTICE +5 -0
  4. dataops_testgen-2.2.0.dist-info/RECORD +270 -0
  5. dataops_testgen-2.2.0.dist-info/WHEEL +5 -0
  6. dataops_testgen-2.2.0.dist-info/entry_points.txt +2 -0
  7. dataops_testgen-2.2.0.dist-info/top_level.txt +1 -0
  8. testgen/__init__.py +0 -0
  9. testgen/__main__.py +770 -0
  10. testgen/commands/__init__.py +0 -0
  11. testgen/commands/queries/__init__.py +0 -0
  12. testgen/commands/queries/execute_cat_tests_query.py +95 -0
  13. testgen/commands/queries/execute_tests_query.py +160 -0
  14. testgen/commands/queries/generate_tests_query.py +94 -0
  15. testgen/commands/queries/profiling_query.py +366 -0
  16. testgen/commands/queries/test_parameter_validation_query.py +88 -0
  17. testgen/commands/run_execute_cat_tests.py +162 -0
  18. testgen/commands/run_execute_tests.py +168 -0
  19. testgen/commands/run_generate_tests.py +107 -0
  20. testgen/commands/run_get_entities.py +122 -0
  21. testgen/commands/run_launch_db_config.py +84 -0
  22. testgen/commands/run_observability_exporter.py +330 -0
  23. testgen/commands/run_profiling_bridge.py +495 -0
  24. testgen/commands/run_quick_start.py +168 -0
  25. testgen/commands/run_setup_profiling_tools.py +96 -0
  26. testgen/commands/run_test_definition.py +146 -0
  27. testgen/commands/run_test_parameter_validation.py +135 -0
  28. testgen/commands/run_upgrade_db_config.py +156 -0
  29. testgen/common/__init__.py +8 -0
  30. testgen/common/clean_sql.py +53 -0
  31. testgen/common/credentials.py +25 -0
  32. testgen/common/database/__init__.py +0 -0
  33. testgen/common/database/database_service.py +629 -0
  34. testgen/common/database/flavor/__init__.py +0 -0
  35. testgen/common/database/flavor/flavor_service.py +75 -0
  36. testgen/common/database/flavor/mssql_flavor_service.py +34 -0
  37. testgen/common/database/flavor/postgresql_flavor_service.py +5 -0
  38. testgen/common/database/flavor/redshift_flavor_service.py +22 -0
  39. testgen/common/database/flavor/snowflake_flavor_service.py +69 -0
  40. testgen/common/database/flavor/trino_flavor_service.py +21 -0
  41. testgen/common/date_service.py +68 -0
  42. testgen/common/display_service.py +85 -0
  43. testgen/common/docker_service.py +76 -0
  44. testgen/common/encrypt.py +55 -0
  45. testgen/common/get_pipeline_parms.py +57 -0
  46. testgen/common/logs.py +79 -0
  47. testgen/common/process_service.py +62 -0
  48. testgen/common/read_file.py +69 -0
  49. testgen/settings.py +440 -0
  50. testgen/template/dbsetup/010_create_base_schema.sql +2 -0
  51. testgen/template/dbsetup/020_create_standard_functions_sprocs.sql +179 -0
  52. testgen/template/dbsetup/030_initialize_new_schema_structure.sql +735 -0
  53. testgen/template/dbsetup/040_populate_new_schema_project.sql +59 -0
  54. testgen/template/dbsetup/050_populate_new_schema_metadata.sql +1517 -0
  55. testgen/template/dbsetup/060_create_standard_views.sql +248 -0
  56. testgen/template/dbsetup/070_create_default_users.sql +17 -0
  57. testgen/template/dbsetup/075_grant_role_rights.sql +43 -0
  58. testgen/template/dbsetup/080_set_current_revision.sql +5 -0
  59. testgen/template/dbupgrade/0100_incremental_upgrade.sql +5 -0
  60. testgen/template/dbupgrade/0101_incremental_upgrade.sql +15 -0
  61. testgen/template/dbupgrade/0102_incremental_upgrade.sql +4 -0
  62. testgen/template/dbupgrade/0103_incremental_upgrade.sql +22 -0
  63. testgen/template/dbupgrade/0104_incremental_upgrade.sql +44 -0
  64. testgen/template/dbupgrade/0105_incremental_upgrade.sql +1 -0
  65. testgen/template/dbupgrade/0106_incremental_upgrade.sql +5 -0
  66. testgen/template/dbupgrade/0107_incremental_upgrade.sql +3 -0
  67. testgen/template/dbupgrade_helpers/get_tg_revision.sql +2 -0
  68. testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql +116 -0
  69. testgen/template/exec_cat_tests/ex_cat_get_distinct_tables.sql +11 -0
  70. testgen/template/exec_cat_tests/ex_cat_results_parse.sql +69 -0
  71. testgen/template/exec_cat_tests/ex_cat_retrieve_agg_test_parms.sql +6 -0
  72. testgen/template/exec_cat_tests/ex_cat_test_query.sql +8 -0
  73. testgen/template/execution/ex_finalize_test_run_results.sql +37 -0
  74. testgen/template/execution/ex_get_tests_non_cat.sql +47 -0
  75. testgen/template/execution/ex_update_test_record_in_testrun_table.sql +27 -0
  76. testgen/template/execution/ex_write_test_record_to_testrun_table.sql +6 -0
  77. testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql +48 -0
  78. testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_num_incr_generic.sql +34 -0
  79. testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_above_generic.sql +49 -0
  80. testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_within_generic.sql +49 -0
  81. testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql +49 -0
  82. testgen/template/flavors/generic/exec_query_tests/ex_custom_query_generic.sql +39 -0
  83. testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql +58 -0
  84. testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql +44 -0
  85. testgen/template/flavors/generic/exec_query_tests/ex_prior_match_generic.sql +37 -0
  86. testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql +53 -0
  87. testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql +46 -0
  88. testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql +59 -0
  89. testgen/template/flavors/generic/profiling/contingency_counts.sql +3 -0
  90. testgen/template/flavors/generic/validate_tests/ex_get_project_column_list_generic.sql +3 -0
  91. testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql +53 -0
  92. testgen/template/flavors/mssql/profiling/project_ddf_query_mssql.sql +35 -0
  93. testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml +246 -0
  94. testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql +36 -0
  95. testgen/template/flavors/mssql/setup_profiling_tools/00_drop_existing_functions_mssql.sql +8 -0
  96. testgen/template/flavors/mssql/setup_profiling_tools/01_create_functions_mssql.sql +12 -0
  97. testgen/template/flavors/mssql/setup_profiling_tools/02_create_functions_mssql.sql +54 -0
  98. testgen/template/flavors/mssql/setup_profiling_tools/create_qc_schema_mssql.sql +4 -0
  99. testgen/template/flavors/mssql/setup_profiling_tools/grant_execute_privileges_mssql.sql +1 -0
  100. testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql +46 -0
  101. testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql +59 -0
  102. testgen/template/flavors/postgresql/profiling/project_ddf_query_postgresql.sql +42 -0
  103. testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml +225 -0
  104. testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql +28 -0
  105. testgen/template/flavors/postgresql/setup_profiling_tools/create_functions_postgresql.sql +157 -0
  106. testgen/template/flavors/postgresql/setup_profiling_tools/create_qc_schema_postgresql.sql +1 -0
  107. testgen/template/flavors/postgresql/setup_profiling_tools/grant_execute_privileges_postgresql.sql +2 -0
  108. testgen/template/flavors/redshift/profiling/project_ddf_query_redshift.sql +38 -0
  109. testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml +221 -0
  110. testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql +29 -0
  111. testgen/template/flavors/redshift/setup_profiling_tools/create_functions_redshift.sql +115 -0
  112. testgen/template/flavors/redshift/setup_profiling_tools/create_qc_schema_redshift.sql +1 -0
  113. testgen/template/flavors/redshift/setup_profiling_tools/grant_execute_privileges_redshift.sql +2 -0
  114. testgen/template/flavors/snowflake/profiling/project_ddf_query_snowflake.sql +38 -0
  115. testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml +220 -0
  116. testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql +29 -0
  117. testgen/template/flavors/snowflake/setup_profiling_tools/create_functions_snowflake.sql +69 -0
  118. testgen/template/flavors/snowflake/setup_profiling_tools/create_qc_schema_snowflake.sql +1 -0
  119. testgen/template/flavors/snowflake/setup_profiling_tools/grant_execute_privileges_snowflake.sql +6 -0
  120. testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml +219 -0
  121. testgen/template/flavors/trino/setup_profiling_tools/create_functions_trino.sql +92 -0
  122. testgen/template/flavors/trino/setup_profiling_tools/create_qc_schema_trino.sql +1 -0
  123. testgen/template/gen_funny_cat_tests/gen_test_constant.sql +104 -0
  124. testgen/template/gen_funny_cat_tests/gen_test_distinct_value_ct.sql +98 -0
  125. testgen/template/gen_funny_cat_tests/gen_test_row_ct.sql +57 -0
  126. testgen/template/gen_funny_cat_tests/gen_test_row_ct_pct.sql +59 -0
  127. testgen/template/generation/gen_delete_old_tests.sql +5 -0
  128. testgen/template/generation/gen_insert_test_suite.sql +5 -0
  129. testgen/template/generation/gen_retrieve_or_insert_test_suite.sql +58 -0
  130. testgen/template/generation/gen_standard_test_type_list.sql +13 -0
  131. testgen/template/generation/gen_standard_tests.sql +48 -0
  132. testgen/template/get_entities/get_connection.sql +21 -0
  133. testgen/template/get_entities/get_connections_list.sql +9 -0
  134. testgen/template/get_entities/get_latest.sql +4 -0
  135. testgen/template/get_entities/get_profile.sql +12 -0
  136. testgen/template/get_entities/get_profile_info.sql +17 -0
  137. testgen/template/get_entities/get_profile_list.sql +17 -0
  138. testgen/template/get_entities/get_profile_screen.sql +275 -0
  139. testgen/template/get_entities/get_project_list.sql +6 -0
  140. testgen/template/get_entities/get_table_group_list.sql +10 -0
  141. testgen/template/get_entities/get_test_generation_list.sql +18 -0
  142. testgen/template/get_entities/get_test_info.sql +41 -0
  143. testgen/template/get_entities/get_test_results_for_run_cli.sql +16 -0
  144. testgen/template/get_entities/get_test_run_list.sql +24 -0
  145. testgen/template/get_entities/get_test_suite.sql +13 -0
  146. testgen/template/get_entities/get_test_suite_list.sql +18 -0
  147. testgen/template/get_entities/list_test_types.sql +4 -0
  148. testgen/template/observability/get_event_data.sql +23 -0
  149. testgen/template/observability/get_test_results.sql +41 -0
  150. testgen/template/observability/update_test_results_exported_to_observability.sql +12 -0
  151. testgen/template/parms/parms_profiling.sql +34 -0
  152. testgen/template/parms/parms_test_execution.sql +13 -0
  153. testgen/template/parms/parms_test_gen.sql +23 -0
  154. testgen/template/profiling/contingency_columns.sql +7 -0
  155. testgen/template/profiling/datatype_suggestions.sql +56 -0
  156. testgen/template/profiling/functional_datatype.sql +523 -0
  157. testgen/template/profiling/functional_tabletype_stage.sql +48 -0
  158. testgen/template/profiling/functional_tabletype_update.sql +8 -0
  159. testgen/template/profiling/pii_flag.sql +133 -0
  160. testgen/template/profiling/profile_anomalies_screen_column.sql +22 -0
  161. testgen/template/profiling/profile_anomalies_screen_multi_column.sql +58 -0
  162. testgen/template/profiling/profile_anomalies_screen_table.sql +22 -0
  163. testgen/template/profiling/profile_anomalies_screen_table_dates.sql +30 -0
  164. testgen/template/profiling/profile_anomalies_screen_variants.sql +40 -0
  165. testgen/template/profiling/profile_anomaly_types_get.sql +3 -0
  166. testgen/template/profiling/project_get_table_sample_count.sql +22 -0
  167. testgen/template/profiling/project_profile_run_record_insert.sql +8 -0
  168. testgen/template/profiling/project_profile_run_record_update.sql +5 -0
  169. testgen/template/profiling/project_profile_run_record_update_status.sql +5 -0
  170. testgen/template/profiling/project_update_profile_results_to_estimates.sql +32 -0
  171. testgen/template/profiling/refresh_anomalies.sql +33 -0
  172. testgen/template/profiling/refresh_data_chars_from_profiling.sql +156 -0
  173. testgen/template/profiling/secondary_profiling_columns.sql +12 -0
  174. testgen/template/profiling/secondary_profiling_delete.sql +4 -0
  175. testgen/template/profiling/secondary_profiling_update.sql +18 -0
  176. testgen/template/quick_start/populate_target_data.sql +1077 -0
  177. testgen/template/quick_start/recreate_target_data_schema.sql +167 -0
  178. testgen/template/quick_start/update_target_data.sql +100 -0
  179. testgen/template/updates/create_tmp_test_definition.sql +19 -0
  180. testgen/template/updates/get_test_def_parms.sql +38 -0
  181. testgen/template/updates/populate_stg_test_definitions.sql +184 -0
  182. testgen/template/validate_tests/ex_disable_tests_test_definitions.sql +5 -0
  183. testgen/template/validate_tests/ex_flag_tests_test_definitions.sql +64 -0
  184. testgen/template/validate_tests/ex_get_project_column_list_generic.sql +3 -0
  185. testgen/template/validate_tests/ex_get_test_column_list_tg.sql +65 -0
  186. testgen/template/validate_tests/ex_write_test_val_errors.sql +22 -0
  187. testgen/ui/__init__.py +0 -0
  188. testgen/ui/app.py +98 -0
  189. testgen/ui/assets/dk_logo.svg +46 -0
  190. testgen/ui/assets/question_mark.png +0 -0
  191. testgen/ui/assets/scripts.js +68 -0
  192. testgen/ui/assets/style.css +140 -0
  193. testgen/ui/bootstrap.py +109 -0
  194. testgen/ui/components/__init__.py +0 -0
  195. testgen/ui/components/frontend/css/KFOlCnqEu92Fr1MmEU9fBBc4.woff2 +0 -0
  196. testgen/ui/components/frontend/css/KFOlCnqEu92Fr1MmEU9fChc4EsA.woff2 +0 -0
  197. testgen/ui/components/frontend/css/KFOmCnqEu92Fr1Mu4mxK.woff2 +0 -0
  198. testgen/ui/components/frontend/css/KFOmCnqEu92Fr1Mu7GxKOzY.woff2 +0 -0
  199. testgen/ui/components/frontend/css/material-symbols-rounded.css +24 -0
  200. testgen/ui/components/frontend/css/material-symbols-rounded.woff2 +0 -0
  201. testgen/ui/components/frontend/css/roboto-font-faces.css +35 -0
  202. testgen/ui/components/frontend/css/shared.css +36 -0
  203. testgen/ui/components/frontend/img/dk_logo.svg +46 -0
  204. testgen/ui/components/frontend/index.html +17 -0
  205. testgen/ui/components/frontend/js/components/breadcrumbs.js +86 -0
  206. testgen/ui/components/frontend/js/components/button.js +66 -0
  207. testgen/ui/components/frontend/js/components/location.js +62 -0
  208. testgen/ui/components/frontend/js/components/select.js +75 -0
  209. testgen/ui/components/frontend/js/components/sidebar.js +358 -0
  210. testgen/ui/components/frontend/js/main.js +99 -0
  211. testgen/ui/components/frontend/js/streamlit.js +19 -0
  212. testgen/ui/components/frontend/js/van.min.js +1 -0
  213. testgen/ui/components/utils/__init__.py +0 -0
  214. testgen/ui/components/utils/callbacks.py +51 -0
  215. testgen/ui/components/utils/component.py +13 -0
  216. testgen/ui/components/widgets/__init__.py +6 -0
  217. testgen/ui/components/widgets/breadcrumbs.py +32 -0
  218. testgen/ui/components/widgets/location.py +65 -0
  219. testgen/ui/components/widgets/modal.py +97 -0
  220. testgen/ui/components/widgets/sidebar.py +69 -0
  221. testgen/ui/navigation/__init__.py +0 -0
  222. testgen/ui/navigation/menu.py +42 -0
  223. testgen/ui/navigation/page.py +20 -0
  224. testgen/ui/navigation/router.py +63 -0
  225. testgen/ui/queries/__init__.py +0 -0
  226. testgen/ui/queries/authentication_queries.py +47 -0
  227. testgen/ui/queries/connection_queries.py +121 -0
  228. testgen/ui/queries/profiling_queries.py +148 -0
  229. testgen/ui/queries/project_queries.py +9 -0
  230. testgen/ui/queries/table_group_queries.py +186 -0
  231. testgen/ui/queries/test_definition_queries.py +270 -0
  232. testgen/ui/queries/test_run_queries.py +32 -0
  233. testgen/ui/queries/test_suite_queries.py +145 -0
  234. testgen/ui/scripts/__init__.py +0 -0
  235. testgen/ui/scripts/patch_streamlit.py +111 -0
  236. testgen/ui/services/__init__.py +0 -0
  237. testgen/ui/services/authentication_service.py +119 -0
  238. testgen/ui/services/connection_service.py +220 -0
  239. testgen/ui/services/database_service.py +282 -0
  240. testgen/ui/services/form_service.py +1008 -0
  241. testgen/ui/services/javascript_service.py +44 -0
  242. testgen/ui/services/query_service.py +316 -0
  243. testgen/ui/services/string_service.py +12 -0
  244. testgen/ui/services/table_group_service.py +130 -0
  245. testgen/ui/services/test_definition_service.py +117 -0
  246. testgen/ui/services/test_run_service.py +13 -0
  247. testgen/ui/services/test_suite_service.py +76 -0
  248. testgen/ui/services/toolbar_service.py +77 -0
  249. testgen/ui/session.py +46 -0
  250. testgen/ui/views/__init__.py +0 -0
  251. testgen/ui/views/app_log_modal.py +92 -0
  252. testgen/ui/views/connections.py +72 -0
  253. testgen/ui/views/connections_base.py +367 -0
  254. testgen/ui/views/login.py +40 -0
  255. testgen/ui/views/not_found.py +16 -0
  256. testgen/ui/views/overview.py +34 -0
  257. testgen/ui/views/profiling_anomalies.py +501 -0
  258. testgen/ui/views/profiling_details.py +335 -0
  259. testgen/ui/views/profiling_modal.py +40 -0
  260. testgen/ui/views/profiling_results.py +206 -0
  261. testgen/ui/views/profiling_summary.py +177 -0
  262. testgen/ui/views/project_settings.py +74 -0
  263. testgen/ui/views/table_groups.py +530 -0
  264. testgen/ui/views/test_definitions.py +1020 -0
  265. testgen/ui/views/test_results.py +908 -0
  266. testgen/ui/views/test_runs.py +195 -0
  267. testgen/ui/views/test_suites.py +545 -0
  268. testgen/utils/__init__.py +0 -0
  269. testgen/utils/plugins.py +17 -0
  270. testgen/utils/singleton.py +14 -0
@@ -0,0 +1,221 @@
1
+ ---
2
+ strTemplate01_sampling: "SELECT "
3
+ strTemplate01_else: "SELECT "
4
+ strTemplate02_all: |
5
+ {CONNECTION_ID} as connection_id,
6
+ '{PROJECT_CODE}' as project_code,
7
+ '{TABLE_GROUPS_ID}' as table_groups_id,
8
+ '{DATA_SCHEMA}' AS schema_name,
9
+ '{RUN_DATE}' AS run_date,
10
+ '{DATA_TABLE}' AS table_name,
11
+ {COL_POS} AS position,
12
+ '{COL_NAME_SANITIZED}' AS column_name,
13
+ '{COL_TYPE}' AS column_type,
14
+ '{COL_GEN_TYPE}' AS general_type,
15
+ COUNT(*) AS record_ct,
16
+ COUNT("{COL_NAME}") AS value_ct,
17
+ COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
18
+ SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct,
19
+ strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length,
20
+ MAX(LEN("{COL_NAME}")) AS max_length,
21
+ AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length,
22
+ strTemplate03_else: NULL as min_length,
23
+ NULL as max_length,
24
+ NULL as avg_length,
25
+ strTemplate04_A: SUM(CASE
26
+ WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 ELSE 0
27
+ END) AS zero_value_ct,
28
+ strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct,
29
+ strTemplate04_else: NULL as zero_value_ct,
30
+ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct,
31
+ SUM(CASE
32
+ WHEN "{COL_NAME}" = '' THEN 1
33
+ ELSE 0
34
+ END) AS zero_length_ct,
35
+ SUM( CASE
36
+ WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1
37
+ ELSE 0
38
+ END ) AS lead_space_ct,
39
+ SUM( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END ) as quoted_value_ct,
40
+ SUM( CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 ELSE 0 END ) as includes_digit_ct,
41
+ SUM( CASE
42
+ WHEN "{COL_NAME}" IN ('.', '?', ' ') THEN 1
43
+ WHEN LOWER("{COL_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1
44
+ WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd',
45
+ 'n/a','#na','none','null','unknown') THEN 1
46
+ WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)',
47
+ '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1
48
+ WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]',
49
+ '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1
50
+ ELSE 0
51
+ END ) AS filled_value_ct,
52
+ LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text,
53
+ LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text,
54
+ SUM({DATA_QC_SCHEMA}.fndk_isnum(LEFT("{COL_NAME}", 31))) AS numeric_ct,
55
+ SUM({DATA_QC_SCHEMA}.fndk_isdate(LEFT("{COL_NAME}", 26))) AS date_ct,
56
+ CASE
57
+ WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'
58
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'STREET_ADDR'
59
+ WHEN SUM(CASE WHEN "{COL_NAME}" IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA')
60
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'STATE_USA'
61
+ WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^([\\+]1 |1-|)[\\+]?[(]?[0-9]{3}[)][ ]?[-\\s\\.]?[0-9]{3}[-\\s\.]?[0-9]{4,6}$'
62
+ OR "{COL_NAME}" ~ '^([\\+]1 |1-|)[2-9][01][0-9][-| ]?[0-9]{3}[-| ]?[0-9]{4}$'
63
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'PHONE_USA'
64
+ WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$'
65
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'EMAIL'
66
+ WHEN SUM( CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999')
67
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA'
68
+ WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[\\w\\s\-]+\\.(txt|csv|tsv|dat|doc|pdf|xlsx)$'
69
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'FILE_NAME'
70
+ WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^([0-9]{4}[- ]){3}[0-9]{4}$'
71
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'CREDIT_CARD'
72
+ WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'
73
+ AND "{COL_NAME}" !~ '\\s(and|but|or|yet)\\s'
74
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'DELIMITED_DATA'
75
+ WHEN SUM ( CASE WHEN "{COL_NAME}" ~ '^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$'
76
+ AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749'
77
+ AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN'
78
+ END as std_pattern_match,
79
+ strTemplate05_else: NULL as distinct_std_value_ct,
80
+ NULL as zero_length_ct,
81
+ NULL as lead_space_ct,
82
+ NULL as quoted_value_ct,
83
+ NULL as includes_digit_ct,
84
+ NULL as filled_value_ct,
85
+ NULL as min_text,
86
+ NULL as max_text,
87
+ NULL as numeric_ct,
88
+ NULL as date_ct,
89
+ NULL as std_pattern_match,
90
+ strTemplate06_A_patterns: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats
91
+ FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(10)) || ' | ' || pattern AS pattern,
92
+ COUNT(*) AS ct
93
+ FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE(
94
+ "{COL_NAME}", '[a-z]', 'a'),
95
+ '[A-Z]', 'A'),
96
+ '[0-9]', 'N') AS pattern
97
+ FROM {DATA_SCHEMA}.{DATA_TABLE}
98
+ WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}"))
99
+ FROM {DATA_SCHEMA}.{DATA_TABLE}) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p
100
+ GROUP BY pattern
101
+ HAVING pattern > ' '
102
+ ORDER BY COUNT(*) DESC) as ps) AS top_patterns,
103
+ strTemplate06_else: NULL as top_patterns,
104
+ strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) as concat_vals
105
+ FROM (
106
+ SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val,
107
+ COUNT(*) as ct
108
+ FROM {DATA_SCHEMA}.{DATA_TABLE}
109
+ WHERE "{COL_NAME}" > ' '
110
+ GROUP BY "{COL_NAME}"
111
+ HAVING "{COL_NAME}" > ' '
112
+ ORDER BY COUNT(*), "{COL_NAME}" DESC
113
+ ) ps
114
+ ) AS top_freq_values,
115
+ strTemplate07_else: NULL as top_freq_values,
116
+ strTemplate08_N: MIN("{COL_NAME}") AS min_value,
117
+ MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0,
118
+ MAX("{COL_NAME}") AS max_value,
119
+ AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value,
120
+ STDDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value,
121
+ MIN(pct_25) as percentile_25,
122
+ MIN(pct_50) as percentile_50,
123
+ MIN(pct_75) as percentile_75,
124
+ strTemplate08_else: NULL as min_value,
125
+ NULL as min_value_over_0,
126
+ NULL as max_value,
127
+ NULL as avg_value,
128
+ NULL as stdev_value,
129
+ NULL as percentile_25,
130
+ NULL as percentile_50,
131
+ NULL as percentile_75,
132
+ strTemplate10_N_dec: SUM(ROUND(MOD("{COL_NAME}", 1), 5)) as fractional_sum,
133
+
134
+ strTemplate10_else: NULL as fractional_sum,
135
+
136
+ strTemplate11_D: CASE
137
+ WHEN MIN("{COL_NAME}") IS NULL THEN NULL
138
+ ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01')
139
+ END as min_date,
140
+ MAX("{COL_NAME}") as max_date,
141
+ SUM(CASE
142
+ WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 12 THEN 1
143
+ ELSE 0
144
+ END) AS before_1yr_date_ct,
145
+ SUM(CASE
146
+ WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 60 THEN 1
147
+ ELSE 0
148
+ END) AS before_5yr_date_ct,
149
+ SUM(CASE
150
+ WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1
151
+ ELSE 0
152
+ END) AS before_20yr_date_ct,
153
+ SUM(CASE
154
+ WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1
155
+ ELSE 0
156
+ END) AS within_1yr_date_ct,
157
+ SUM(CASE
158
+ WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1
159
+ ELSE 0
160
+ END) AS within_1mo_date_ct,
161
+ SUM(CASE
162
+ WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 ELSE 0
163
+ END) AS future_date_ct,
164
+ COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present,
165
+ COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present,
166
+ COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present,
167
+
168
+ strTemplate11_else: NULL as min_date,
169
+ NULL as max_date,
170
+ NULL as before_1yr_date_ct,
171
+ NULL as before_5yr_date_ct,
172
+ NULL as before_20yr_date_ct,
173
+ NULL as within_1yr_date_ct,
174
+ NULL as within_1mo_date_ct,
175
+ NULL as future_date_ct,
176
+ NULL as date_days_present,
177
+ NULL as date_weeks_present,
178
+ NULL as date_months_present,
179
+
180
+ strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct,
181
+
182
+ strTemplate12_else: NULL as boolean_true_ct,
183
+
184
+ strTemplate13_ALL: NULL AS datatype_suggestion,
185
+ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE(
186
+ "{COL_NAME}", '[a-z]', 'a'),
187
+ '[A-Z]', 'A'),
188
+ '[0-9]', 'N')
189
+ ) AS pattern_ct
190
+ FROM {DATA_SCHEMA}.{DATA_TABLE}
191
+ WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct,
192
+ SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct,
193
+ AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces,
194
+
195
+ strTemplate14_A_no_patterns: NULL as distinct_pattern_ct,
196
+ SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct,
197
+ AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces,
198
+
199
+ strTemplate14_else: NULL as distinct_pattern_ct,
200
+ NULL as embedded_space_ct,
201
+ NULL as avg_embedded_spaces,
202
+
203
+ strTemplate15_ALL: NULL as functional_data_type,
204
+ NULL as functional_table_type,
205
+
206
+ strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id"
207
+
208
+ strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} '
209
+
210
+ strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE}'
211
+
212
+ strTemplate99_N: |
213
+ , (SELECT
214
+ PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25,
215
+ PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50,
216
+ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75
217
+ FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile
218
+
219
+ strTemplate99_else: ' '
220
+
221
+ strTemplate100_sampling: 'WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}'
@@ -0,0 +1,29 @@
1
+ -- Get Freqs for selected columns
2
+ WITH ranked_vals AS (
3
+ SELECT "{COL_NAME}",
4
+ COUNT(*) AS ct,
5
+ ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn
6
+ FROM {DATA_SCHEMA}.{DATA_TABLE}
7
+ WHERE "{COL_NAME}" > ' '
8
+ GROUP BY "{COL_NAME}"
9
+ ),
10
+ consol_vals AS (
11
+ SELECT COALESCE(CASE WHEN rn <= 10 THEN '| ' || "{COL_NAME}" || ' | ' || CAST(ct AS VARCHAR)
12
+ ELSE NULL
13
+ END, '| Other Values (' || CAST(COUNT(DISTINCT "{COL_NAME}") as VARCHAR) || ') | ' || CAST(SUM(ct) as VARCHAR) ) AS val,
14
+ MIN(rn) as min_rn
15
+ FROM ranked_vals
16
+ GROUP BY CASE WHEN rn <= 10 THEN '| ' || "{COL_NAME}" || ' | ' || CAST(ct AS VARCHAR)
17
+ ELSE NULL
18
+ END
19
+ )
20
+ SELECT '{PROJECT_CODE}' as project_code,
21
+ '{DATA_SCHEMA}' as schema_name,
22
+ '{RUN_DATE}' as run_date,
23
+ '{DATA_TABLE}' as table_name,
24
+ '{COL_NAME}' as column_name,
25
+ REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values,
26
+ ( SELECT MD5(LISTAGG(DISTINCT "{COL_NAME}", '|')
27
+ WITHIN GROUP (ORDER BY "{COL_NAME}")) as dvh
28
+ FROM {DATA_SCHEMA}.{DATA_TABLE} ) as distinct_value_hash
29
+ FROM consol_vals;
@@ -0,0 +1,115 @@
1
+ CREATE OR REPLACE FUNCTION {DATA_QC_SCHEMA}.fndk_isnum(VARCHAR)
2
+ RETURNS INTEGER
3
+ IMMUTABLE
4
+ AS
5
+ $$
6
+ SELECT CASE
7
+ WHEN $1 ~ '^\\s*[+-]?\\$?\\s*[0-9]+(,[0-9]{3})*(\\.[0-9]*)?[\\%]?\\s*$' THEN 1
8
+ ELSE 0
9
+ END;
10
+ $$
11
+ LANGUAGE sql;
12
+
13
+
14
+ CREATE OR REPLACE FUNCTION {DATA_QC_SCHEMA}.fndk_isdate(VARCHAR)
15
+ RETURNS INTEGER
16
+ IMMUTABLE
17
+ AS $$
18
+ SELECT CASE
19
+ -- YYYY-MM-DD HH:MM:SS SSSSSS or YYYY-MM-DD HH:MM:SS
20
+ WHEN $1 ~
21
+ '^(\\d{4})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])\\s(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\s[0-9]{6})?$'
22
+ THEN CASE
23
+ WHEN LEFT($1, 4):: INT BETWEEN 1800 AND 2200
24
+ AND (
25
+ (SUBSTRING($1, 6, 2) IN ('01', '03', '05', '07', '08',
26
+ '10', '12')
27
+ AND SUBSTRING($1, 9, 2):: INT BETWEEN 1 AND 31)
28
+ OR (SUBSTRING($1, 6, 2) IN ('04', '06', '09')
29
+ AND SUBSTRING($1, 9, 2):: INT BETWEEN 1 AND 30)
30
+ OR (SUBSTRING($1, 6, 2) = '02'
31
+ AND SUBSTRING($1, 9, 2):: INT :: INT BETWEEN 1 AND 29)
32
+ )
33
+ THEN 1
34
+ ELSE 0
35
+ END
36
+ -- YYYYMMDDHHMMSSSSSS or YYYYMMDD
37
+ WHEN $1 ~
38
+ '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])([0-5][0-9])([0-5][0-9])([0-9]{6})$'
39
+ OR $1 ~ '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])$'
40
+ THEN CASE
41
+ WHEN LEFT($1, 4)::INT BETWEEN 1800 AND 2200
42
+ AND (
43
+ (SUBSTRING($1, 5, 2) IN ('01', '03', '05', '07', '08',
44
+ '10', '12')
45
+ AND SUBSTRING($1, 7, 2)::INT BETWEEN 1 AND 31)
46
+ OR (SUBSTRING($1, 5, 2) IN ('04', '06', '09')
47
+ AND SUBSTRING($1, 7, 2)::INT BETWEEN 1 AND 30)
48
+ OR (SUBSTRING($1, 5, 2) = '02'
49
+ AND SUBSTRING($1, 7, 2)::INT::INT BETWEEN 1 AND 29)
50
+ )
51
+ THEN 1
52
+ ELSE 0
53
+ END
54
+ -- Exclude anything else long
55
+ WHEN LENGTH($1) > 11 THEN 0
56
+ -- YYYY-MMM/MM-DD
57
+ WHEN REGEXP_REPLACE(UPPER($1), '(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', '12')
58
+ ~ '[12][09][0-9][0-9]-[0-1]?[0-9]-[0-3]?[0-9]'
59
+ THEN CASE
60
+ WHEN SPLIT_PART($1, '-', 1)::INT BETWEEN 1800 AND 2200
61
+ AND (
62
+ (UPPER(SPLIT_PART($1, '-', 2)) IN ('01', '03', '05', '07', '08',
63
+ '1', '3', '5', '7', '8', '10', '12',
64
+ 'JAN', 'MAR', 'MAY', 'JUL', 'AUG',
65
+ 'OCT', 'DEC')
66
+ AND SPLIT_PART($1, '-', 3)::INT BETWEEN 1 AND 31)
67
+ OR (UPPER(SPLIT_PART($1, '-', 2)) IN ('04', '06', '09', '4', '6', '9', '11',
68
+ 'APR', 'JUN', 'SEP', 'NOV')
69
+ AND SPLIT_PART($1, '-', 3)::INT BETWEEN 1 AND 30)
70
+ OR (UPPER(SPLIT_PART($1, '-', 2)) IN ('02', '2', 'FEB')
71
+ AND SPLIT_PART($1, '-', 3)::INT BETWEEN 1 AND 29)
72
+ )
73
+ THEN 1
74
+ ELSE 0
75
+ END
76
+ -- MM/-DD/-YY/YYYY
77
+ WHEN REPLACE($1, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[12][09][0-9][0-9]$'
78
+ OR REPLACE($1, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[0-9][0-9]$'
79
+ THEN
80
+ CASE
81
+ WHEN SPLIT_PART(REPLACE($1, '-', '/'), '/', 1)::INT BETWEEN 1 AND 12
82
+ AND (
83
+ (SPLIT_PART(REPLACE($1, '-', '/'), '/', 1)::INT IN (1, 3, 5, 7, 8, 10, 12)
84
+ AND SPLIT_PART(REPLACE($1, '-', '/'), '/', 2)::INT BETWEEN 1 AND 31)
85
+ OR (SPLIT_PART(REPLACE($1, '-', '/'), '/', 1)::INT IN (4, 6, 9, 11)
86
+ AND SPLIT_PART(REPLACE($1, '-', '/'), '/', 2)::INT BETWEEN 1 AND 30)
87
+ OR (SPLIT_PART(REPLACE($1, '-', '/'), '/', 1)::INT = 2
88
+ AND SPLIT_PART(REPLACE($1, '-', '/'), '/', 2)::INT BETWEEN 1 AND 29)
89
+ )
90
+ AND
91
+ ('20' + RIGHT(SPLIT_PART(REPLACE($1, '-', '/'), '/', 3), 2))::INT BETWEEN 1800 AND 2200
92
+ THEN 1
93
+ ELSE 0
94
+ END
95
+ -- DD-MMM-YYYY
96
+ WHEN UPPER($1) ~ '[0-3]?[0-9]-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-[12][09][0-9][0-9]'
97
+ THEN
98
+ CASE
99
+ WHEN SPLIT_PART($1, '-', 3)::INT BETWEEN 1800 AND 2200
100
+ AND (
101
+ (UPPER(SPLIT_PART($1, '-', 2)) IN ('JAN', 'MAR', 'MAY', 'JUL', 'AUG', 'OCT', 'DEC')
102
+ AND SPLIT_PART($1, '-', 1)::INT BETWEEN 1 AND 31)
103
+ OR (UPPER(SPLIT_PART($1, '-', 2)) IN ('APR', 'JUN', 'SEP', 'NOV')
104
+ AND SPLIT_PART($1, '-', 1)::INT BETWEEN 1 AND 30)
105
+ OR (UPPER(SPLIT_PART($1, '-', 2)) = 'FEB'
106
+ AND SPLIT_PART($1, '-', 1)::INT BETWEEN 1 AND 29)
107
+ )
108
+ THEN 1
109
+ ELSE 0
110
+ END
111
+ ELSE 0
112
+ END
113
+ AS isdate;
114
+ $$
115
+ LANGUAGE sql;
@@ -0,0 +1 @@
1
+ CREATE SCHEMA IF NOT exists {DATA_QC_SCHEMA};
@@ -0,0 +1,2 @@
1
+ GRANT ALL PRIVILEGES ON SCHEMA {DATA_QC_SCHEMA} TO {DB_USER};
2
+ GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA {DATA_QC_SCHEMA} TO {DB_USER};
@@ -0,0 +1,38 @@
1
+ SELECT '{PROJECT_CODE}' as project_code,
2
+ CURRENT_TIMESTAMP as refresh_timestamp,
3
+ c.table_schema,
4
+ c.table_name,
5
+ c.column_name,
6
+ CASE
7
+ WHEN c.data_type ILIKE 'timestamp%' THEN lower(c.data_type)
8
+ WHEN c.data_type ILIKE 'date' THEN lower(c.data_type)
9
+ WHEN c.data_type ILIKE 'boolean' THEN 'boolean'
10
+ WHEN c.data_type = 'TEXT'
11
+ THEN 'varchar(' || CAST(c.character_maximum_length AS VARCHAR) || ')'
12
+ WHEN c.data_type ILIKE 'char%' THEN 'char(' || CAST(c.character_maximum_length AS VARCHAR) || ')'
13
+ WHEN c.data_type = 'NUMBER' AND c.numeric_precision = 38 AND c.numeric_scale = 0 THEN 'bigint'
14
+ WHEN c.data_type ILIKE 'num%' THEN 'numeric(' || CAST(c.numeric_precision AS VARCHAR) || ',' ||
15
+ CAST(c.numeric_scale AS VARCHAR) || ')'
16
+ ELSE c.data_type
17
+ END AS data_type,
18
+ c.character_maximum_length,
19
+ c.ordinal_position,
20
+ CASE
21
+ WHEN c.data_type ILIKE '%char%' OR c.data_type = 'TEXT'
22
+ THEN 'A'
23
+ WHEN c.data_type ILIKE 'boolean'
24
+ THEN 'B'
25
+ WHEN c.data_type ILIKE 'date'
26
+ OR c.data_type ILIKE 'timestamp%'
27
+ THEN 'D'
28
+ WHEN c.data_type = 'time without time zone'
29
+ THEN 'T'
30
+ WHEN lower(c.data_type) IN ('bigint', 'double precision', 'integer', 'smallint', 'real', 'float')
31
+ OR c.data_type ILIKE 'num%'
32
+ THEN 'N'
33
+ ELSE
34
+ 'X' END AS general_type,
35
+ numeric_scale > 0 as is_decimal
36
+ FROM information_schema.columns c
37
+ WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA}
38
+ ORDER BY c.table_schema, c.table_name, c.ordinal_position;
@@ -0,0 +1,220 @@
1
+ ---
2
+ strTemplate01_sampling: "SELECT "
3
+ strTemplate01_else: "SELECT "
4
+ strTemplate02_all: |
5
+ {CONNECTION_ID} as connection_id,
6
+ '{PROJECT_CODE}' as project_code,
7
+ '{TABLE_GROUPS_ID}' as table_groups_id,
8
+ '{DATA_SCHEMA}' AS schema_name,
9
+ '{RUN_DATE}' AS run_date,
10
+ '{DATA_TABLE}' AS table_name,
11
+ {COL_POS} AS position,
12
+ '{COL_NAME_SANITIZED}' AS column_name,
13
+ '{COL_TYPE}' AS column_type,
14
+ '{COL_GEN_TYPE}' AS general_type,
15
+ COUNT(*) AS record_ct,
16
+ COUNT("{COL_NAME}") AS value_ct,
17
+ COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
18
+ SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct,
19
+ strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length,
20
+ MAX(LEN("{COL_NAME}")) AS max_length,
21
+ AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length,
22
+ strTemplate03_else: NULL as min_length,
23
+ NULL as max_length,
24
+ NULL as avg_length,
25
+ strTemplate04_A: SUM(CASE
26
+ WHEN REGEXP_LIKE(TRIM("{COL_NAME}"::VARCHAR), '^0(\.0*)?$') THEN 1 ELSE 0
27
+ END) AS zero_value_ct,
28
+ strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct,
29
+ strTemplate04_else: NULL as zero_value_ct,
30
+ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct,
31
+ SUM(CASE
32
+ WHEN "{COL_NAME}" = '' THEN 1
33
+ ELSE 0
34
+ END) AS zero_length_ct,
35
+ SUM( CASE
36
+ WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1
37
+ ELSE 0
38
+ END ) AS lead_space_ct,
39
+ SUM( CASE WHEN "{COL_NAME}"::VARCHAR ILIKE '"%"' OR "{COL_NAME}"::VARCHAR ILIKE '''%''' THEN 1 ELSE 0 END ) as quoted_value_ct,
40
+ SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '.*[0-9].*') THEN 1 ELSE 0 END ) as includes_digit_ct,
41
+ SUM( CASE
42
+ WHEN "{COL_NAME}" IN ('.', '?', ' ') THEN 1
43
+ WHEN LOWER("{COL_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COL_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COL_NAME}"::VARCHAR) REGEXP '9{2,}'
44
+ OR LOWER("{COL_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COL_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1
45
+ WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd',
46
+ 'n/a','#na','none','null','unknown') THEN 1
47
+ WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)',
48
+ '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1
49
+ WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]',
50
+ '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1
51
+ ELSE 0
52
+ END ) AS filled_value_ct,
53
+ LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text,
54
+ LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text,
55
+ SUM({DATA_QC_SCHEMA}.fndk_isnum(LEFT("{COL_NAME}", 31))) AS numeric_ct,
56
+ SUM({DATA_QC_SCHEMA}.fndk_isdate(LEFT("{COL_NAME}", 26))) AS date_ct,
57
+ CASE
58
+ WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$')
59
+ THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'STREET_ADDR'
60
+ WHEN CAST(SUM(CASE WHEN "{COL_NAME}" IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA')
61
+ THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'STATE_USA'
62
+ WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^([\\+]1 |1-|)[\\+]?[(]?[0-9]{3}[)][ ]?[-\\s\\.]?[0-9]{3}[-\\s\\.]?[0-9]{4,6}$')
63
+ OR REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^([\+]1 |1-|)[2-9][01][0-9][-| ]?[0-9]{3}[-| ]?[0-9]{4}$')
64
+ THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'PHONE_USA'
65
+ WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$')
66
+ THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'EMAIL'
67
+ WHEN CAST(SUM( CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999')
68
+ THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'ZIP_USA'
69
+ WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[\\w\\s\-]+\\.(txt|csv|tsv|dat|doc|pdf|xlsx)$')
70
+ THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'FILE_NAME'
71
+ WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^([0-9]{4}[- ]?){3}[0-9]{4}$')
72
+ THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'CREDIT_CARD'
73
+ WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$')
74
+ AND NOT REGEXP_LIKE("{COL_NAME}"::VARCHAR, '.*\\s(and|but|or|yet)\\s.*')
75
+ THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'DELIMITED_DATA'
76
+ WHEN SUM ( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$')
77
+ AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749'
78
+ AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN'
79
+ END as std_pattern_match,
80
+ strTemplate05_else: NULL as distinct_std_value_ct,
81
+ NULL as zero_length_ct,
82
+ NULL as lead_space_ct,
83
+ NULL as quoted_value_ct,
84
+ NULL as includes_digit_ct,
85
+ NULL as filled_value_ct,
86
+ NULL as min_text,
87
+ NULL as max_text,
88
+ NULL as numeric_ct,
89
+ NULL as date_ct,
90
+ NULL as std_pattern_match,
91
+ strTemplate06_A_patterns: ( SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats
92
+ FROM (
93
+ SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(10)) || ' | ' || pattern AS pattern,
94
+ COUNT(*) AS ct
95
+ FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE(
96
+ "{COL_NAME}"::VARCHAR, '[a-z]', 'a'),
97
+ '[A-Z]', 'A'),
98
+ '[0-9]', 'N') AS pattern
99
+ FROM {DATA_SCHEMA}.{DATA_TABLE}
100
+ WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}"))
101
+ FROM {DATA_SCHEMA}.{DATA_TABLE}) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p
102
+ GROUP BY pattern
103
+ HAVING pattern > ' '
104
+ ORDER BY COUNT(*) DESC) as ps) AS top_patterns,
105
+ strTemplate06_else: NULL as top_patterns,
106
+ strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) as concat_vals
107
+ FROM (
108
+ SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val,
109
+ COUNT(*) as ct
110
+ FROM {DATA_SCHEMA}.{DATA_TABLE}
111
+ WHERE "{COL_NAME}" > ' '
112
+ GROUP BY "{COL_NAME}"
113
+ HAVING "{COL_NAME}" > ' '
114
+ ORDER BY COUNT(*), "{COL_NAME}" DESC
115
+ ) ps
116
+ ) AS top_freq_values,
117
+ strTemplate07_else: NULL as top_freq_values,
118
+ strTemplate08_N: MIN("{COL_NAME}") AS min_value,
119
+ MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0,
120
+ MAX("{COL_NAME}") AS max_value,
121
+ AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value,
122
+ STDDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value,
123
+ MIN(pct_25) as percentile_25,
124
+ MIN(pct_50) as percentile_50,
125
+ MIN(pct_75) as percentile_75,
126
+ strTemplate08_else: NULL as min_value,
127
+ NULL as min_value_over_0,
128
+ NULL as max_value,
129
+ NULL as avg_value,
130
+ NULL as stdev_value,
131
+ NULL as percentile_25,
132
+ NULL as percentile_50,
133
+ NULL as percentile_75,
134
+ strTemplate10_N_dec: SUM(ROUND(MOD("{COL_NAME}", 1), 5)) as fractional_sum,
135
+
136
+ strTemplate10_else: NULL as fractional_sum,
137
+
138
+ strTemplate11_D: GREATEST(MIN("{COL_NAME}"), '0001-01-01') as min_date,
139
+ MAX("{COL_NAME}") as max_date,
140
+ SUM(CASE
141
+ WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 12 THEN 1
142
+ ELSE 0
143
+ END) AS before_1yr_date_ct,
144
+ SUM(CASE
145
+ WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 60 THEN 1
146
+ ELSE 0
147
+ END) AS before_5yr_date_ct,
148
+ SUM(CASE
149
+ WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1
150
+ ELSE 0
151
+ END) AS before_20yr_date_ct,
152
+ SUM(CASE
153
+ WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1
154
+ ELSE 0
155
+ END) AS within_1yr_date_ct,
156
+ SUM(CASE
157
+ WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1
158
+ ELSE 0
159
+ END) AS within_1mo_date_ct,
160
+ SUM(CASE
161
+ WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 ELSE 0
162
+ END) AS future_date_ct,
163
+ COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present,
164
+ COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present,
165
+ COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present,
166
+
167
+ strTemplate11_else: NULL as min_date,
168
+ NULL as max_date,
169
+ NULL as before_1yr_date_ct,
170
+ NULL as before_5yr_date_ct,
171
+ NULL as before_20yr_date_ct,
172
+ NULL as within_1yr_date_ct,
173
+ NULL as within_1mo_date_ct,
174
+ NULL as future_date_ct,
175
+ NULL as date_days_present,
176
+ NULL as date_weeks_present,
177
+ NULL as date_months_present,
178
+
179
+ strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct,
180
+
181
+ strTemplate12_else: NULL as boolean_true_ct,
182
+
183
+ strTemplate13_ALL: NULL AS datatype_suggestion,
184
+ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE(
185
+ "{COL_NAME}"::VARCHAR, '[a-z]', 'a'),
186
+ '[A-Z]', 'A'),
187
+ '[0-9]', 'N')
188
+ ) AS pattern_ct
189
+ FROM {DATA_SCHEMA}.{DATA_TABLE}
190
+ WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct,
191
+ SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' '))::BIGINT) AS embedded_space_ct,
192
+ AVG(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' ')::FLOAT) AS avg_embedded_spaces,
193
+
194
+ strTemplate14_A_no_patterns: NULL as distinct_pattern_ct,
195
+ SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' '))::BIGINT) AS embedded_space_ct,
196
+ AVG(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' ')::FLOAT) AS avg_embedded_spaces,
197
+
198
+ strTemplate14_else: NULL as distinct_pattern_ct,
199
+ NULL as embedded_space_ct,
200
+ NULL as avg_embedded_spaces,
201
+
202
+ strTemplate15_ALL: NULL as functional_data_type,
203
+ NULL as functional_table_type,
204
+
205
+ strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id "
206
+
207
+ strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} SAMPLE ({SAMPLE_SIZE} rows)'
208
+
209
+ strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE}'
210
+
211
+ strTemplate99_N: |
212
+ ,
213
+ (SELECT
214
+ PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25,
215
+ PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50,
216
+ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75
217
+ FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile
218
+ strTemplate99_else: ;
219
+
220
+ strTemplate100_sampling: ' '