@clickzetta/cz-cli-darwin-arm64 0.5.16 → 0.5.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/lakehouse-doc-en/SKILL.md +6 -11
- package/bin/skills/lakehouse-doc-en/references/AIGateway.md +58 -13
- package/bin/skills/lakehouse-doc-en/references/Computation.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/DataSource_Amazon_DocumentDB.md +3 -1
- package/bin/skills/lakehouse-doc-en/references/Foreach.md +14 -14
- package/bin/skills/lakehouse-doc-en/references/JDBC-Driver.md +0 -1
- package/bin/skills/lakehouse-doc-en/references/LakehouseAI-overview.md +21 -8
- package/bin/skills/lakehouse-doc-en/references/LakehouseDataGPT-tour.md +4 -9
- package/bin/skills/lakehouse-doc-en/references/LakehouseStudio-tour.md +14 -19
- package/bin/skills/lakehouse-doc-en/references/Lakehouse_Zilliz_MakeDataReadyforBIandAI.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/Logstash.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/Migrate_Spark_DataEngineeringBestPractices_Project_to_Lakehouse.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/Notebook.md +17 -17
- package/bin/skills/lakehouse-doc-en/references/RemoteFunction-as-udf.md +14 -14
- package/bin/skills/lakehouse-doc-en/references/SQL_External_Catalog_Guide.md +1 -9
- package/bin/skills/lakehouse-doc-en/references/SUMMARY.md +59 -29
- package/bin/skills/lakehouse-doc-en/references/WINDOWFUNCTION.md +99 -57
- package/bin/skills/lakehouse-doc-en/references/Zettapark_Data_Engineering_Demo.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/access-control-configuration.md +1 -8
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-2-5-1.0.md +16 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-3-29-1.0.2.md +14 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-3-8-1.0.1.md +16 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-4-28-1.1.md +29 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-5-12-1.1.1.md +18 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-5-15-1.2.md +9 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-5-21-1.3.md +9 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-5-28-1.4.md +10 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-6-3-1.5.md +9 -0
- package/bin/skills/lakehouse-doc-en/references/alicloud-arn-externalid.md +0 -5
- package/bin/skills/lakehouse-doc-en/references/answer-accuracy-improve.md +120 -103
- package/bin/skills/lakehouse-doc-en/references/application-list.md +1 -3
- package/bin/skills/lakehouse-doc-en/references/approval-list.md +16 -17
- package/bin/skills/lakehouse-doc-en/references/batch-load-parquet-file-into-lakehouse.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/batch_sync.md +9 -9
- package/bin/skills/lakehouse-doc-en/references/batch_sync_Sop.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/batchloadparquetfileintoLakehouse.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/bulkloadv1-python-sdk.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/chart-auto-refresh-guide.md +12 -6
- package/bin/skills/lakehouse-doc-en/references/clickzetta-sample-data.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/code_approval.md +1 -5
- package/bin/skills/lakehouse-doc-en/references/composite_task.md +31 -42
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_environment_and_data_generate.md +6 -9
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_javasdk_bulkload_realtime.md +4 -10
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_kafka_realtime_sync.md +1 -10
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_local_file_into_table_by_studio.md +0 -6
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_studio_batchload_public_network.md +0 -5
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_studio_python_node.md +2 -7
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_studio_realtime_cdc_public_network.md +13 -18
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_studio_sql_insert.md +0 -1
- package/bin/skills/lakehouse-doc-en/references/concepts.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/config-datasource.md +5 -7
- package/bin/skills/lakehouse-doc-en/references/connect-with-cli.md +116 -72
- package/bin/skills/lakehouse-doc-en/references/connect-with-cz-cli.md +151 -0
- package/bin/skills/lakehouse-doc-en/references/continue-job.md +9 -17
- package/bin/skills/lakehouse-doc-en/references/create-api-connection.md +315 -286
- package/bin/skills/lakehouse-doc-en/references/create-catalog-connection.md +1 -0
- package/bin/skills/lakehouse-doc-en/references/create-dynamic-table.md +4 -4
- package/bin/skills/lakehouse-doc-en/references/create-external-catalog.md +85 -22
- package/bin/skills/lakehouse-doc-en/references/create-table-ddl.md +45 -0
- package/bin/skills/lakehouse-doc-en/references/creating_alicloud_privatelinkendpoint.md +4 -6
- package/bin/skills/lakehouse-doc-en/references/creating_alicloud_privatelinkservice.md +4 -7
- package/bin/skills/lakehouse-doc-en/references/creating_tencentcloud_privatelinkendpoint.md +2 -7
- package/bin/skills/lakehouse-doc-en/references/creating_tencentcloud_privatelinkservice.md +1 -5
- package/bin/skills/lakehouse-doc-en/references/cz-cli-agent.md +15 -10
- package/bin/skills/lakehouse-doc-en/references/cz-cli-datasource.md +0 -8
- package/bin/skills/lakehouse-doc-en/references/cz-cli-sql.md +2 -45
- package/bin/skills/lakehouse-doc-en/references/cz-cli.md +53 -42
- package/bin/skills/lakehouse-doc-en/references/dashboard-version-management-guide.md +12 -4
- package/bin/skills/lakehouse-doc-en/references/data-integration-intro.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/data-integration.md +29 -27
- package/bin/skills/lakehouse-doc-en/references/data-load-summary.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/data-quality.md +25 -25
- package/bin/skills/lakehouse-doc-en/references/data-sharing.md +31 -54
- package/bin/skills/lakehouse-doc-en/references/data-sources.md +45 -45
- package/bin/skills/lakehouse-doc-en/references/data_catalog.md +23 -25
- package/bin/skills/lakehouse-doc-en/references/data_privacy.md +5 -2
- package/bin/skills/lakehouse-doc-en/references/data_sharing_between_accounts_guide.md +0 -4
- package/bin/skills/lakehouse-doc-en/references/data_visualization.md +4 -15
- package/bin/skills/lakehouse-doc-en/references/dataagent.md +39 -7
- package/bin/skills/lakehouse-doc-en/references/databricks-delta-to-lakehouse-migration.md +168 -0
- package/bin/skills/lakehouse-doc-en/references/databricks-dlt-to-lakehouse-migration.md +331 -0
- package/bin/skills/lakehouse-doc-en/references/databricks-external-catalog-practice.md +367 -0
- package/bin/skills/lakehouse-doc-en/references/databricks-jobs-to-studio-migration.md +199 -0
- package/bin/skills/lakehouse-doc-en/references/databricks-notebook-to-studio-migration.md +350 -0
- package/bin/skills/lakehouse-doc-en/references/databricks-uc-governance-to-lakehouse-migration.md +327 -0
- package/bin/skills/lakehouse-doc-en/references/datagpt-model-config.md +34 -0
- package/bin/skills/lakehouse-doc-en/references/datagpt_data_source.md +50 -37
- package/bin/skills/lakehouse-doc-en/references/datagpt_introduction.md +55 -79
- package/bin/skills/lakehouse-doc-en/references/datagpt_quickstart.md +50 -64
- package/bin/skills/lakehouse-doc-en/references/datalake-acceleration.md +75 -2
- package/bin/skills/lakehouse-doc-en/references/dbt-databricks-to-clickzetta-migration.md +242 -0
- package/bin/skills/lakehouse-doc-en/references/dynamic-mask.md +30 -30
- package/bin/skills/lakehouse-doc-en/references/dynamic-table-bestpractice.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/dynamic-table-introduce.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/dynamic_table_summary.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/eco_integration/streamlit.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/eco_integration/superset.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/ecosystem-all.md +1 -3
- package/bin/skills/lakehouse-doc-en/references/ecosystem.md +145 -0
- package/bin/skills/lakehouse-doc-en/references/external-catalog-summary.md +33 -38
- package/bin/skills/lakehouse-doc-en/references/external-function-combo-practice.md +466 -0
- package/bin/skills/lakehouse-doc-en/references/f6fc6447ee.md +7 -9
- package/bin/skills/lakehouse-doc-en/references/federation-query.md +56 -6
- package/bin/skills/lakehouse-doc-en/references/finebi-mysql.md +2 -0
- package/bin/skills/lakehouse-doc-en/references/get-started-with-sample-data.md +10 -11
- package/bin/skills/lakehouse-doc-en/references/gitfolder.md +2 -3
- package/bin/skills/lakehouse-doc-en/references/grant-privileges.md +2 -0
- package/bin/skills/lakehouse-doc-en/references/iceberg-rest-catalog-databricks.md +166 -0
- package/bin/skills/lakehouse-doc-en/references/ide.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/if_else_task.md +59 -57
- package/bin/skills/lakehouse-doc-en/references/input_output.md +10 -7
- package/bin/skills/lakehouse-doc-en/references/jobprofile-bestpractices.md +60 -64
- package/bin/skills/lakehouse-doc-en/references/kafka-connection.md +0 -1
- package/bin/skills/lakehouse-doc-en/references/key-concepts.md +146 -117
- package/bin/skills/lakehouse-doc-en/references/lakehouse-ai-gateway-cz-cli.md +317 -0
- package/bin/skills/lakehouse-doc-en/references/lakehouse-ai-sql-analysis.md +345 -0
- package/bin/skills/lakehouse-doc-en/references/lakehouse-dqc-guide.md +300 -0
- package/bin/skills/lakehouse-doc-en/references/lakehouse-medallion-sql-dt-guide.md +543 -0
- package/bin/skills/lakehouse-doc-en/references/lakehouse-multi-cloud-acceleration.md +274 -0
- package/bin/skills/lakehouse-doc-en/references/lakehouse-multimodal-ai-pipeline.md +198 -0
- package/bin/skills/lakehouse-doc-en/references/lakehouse-quick-experience_guide.md +49 -52
- package/bin/skills/lakehouse-doc-en/references/lakehouse-volume-pipe-acceleration-guide.md +380 -0
- package/bin/skills/lakehouse-doc-en/references/langchain-plug-installation.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/management.md +4 -9
- package/bin/skills/lakehouse-doc-en/references/medallion-lakehouse-from-scratch.md +2 -1
- package/bin/skills/lakehouse-doc-en/references/metrics_answer_build.md +58 -21
- package/bin/skills/lakehouse-doc-en/references/migrate-spark-data-engineering-best-practices-to-lakehouse.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/mindsdb.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/monitoring_and_alerting.md +65 -60
- package/bin/skills/lakehouse-doc-en/references/monitoring_item_specification.md +33 -33
- package/bin/skills/lakehouse-doc-en/references/multitable_batch_sync.md +16 -16
- package/bin/skills/lakehouse-doc-en/references/multitable_realtime_sync.md +65 -72
- package/bin/skills/lakehouse-doc-en/references/multitable_realtime_sync_sop.md +54 -52
- package/bin/skills/lakehouse-doc-en/references/navicat-mysql.md +2 -0
- package/bin/skills/lakehouse-doc-en/references/om-dynamic-table.md +71 -66
- package/bin/skills/lakehouse-doc-en/references/om-vcluster.md +2 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-create-session.md +79 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-generate-auth-token.md +63 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-overview.md +96 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-quick-start.md +286 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-response-guide.md +264 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-safe-question-poll.md +201 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-text2insight-query.md +99 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-text2insight-stop.md +74 -0
- package/bin/skills/lakehouse-doc-en/references/overview.md +6 -7
- package/bin/skills/lakehouse-doc-en/references/permission-application.md +5 -5
- package/bin/skills/lakehouse-doc-en/references/pipe-introduction.md +1 -0
- package/bin/skills/lakehouse-doc-en/references/pipe-kafka-table-stream.md +72 -70
- package/bin/skills/lakehouse-doc-en/references/pipe-kafka.md +105 -110
- package/bin/skills/lakehouse-doc-en/references/pipe-overview.md +40 -40
- package/bin/skills/lakehouse-doc-en/references/pipe-storage-object.md +43 -48
- package/bin/skills/lakehouse-doc-en/references/pipe-summary.md +14 -4
- package/bin/skills/lakehouse-doc-en/references/pipe-syntax.md +58 -151
- package/bin/skills/lakehouse-doc-en/references/practice_python_task.md +4 -4
- package/bin/skills/lakehouse-doc-en/references/pricing-ai-gateway.md +181 -0
- package/bin/skills/lakehouse-doc-en/references/pricing-lakehouse.md +316 -0
- package/bin/skills/lakehouse-doc-en/references/pricing.md +44 -288
- package/bin/skills/lakehouse-doc-en/references/private-link-general.md +0 -2
- package/bin/skills/lakehouse-doc-en/references/pyspark-to-zettapark-migration-f1.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/python-igs.md +7 -3
- package/bin/skills/lakehouse-doc-en/references/python-sample-put-github-rt-events.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/python-task.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/python_reference/connector.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/python_reference/connector_advanced.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/python_reference/connector_examples.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/python_sdk_guide.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/python_shell_datasource.md +11 -9
- package/bin/skills/lakehouse-doc-en/references/quick_start_batch_sync_data.md +9 -18
- package/bin/skills/lakehouse-doc-en/references/quick_start_bi_analysis.md +8 -25
- package/bin/skills/lakehouse-doc-en/references/quick_start_create_workspace.md +4 -6
- package/bin/skills/lakehouse-doc-en/references/quick_start_data_quality.md +8 -8
- package/bin/skills/lakehouse-doc-en/references/quick_start_etl.md +16 -20
- package/bin/skills/lakehouse-doc-en/references/quick_start_monitoring_and_alerting.md +10 -18
- package/bin/skills/lakehouse-doc-en/references/quick_start_sql_query.md +7 -10
- package/bin/skills/lakehouse-doc-en/references/quick_start_upload_data.md +5 -7
- package/bin/skills/lakehouse-doc-en/references/quick_start_user_management.md +8 -8
- package/bin/skills/lakehouse-doc-en/references/quick_start_workspace.md +0 -5
- package/bin/skills/lakehouse-doc-en/references/quick_start_workspace_user.md +8 -8
- package/bin/skills/lakehouse-doc-en/references/quickstart.md +69 -56
- package/bin/skills/lakehouse-doc-en/references/quickstart_datashare_between_companies.md +0 -5
- package/bin/skills/lakehouse-doc-en/references/quickstart_envirment_for_team.md +0 -24
- package/bin/skills/lakehouse-doc-en/references/realtime-pipeline-selection-guide.md +1 -2
- package/bin/skills/lakehouse-doc-en/references/realtime-sales-dashboard-with-dynamic-table.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/realtime_sync.md +0 -1
- package/bin/skills/lakehouse-doc-en/references/release-note-2026-05-19.md +5 -3
- package/bin/skills/lakehouse-doc-en/references/revoke-privileges.md +3 -1
- package/bin/skills/lakehouse-doc-en/references/roles.md +2 -3
- package/bin/skills/lakehouse-doc-en/references/row-filter.md +165 -0
- package/bin/skills/lakehouse-doc-en/references/row_level_permission.md +30 -19
- package/bin/skills/lakehouse-doc-en/references/scheduled_task.md +28 -21
- package/bin/skills/lakehouse-doc-en/references/security_overview.md +99 -21
- package/bin/skills/lakehouse-doc-en/references/set-command.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/setup.md +13 -15
- package/bin/skills/lakehouse-doc-en/references/show-grants.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/snowflake-dynamic-tables-to-lakehouse.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/spark-connector-summary.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/sql_functions/context_functions/current_vcluster.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/sso-configuration.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/streaming_pipeline_with_dynamic_table.md +0 -1
- package/bin/skills/lakehouse-doc-en/references/studio-incremental-sync-practice.md +27 -23
- package/bin/skills/lakehouse-doc-en/references/studio-shell-task.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/supported-cloud-platforms.md +32 -0
- package/bin/skills/lakehouse-doc-en/references/table_rendering.md +18 -12
- package/bin/skills/lakehouse-doc-en/references/task-develop.md +89 -91
- package/bin/skills/lakehouse-doc-en/references/task_development.md +19 -17
- package/bin/skills/lakehouse-doc-en/references/task_group.md +16 -14
- package/bin/skills/lakehouse-doc-en/references/task_instance.md +21 -21
- package/bin/skills/lakehouse-doc-en/references/task_param.md +38 -35
- package/bin/skills/lakehouse-doc-en/references/task_param_reference.md +81 -79
- package/bin/skills/lakehouse-doc-en/references/task_scheduling_dependency.md +20 -21
- package/bin/skills/lakehouse-doc-en/references/tencentcloud_arn_and_externalid.md +1 -5
- package/bin/skills/lakehouse-doc-en/references/trial-account-quotas-and-limits.md +1 -3
- package/bin/skills/lakehouse-doc-en/references/tutorial_connect_to_lakehouse.md +69 -0
- package/bin/skills/lakehouse-doc-en/references/tutorials.md +4 -1
- package/bin/skills/lakehouse-doc-en/references/unique-key.md +167 -0
- package/bin/skills/lakehouse-doc-en/references/usageandbillingview.md +138 -0
- package/bin/skills/lakehouse-doc-en/references/use-dbt-dev.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/use-java-sdk-realtime-uploaddata.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/use-java-sdk-upload-data-local.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/use-models.md +128 -0
- package/bin/skills/lakehouse-doc-en/references/use-mysql-client.md +81 -81
- package/bin/skills/lakehouse-doc-en/references/use-python-sdk-upload-data.md +10 -12
- package/bin/skills/lakehouse-doc-en/references/user-identification.md +2 -3
- package/bin/skills/lakehouse-doc-en/references/user_permission_grand_guide.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/using-udf-in-dynamic-table.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/vc_cache.md +18 -22
- package/bin/skills/lakehouse-doc-en/references/vcluster_size_description.md +33 -31
- package/bin/skills/lakehouse-doc-en/references/virtual-cluster.md +43 -45
- package/bin/skills/lakehouse-doc-en/references/web-job-history.md +94 -108
- package/bin/skills/lakehouse-doc-en/references/web_search.md +16 -7
- package/bin/skills/lakehouse-doc-en/references/zettapark-data-engineering-demo.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/zettapark-dataframe-guide.md +144 -70
- package/bin/skills/lakehouse-doc-en/references/zettapark-dynamic-table-guide.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/zettapark-etl-guide.md +73 -33
- package/bin/skills/lakehouse-doc-en/references/zettapark-feature-engineering.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/zettapark-functions-guide.md +75 -46
- package/bin/skills/lakehouse-doc-en/references/zettapark-quick-start.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/zettapark-stream-guide.md +4 -4
- package/bin/skills/lakehouse-doc-en/references/zettapark-volume-guide.md +93 -29
- package/package.json +1 -1
- package/bin/skills/lakehouse-doc-en/references/CLAUDE.md +0 -606
- package/bin/skills/lakehouse-doc-en/references/modelprice.md +0 -155
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
# Data Quality Checks (DQC): SQL-Driven Automated Validation
|
|
2
|
+
|
|
3
|
+
Your data pipeline is running — but is the data actually correct? Are row counts consistent? Do critical fields contain null values? Are aggregated metrics reasonable? These are questions every data engineer faces every day. Singdata Lakehouse's Data Quality Check (DQC) uses pure SQL to implement automated validation, integrating quality monitoring into the data pipeline so that problems are caught before they impact downstream consumers.
|
|
4
|
+
|
|
5
|
+
This article uses the NHL Medallion architecture as an example to demonstrate how to build a complete DQC framework across the Bronze → Silver → Gold three-layer model.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## DQC Core Concepts
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
Data Pipeline DQC Gate
|
|
13
|
+
──────────── ─────────
|
|
14
|
+
Bronze (raw data) ── after load ──→ row count + freshness
|
|
15
|
+
│
|
|
16
|
+
▼
|
|
17
|
+
Silver (clean DT) ── after refresh ──→ null rate + uniqueness + value range
|
|
18
|
+
│
|
|
19
|
+
▼
|
|
20
|
+
Gold (aggregate DT) ── after refresh ──→ aggregation consistency + volatility
|
|
21
|
+
│
|
|
22
|
+
▼
|
|
23
|
+
BI / Apps ←── consume PASS only
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
DQC is not a one-time activity — it is an **automated process embedded in the pipeline**. After each data refresh, checks run automatically and results are written to a `dqc_results` table, with anomalies exposed through a monitoring DT.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## SQL Commands Used
|
|
31
|
+
|
|
32
|
+
| Command / Function | Purpose | Use case |
|
|
33
|
+
|------------|------|---------|
|
|
34
|
+
| `CREATE SCHEMA` | Create a dedicated DQC layer | Isolate quality check tables |
|
|
35
|
+
| `CREATE TABLE` | Create a DQC results table | Store the history of each check run |
|
|
36
|
+
| `INSERT INTO ... SELECT` | Write DQC check results | One record per check |
|
|
37
|
+
| `CASE WHEN` | Determine PASS/WARN/FAIL | Core logic for all check rules |
|
|
38
|
+
| `COUNT(*)` / `SUM(CASE WHEN)` | Row count, conditional count | Row count validation, null rate, uniqueness |
|
|
39
|
+
| `MIN` / `MAX` | Value range upper and lower bounds | Value range checks, freshness checks |
|
|
40
|
+
| `CREATE DYNAMIC TABLE` | Create a DQC dashboard DT | Auto-refresh quality status summary |
|
|
41
|
+
| `REFRESH DYNAMIC TABLE` | Manually trigger DT refresh | Initialize data after first creation |
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## DQC Check Types
|
|
46
|
+
|
|
47
|
+
| Type | Description | Example |
|
|
48
|
+
|---|---|---|
|
|
49
|
+
| **Row count consistency** | Whether upstream and downstream row counts match | Bronze team_info(33) = Silver dim_team(33) |
|
|
50
|
+
| **Null rate** | Proportion of NULL values in critical fields | goals null rate should be 0% |
|
|
51
|
+
| **Uniqueness** | Whether ID fields contain duplicates | player_id should be unique |
|
|
52
|
+
| **Value range** | Whether numeric values fall within reasonable bounds | goals >= 0, save_pct in [0,1] |
|
|
53
|
+
| **Freshness** | Whether data has been updated to the latest | Latest season >= 2019 |
|
|
54
|
+
| **Aggregation consistency** | Whether summary metrics are self-consistent | wins + losses = games played |
|
|
55
|
+
| **Referential integrity** | JOIN key match rate | skater_stats.player_id exists in player_info |
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Creating the DQC Results Table
|
|
60
|
+
|
|
61
|
+
```sql
|
|
62
|
+
CREATE SCHEMA IF NOT EXISTS dqc COMMENT 'Data Quality Check layer';
|
|
63
|
+
|
|
64
|
+
CREATE TABLE dqc.dqc_results (
|
|
65
|
+
check_id STRING COMMENT 'Check ID, e.g. DQC-001',
|
|
66
|
+
check_name STRING COMMENT 'Check type: row_match/null_rate/uniqueness...',
|
|
67
|
+
layer STRING COMMENT 'Data layer: bronze/silver/gold',
|
|
68
|
+
metric STRING COMMENT 'Metric name',
|
|
69
|
+
expected STRING COMMENT 'Expected value or range',
|
|
70
|
+
actual STRING COMMENT 'Actual value',
|
|
71
|
+
status STRING COMMENT 'PASS / WARN / FAIL',
|
|
72
|
+
detail STRING COMMENT 'Check description',
|
|
73
|
+
checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP()
|
|
74
|
+
) COMMENT 'DQC check results table';
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Defining Check Rules
|
|
80
|
+
|
|
81
|
+
### Row Count Consistency
|
|
82
|
+
|
|
83
|
+
Verify that the Silver layer dimension table row count matches the Bronze source table:
|
|
84
|
+
|
|
85
|
+
```sql
|
|
86
|
+
INSERT INTO dqc.dqc_results (check_id, check_name, layer, metric, expected, actual, status, detail)
|
|
87
|
+
SELECT 'DQC-001', 'dim_row_match', 'silver', 'dim_team_rows',
|
|
88
|
+
CAST((SELECT COUNT(*) FROM nhl_game_data.team_info) AS STRING),
|
|
89
|
+
CAST((SELECT COUNT(*) FROM silver.dim_team) AS STRING),
|
|
90
|
+
CASE WHEN (SELECT COUNT(*) FROM nhl_game_data.team_info)
|
|
91
|
+
= (SELECT COUNT(*) FROM silver.dim_team)
|
|
92
|
+
THEN 'PASS' ELSE 'FAIL' END,
|
|
93
|
+
'Bronze team_info row count should match Silver dim_team';
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Null Rate
|
|
97
|
+
|
|
98
|
+
```sql
|
|
99
|
+
INSERT INTO dqc.dqc_results (check_id, check_name, layer, metric, expected, actual, status, detail)
|
|
100
|
+
SELECT 'DQC-003', 'null_rate', 'silver', 'skater_goals_null_pct',
|
|
101
|
+
'=0',
|
|
102
|
+
CAST(ROUND(SUM(CASE WHEN goals IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS STRING),
|
|
103
|
+
CASE WHEN SUM(CASE WHEN goals IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*) = 0
|
|
104
|
+
THEN 'PASS' ELSE 'WARN' END,
|
|
105
|
+
'Silver fact_skater_stats.goals should have no NULLs'
|
|
106
|
+
FROM silver.fact_skater_stats;
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
> **PASS vs WARN vs FAIL severity levels**:
|
|
110
|
+
> - `PASS`: Fully meets expectations
|
|
111
|
+
> - `WARN`: Deviation within tolerance, needs attention but should not block (e.g., null rate < 1%)
|
|
112
|
+
> - `FAIL`: Severely out of range, should block downstream consumption
|
|
113
|
+
|
|
114
|
+
### Uniqueness
|
|
115
|
+
|
|
116
|
+
```sql
|
|
117
|
+
INSERT INTO dqc.dqc_results (check_id, check_name, layer, metric, expected, actual, status, detail)
|
|
118
|
+
SELECT 'DQC-005', 'uniqueness', 'silver', 'dim_player_id_unique',
|
|
119
|
+
'TRUE',
|
|
120
|
+
CAST(CASE WHEN COUNT(*) = COUNT(DISTINCT player_id) THEN 'TRUE' ELSE 'FALSE' END AS STRING),
|
|
121
|
+
CASE WHEN COUNT(*) = COUNT(DISTINCT player_id) THEN 'PASS' ELSE 'FAIL' END,
|
|
122
|
+
'Silver dim_player.player_id should be unique'
|
|
123
|
+
FROM silver.dim_player;
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Value Range
|
|
127
|
+
|
|
128
|
+
```sql
|
|
129
|
+
INSERT INTO dqc.dqc_results (check_id, check_name, layer, metric, expected, actual, status, detail)
|
|
130
|
+
SELECT 'DQC-006', 'value_range', 'silver', 'skater_goals_positive',
|
|
131
|
+
'>=0',
|
|
132
|
+
CAST(MIN(goals) AS STRING),
|
|
133
|
+
CASE WHEN MIN(goals) >= 0 THEN 'PASS' ELSE 'FAIL' END,
|
|
134
|
+
'Silver fact_skater_stats.goals should not be negative'
|
|
135
|
+
FROM silver.fact_skater_stats;
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Freshness
|
|
139
|
+
|
|
140
|
+
```sql
|
|
141
|
+
INSERT INTO dqc.dqc_results (check_id, check_name, layer, metric, expected, actual, status, detail)
|
|
142
|
+
SELECT 'DQC-008', 'freshness', 'bronze', 'max_season',
|
|
143
|
+
'>=2019',
|
|
144
|
+
CAST(MAX(season) AS STRING),
|
|
145
|
+
CASE WHEN MAX(season) >= 2019 THEN 'PASS' ELSE 'WARN' END,
|
|
146
|
+
'Bronze latest season should not be earlier than 2019'
|
|
147
|
+
FROM nhl_game_data.game;
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## DQC Dashboard
|
|
153
|
+
|
|
154
|
+
Aggregate `dqc_results` into a Dynamic Table dashboard for a one-stop view of quality status across layers:
|
|
155
|
+
|
|
156
|
+
```sql
|
|
157
|
+
CREATE OR REPLACE DYNAMIC TABLE dqc.dqc_dashboard
|
|
158
|
+
REFRESH INTERVAL 1 DAY VCLUSTER DEFAULT
|
|
159
|
+
COMMENT 'DQC Dashboard - quality status summary by layer'
|
|
160
|
+
AS
|
|
161
|
+
SELECT
|
|
162
|
+
layer,
|
|
163
|
+
COUNT(*) AS total_checks,
|
|
164
|
+
SUM(CASE WHEN status = 'PASS' THEN 1 ELSE 0 END) AS pass_cnt,
|
|
165
|
+
SUM(CASE WHEN status = 'WARN' THEN 1 ELSE 0 END) AS warn_cnt,
|
|
166
|
+
SUM(CASE WHEN status = 'FAIL' THEN 1 ELSE 0 END) AS fail_cnt,
|
|
167
|
+
ROUND(SUM(CASE WHEN status = 'PASS' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pass_rate
|
|
168
|
+
FROM dqc.dqc_results
|
|
169
|
+
GROUP BY layer;
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
**View the dashboard:**
|
|
173
|
+
|
|
174
|
+
```sql
|
|
175
|
+
SELECT * FROM dqc.dqc_dashboard ORDER BY layer;
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Validation Results
|
|
181
|
+
|
|
182
|
+
Running 10 DQC checks on the NHL Medallion architecture (Bronze: 10 tables → Silver: 4 DTs → Gold: 5 DTs):
|
|
183
|
+
|
|
184
|
+
| ID | Type | Layer | Metric | Expected | Actual | Status |
|
|
185
|
+
|---|---|---|---|---|---|---|
|
|
186
|
+
| DQC-001 | Row count | silver | dim_team_rows | 33 | 33 | PASS |
|
|
187
|
+
| DQC-002 | Row count | silver | dim_player_rows | 3925 | 3925 | PASS |
|
|
188
|
+
| DQC-003 | Null | silver | goals_null_pct | =0 | 0.00% | PASS |
|
|
189
|
+
| DQC-004 | Null | silver | player_name_null_pct | <1% | 0.00% | PASS |
|
|
190
|
+
| DQC-005 | Uniqueness | silver | player_id_unique | TRUE | TRUE | PASS |
|
|
191
|
+
| DQC-006 | Value range | silver | goals >= 0 | >=0 | 0 | PASS |
|
|
192
|
+
| DQC-007 | Value range | silver | points >= 0 | >=0 | 0 | PASS |
|
|
193
|
+
| DQC-008 | Freshness | bronze | max_season | >=2019 | 2020 | PASS |
|
|
194
|
+
| DQC-009 | Aggregation | gold | unique_seasons | >0 | 19 | PASS |
|
|
195
|
+
| DQC-010 | Aggregation | gold | wins >= 0 | >=0 | 0 | PASS |
|
|
196
|
+
|
|
197
|
+
**All PASS, 100% pass rate.**
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Integrating with the Data Pipeline
|
|
202
|
+
|
|
203
|
+
### Option 1: Manual trigger (suitable for development validation)
|
|
204
|
+
|
|
205
|
+
```sql
|
|
206
|
+
-- Run all DQC checks, then view results
|
|
207
|
+
SELECT check_id, status, metric, actual
|
|
208
|
+
FROM dqc.dqc_results
|
|
209
|
+
WHERE status != 'PASS'; -- show anomalies only
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Option 2: Dynamic Table automated execution
|
|
213
|
+
|
|
214
|
+
Wrap DQC check logic in a Dynamic Table to automatically re-run after each source table refresh:
|
|
215
|
+
|
|
216
|
+
```sql
|
|
217
|
+
-- DQC check DT: null rate monitoring
|
|
218
|
+
CREATE OR REPLACE DYNAMIC TABLE dqc.skater_null_monitor
|
|
219
|
+
REFRESH INTERVAL 1 DAY VCLUSTER DEFAULT
|
|
220
|
+
COMMENT 'Silver layer player stats null rate monitoring'
|
|
221
|
+
AS
|
|
222
|
+
SELECT
|
|
223
|
+
'DQC-003' AS check_id,
|
|
224
|
+
'null_rate' AS check_name,
|
|
225
|
+
'silver' AS layer,
|
|
226
|
+
'skater_goals_null_pct' AS metric,
|
|
227
|
+
'=0' AS expected,
|
|
228
|
+
CAST(ROUND(SUM(CASE WHEN goals IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS STRING) AS actual,
|
|
229
|
+
CASE WHEN SUM(CASE WHEN goals IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*) = 0
|
|
230
|
+
THEN 'PASS' ELSE 'WARN' END AS status
|
|
231
|
+
FROM silver.fact_skater_stats;
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
> **Note**: DQC results in DT mode are overwritten with each REFRESH. To retain historical records, INSERT results into the `dqc_results` table instead of using a DT.
|
|
235
|
+
|
|
236
|
+
### Option 3: Studio task scheduling
|
|
237
|
+
|
|
238
|
+
Create a DQC task in Studio with a Cron schedule and dependency on the ETL task:
|
|
239
|
+
|
|
240
|
+
```
|
|
241
|
+
00_sync (Cron 02:00)
|
|
242
|
+
↓
|
|
243
|
+
04_etl (Cron 02:30, depends on 00)
|
|
244
|
+
↓
|
|
245
|
+
05_dqc (Cron 03:00, depends on 04) ← DQC runs after ETL completes
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
If DQC finds a FAIL, you can configure Studio alert rules to send notifications.
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
## DQC Checklist
|
|
253
|
+
|
|
254
|
+
| Layer | Must check after load | Recommended checks |
|
|
255
|
+
|---|---|---|
|
|
256
|
+
| **Bronze** | Row count >= source, latest data date | `_op` distribution (I/U/D), file count |
|
|
257
|
+
| **Silver** | Row count <= Bronze, critical field NULL < 1%, ID unique | LEFT JOIN match rate, value range, type conversion success rate |
|
|
258
|
+
| **Gold** | Aggregation results non-null, metrics >= 0 | Period-over-period change < 20%, TOP N results reasonable |
|
|
259
|
+
|
|
260
|
+
---
|
|
261
|
+
|
|
262
|
+
## Integrating with Alerts
|
|
263
|
+
|
|
264
|
+
```sql
|
|
265
|
+
-- Query all FAIL checks
|
|
266
|
+
SELECT * FROM dqc.dqc_results WHERE status = 'FAIL';
|
|
267
|
+
|
|
268
|
+
-- Query anomaly summary for the current check run
|
|
269
|
+
SELECT layer,
|
|
270
|
+
SUM(CASE WHEN status = 'FAIL' THEN 1 ELSE 0 END) AS fails,
|
|
271
|
+
SUM(CASE WHEN status = 'WARN' THEN 1 ELSE 0 END) AS warns
|
|
272
|
+
FROM dqc.dqc_results
|
|
273
|
+
WHERE checked_at > CURRENT_TIMESTAMP() - INTERVAL 1 DAY
|
|
274
|
+
GROUP BY layer
|
|
275
|
+
HAVING SUM(CASE WHEN status = 'FAIL' THEN 1 ELSE 0 END) > 0;
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
You can configure in Studio: any FAIL in DQC task results → trigger WeCom/DingTalk/Feishu notification.
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## Notes
|
|
283
|
+
|
|
284
|
+
| Note | Description |
|
|
285
|
+
|---|---|
|
|
286
|
+
| **DQC results table should be a regular table** | Retaining historical records enables trend analysis; DTs overwrite history |
|
|
287
|
+
| **WARN does not block, FAIL should block** | WARN means "needs attention"; FAIL means "cannot be published" |
|
|
288
|
+
| **DQC checks have their own cost** | Each check is a full table scan; keep the number of checks manageable (3-5 per layer recommended) |
|
|
289
|
+
| **Thresholds need business calibration** | NULL tolerance varies across business domains; use historical data to establish a baseline first |
|
|
290
|
+
| **Mind the timezone in freshness checks** | `CURRENT_TIMESTAMP()` is UTC, which may differ from the business timezone |
|
|
291
|
+
|
|
292
|
+
---
|
|
293
|
+
|
|
294
|
+
## Related Documentation
|
|
295
|
+
|
|
296
|
+
- [Medallion Pure-SQL DT Architecture](lakehouse-medallion-sql-dt-guide.md) — Bronze → Silver → Gold three-layer modeling
|
|
297
|
+
- [Volume + Pipe Data Lake Acceleration](lakehouse-volume-pipe-acceleration-guide.md) — Data ingestion pipeline
|
|
298
|
+
- [AI-Enhanced Data Analysis](lakehouse-ai-sql-analysis.md) — Calling LLMs in SQL for intelligent analysis
|
|
299
|
+
- [Studio Task Scheduling](task_scheduling.md) — DQC task Cron configuration
|
|
300
|
+
- [Monitoring and Alerting](monitoring_and_alerting.md) — Configuring DQC anomaly notifications
|