@clickzetta/cz-cli-darwin-x64 0.5.16 → 0.5.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/lakehouse-doc-en/SKILL.md +6 -11
- package/bin/skills/lakehouse-doc-en/references/AIGateway.md +58 -13
- package/bin/skills/lakehouse-doc-en/references/Computation.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/DataSource_Amazon_DocumentDB.md +3 -1
- package/bin/skills/lakehouse-doc-en/references/Foreach.md +14 -14
- package/bin/skills/lakehouse-doc-en/references/JDBC-Driver.md +0 -1
- package/bin/skills/lakehouse-doc-en/references/LakehouseAI-overview.md +21 -8
- package/bin/skills/lakehouse-doc-en/references/LakehouseDataGPT-tour.md +4 -9
- package/bin/skills/lakehouse-doc-en/references/LakehouseStudio-tour.md +14 -19
- package/bin/skills/lakehouse-doc-en/references/Lakehouse_Zilliz_MakeDataReadyforBIandAI.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/Logstash.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/Migrate_Spark_DataEngineeringBestPractices_Project_to_Lakehouse.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/Notebook.md +17 -17
- package/bin/skills/lakehouse-doc-en/references/RemoteFunction-as-udf.md +14 -14
- package/bin/skills/lakehouse-doc-en/references/SQL_External_Catalog_Guide.md +1 -9
- package/bin/skills/lakehouse-doc-en/references/SUMMARY.md +59 -29
- package/bin/skills/lakehouse-doc-en/references/WINDOWFUNCTION.md +99 -57
- package/bin/skills/lakehouse-doc-en/references/Zettapark_Data_Engineering_Demo.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/access-control-configuration.md +1 -8
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-2-5-1.0.md +16 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-3-29-1.0.2.md +14 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-3-8-1.0.1.md +16 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-4-28-1.1.md +29 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-5-12-1.1.1.md +18 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-5-15-1.2.md +9 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-5-21-1.3.md +9 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-5-28-1.4.md +10 -0
- package/bin/skills/lakehouse-doc-en/references/aigw-2026-6-3-1.5.md +9 -0
- package/bin/skills/lakehouse-doc-en/references/alicloud-arn-externalid.md +0 -5
- package/bin/skills/lakehouse-doc-en/references/answer-accuracy-improve.md +120 -103
- package/bin/skills/lakehouse-doc-en/references/application-list.md +1 -3
- package/bin/skills/lakehouse-doc-en/references/approval-list.md +16 -17
- package/bin/skills/lakehouse-doc-en/references/batch-load-parquet-file-into-lakehouse.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/batch_sync.md +9 -9
- package/bin/skills/lakehouse-doc-en/references/batch_sync_Sop.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/batchloadparquetfileintoLakehouse.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/bulkloadv1-python-sdk.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/chart-auto-refresh-guide.md +12 -6
- package/bin/skills/lakehouse-doc-en/references/clickzetta-sample-data.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/code_approval.md +1 -5
- package/bin/skills/lakehouse-doc-en/references/composite_task.md +31 -42
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_environment_and_data_generate.md +6 -9
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_javasdk_bulkload_realtime.md +4 -10
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_kafka_realtime_sync.md +1 -10
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_local_file_into_table_by_studio.md +0 -6
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_studio_batchload_public_network.md +0 -5
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_studio_python_node.md +2 -7
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_studio_realtime_cdc_public_network.md +13 -18
- package/bin/skills/lakehouse-doc-en/references/comprehensive_guide_to_ingesting_studio_sql_insert.md +0 -1
- package/bin/skills/lakehouse-doc-en/references/concepts.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/config-datasource.md +5 -7
- package/bin/skills/lakehouse-doc-en/references/connect-with-cli.md +116 -72
- package/bin/skills/lakehouse-doc-en/references/connect-with-cz-cli.md +151 -0
- package/bin/skills/lakehouse-doc-en/references/continue-job.md +9 -17
- package/bin/skills/lakehouse-doc-en/references/create-api-connection.md +315 -286
- package/bin/skills/lakehouse-doc-en/references/create-catalog-connection.md +1 -0
- package/bin/skills/lakehouse-doc-en/references/create-dynamic-table.md +4 -4
- package/bin/skills/lakehouse-doc-en/references/create-external-catalog.md +85 -22
- package/bin/skills/lakehouse-doc-en/references/create-table-ddl.md +45 -0
- package/bin/skills/lakehouse-doc-en/references/creating_alicloud_privatelinkendpoint.md +4 -6
- package/bin/skills/lakehouse-doc-en/references/creating_alicloud_privatelinkservice.md +4 -7
- package/bin/skills/lakehouse-doc-en/references/creating_tencentcloud_privatelinkendpoint.md +2 -7
- package/bin/skills/lakehouse-doc-en/references/creating_tencentcloud_privatelinkservice.md +1 -5
- package/bin/skills/lakehouse-doc-en/references/cz-cli-agent.md +15 -10
- package/bin/skills/lakehouse-doc-en/references/cz-cli-datasource.md +0 -8
- package/bin/skills/lakehouse-doc-en/references/cz-cli-sql.md +2 -45
- package/bin/skills/lakehouse-doc-en/references/cz-cli.md +53 -42
- package/bin/skills/lakehouse-doc-en/references/dashboard-version-management-guide.md +12 -4
- package/bin/skills/lakehouse-doc-en/references/data-integration-intro.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/data-integration.md +29 -27
- package/bin/skills/lakehouse-doc-en/references/data-load-summary.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/data-quality.md +25 -25
- package/bin/skills/lakehouse-doc-en/references/data-sharing.md +31 -54
- package/bin/skills/lakehouse-doc-en/references/data-sources.md +45 -45
- package/bin/skills/lakehouse-doc-en/references/data_catalog.md +23 -25
- package/bin/skills/lakehouse-doc-en/references/data_privacy.md +5 -2
- package/bin/skills/lakehouse-doc-en/references/data_sharing_between_accounts_guide.md +0 -4
- package/bin/skills/lakehouse-doc-en/references/data_visualization.md +4 -15
- package/bin/skills/lakehouse-doc-en/references/dataagent.md +39 -7
- package/bin/skills/lakehouse-doc-en/references/databricks-delta-to-lakehouse-migration.md +168 -0
- package/bin/skills/lakehouse-doc-en/references/databricks-dlt-to-lakehouse-migration.md +331 -0
- package/bin/skills/lakehouse-doc-en/references/databricks-external-catalog-practice.md +367 -0
- package/bin/skills/lakehouse-doc-en/references/databricks-jobs-to-studio-migration.md +199 -0
- package/bin/skills/lakehouse-doc-en/references/databricks-notebook-to-studio-migration.md +350 -0
- package/bin/skills/lakehouse-doc-en/references/databricks-uc-governance-to-lakehouse-migration.md +327 -0
- package/bin/skills/lakehouse-doc-en/references/datagpt-model-config.md +34 -0
- package/bin/skills/lakehouse-doc-en/references/datagpt_data_source.md +50 -37
- package/bin/skills/lakehouse-doc-en/references/datagpt_introduction.md +55 -79
- package/bin/skills/lakehouse-doc-en/references/datagpt_quickstart.md +50 -64
- package/bin/skills/lakehouse-doc-en/references/datalake-acceleration.md +75 -2
- package/bin/skills/lakehouse-doc-en/references/dbt-databricks-to-clickzetta-migration.md +242 -0
- package/bin/skills/lakehouse-doc-en/references/dynamic-mask.md +30 -30
- package/bin/skills/lakehouse-doc-en/references/dynamic-table-bestpractice.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/dynamic-table-introduce.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/dynamic_table_summary.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/eco_integration/streamlit.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/eco_integration/superset.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/ecosystem-all.md +1 -3
- package/bin/skills/lakehouse-doc-en/references/ecosystem.md +145 -0
- package/bin/skills/lakehouse-doc-en/references/external-catalog-summary.md +33 -38
- package/bin/skills/lakehouse-doc-en/references/external-function-combo-practice.md +466 -0
- package/bin/skills/lakehouse-doc-en/references/f6fc6447ee.md +7 -9
- package/bin/skills/lakehouse-doc-en/references/federation-query.md +56 -6
- package/bin/skills/lakehouse-doc-en/references/finebi-mysql.md +2 -0
- package/bin/skills/lakehouse-doc-en/references/get-started-with-sample-data.md +10 -11
- package/bin/skills/lakehouse-doc-en/references/gitfolder.md +2 -3
- package/bin/skills/lakehouse-doc-en/references/grant-privileges.md +2 -0
- package/bin/skills/lakehouse-doc-en/references/iceberg-rest-catalog-databricks.md +166 -0
- package/bin/skills/lakehouse-doc-en/references/ide.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/if_else_task.md +59 -57
- package/bin/skills/lakehouse-doc-en/references/input_output.md +10 -7
- package/bin/skills/lakehouse-doc-en/references/jobprofile-bestpractices.md +60 -64
- package/bin/skills/lakehouse-doc-en/references/kafka-connection.md +0 -1
- package/bin/skills/lakehouse-doc-en/references/key-concepts.md +146 -117
- package/bin/skills/lakehouse-doc-en/references/lakehouse-ai-gateway-cz-cli.md +317 -0
- package/bin/skills/lakehouse-doc-en/references/lakehouse-ai-sql-analysis.md +345 -0
- package/bin/skills/lakehouse-doc-en/references/lakehouse-dqc-guide.md +300 -0
- package/bin/skills/lakehouse-doc-en/references/lakehouse-medallion-sql-dt-guide.md +543 -0
- package/bin/skills/lakehouse-doc-en/references/lakehouse-multi-cloud-acceleration.md +274 -0
- package/bin/skills/lakehouse-doc-en/references/lakehouse-multimodal-ai-pipeline.md +198 -0
- package/bin/skills/lakehouse-doc-en/references/lakehouse-quick-experience_guide.md +49 -52
- package/bin/skills/lakehouse-doc-en/references/lakehouse-volume-pipe-acceleration-guide.md +380 -0
- package/bin/skills/lakehouse-doc-en/references/langchain-plug-installation.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/management.md +4 -9
- package/bin/skills/lakehouse-doc-en/references/medallion-lakehouse-from-scratch.md +2 -1
- package/bin/skills/lakehouse-doc-en/references/metrics_answer_build.md +58 -21
- package/bin/skills/lakehouse-doc-en/references/migrate-spark-data-engineering-best-practices-to-lakehouse.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/mindsdb.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/monitoring_and_alerting.md +65 -60
- package/bin/skills/lakehouse-doc-en/references/monitoring_item_specification.md +33 -33
- package/bin/skills/lakehouse-doc-en/references/multitable_batch_sync.md +16 -16
- package/bin/skills/lakehouse-doc-en/references/multitable_realtime_sync.md +65 -72
- package/bin/skills/lakehouse-doc-en/references/multitable_realtime_sync_sop.md +54 -52
- package/bin/skills/lakehouse-doc-en/references/navicat-mysql.md +2 -0
- package/bin/skills/lakehouse-doc-en/references/om-dynamic-table.md +71 -66
- package/bin/skills/lakehouse-doc-en/references/om-vcluster.md +2 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-create-session.md +79 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-generate-auth-token.md +63 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-overview.md +96 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-quick-start.md +286 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-response-guide.md +264 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-safe-question-poll.md +201 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-text2insight-query.md +99 -0
- package/bin/skills/lakehouse-doc-en/references/open-api-text2insight-stop.md +74 -0
- package/bin/skills/lakehouse-doc-en/references/overview.md +6 -7
- package/bin/skills/lakehouse-doc-en/references/permission-application.md +5 -5
- package/bin/skills/lakehouse-doc-en/references/pipe-introduction.md +1 -0
- package/bin/skills/lakehouse-doc-en/references/pipe-kafka-table-stream.md +72 -70
- package/bin/skills/lakehouse-doc-en/references/pipe-kafka.md +105 -110
- package/bin/skills/lakehouse-doc-en/references/pipe-overview.md +40 -40
- package/bin/skills/lakehouse-doc-en/references/pipe-storage-object.md +43 -48
- package/bin/skills/lakehouse-doc-en/references/pipe-summary.md +14 -4
- package/bin/skills/lakehouse-doc-en/references/pipe-syntax.md +58 -151
- package/bin/skills/lakehouse-doc-en/references/practice_python_task.md +4 -4
- package/bin/skills/lakehouse-doc-en/references/pricing-ai-gateway.md +181 -0
- package/bin/skills/lakehouse-doc-en/references/pricing-lakehouse.md +316 -0
- package/bin/skills/lakehouse-doc-en/references/pricing.md +44 -288
- package/bin/skills/lakehouse-doc-en/references/private-link-general.md +0 -2
- package/bin/skills/lakehouse-doc-en/references/pyspark-to-zettapark-migration-f1.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/python-igs.md +7 -3
- package/bin/skills/lakehouse-doc-en/references/python-sample-put-github-rt-events.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/python-task.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/python_reference/connector.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/python_reference/connector_advanced.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/python_reference/connector_examples.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/python_sdk_guide.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/python_shell_datasource.md +11 -9
- package/bin/skills/lakehouse-doc-en/references/quick_start_batch_sync_data.md +9 -18
- package/bin/skills/lakehouse-doc-en/references/quick_start_bi_analysis.md +8 -25
- package/bin/skills/lakehouse-doc-en/references/quick_start_create_workspace.md +4 -6
- package/bin/skills/lakehouse-doc-en/references/quick_start_data_quality.md +8 -8
- package/bin/skills/lakehouse-doc-en/references/quick_start_etl.md +16 -20
- package/bin/skills/lakehouse-doc-en/references/quick_start_monitoring_and_alerting.md +10 -18
- package/bin/skills/lakehouse-doc-en/references/quick_start_sql_query.md +7 -10
- package/bin/skills/lakehouse-doc-en/references/quick_start_upload_data.md +5 -7
- package/bin/skills/lakehouse-doc-en/references/quick_start_user_management.md +8 -8
- package/bin/skills/lakehouse-doc-en/references/quick_start_workspace.md +0 -5
- package/bin/skills/lakehouse-doc-en/references/quick_start_workspace_user.md +8 -8
- package/bin/skills/lakehouse-doc-en/references/quickstart.md +69 -56
- package/bin/skills/lakehouse-doc-en/references/quickstart_datashare_between_companies.md +0 -5
- package/bin/skills/lakehouse-doc-en/references/quickstart_envirment_for_team.md +0 -24
- package/bin/skills/lakehouse-doc-en/references/realtime-pipeline-selection-guide.md +1 -2
- package/bin/skills/lakehouse-doc-en/references/realtime-sales-dashboard-with-dynamic-table.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/realtime_sync.md +0 -1
- package/bin/skills/lakehouse-doc-en/references/release-note-2026-05-19.md +5 -3
- package/bin/skills/lakehouse-doc-en/references/revoke-privileges.md +3 -1
- package/bin/skills/lakehouse-doc-en/references/roles.md +2 -3
- package/bin/skills/lakehouse-doc-en/references/row-filter.md +165 -0
- package/bin/skills/lakehouse-doc-en/references/row_level_permission.md +30 -19
- package/bin/skills/lakehouse-doc-en/references/scheduled_task.md +28 -21
- package/bin/skills/lakehouse-doc-en/references/security_overview.md +99 -21
- package/bin/skills/lakehouse-doc-en/references/set-command.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/setup.md +13 -15
- package/bin/skills/lakehouse-doc-en/references/show-grants.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/snowflake-dynamic-tables-to-lakehouse.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/spark-connector-summary.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/sql_functions/context_functions/current_vcluster.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/sso-configuration.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/streaming_pipeline_with_dynamic_table.md +0 -1
- package/bin/skills/lakehouse-doc-en/references/studio-incremental-sync-practice.md +27 -23
- package/bin/skills/lakehouse-doc-en/references/studio-shell-task.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/supported-cloud-platforms.md +32 -0
- package/bin/skills/lakehouse-doc-en/references/table_rendering.md +18 -12
- package/bin/skills/lakehouse-doc-en/references/task-develop.md +89 -91
- package/bin/skills/lakehouse-doc-en/references/task_development.md +19 -17
- package/bin/skills/lakehouse-doc-en/references/task_group.md +16 -14
- package/bin/skills/lakehouse-doc-en/references/task_instance.md +21 -21
- package/bin/skills/lakehouse-doc-en/references/task_param.md +38 -35
- package/bin/skills/lakehouse-doc-en/references/task_param_reference.md +81 -79
- package/bin/skills/lakehouse-doc-en/references/task_scheduling_dependency.md +20 -21
- package/bin/skills/lakehouse-doc-en/references/tencentcloud_arn_and_externalid.md +1 -5
- package/bin/skills/lakehouse-doc-en/references/trial-account-quotas-and-limits.md +1 -3
- package/bin/skills/lakehouse-doc-en/references/tutorial_connect_to_lakehouse.md +69 -0
- package/bin/skills/lakehouse-doc-en/references/tutorials.md +4 -1
- package/bin/skills/lakehouse-doc-en/references/unique-key.md +167 -0
- package/bin/skills/lakehouse-doc-en/references/usageandbillingview.md +138 -0
- package/bin/skills/lakehouse-doc-en/references/use-dbt-dev.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/use-java-sdk-realtime-uploaddata.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/use-java-sdk-upload-data-local.md +3 -3
- package/bin/skills/lakehouse-doc-en/references/use-models.md +128 -0
- package/bin/skills/lakehouse-doc-en/references/use-mysql-client.md +81 -81
- package/bin/skills/lakehouse-doc-en/references/use-python-sdk-upload-data.md +10 -12
- package/bin/skills/lakehouse-doc-en/references/user-identification.md +2 -3
- package/bin/skills/lakehouse-doc-en/references/user_permission_grand_guide.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/using-udf-in-dynamic-table.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/vc_cache.md +18 -22
- package/bin/skills/lakehouse-doc-en/references/vcluster_size_description.md +33 -31
- package/bin/skills/lakehouse-doc-en/references/virtual-cluster.md +43 -45
- package/bin/skills/lakehouse-doc-en/references/web-job-history.md +94 -108
- package/bin/skills/lakehouse-doc-en/references/web_search.md +16 -7
- package/bin/skills/lakehouse-doc-en/references/zettapark-data-engineering-demo.md +1 -1
- package/bin/skills/lakehouse-doc-en/references/zettapark-dataframe-guide.md +144 -70
- package/bin/skills/lakehouse-doc-en/references/zettapark-dynamic-table-guide.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/zettapark-etl-guide.md +73 -33
- package/bin/skills/lakehouse-doc-en/references/zettapark-feature-engineering.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/zettapark-functions-guide.md +75 -46
- package/bin/skills/lakehouse-doc-en/references/zettapark-quick-start.md +2 -2
- package/bin/skills/lakehouse-doc-en/references/zettapark-stream-guide.md +4 -4
- package/bin/skills/lakehouse-doc-en/references/zettapark-volume-guide.md +93 -29
- package/package.json +1 -1
- package/bin/skills/lakehouse-doc-en/references/CLAUDE.md +0 -606
- package/bin/skills/lakehouse-doc-en/references/modelprice.md +0 -155
|
@@ -0,0 +1,543 @@
|
|
|
1
|
+
# Singdata Lakehouse Medallion Architecture in Practice: Pure SQL Dynamic Table Approach
|
|
2
|
+
|
|
3
|
+
The Medallion architecture (Bronze → Silver → Gold) is a data lake organization pattern popularized by Databricks. On Singdata Lakehouse, beyond implementing it with ZettaPark Python API, there is a cleaner alternative: **building all three layers declaratively using SQL Dynamic Tables**—no Python code required, no scheduling platform configuration needed, and all three layers automatically refresh incrementally based on dependency chains.
|
|
4
|
+
|
|
5
|
+
This article uses the NHL (National Hockey League) real-world dataset (10 tables, ~14 million rows) to fully demonstrate this approach.
|
|
6
|
+
|
|
7
|
+
> 💡 If you are familiar with Databricks Medallion but prefer not to write Python/ZettaPark, or want to manage data pipelines with pure SQL, this article is your reference. It complements the [ZettaPark migration approach](medallion-lakehouse-from-scratch.md), with the two covering different technical preferences.
|
|
8
|
+
|
|
9
|
+
### Data Lake Acceleration Overview: Where This Article Fits
|
|
10
|
+
|
|
11
|
+
A typical data lake acceleration pipeline looks like: **Object storage files → Volume (mount) → Pipe (continuous ingestion) → Target table → Dynamic Table (incremental aggregation)**. The first two steps handle "automatic data loading," while this article focuses on the final step—cleansing, modeling, and aggregation after data is loaded, using Dynamic Tables to declaratively build the Bronze → Silver → Gold three-layer pipeline.
|
|
12
|
+
|
|
13
|
+
If you have not set up data ingestion yet, start with [Volume + Pipe End-to-End Practice](lakehouse-volume-pipe-acceleration-guide.md) to get file auto-loading working first. If your data is already in Lakehouse tables (like the NHL dataset in this article), start directly here.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Why Use Dynamic Tables to Build Medallion
|
|
18
|
+
|
|
19
|
+
Traditional Medallion architecture typically relies on scheduling platforms (Airflow/Databricks Workflows) to execute Python Notebooks or SQL scripts sequentially. Dynamic Tables offer a different paradigm:
|
|
20
|
+
|
|
21
|
+
| Dimension | Traditional ETL Scheduling | Dynamic Table Approach |
|
|
22
|
+
|---|---|---|
|
|
23
|
+
| Coding style | Python/ZettaPark or SQL scripts | Pure SQL (`CREATE DYNAMIC TABLE ... AS SELECT`) |
|
|
24
|
+
| Scheduling config | Requires DAG and Cron configuration | Declarative `REFRESH INTERVAL`, system auto-schedules |
|
|
25
|
+
| Incremental computation | Manual incremental logic required | System CBO automatically detects incremental changes |
|
|
26
|
+
| Dependency management | Manual orchestration of upstream/downstream order | DT automatically determines refresh order by reference |
|
|
27
|
+
| Data lineage | Requires additional tools to track | `SHOW DYNAMIC TABLE REFRESH HISTORY` built-in |
|
|
28
|
+
| Code as assets | Notebooks/scripts scattered across management | Centralized in Studio, searchable, comparable, reusable |
|
|
29
|
+
|
|
30
|
+
The core difference: **you do not need to worry about "when to run" or "what to run"—you only need to declare "what result you want"**. The system handles computation orchestration, incremental detection, and parallel scheduling.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Dataset Overview
|
|
35
|
+
|
|
36
|
+
NHL hockey data from the `nhl_game_data` schema (Bronze layer, already loaded):
|
|
37
|
+
|
|
38
|
+
| Table | Rows | Description |
|
|
39
|
+
|---|---|---|
|
|
40
|
+
| `game` | 26,305 | Main game table (matchups, scores, venues, seasons) |
|
|
41
|
+
| `player_info` | 3,925 | Player profiles (name, nationality, position, height/weight) |
|
|
42
|
+
| `team_info` | 33 | Team information (name, abbreviation) |
|
|
43
|
+
| `game_skater_stats` | 945,830 | Skater stats (goals, assists, shots, hits, +/-, etc.) |
|
|
44
|
+
| `game_goalie_stats` | 56,656 | Goalie stats (saves, goals against, save percentage) |
|
|
45
|
+
| `game_goals` | 148,992 | Goal details |
|
|
46
|
+
| `game_plays` | 5,050,529 | Game events (play-by-play) |
|
|
47
|
+
| `game_plays_players` | 7,586,604 | Player participation details per event |
|
|
48
|
+
| `game_penalties` | 247,828 | Penalty records |
|
|
49
|
+
| `game_teams_stats` | 52,610 | Team game-level statistics |
|
|
50
|
+
|
|
51
|
+
Data relationships: `game` is the core fact table, linked to other tables via `game_id`, `player_id`, and `team_id`. Covers 10 seasons from 2010 to 2020.
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Architecture Design
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
Bronze (nhl_game_data.*) Silver (silver.*) DT Gold (gold.*) DT
|
|
59
|
+
═══════════════════════ ══════════════════ ══════════════════
|
|
60
|
+
Raw data, zero transformation Cleansed + dimension joins Business metrics
|
|
61
|
+
|
|
62
|
+
game ─────────┐ ┌─ dim_team (33) ┌─ scoring_leaders
|
|
63
|
+
team_info ────┤ LEFT JOIN ──→├─ dim_player (3,925) ├─ player_career_stats
|
|
64
|
+
player_info ──┘ ├─ fact_skater_stats ├─ team_season_summary
|
|
65
|
+
skater_stats ── LEFT JOIN ──→ └─ fact_goalie_stats ├─ goalie_season_rankings
|
|
66
|
+
goalie_stats ── LEFT JOIN ──→ └─ team_home_away_split
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Three-layer responsibilities:
|
|
70
|
+
|
|
71
|
+
| Layer | Schema | Table Type | Responsibility |
|
|
72
|
+
|---|---|---|---|
|
|
73
|
+
| **Bronze** | `nhl_game_data` | Regular table | Raw data, no transformation |
|
|
74
|
+
| **Silver** | `silver` | Dynamic Table | JOIN dimension tables for names, cleanse field types (STRING→INT), standardize |
|
|
75
|
+
| **Gold** | `gold` | Dynamic Table | Aggregated metrics: top scorers, team records, goalie rankings, career stats |
|
|
76
|
+
|
|
77
|
+
> ⚠️ Silver and Gold both use Dynamic Tables; **materialized views are not recommended**. DT supports incremental refresh and Time Travel; materialized views do not.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Implementation Steps
|
|
82
|
+
|
|
83
|
+
### Prerequisites
|
|
84
|
+
|
|
85
|
+
- Virtual Cluster available (use `DEFAULT`, GP type, Serverless on-demand wake-up)
|
|
86
|
+
- Bronze data loaded (`nhl_game_data.*` 10 tables)
|
|
87
|
+
- Permissions for CREATE SCHEMA / CREATE DYNAMIC TABLE
|
|
88
|
+
|
|
89
|
+
### Step 1: Create Schemas
|
|
90
|
+
|
|
91
|
+
Use separate schemas to physically isolate each layer:
|
|
92
|
+
|
|
93
|
+
```sql
|
|
94
|
+
CREATE SCHEMA IF NOT EXISTS silver COMMENT 'Medallion Silver cleansed layer';
|
|
95
|
+
CREATE SCHEMA IF NOT EXISTS gold COMMENT 'Medallion Gold aggregated metrics layer';
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Step 2: Silver Layer — Dimension Tables
|
|
99
|
+
|
|
100
|
+
The simplest DT: directly filter/transform columns from Bronze tables. These two tables are small (33 rows and 3,925 rows), so even FULL refreshes are effortless.
|
|
101
|
+
|
|
102
|
+
```sql
|
|
103
|
+
-- Team dimension
|
|
104
|
+
CREATE OR REPLACE DYNAMIC TABLE silver.dim_team
|
|
105
|
+
REFRESH INTERVAL 1 DAY vcluster DEFAULT
|
|
106
|
+
COMMENT 'Silver team dimension table'
|
|
107
|
+
AS
|
|
108
|
+
SELECT
|
|
109
|
+
team_id,
|
|
110
|
+
franchiseid,
|
|
111
|
+
shortname,
|
|
112
|
+
teamname,
|
|
113
|
+
abbreviation,
|
|
114
|
+
link
|
|
115
|
+
FROM nhl_game_data.team_info;
|
|
116
|
+
|
|
117
|
+
-- Player dimension (standardized + full name column added)
|
|
118
|
+
CREATE OR REPLACE DYNAMIC TABLE silver.dim_player
|
|
119
|
+
REFRESH INTERVAL 1 DAY vcluster DEFAULT
|
|
120
|
+
COMMENT 'Silver player dimension table — standardized fields + full name'
|
|
121
|
+
AS
|
|
122
|
+
SELECT
|
|
123
|
+
player_id,
|
|
124
|
+
firstname,
|
|
125
|
+
lastname,
|
|
126
|
+
CONCAT(firstname, ' ', lastname) AS full_name,
|
|
127
|
+
nationality,
|
|
128
|
+
birthcity,
|
|
129
|
+
primaryposition AS position,
|
|
130
|
+
birthdate,
|
|
131
|
+
height,
|
|
132
|
+
height_cm,
|
|
133
|
+
CAST(NULLIF(REGEXP_REPLACE(weight, ',', ''), '') AS INT) AS weight_kg,
|
|
134
|
+
shootscatches
|
|
135
|
+
FROM nhl_game_data.player_info;
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
> **Why use `REGEXP_REPLACE(weight, ',', '')`?** In NHL raw data, numeric fields (such as hits, weight) may contain thousands separators (e.g., "1,234"). Direct CAST would throw an error. Removing the comma before casting to INT is a necessary cleansing step.
|
|
139
|
+
|
|
140
|
+
### Step 3: Silver Layer — Fact Tables
|
|
141
|
+
|
|
142
|
+
The core work of fact tables: **JOIN dimension tables to resolve names + type standardization**. Using skater stats as an example:
|
|
143
|
+
|
|
144
|
+
```sql
|
|
145
|
+
CREATE OR REPLACE DYNAMIC TABLE silver.fact_skater_stats
|
|
146
|
+
REFRESH INTERVAL 1 DAY vcluster DEFAULT
|
|
147
|
+
COMMENT 'Silver skater stats fact table — joined with player name + team name + season'
|
|
148
|
+
AS
|
|
149
|
+
SELECT
|
|
150
|
+
s.game_id,
|
|
151
|
+
s.player_id,
|
|
152
|
+
p.full_name AS player_name,
|
|
153
|
+
p.position,
|
|
154
|
+
s.team_id,
|
|
155
|
+
t.teamname AS team_name,
|
|
156
|
+
t.abbreviation AS team_abbr,
|
|
157
|
+
g.season,
|
|
158
|
+
g.date_time_gmt AS game_date,
|
|
159
|
+
s.timeonice,
|
|
160
|
+
s.goals,
|
|
161
|
+
s.assists,
|
|
162
|
+
s.goals + s.assists AS points, -- computed field: points
|
|
163
|
+
s.shots,
|
|
164
|
+
CAST(NULLIF(REGEXP_REPLACE(s.hits, ',', ''), '') AS INT) AS hits,
|
|
165
|
+
s.powerplaygoals,
|
|
166
|
+
s.penaltyminutes,
|
|
167
|
+
s.plusminus,
|
|
168
|
+
s.eventimeonice,
|
|
169
|
+
s.powerplaytimeonice
|
|
170
|
+
FROM nhl_game_data.game_skater_stats s
|
|
171
|
+
LEFT JOIN nhl_game_data.game g
|
|
172
|
+
ON s.game_id = g.game_id
|
|
173
|
+
LEFT JOIN silver.dim_player p
|
|
174
|
+
ON s.player_id = p.player_id
|
|
175
|
+
LEFT JOIN silver.dim_team t
|
|
176
|
+
ON s.team_id = t.team_id;
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
> ⚠️ **The Silver fact table references Silver dimension tables** (`silver.dim_player`, `silver.dim_team`). This means the system refreshes dimension tables first, then fact tables—DT handles the dependency chain automatically.
|
|
180
|
+
|
|
181
|
+
Goalie stats fact table follows the same pattern, with additional save percentage calculation:
|
|
182
|
+
|
|
183
|
+
```sql
|
|
184
|
+
CREATE OR REPLACE DYNAMIC TABLE silver.fact_goalie_stats
|
|
185
|
+
REFRESH INTERVAL 1 DAY vcluster DEFAULT
|
|
186
|
+
COMMENT 'Silver goalie stats fact table — includes save percentage calculation'
|
|
187
|
+
AS
|
|
188
|
+
SELECT
|
|
189
|
+
gs.game_id,
|
|
190
|
+
gs.player_id,
|
|
191
|
+
p.full_name AS player_name,
|
|
192
|
+
t.teamname AS team_name,
|
|
193
|
+
t.abbreviation AS team_abbr,
|
|
194
|
+
g.season,
|
|
195
|
+
g.date_time_gmt AS game_date,
|
|
196
|
+
gs.timeonice,
|
|
197
|
+
gs.shots AS shots_faced,
|
|
198
|
+
gs.saves,
|
|
199
|
+
CASE WHEN gs.shots > 0
|
|
200
|
+
THEN ROUND(gs.saves * 1.0 / gs.shots, 3)
|
|
201
|
+
ELSE NULL
|
|
202
|
+
END AS save_pct, -- computed field: save percentage
|
|
203
|
+
gs.decision
|
|
204
|
+
FROM nhl_game_data.game_goalie_stats gs
|
|
205
|
+
LEFT JOIN nhl_game_data.game g
|
|
206
|
+
ON gs.game_id = g.game_id
|
|
207
|
+
LEFT JOIN silver.dim_player p
|
|
208
|
+
ON gs.player_id = p.player_id
|
|
209
|
+
LEFT JOIN silver.dim_team t
|
|
210
|
+
ON gs.team_id = t.team_id;
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
### Step 4: Initial Refresh of Silver Layer
|
|
214
|
+
|
|
215
|
+
After DT creation, only the computation logic is defined—there is no data yet. You need to manually trigger the first refresh:
|
|
216
|
+
|
|
217
|
+
```sql
|
|
218
|
+
REFRESH DYNAMIC TABLE silver.dim_team;
|
|
219
|
+
REFRESH DYNAMIC TABLE silver.dim_player;
|
|
220
|
+
REFRESH DYNAMIC TABLE silver.fact_skater_stats;
|
|
221
|
+
REFRESH DYNAMIC TABLE silver.fact_goalie_stats;
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
> 💡 Refresh dimension tables first, then fact tables—since fact tables reference dimension tables. Although order does not matter when executing manually (the system waits for dependencies to be ready), following the dependency order is recommended for faster initial completion.
|
|
225
|
+
|
|
226
|
+
### Step 5: Gold Layer — Aggregated Metrics
|
|
227
|
+
|
|
228
|
+
The Gold layer reads data from the Silver layer and uses aggregate functions to generate business metrics. All tables use a `1 DAY` refresh interval (T+1 scenario).
|
|
229
|
+
|
|
230
|
+
#### Top Scorers: TOP 20 Scorers Per Season
|
|
231
|
+
|
|
232
|
+
Use the `RANK()` window function to rank by season:
|
|
233
|
+
|
|
234
|
+
```sql
|
|
235
|
+
CREATE OR REPLACE DYNAMIC TABLE gold.scoring_leaders
|
|
236
|
+
REFRESH INTERVAL 1 DAY vcluster DEFAULT
|
|
237
|
+
COMMENT 'Gold top 20 scorers per season — ranked by points (goals + assists)'
|
|
238
|
+
AS
|
|
239
|
+
SELECT season, rank, player_id, player_name, position, team_abbr,
|
|
240
|
+
games_played, goals, assists, points,
|
|
241
|
+
ROUND(points * 1.0 / games_played, 2) AS pts_per_game
|
|
242
|
+
FROM (
|
|
243
|
+
SELECT
|
|
244
|
+
season, player_id, player_name, position, team_abbr,
|
|
245
|
+
COUNT(*) AS games_played,
|
|
246
|
+
SUM(goals) AS goals,
|
|
247
|
+
SUM(assists) AS assists,
|
|
248
|
+
SUM(points) AS points,
|
|
249
|
+
RANK() OVER (PARTITION BY season ORDER BY SUM(points) DESC) AS rank
|
|
250
|
+
FROM silver.fact_skater_stats
|
|
251
|
+
GROUP BY season, player_id, player_name, position, team_abbr
|
|
252
|
+
) t
|
|
253
|
+
WHERE rank <= 20;
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
**Validation results** (2019-20 season):
|
|
257
|
+
|
|
258
|
+
| rank | player | team | goals | assists | points |
|
|
259
|
+
|---|---|---|---|---|---|
|
|
260
|
+
| 1 | Nikita Kucherov | TBL | 160 | 316 | 476 |
|
|
261
|
+
| 2 | Nathan MacKinnon | COL | 176 | 296 | 472 |
|
|
262
|
+
| 3 | Leon Draisaitl | EDM | 181 | 274 | 455 |
|
|
263
|
+
| 4 | David Pastrnak | BOS | 204 | 216 | 420 |
|
|
264
|
+
| 5 | Connor McDavid | EDM | 153 | 262 | 415 |
|
|
265
|
+
|
|
266
|
+
> ✅ Rankings match NHL official records, data accuracy validation passed.
|
|
267
|
+
|
|
268
|
+
#### Team Season Records
|
|
269
|
+
|
|
270
|
+
Bronze data only has a home/away team perspective per game. Each game needs to be expanded into two rows (one for home team, one for away team), then aggregated by team and season. This is implemented with `UNION ALL` + `CASE WHEN`:
|
|
271
|
+
|
|
272
|
+
```sql
|
|
273
|
+
CREATE OR REPLACE DYNAMIC TABLE gold.team_season_summary
|
|
274
|
+
REFRESH INTERVAL 1 DAY vcluster DEFAULT
|
|
275
|
+
COMMENT 'Gold team season records — wins/losses/goals/goals-against/points'
|
|
276
|
+
AS
|
|
277
|
+
SELECT
|
|
278
|
+
g.season, g.team_id,
|
|
279
|
+
t.teamname AS team_name,
|
|
280
|
+
t.abbreviation AS team_abbr,
|
|
281
|
+
COUNT(*) AS games_played,
|
|
282
|
+
SUM(CASE WHEN g.side = 'home' AND g.outcome LIKE 'home win%' THEN 1
|
|
283
|
+
WHEN g.side = 'away' AND g.outcome LIKE 'away win%' THEN 1
|
|
284
|
+
ELSE 0 END) AS wins,
|
|
285
|
+
SUM(CASE WHEN g.side = 'home' AND g.outcome LIKE 'away win%' THEN 1
|
|
286
|
+
WHEN g.side = 'away' AND g.outcome LIKE 'home win%' THEN 1
|
|
287
|
+
ELSE 0 END) AS losses,
|
|
288
|
+
SUM(CASE WHEN g.side = 'home' THEN g.home_goals
|
|
289
|
+
ELSE g.away_goals END) AS goals_for,
|
|
290
|
+
SUM(CASE WHEN g.side = 'home' THEN g.away_goals
|
|
291
|
+
ELSE g.home_goals END) AS goals_against,
|
|
292
|
+
SUM(CASE WHEN g.side = 'home' AND g.outcome LIKE 'home win%' THEN 2
|
|
293
|
+
WHEN g.side = 'away' AND g.outcome LIKE 'away win%' THEN 2
|
|
294
|
+
ELSE 0 END) AS points
|
|
295
|
+
FROM (
|
|
296
|
+
SELECT season, home_team_id AS team_id, outcome,
|
|
297
|
+
home_goals, away_goals, 'home' AS side
|
|
298
|
+
FROM nhl_game_data.game
|
|
299
|
+
UNION ALL
|
|
300
|
+
SELECT season, away_team_id AS team_id, outcome,
|
|
301
|
+
home_goals, away_goals, 'away' AS side
|
|
302
|
+
FROM nhl_game_data.game
|
|
303
|
+
) g
|
|
304
|
+
LEFT JOIN silver.dim_team t ON g.team_id = t.team_id
|
|
305
|
+
GROUP BY g.season, g.team_id, t.teamname, t.abbreviation;
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
> ⚠️ **Note**: An early version used `outcome LIKE '%win%'` to match wins, but this caused the away team row to also be counted as a win when the home team won. You must cross-match `side` and `outcome`: home team rows only match `'home win%'`, and away team rows only match `'away win%'`.
|
|
309
|
+
|
|
310
|
+
**Validation results** (2019-20 season TOP 5):
|
|
311
|
+
|
|
312
|
+
| team | games | wins | losses | points |
|
|
313
|
+
|---|---|---|---|---|
|
|
314
|
+
| Lightning (TBL) | 190 | 122 | 68 | 244 |
|
|
315
|
+
| Stars (DAL) | 192 | 104 | 88 | 208 |
|
|
316
|
+
| Golden Knights (VGK) | 182 | 102 | 80 | 204 |
|
|
317
|
+
| Avalanche (COL) | 170 | 102 | 68 | 204 |
|
|
318
|
+
| Flyers (PHI) | 170 | 102 | 68 | 204 |
|
|
319
|
+
|
|
320
|
+
#### Goalie Season Rankings + Player Career Stats + Home/Away Split
|
|
321
|
+
|
|
322
|
+
Full DDL is in the appendix. The core pattern is the same: aggregate from Silver layer → `RANK() OVER (PARTITION BY season ...)` → filter TOP N.
|
|
323
|
+
|
|
324
|
+
### Step 6: Validate the Full Pipeline
|
|
325
|
+
|
|
326
|
+
```sql
|
|
327
|
+
-- Row count comparison across layers
|
|
328
|
+
SELECT 'Bronze game' AS layer, COUNT(*) FROM nhl_game_data.game
|
|
329
|
+
UNION ALL SELECT 'Silver dim_team', COUNT(*) FROM silver.dim_team
|
|
330
|
+
UNION ALL SELECT 'Silver fact_skater', COUNT(*) FROM silver.fact_skater_stats
|
|
331
|
+
UNION ALL SELECT 'Gold scoring_leaders', COUNT(*) FROM gold.scoring_leaders
|
|
332
|
+
UNION ALL SELECT 'Gold team_season', COUNT(*) FROM gold.team_season_summary;
|
|
333
|
+
|
|
334
|
+
-- View DT refresh history
|
|
335
|
+
SHOW DYNAMIC TABLE REFRESH HISTORY WHERE name = 'scoring_leaders';
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
**Complete validation results:**
|
|
339
|
+
|
|
340
|
+
| Layer | Table | Rows | Refresh Mode | Status |
|
|
341
|
+
|---|---|---|---|---|
|
|
342
|
+
| Silver | dim_team | 33 | FULL | ✅ Matches Bronze |
|
|
343
|
+
| Silver | dim_player | 3,925 | FULL | ✅ Matches Bronze |
|
|
344
|
+
| Silver | fact_skater_stats | 1,130,682 | FULL | ✅ Includes player_name/team_name/points |
|
|
345
|
+
| Silver | fact_goalie_stats | 67,642 | FULL | ✅ Includes computed save_pct |
|
|
346
|
+
| Gold | scoring_leaders | 399 | FULL | ✅ TOP 20 per season |
|
|
347
|
+
| Gold | player_career_stats | 3,353 | FULL | ✅ Career summary |
|
|
348
|
+
| Gold | team_season_summary | 580 | FULL | ✅ 33 teams × 18 seasons |
|
|
349
|
+
| Gold | goalie_season_rankings | 294 | FULL | ✅ TOP 15 per season |
|
|
350
|
+
| Gold | team_home_away_split | 580 | FULL | ✅ Home/away split |
|
|
351
|
+
|
|
352
|
+
> 💡 **Why all FULL?** On the first refresh there is no incremental baseline, so DT must perform a full scan of the source tables to establish the initial state. After Bronze layer receives new data, DT will automatically switch to INCREMENTAL mode and process only the changed parts. Source tables need `change_tracking` enabled to support incremental refresh (`ALTER TABLE table_name SET PROPERTIES ('change_tracking' = 'true')`).
|
|
353
|
+
|
|
354
|
+
---
|
|
355
|
+
|
|
356
|
+
## Design Principles
|
|
357
|
+
|
|
358
|
+
### 1. Cross-Layer Reference Rules
|
|
359
|
+
|
|
360
|
+
| Reference Direction | Allowed | Example |
|
|
361
|
+
|---|---|---|
|
|
362
|
+
| Silver → Bronze | ✅ | `FROM nhl_game_data.game` |
|
|
363
|
+
| Gold → Silver | ✅ | `FROM silver.fact_skater_stats` |
|
|
364
|
+
| Gold → Bronze | ⚠️ Not recommended | Should access indirectly through Silver layer |
|
|
365
|
+
| Gold → Gold | ⚠️ Use with caution | Only for multi-level aggregation |
|
|
366
|
+
| Bronze → Silver | ❌ Forbidden | Lower layers should not depend on upper layers |
|
|
367
|
+
|
|
368
|
+
### 2. LEFT JOIN Filter Conditions Must Go in ON Clause
|
|
369
|
+
|
|
370
|
+
```sql
|
|
371
|
+
-- ❌ Wrong: WHERE filter degrades LEFT JOIN to INNER JOIN
|
|
372
|
+
SELECT * FROM skater_stats s
|
|
373
|
+
LEFT JOIN team_info t ON s.team_id = t.team_id
|
|
374
|
+
WHERE t.abbreviation = 'TBL';
|
|
375
|
+
|
|
376
|
+
-- ✅ Correct: filter condition in ON clause
|
|
377
|
+
SELECT * FROM skater_stats s
|
|
378
|
+
LEFT JOIN team_info t
|
|
379
|
+
ON s.team_id = t.team_id AND t.abbreviation = 'TBL';
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
### 3. First Refresh Baseline Time
|
|
383
|
+
|
|
384
|
+
`REFRESH INTERVAL 1 DAY` calculates the next trigger based on creation time and does not align to clock hours. It is recommended to immediately execute `REFRESH DYNAMIC TABLE` after creation to manually trigger the first refresh and reset the baseline time:
|
|
385
|
+
|
|
386
|
+
```sql
|
|
387
|
+
CREATE DYNAMIC TABLE gold.scoring_leaders ...;
|
|
388
|
+
REFRESH DYNAMIC TABLE gold.scoring_leaders;
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
### 4. String Cleansing
|
|
392
|
+
|
|
393
|
+
When raw data comes from external systems, numeric fields may contain non-standard characters:
|
|
394
|
+
|
|
395
|
+
```sql
|
|
396
|
+
CAST(NULLIF(REGEXP_REPLACE(hits, ',', ''), '') AS INT)
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
Three-step cleansing: remove commas → NULLIF empty string → CAST to target type. `NULLIF` prevents CAST failures caused by empty strings.
|
|
400
|
+
|
|
401
|
+
---
|
|
402
|
+
|
|
403
|
+
## Cost Analysis
|
|
404
|
+
|
|
405
|
+
| Layer | DT Count | Refresh Frequency | Estimated CRU |
|
|
406
|
+
|---|---|---|---|
|
|
407
|
+
| Silver | 4 | 1 DAY | Low (full refresh, but small data volume) |
|
|
408
|
+
| Gold | 5 | 1 DAY | Medium (involves aggregation, ~14M row scan) |
|
|
409
|
+
|
|
410
|
+
All use GP type Virtual Cluster (`DEFAULT`), Serverless on-demand billing. In T+1 scenarios with only one refresh per day, this is lower cost than traditional hourly ETL.
|
|
411
|
+
|
|
412
|
+
> 💡 To reduce Gold layer costs, infrequently used metrics (such as `goalie_season_rankings`, `team_home_away_split`) can be set to `7 DAY` refresh frequency.
|
|
413
|
+
|
|
414
|
+
---
|
|
415
|
+
|
|
416
|
+
## Comparison with ZettaPark Approach
|
|
417
|
+
|
|
418
|
+
| | ZettaPark Approach | Pure SQL DT Approach (this article) |
|
|
419
|
+
|---|---|---|
|
|
420
|
+
| Target audience | Python developers, Data Scientists | SQL developers, Data Analysts |
|
|
421
|
+
| Code volume | Python scripts + Spark API | Pure SQL (DDL) |
|
|
422
|
+
| Scheduling | Requires external scheduling (Studio/Notebook) | DT auto-refresh, no scheduling needed |
|
|
423
|
+
| Incremental computation | Manual CDC management required | System handles automatically |
|
|
424
|
+
| Flexibility | High (Python can call any library) | Medium (within SQL expression capabilities) |
|
|
425
|
+
| Learning curve | Pandas/PySpark/ZettaPark | Pure SQL |
|
|
426
|
+
| Use cases | Complex transformations, ML feature engineering, external API calls | Standard ETL, aggregation, JOINs, window functions |
|
|
427
|
+
|
|
428
|
+
**Both approaches coexist without conflict**: use ZettaPark for complex cleansing, use DT for aggregated metrics, leveraging the strengths of each within the same Medallion architecture.
|
|
429
|
+
|
|
430
|
+
---
|
|
431
|
+
|
|
432
|
+
## Notes
|
|
433
|
+
|
|
434
|
+
| Note | Description |
|
|
435
|
+
|---|---|
|
|
436
|
+
| Bronze data changes trigger DT automatically | All 9 DTs in the pipeline refresh in dependency order, no manual trigger needed |
|
|
437
|
+
| DT does not support ALTER to modify SQL | Use `CREATE OR REPLACE` to rebuild |
|
|
438
|
+
| Virtual Cluster must be GP type | AP type does not support small file merging, queries slow down over time |
|
|
439
|
+
| Silver fact tables reference Silver dimension tables | System automatically ensures dimension tables refresh first |
|
|
440
|
+
| String numeric fields need cleansing | Remove commas → NULLIF → CAST, three steps |
|
|
441
|
+
| UNION ALL row expansion requires careful business logic | When splitting home/away teams, win/loss determination must cross-match side and outcome |
|
|
442
|
+
| Manual REFRESH required after initial creation | `REFRESH INTERVAL` does not immediately trigger the first computation |
|
|
443
|
+
|
|
444
|
+
---
|
|
445
|
+
|
|
446
|
+
## Appendix: Complete Gold Layer DDL
|
|
447
|
+
|
|
448
|
+
### Player Career Stats
|
|
449
|
+
|
|
450
|
+
```sql
|
|
451
|
+
CREATE OR REPLACE DYNAMIC TABLE gold.player_career_stats
|
|
452
|
+
REFRESH INTERVAL 1 DAY vcluster DEFAULT
|
|
453
|
+
COMMENT 'Gold player career overview — all-season totals + per-game efficiency'
|
|
454
|
+
AS
|
|
455
|
+
SELECT
|
|
456
|
+
player_id, player_name, position,
|
|
457
|
+
COUNT(*) AS games_played,
|
|
458
|
+
SUM(goals) AS total_goals,
|
|
459
|
+
SUM(assists) AS total_assists,
|
|
460
|
+
SUM(points) AS total_points,
|
|
461
|
+
ROUND(SUM(points) * 1.0 / COUNT(*), 2) AS pts_per_game,
|
|
462
|
+
ROUND(SUM(goals) * 1.0 / NULLIF(SUM(shots), 0), 3) AS shooting_pct,
|
|
463
|
+
AVG(timeonice) AS avg_timeonice_sec,
|
|
464
|
+
SUM(penaltyminutes) AS total_pim,
|
|
465
|
+
AVG(plusminus) AS avg_plusminus
|
|
466
|
+
FROM silver.fact_skater_stats
|
|
467
|
+
GROUP BY player_id, player_name, position;
|
|
468
|
+
```
|
|
469
|
+
|
|
470
|
+
### Goalie Season Rankings
|
|
471
|
+
|
|
472
|
+
```sql
|
|
473
|
+
CREATE OR REPLACE DYNAMIC TABLE gold.goalie_season_rankings
|
|
474
|
+
REFRESH INTERVAL 1 DAY vcluster DEFAULT
|
|
475
|
+
COMMENT 'Gold goalie season rankings TOP 15 — ranked by wins'
|
|
476
|
+
AS
|
|
477
|
+
SELECT season, rank, player_id, player_name, team_abbr,
|
|
478
|
+
games_played, wins, saves, shots_faced,
|
|
479
|
+
ROUND(save_pct, 3) AS save_pct
|
|
480
|
+
FROM (
|
|
481
|
+
SELECT
|
|
482
|
+
season, player_id, player_name, team_abbr,
|
|
483
|
+
COUNT(*) AS games_played,
|
|
484
|
+
SUM(CASE WHEN decision = 'W' THEN 1 ELSE 0 END) AS wins,
|
|
485
|
+
SUM(saves) AS saves,
|
|
486
|
+
SUM(shots_faced) AS shots_faced,
|
|
487
|
+
CASE WHEN SUM(shots_faced) > 0
|
|
488
|
+
THEN SUM(saves) * 1.0 / SUM(shots_faced)
|
|
489
|
+
ELSE NULL END AS save_pct,
|
|
490
|
+
RANK() OVER (PARTITION BY season ORDER BY
|
|
491
|
+
SUM(CASE WHEN decision = 'W' THEN 1 ELSE 0 END) DESC) AS rank
|
|
492
|
+
FROM silver.fact_goalie_stats
|
|
493
|
+
GROUP BY season, player_id, player_name, team_abbr
|
|
494
|
+
) t
|
|
495
|
+
WHERE rank <= 15;
|
|
496
|
+
```
|
|
497
|
+
|
|
498
|
+
### Home/Away Split
|
|
499
|
+
|
|
500
|
+
```sql
|
|
501
|
+
CREATE OR REPLACE DYNAMIC TABLE gold.team_home_away_split
|
|
502
|
+
REFRESH INTERVAL 1 DAY vcluster DEFAULT
|
|
503
|
+
COMMENT 'Gold team home vs. away performance — home win% vs away win%'
|
|
504
|
+
AS
|
|
505
|
+
SELECT
|
|
506
|
+
g.season, g.team_id,
|
|
507
|
+
t.teamname AS team_name,
|
|
508
|
+
t.abbreviation AS team_abbr,
|
|
509
|
+
COUNT(CASE WHEN g.side = 'home' THEN 1 END) AS home_games,
|
|
510
|
+
COUNT(CASE WHEN g.side = 'home' AND g.outcome LIKE 'home win%' THEN 1 END) AS home_wins,
|
|
511
|
+
COUNT(CASE WHEN g.side = 'away' THEN 1 END) AS away_games,
|
|
512
|
+
COUNT(CASE WHEN g.side = 'away' AND g.outcome LIKE 'away win%' THEN 1 END) AS away_wins,
|
|
513
|
+
ROUND(
|
|
514
|
+
COUNT(CASE WHEN g.side = 'home' AND g.outcome LIKE 'home win%' THEN 1 END) * 1.0 /
|
|
515
|
+
NULLIF(COUNT(CASE WHEN g.side = 'home' THEN 1 END), 0), 3
|
|
516
|
+
) AS home_win_pct,
|
|
517
|
+
ROUND(
|
|
518
|
+
COUNT(CASE WHEN g.side = 'away' AND g.outcome LIKE 'away win%' THEN 1 END) * 1.0 /
|
|
519
|
+
NULLIF(COUNT(CASE WHEN g.side = 'away' THEN 1 END), 0), 3
|
|
520
|
+
) AS away_win_pct
|
|
521
|
+
FROM (
|
|
522
|
+
SELECT season, home_team_id AS team_id, outcome, 'home' AS side
|
|
523
|
+
FROM nhl_game_data.game
|
|
524
|
+
UNION ALL
|
|
525
|
+
SELECT season, away_team_id AS team_id, outcome, 'away' AS side
|
|
526
|
+
FROM nhl_game_data.game
|
|
527
|
+
) g
|
|
528
|
+
LEFT JOIN silver.dim_team t ON g.team_id = t.team_id
|
|
529
|
+
GROUP BY g.season, g.team_id, t.teamname, t.abbreviation;
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
---
|
|
533
|
+
|
|
534
|
+
## Related Documents
|
|
535
|
+
|
|
536
|
+
Complete data lake acceleration pipeline: Volume mount → Pipe ingestion → Dynamic Table modeling. The following documents cover each stage:
|
|
537
|
+
|
|
538
|
+
- [Volume + Pipe Data Lake Acceleration](lakehouse-volume-pipe-acceleration-guide.md) — File auto-ingestion, the upstream step for this article
|
|
539
|
+
- [Multi-Cloud Unified Data Lake Acceleration](lakehouse-multi-cloud-acceleration.md) — Same SQL runs on Alibaba Cloud/Tencent Cloud/AWS
|
|
540
|
+
- [Dynamic Table Introduction](dynamic-table-introduce.md) — Incremental computation mechanism and scheduling principles
|
|
541
|
+
- [CREATE DYNAMIC TABLE](create-dynamic-table.md) — Complete DDL syntax
|
|
542
|
+
- [Incremental Computing Overview](incremental-computing.md) — DT incremental refresh support matrix
|
|
543
|
+
- [Medallion from Scratch (ZettaPark Approach)](medallion-lakehouse-from-scratch.md) — Python API version covering the same topic
|