@clickzetta/cz-cli-darwin-x64 0.3.92 → 0.3.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-ai-function/SKILL.md +109 -0
- package/bin/skills/clickzetta-ai-function/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-ai-function/references/ai-function-ddl.md +106 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +124 -124
- package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -5
- package/bin/skills/clickzetta-bi-connect/SKILL.md +79 -78
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +56 -56
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +386 -382
- package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -5
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +73 -212
- package/bin/skills/clickzetta-data-science/SKILL.md +57 -56
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +38 -38
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +16 -16
- package/bin/skills/clickzetta-data-science/references/setup.md +28 -28
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +44 -44
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +22 -22
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +32 -32
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +1 -1
- package/bin/skills/clickzetta-external-function/SKILL.md +51 -109
- package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -4
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +39 -77
- package/bin/skills/clickzetta-java-sdk/SKILL.md +49 -48
- package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -12
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +34 -34
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +44 -44
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +273 -507
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +197 -231
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +231 -304
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +180 -179
- package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -5
- package/bin/skills/clickzetta-semantic-view/SKILL.md +74 -72
- package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -12
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +75 -75
- package/bin/skills/clickzetta-sql-migration/SKILL.md +128 -0
- package/bin/skills/clickzetta-sql-migration/eval_cases.jsonl +10 -0
- package/bin/skills/clickzetta-sql-migration/references/ddl-reference.md +350 -0
- package/bin/skills/clickzetta-sql-migration/references/dml-differences.md +192 -0
- package/bin/skills/clickzetta-sql-migration/references/dml-reference.md +279 -0
- package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/dql-reference.md +128 -128
- package/bin/skills/clickzetta-sql-migration/references/function-mapping.md +194 -0
- package/bin/skills/clickzetta-sql-migration/references/functions-reference.md +372 -0
- package/bin/skills/clickzetta-sql-migration/references/implicit-type-conversion.md +143 -0
- package/bin/skills/clickzetta-sql-migration/references/migration-databricks.md +260 -0
- package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/migration-snowflake.md +112 -112
- package/bin/skills/clickzetta-sql-migration/references/vs-snowflake.md +346 -0
- package/bin/skills/clickzetta-sql-migration/references/vs-spark.md +229 -0
- package/bin/skills/clickzetta-studio-task-manager/SKILL.md +326 -329
- package/bin/skills/clickzetta-table-lineage/SKILL.md +57 -55
- package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -1
- package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +5 -5
- package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +6 -6
- package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +2 -2
- package/bin/skills/clickzetta-volume-manager/SKILL.md +186 -100
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +153 -52
- package/package.json +1 -1
- package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -191
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
- package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
- /package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/LICENSE +0 -0
|
@@ -1,57 +1,57 @@
|
|
|
1
|
-
# BITMAP
|
|
1
|
+
# BITMAP User Profiling Reference
|
|
2
2
|
|
|
3
|
-
>
|
|
3
|
+
> Source: https://www.yunqi.tech/documents/bitmap-type
|
|
4
4
|
|
|
5
|
-
BITMAP
|
|
5
|
+
BITMAP is a data type in ClickZetta for efficiently storing and processing integer sets, based on the Roaring Bitmap compression algorithm. It is particularly suited for data science use cases such as user profiling, audience segmentation, and UV counting.
|
|
6
6
|
|
|
7
7
|
---
|
|
8
8
|
|
|
9
|
-
##
|
|
9
|
+
## Core Limitations
|
|
10
10
|
|
|
11
|
-
-
|
|
12
|
-
-
|
|
13
|
-
-
|
|
14
|
-
-
|
|
11
|
+
- Supports **64-bit unsigned integers** (0 to 2^64-1)
|
|
12
|
+
- **Does not support** comparison operators (<, >, =)
|
|
13
|
+
- **Does not support** ORDER BY, GROUP BY, DISTINCT
|
|
14
|
+
- **Cannot** be used as PRIMARY KEY, PARTITION KEY, or CLUSTER KEY
|
|
15
15
|
|
|
16
16
|
---
|
|
17
17
|
|
|
18
|
-
##
|
|
18
|
+
## Building User Tag BITMAPs
|
|
19
19
|
|
|
20
20
|
```sql
|
|
21
|
-
--
|
|
21
|
+
-- Option 1: Aggregate from row data (most common)
|
|
22
22
|
CREATE TABLE ds_workspace.user_tags AS
|
|
23
23
|
SELECT
|
|
24
24
|
tag_name,
|
|
25
25
|
group_bitmap_state(user_id) AS user_bitmap
|
|
26
26
|
FROM (
|
|
27
|
-
--
|
|
27
|
+
-- High-value users
|
|
28
28
|
SELECT 'high_value' AS tag_name, user_id
|
|
29
29
|
FROM my_schema.orders
|
|
30
30
|
WHERE total_amount_30d > 1000
|
|
31
31
|
UNION ALL
|
|
32
|
-
--
|
|
32
|
+
-- Active in last 30 days
|
|
33
33
|
SELECT 'active_30d' AS tag_name, user_id
|
|
34
34
|
FROM my_schema.events
|
|
35
35
|
WHERE event_date >= CURRENT_DATE - INTERVAL 30 DAY
|
|
36
36
|
UNION ALL
|
|
37
|
-
--
|
|
37
|
+
-- Churned users (inactive for 90 days)
|
|
38
38
|
SELECT 'churned' AS tag_name, user_id
|
|
39
39
|
FROM my_schema.users
|
|
40
40
|
WHERE last_active_date < CURRENT_DATE - INTERVAL 90 DAY
|
|
41
41
|
) t
|
|
42
42
|
GROUP BY tag_name;
|
|
43
43
|
|
|
44
|
-
--
|
|
44
|
+
-- Option 2: Build from an array
|
|
45
45
|
INSERT INTO ds_workspace.user_tags VALUES
|
|
46
46
|
('vip', bitmap_build(ARRAY(1001, 1002, 1003, 1004)));
|
|
47
47
|
```
|
|
48
48
|
|
|
49
49
|
---
|
|
50
50
|
|
|
51
|
-
##
|
|
51
|
+
## Audience Segmentation Operations
|
|
52
52
|
|
|
53
53
|
```sql
|
|
54
|
-
--
|
|
54
|
+
-- Intersection: users matching all tags (AND)
|
|
55
55
|
SELECT bitmap_count(
|
|
56
56
|
bitmap_and(
|
|
57
57
|
(SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
|
|
@@ -59,7 +59,7 @@ SELECT bitmap_count(
|
|
|
59
59
|
)
|
|
60
60
|
) AS target_count;
|
|
61
61
|
|
|
62
|
-
--
|
|
62
|
+
-- Union: users matching any tag (OR)
|
|
63
63
|
SELECT bitmap_count(
|
|
64
64
|
bitmap_or(
|
|
65
65
|
(SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
|
|
@@ -67,7 +67,7 @@ SELECT bitmap_count(
|
|
|
67
67
|
)
|
|
68
68
|
) AS reach_count;
|
|
69
69
|
|
|
70
|
-
--
|
|
70
|
+
-- Difference: exclude a group (ANDNOT)
|
|
71
71
|
SELECT bitmap_count(
|
|
72
72
|
bitmap_andnot(
|
|
73
73
|
(SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
|
|
@@ -75,7 +75,7 @@ SELECT bitmap_count(
|
|
|
75
75
|
)
|
|
76
76
|
) AS targetable_count;
|
|
77
77
|
|
|
78
|
-
--
|
|
78
|
+
-- Get target user ID list
|
|
79
79
|
SELECT bitmap_to_array(
|
|
80
80
|
bitmap_andnot(
|
|
81
81
|
(SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
|
|
@@ -86,10 +86,10 @@ SELECT bitmap_to_array(
|
|
|
86
86
|
|
|
87
87
|
---
|
|
88
88
|
|
|
89
|
-
## UV
|
|
89
|
+
## UV Counting (Distinct Count)
|
|
90
90
|
|
|
91
91
|
```sql
|
|
92
|
-
--
|
|
92
|
+
-- Daily active users (DAU)
|
|
93
93
|
SELECT
|
|
94
94
|
event_date,
|
|
95
95
|
bitmap_count(group_bitmap_state(user_id)) AS dau
|
|
@@ -97,11 +97,11 @@ FROM my_schema.events
|
|
|
97
97
|
GROUP BY event_date
|
|
98
98
|
ORDER BY event_date;
|
|
99
99
|
|
|
100
|
-
--
|
|
100
|
+
-- Weekly active users (WAU) — deduplicated across days
|
|
101
101
|
SELECT
|
|
102
102
|
DATE_TRUNC('week', event_date) AS week_start,
|
|
103
103
|
bitmap_count(
|
|
104
|
-
bitmap_or_agg(daily_bitmap) --
|
|
104
|
+
bitmap_or_agg(daily_bitmap) -- merge multiple days' bitmaps
|
|
105
105
|
) AS wau
|
|
106
106
|
FROM (
|
|
107
107
|
SELECT event_date,
|
|
@@ -111,7 +111,7 @@ FROM (
|
|
|
111
111
|
) t
|
|
112
112
|
GROUP BY 1;
|
|
113
113
|
|
|
114
|
-
--
|
|
114
|
+
-- User retention analysis (new vs. returning users)
|
|
115
115
|
SELECT
|
|
116
116
|
bitmap_count(
|
|
117
117
|
bitmap_and(new_users.user_bitmap, return_users.user_bitmap)
|
|
@@ -128,19 +128,19 @@ FROM
|
|
|
128
128
|
|
|
129
129
|
---
|
|
130
130
|
|
|
131
|
-
##
|
|
131
|
+
## BITMAP Function Quick Reference
|
|
132
132
|
|
|
133
|
-
|
|
|
133
|
+
| Function | Description | Example |
|
|
134
134
|
|---|---|---|
|
|
135
|
-
| `group_bitmap_state(col)` |
|
|
136
|
-
| `bitmap_count(bm)` |
|
|
137
|
-
| `bitmap_and(a, b)` |
|
|
138
|
-
| `bitmap_or(a, b)` |
|
|
139
|
-
| `bitmap_andnot(a, b)` |
|
|
140
|
-
| `bitmap_xor(a, b)` |
|
|
141
|
-
| `bitmap_to_array(bm)` |
|
|
142
|
-
| `bitmap_build(arr)` |
|
|
143
|
-
| `bitmap_contains(bm, val)` |
|
|
144
|
-
| `bitmap_min(bm)` |
|
|
145
|
-
| `bitmap_max(bm)` |
|
|
146
|
-
| `to_bitmap(val)` |
|
|
135
|
+
| `group_bitmap_state(col)` | Aggregate to build a BITMAP | `GROUP BY tag` |
|
|
136
|
+
| `bitmap_count(bm)` | Count elements (UV) | `bitmap_count(user_bm)` |
|
|
137
|
+
| `bitmap_and(a, b)` | Intersection | Users in both A and B |
|
|
138
|
+
| `bitmap_or(a, b)` | Union | Users in A or B |
|
|
139
|
+
| `bitmap_andnot(a, b)` | Difference | In A but not in B |
|
|
140
|
+
| `bitmap_xor(a, b)` | Symmetric difference | Exclusive to either A or B |
|
|
141
|
+
| `bitmap_to_array(bm)` | Convert to integer array | Get user ID list |
|
|
142
|
+
| `bitmap_build(arr)` | Build from array | `bitmap_build(ARRAY(1,2,3))` |
|
|
143
|
+
| `bitmap_contains(bm, val)` | Check if value is present | `bitmap_contains(bm, user_id)` |
|
|
144
|
+
| `bitmap_min(bm)` | Minimum element | — |
|
|
145
|
+
| `bitmap_max(bm)` | Maximum element | — |
|
|
146
|
+
| `to_bitmap(val)` | Convert single value to BITMAP | `to_bitmap(user_id)` |
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Data Discovery, Quality Assessment, Cleaning, and EDA Examples
|
|
2
2
|
|
|
3
|
-
##
|
|
3
|
+
## Data Discovery
|
|
4
4
|
|
|
5
5
|
```python
|
|
6
6
|
from src.config import get_session
|
|
@@ -20,10 +20,10 @@ session.sql("""
|
|
|
20
20
|
|
|
21
21
|
---
|
|
22
22
|
|
|
23
|
-
##
|
|
23
|
+
## Data Quality Assessment
|
|
24
24
|
|
|
25
25
|
```sql
|
|
26
|
-
--
|
|
26
|
+
-- Basic statistics
|
|
27
27
|
SELECT
|
|
28
28
|
COUNT(*) AS total_rows,
|
|
29
29
|
COUNT(DISTINCT user_id) AS unique_users,
|
|
@@ -32,11 +32,11 @@ SELECT
|
|
|
32
32
|
ROUND(100.0 * SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS amount_null_pct
|
|
33
33
|
FROM my_schema.orders;
|
|
34
34
|
|
|
35
|
-
--
|
|
35
|
+
-- Duplicate primary key check
|
|
36
36
|
SELECT order_id, COUNT(*) AS cnt
|
|
37
37
|
FROM my_schema.orders GROUP BY order_id HAVING cnt > 1 LIMIT 10;
|
|
38
38
|
|
|
39
|
-
--
|
|
39
|
+
-- Numeric distribution (efficient for large tables)
|
|
40
40
|
SELECT
|
|
41
41
|
approx_percentile(amount, 0.25) AS p25,
|
|
42
42
|
approx_percentile(amount, 0.50) AS median,
|
|
@@ -45,25 +45,25 @@ SELECT
|
|
|
45
45
|
MIN(amount) AS min_val, MAX(amount) AS max_val
|
|
46
46
|
FROM my_schema.orders;
|
|
47
47
|
|
|
48
|
-
--
|
|
48
|
+
-- Top-K high-frequency values
|
|
49
49
|
SELECT approx_top_k(status, 10) AS top_statuses FROM my_schema.orders;
|
|
50
50
|
|
|
51
|
-
--
|
|
51
|
+
-- Approximate distinct count
|
|
52
52
|
SELECT approx_count_distinct(user_id) AS approx_uv FROM my_schema.events;
|
|
53
53
|
```
|
|
54
54
|
|
|
55
55
|
---
|
|
56
56
|
|
|
57
|
-
##
|
|
57
|
+
## Data Cleaning
|
|
58
58
|
|
|
59
59
|
```sql
|
|
60
|
-
--
|
|
60
|
+
-- Deduplication (keep the latest record)
|
|
61
61
|
SELECT * FROM (
|
|
62
62
|
SELECT *, ROW_NUMBER() OVER (PARTITION BY order_id ORDER BY update_time DESC) AS rn
|
|
63
63
|
FROM my_schema.orders_raw
|
|
64
64
|
) WHERE rn = 1;
|
|
65
65
|
|
|
66
|
-
--
|
|
66
|
+
-- Null handling + type casting
|
|
67
67
|
SELECT
|
|
68
68
|
order_id, user_id,
|
|
69
69
|
COALESCE(amount, 0.0) AS amount,
|
|
@@ -72,7 +72,7 @@ SELECT
|
|
|
72
72
|
FROM my_schema.orders_raw
|
|
73
73
|
WHERE user_id IS NOT NULL;
|
|
74
74
|
|
|
75
|
-
--
|
|
75
|
+
-- Multi-table integration
|
|
76
76
|
SELECT o.order_id, o.user_id, o.amount, o.order_date,
|
|
77
77
|
u.age_group, u.city, p.category, p.brand
|
|
78
78
|
FROM my_schema.orders o
|
|
@@ -85,16 +85,16 @@ LEFT JOIN my_schema.products p ON o.product_id = p.product_id;
|
|
|
85
85
|
## EDA
|
|
86
86
|
|
|
87
87
|
```python
|
|
88
|
-
#
|
|
88
|
+
# Sampling strategies
|
|
89
89
|
df_quick = session.sql("""
|
|
90
90
|
SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000
|
|
91
|
-
""").to_pandas() # SYSTEM
|
|
91
|
+
""").to_pandas() # SYSTEM: file-level, very fast, good for >1M row previews
|
|
92
92
|
|
|
93
93
|
df_ml = session.sql("""
|
|
94
94
|
SELECT * FROM my_schema.events TABLESAMPLE ROW (10)
|
|
95
|
-
""").to_pandas() # ROW
|
|
95
|
+
""").to_pandas() # ROW: exact row-level, good for ML training sets
|
|
96
96
|
|
|
97
|
-
#
|
|
97
|
+
# Time series analysis
|
|
98
98
|
session.sql("""
|
|
99
99
|
SELECT
|
|
100
100
|
DATE_TRUNC('day', order_time) AS dt,
|
|
@@ -1,62 +1,62 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Environment Setup & Project Configuration
|
|
2
2
|
|
|
3
|
-
##
|
|
3
|
+
## Environment Setup
|
|
4
4
|
|
|
5
5
|
```bash
|
|
6
|
-
#
|
|
6
|
+
# Option 1: venv (recommended)
|
|
7
7
|
python3.12 -m venv .venv
|
|
8
8
|
source .venv/bin/activate # macOS/Linux
|
|
9
9
|
pip install clickzetta_zettapark_python clickzetta-connector-python \
|
|
10
10
|
python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
|
|
11
11
|
-i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
12
12
|
|
|
13
|
-
#
|
|
13
|
+
# Option 2: pyenv (when you need to switch Python versions)
|
|
14
14
|
pyenv install 3.12.9 && pyenv local 3.12.9
|
|
15
15
|
python -m venv .venv && source .venv/bin/activate
|
|
16
16
|
pip install clickzetta_zettapark_python clickzetta-connector-python \
|
|
17
17
|
python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
|
|
18
18
|
-i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
19
19
|
|
|
20
|
-
#
|
|
20
|
+
# Option 3: conda
|
|
21
21
|
conda create -n lakehouse-ds python=3.12 -y && conda activate lakehouse-ds
|
|
22
22
|
pip install clickzetta_zettapark_python clickzetta-connector-python \
|
|
23
23
|
python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
|
|
24
24
|
-i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
-
|
|
|
27
|
+
| Issue | Fix |
|
|
28
28
|
|------|------|
|
|
29
|
-
| Python 3.8/3.9 | `pyenv install 3.12.9`
|
|
30
|
-
| `pyarrow`
|
|
31
|
-
| M1/M2 Mac
|
|
32
|
-
|
|
|
29
|
+
| Python 3.8/3.9 | `pyenv install 3.12.9` or `python3.12 -m venv .venv` |
|
|
30
|
+
| `pyarrow` version conflict | `pip install pyarrow==14.0.0` |
|
|
31
|
+
| M1/M2 Mac error | `pip install --no-binary :all:` or use conda |
|
|
32
|
+
| Connection timeout | VCluster not started — start it manually in Studio |
|
|
33
33
|
|
|
34
34
|
---
|
|
35
35
|
|
|
36
|
-
## Jupyter Kernel
|
|
36
|
+
## Jupyter Kernel Configuration
|
|
37
37
|
|
|
38
38
|
```bash
|
|
39
|
-
#
|
|
39
|
+
# Register the venv as a Jupyter kernel (critical — otherwise notebook uses system Python)
|
|
40
40
|
source .venv/bin/activate
|
|
41
41
|
pip install ipykernel jupyterlab
|
|
42
42
|
python -m ipykernel install --user --name lakehouse-ds --display-name "Python (lakehouse-ds)"
|
|
43
43
|
|
|
44
|
-
#
|
|
44
|
+
# Start JupyterLab
|
|
45
45
|
jupyter lab --port=8888
|
|
46
46
|
```
|
|
47
47
|
|
|
48
|
-
VS Code / Cursor
|
|
48
|
+
VS Code / Cursor: open `.ipynb` → top-right "Select Kernel" → choose "Python (lakehouse-ds)"
|
|
49
49
|
|
|
50
|
-
|
|
|
50
|
+
| Issue | Fix |
|
|
51
51
|
|------|------|
|
|
52
|
-
| `ModuleNotFoundError: clickzetta` | kernel
|
|
53
|
-
| `.env`
|
|
54
|
-
| `to_pandas()` OOM |
|
|
55
|
-
|
|
|
52
|
+
| `ModuleNotFoundError: clickzetta` | Wrong kernel selected — switch to the registered venv kernel |
|
|
53
|
+
| `.env` not loading | Use `load_dotenv(dotenv_path='../.env')` with an explicit path |
|
|
54
|
+
| `to_pandas()` OOM | Add `TABLESAMPLE ROW(1)` or `LIMIT` |
|
|
55
|
+
| Charts not showing | Add `%matplotlib inline` at the top of the notebook |
|
|
56
56
|
|
|
57
57
|
---
|
|
58
58
|
|
|
59
|
-
## src/config.py
|
|
59
|
+
## src/config.py Template
|
|
60
60
|
|
|
61
61
|
```python
|
|
62
62
|
import os, sys
|
|
@@ -65,7 +65,7 @@ from dotenv import load_dotenv
|
|
|
65
65
|
from clickzetta.zettapark.session import Session
|
|
66
66
|
import clickzetta
|
|
67
67
|
|
|
68
|
-
#
|
|
68
|
+
# Search for .env in multiple locations
|
|
69
69
|
for _p in [
|
|
70
70
|
Path(__file__).parent.parent / ".env",
|
|
71
71
|
Path.home() / ".config" / "kilo" / ".env",
|
|
@@ -77,12 +77,12 @@ for _p in [
|
|
|
77
77
|
break
|
|
78
78
|
|
|
79
79
|
def check_environment():
|
|
80
|
-
"""
|
|
80
|
+
"""Call from 00-env-check.ipynb to print environment diagnostics."""
|
|
81
81
|
ver = sys.version_info
|
|
82
82
|
if ver < (3, 10):
|
|
83
83
|
raise RuntimeError(
|
|
84
|
-
f"Python {ver.major}.{ver.minor}
|
|
85
|
-
"
|
|
84
|
+
f"Python {ver.major}.{ver.minor} does not meet requirements. ZettaPark requires Python 3.10+.\n"
|
|
85
|
+
"Upgrade: brew install pyenv && pyenv install 3.12.9 && pyenv local 3.12.9"
|
|
86
86
|
)
|
|
87
87
|
print(f"✅ Python {ver.major}.{ver.minor}.{ver.micro}")
|
|
88
88
|
for pkg, mod in [
|
|
@@ -94,12 +94,12 @@ def check_environment():
|
|
|
94
94
|
m = __import__(mod.split(".")[0])
|
|
95
95
|
print(f"✅ {pkg}: {getattr(m, '__version__', 'ok')}")
|
|
96
96
|
except ImportError:
|
|
97
|
-
print(f"❌ {pkg}:
|
|
97
|
+
print(f"❌ {pkg}: not installed → pip install {pkg}")
|
|
98
98
|
try:
|
|
99
99
|
s = get_session()
|
|
100
100
|
print(f"✅ Lakehouse: {s.sql('SELECT current_workspace(), current_user()').collect()}")
|
|
101
101
|
except Exception as e:
|
|
102
|
-
print(f"❌ Lakehouse
|
|
102
|
+
print(f"❌ Lakehouse connection failed: {e}")
|
|
103
103
|
|
|
104
104
|
def get_session() -> Session:
|
|
105
105
|
return Session.builder.configs({
|
|
@@ -113,7 +113,7 @@ def get_session() -> Session:
|
|
|
113
113
|
}).create()
|
|
114
114
|
|
|
115
115
|
def get_connector_connection():
|
|
116
|
-
"""
|
|
116
|
+
"""For pd.read_sql only. Do NOT use with df.to_sql()."""
|
|
117
117
|
return clickzetta.connect(
|
|
118
118
|
service=os.environ["CLICKZETTA_SERVICE"],
|
|
119
119
|
instance=os.environ["CLICKZETTA_INSTANCE"],
|
|
@@ -127,7 +127,7 @@ def get_connector_connection():
|
|
|
127
127
|
|
|
128
128
|
---
|
|
129
129
|
|
|
130
|
-
## .env
|
|
130
|
+
## .env Template
|
|
131
131
|
|
|
132
132
|
```bash
|
|
133
133
|
CLICKZETTA_SERVICE=cn-shanghai-alicloud.api.clickzetta.com
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Statistical Analysis Functions Reference for Data Science
|
|
2
2
|
|
|
3
3
|
---
|
|
4
4
|
|
|
5
|
-
##
|
|
5
|
+
## Approximate Aggregate Functions (Efficient for Large Tables)
|
|
6
6
|
|
|
7
|
-
### approx_count_distinct —
|
|
7
|
+
### approx_count_distinct — Approximate Distinct Count
|
|
8
8
|
|
|
9
9
|
```sql
|
|
10
|
-
--
|
|
10
|
+
-- Uses HyperLogLog algorithm, ~2% error, 10x+ faster than COUNT(DISTINCT)
|
|
11
11
|
SELECT approx_count_distinct(user_id) AS approx_uv
|
|
12
12
|
FROM my_schema.events;
|
|
13
13
|
|
|
14
|
-
--
|
|
14
|
+
-- Daily active users (DAU)
|
|
15
15
|
SELECT
|
|
16
16
|
DATE(event_time) AS dt,
|
|
17
17
|
approx_count_distinct(user_id) AS dau
|
|
@@ -20,10 +20,10 @@ GROUP BY 1
|
|
|
20
20
|
ORDER BY 1;
|
|
21
21
|
```
|
|
22
22
|
|
|
23
|
-
### approx_percentile —
|
|
23
|
+
### approx_percentile — Approximate Percentiles
|
|
24
24
|
|
|
25
25
|
```sql
|
|
26
|
-
--
|
|
26
|
+
-- Median, quartiles, P95, P99
|
|
27
27
|
SELECT
|
|
28
28
|
approx_percentile(amount, 0.25) AS p25,
|
|
29
29
|
approx_percentile(amount, 0.50) AS median,
|
|
@@ -32,7 +32,7 @@ SELECT
|
|
|
32
32
|
approx_percentile(amount, 0.99) AS p99
|
|
33
33
|
FROM my_schema.orders;
|
|
34
34
|
|
|
35
|
-
--
|
|
35
|
+
-- Grouped percentiles
|
|
36
36
|
SELECT
|
|
37
37
|
category,
|
|
38
38
|
approx_percentile(price, 0.5) AS median_price
|
|
@@ -40,14 +40,14 @@ FROM my_schema.products
|
|
|
40
40
|
GROUP BY category;
|
|
41
41
|
```
|
|
42
42
|
|
|
43
|
-
### approx_histogram —
|
|
43
|
+
### approx_histogram — Approximate Histogram
|
|
44
44
|
|
|
45
45
|
```sql
|
|
46
|
-
--
|
|
46
|
+
-- Returns a struct array: [{min, max, count}, ...]
|
|
47
47
|
SELECT approx_histogram(amount, 10) AS hist
|
|
48
48
|
FROM my_schema.orders;
|
|
49
49
|
|
|
50
|
-
--
|
|
50
|
+
-- Parse histogram (expand to rows)
|
|
51
51
|
SELECT
|
|
52
52
|
bucket.min AS bucket_min,
|
|
53
53
|
bucket.max AS bucket_max,
|
|
@@ -58,15 +58,15 @@ FROM (
|
|
|
58
58
|
);
|
|
59
59
|
```
|
|
60
60
|
|
|
61
|
-
### approx_top_k —
|
|
61
|
+
### approx_top_k — Approximate Top-K High-Frequency Values
|
|
62
62
|
|
|
63
63
|
```sql
|
|
64
|
-
--
|
|
64
|
+
-- Find the top 10 most frequent cities
|
|
65
65
|
SELECT approx_top_k(city, 10) AS top_cities
|
|
66
66
|
FROM my_schema.orders;
|
|
67
67
|
|
|
68
|
-
--
|
|
69
|
-
--
|
|
68
|
+
-- Returns a struct array: [{value, count}, ...]
|
|
69
|
+
-- Expand to rows (fields are value and count)
|
|
70
70
|
SELECT item.value AS city, item.count AS cnt
|
|
71
71
|
FROM (
|
|
72
72
|
SELECT EXPLODE(approx_top_k(city, 10)) AS item
|
|
@@ -77,59 +77,59 @@ ORDER BY cnt DESC;
|
|
|
77
77
|
|
|
78
78
|
---
|
|
79
79
|
|
|
80
|
-
##
|
|
80
|
+
## Exact Statistical Functions
|
|
81
81
|
|
|
82
82
|
### percentile / median
|
|
83
83
|
|
|
84
84
|
```sql
|
|
85
|
-
--
|
|
85
|
+
-- Exact median (use for small tables; use approx_percentile for large tables)
|
|
86
86
|
SELECT
|
|
87
87
|
percentile(amount, 0.5) AS exact_median,
|
|
88
|
-
median(amount) AS median_alias --
|
|
88
|
+
median(amount) AS median_alias -- equivalent
|
|
89
89
|
FROM my_schema.orders;
|
|
90
90
|
|
|
91
|
-
--
|
|
91
|
+
-- Multiple percentiles
|
|
92
92
|
SELECT percentile(amount, ARRAY(0.25, 0.5, 0.75, 0.9, 0.99))
|
|
93
93
|
FROM my_schema.orders;
|
|
94
94
|
```
|
|
95
95
|
|
|
96
96
|
---
|
|
97
97
|
|
|
98
|
-
## TABLESAMPLE
|
|
98
|
+
## TABLESAMPLE Sampling
|
|
99
99
|
|
|
100
100
|
```sql
|
|
101
|
-
-- ROW
|
|
102
|
-
SELECT * FROM my_schema.events TABLESAMPLE ROW (10); --
|
|
103
|
-
SELECT * FROM my_schema.events TABLESAMPLE ROW (5 ROWS); --
|
|
101
|
+
-- ROW mode: exact row-level sampling (good for ML training sets, <10M rows)
|
|
102
|
+
SELECT * FROM my_schema.events TABLESAMPLE ROW (10); -- exact 10%
|
|
103
|
+
SELECT * FROM my_schema.events TABLESAMPLE ROW (5 ROWS); -- exact 5 rows
|
|
104
104
|
|
|
105
|
-
-- SYSTEM
|
|
106
|
-
SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000; --
|
|
105
|
+
-- SYSTEM mode: file-level sampling (good for large table quick preview, >10M rows)
|
|
106
|
+
SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000; -- ~0.1%
|
|
107
107
|
|
|
108
|
-
--
|
|
108
|
+
-- Stratified sampling (proportional by category)
|
|
109
109
|
SELECT * FROM (
|
|
110
110
|
SELECT *,
|
|
111
111
|
ROW_NUMBER() OVER (PARTITION BY category ORDER BY RAND()) AS rn,
|
|
112
112
|
COUNT(*) OVER (PARTITION BY category) AS cat_total
|
|
113
113
|
FROM my_schema.products
|
|
114
114
|
)
|
|
115
|
-
WHERE rn <= CEIL(cat_total * 0.1); --
|
|
115
|
+
WHERE rn <= CEIL(cat_total * 0.1); -- 10% per category
|
|
116
116
|
```
|
|
117
117
|
|
|
118
|
-
|
|
|
118
|
+
| Use Case | Recommended Mode | Notes |
|
|
119
119
|
|---|---|---|
|
|
120
|
-
|
|
|
121
|
-
| ML
|
|
122
|
-
|
|
|
123
|
-
|
|
|
120
|
+
| Quick data preview | SYSTEM | Very fast, good for >1M rows |
|
|
121
|
+
| ML training set | ROW | Exact random, ensures representativeness |
|
|
122
|
+
| Data quality spot check | SYSTEM | Fast sampling for validation |
|
|
123
|
+
| Statistical analysis | ROW | Exact probability sampling |
|
|
124
124
|
|
|
125
|
-
> ⚠️
|
|
125
|
+
> ⚠️ **Note**: TABLESAMPLE on small tables (<tens of thousands of rows) may return all data — percentage sampling is not precise. Use `LIMIT` directly for small tables.
|
|
126
126
|
|
|
127
127
|
---
|
|
128
128
|
|
|
129
|
-
##
|
|
129
|
+
## Window Functions (Time Series / Ranking Features)
|
|
130
130
|
|
|
131
131
|
```sql
|
|
132
|
-
--
|
|
132
|
+
-- 7-day moving average
|
|
133
133
|
SELECT
|
|
134
134
|
dt,
|
|
135
135
|
revenue,
|
|
@@ -139,7 +139,7 @@ SELECT
|
|
|
139
139
|
) AS revenue_7d_ma
|
|
140
140
|
FROM daily_stats;
|
|
141
141
|
|
|
142
|
-
--
|
|
142
|
+
-- Month-over-month growth rate
|
|
143
143
|
SELECT
|
|
144
144
|
dt,
|
|
145
145
|
revenue,
|
|
@@ -148,7 +148,7 @@ SELECT
|
|
|
148
148
|
/ NULLIF(LAG(revenue, 1) OVER (ORDER BY dt), 0), 2) AS mom_growth_pct
|
|
149
149
|
FROM daily_stats;
|
|
150
150
|
|
|
151
|
-
--
|
|
151
|
+
-- User behavior ranking (RFM analysis)
|
|
152
152
|
SELECT
|
|
153
153
|
user_id,
|
|
154
154
|
total_amount,
|
|
@@ -157,7 +157,7 @@ SELECT
|
|
|
157
157
|
NTILE(5) OVER (ORDER BY last_order_date DESC) AS recency_quintile
|
|
158
158
|
FROM user_rfm;
|
|
159
159
|
|
|
160
|
-
--
|
|
160
|
+
-- Deduplication keeping latest (common in data cleaning)
|
|
161
161
|
SELECT * FROM (
|
|
162
162
|
SELECT *,
|
|
163
163
|
ROW_NUMBER() OVER (
|
|
@@ -170,25 +170,25 @@ SELECT * FROM (
|
|
|
170
170
|
|
|
171
171
|
---
|
|
172
172
|
|
|
173
|
-
##
|
|
173
|
+
## Data Quality Check Template
|
|
174
174
|
|
|
175
175
|
```sql
|
|
176
|
-
--
|
|
176
|
+
-- Output all key quality metrics in one query
|
|
177
177
|
SELECT
|
|
178
178
|
COUNT(*) AS total_rows,
|
|
179
179
|
COUNT(DISTINCT user_id) AS unique_users,
|
|
180
|
-
--
|
|
180
|
+
-- Null rates
|
|
181
181
|
ROUND(100.0 * COUNT(*) FILTER (WHERE user_id IS NULL)
|
|
182
182
|
/ COUNT(*), 2) AS user_id_null_pct,
|
|
183
183
|
ROUND(100.0 * COUNT(*) FILTER (WHERE amount IS NULL)
|
|
184
184
|
/ COUNT(*), 2) AS amount_null_pct,
|
|
185
|
-
--
|
|
185
|
+
-- Anomalies
|
|
186
186
|
SUM(CASE WHEN amount < 0 THEN 1 ELSE 0 END) AS negative_amount_cnt,
|
|
187
187
|
SUM(CASE WHEN amount > 1000000 THEN 1 ELSE 0 END) AS extreme_amount_cnt,
|
|
188
|
-
--
|
|
188
|
+
-- Time range
|
|
189
189
|
MIN(order_date) AS earliest_date,
|
|
190
190
|
MAX(order_date) AS latest_date,
|
|
191
|
-
--
|
|
191
|
+
-- Distribution
|
|
192
192
|
approx_percentile(amount, 0.5) AS median_amount,
|
|
193
193
|
approx_percentile(amount, 0.99) AS p99_amount
|
|
194
194
|
FROM my_schema.orders;
|