cudf-polars-cu13 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -0
- cudf_polars/VERSION +1 -0
- cudf_polars/__init__.py +28 -0
- cudf_polars/_version.py +21 -0
- cudf_polars/callback.py +318 -0
- cudf_polars/containers/__init__.py +13 -0
- cudf_polars/containers/column.py +495 -0
- cudf_polars/containers/dataframe.py +361 -0
- cudf_polars/containers/datatype.py +137 -0
- cudf_polars/dsl/__init__.py +8 -0
- cudf_polars/dsl/expr.py +66 -0
- cudf_polars/dsl/expressions/__init__.py +8 -0
- cudf_polars/dsl/expressions/aggregation.py +226 -0
- cudf_polars/dsl/expressions/base.py +272 -0
- cudf_polars/dsl/expressions/binaryop.py +120 -0
- cudf_polars/dsl/expressions/boolean.py +326 -0
- cudf_polars/dsl/expressions/datetime.py +271 -0
- cudf_polars/dsl/expressions/literal.py +97 -0
- cudf_polars/dsl/expressions/rolling.py +643 -0
- cudf_polars/dsl/expressions/selection.py +74 -0
- cudf_polars/dsl/expressions/slicing.py +46 -0
- cudf_polars/dsl/expressions/sorting.py +85 -0
- cudf_polars/dsl/expressions/string.py +1002 -0
- cudf_polars/dsl/expressions/struct.py +137 -0
- cudf_polars/dsl/expressions/ternary.py +49 -0
- cudf_polars/dsl/expressions/unary.py +517 -0
- cudf_polars/dsl/ir.py +2607 -0
- cudf_polars/dsl/nodebase.py +164 -0
- cudf_polars/dsl/to_ast.py +359 -0
- cudf_polars/dsl/tracing.py +16 -0
- cudf_polars/dsl/translate.py +939 -0
- cudf_polars/dsl/traversal.py +224 -0
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +481 -0
- cudf_polars/dsl/utils/groupby.py +98 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +61 -0
- cudf_polars/dsl/utils/reshape.py +74 -0
- cudf_polars/dsl/utils/rolling.py +121 -0
- cudf_polars/dsl/utils/windows.py +192 -0
- cudf_polars/experimental/__init__.py +8 -0
- cudf_polars/experimental/base.py +386 -0
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds.py +220 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
- cudf_polars/experimental/benchmarks/pdsh.py +814 -0
- cudf_polars/experimental/benchmarks/utils.py +832 -0
- cudf_polars/experimental/dask_registers.py +200 -0
- cudf_polars/experimental/dispatch.py +156 -0
- cudf_polars/experimental/distinct.py +197 -0
- cudf_polars/experimental/explain.py +157 -0
- cudf_polars/experimental/expressions.py +590 -0
- cudf_polars/experimental/groupby.py +327 -0
- cudf_polars/experimental/io.py +943 -0
- cudf_polars/experimental/join.py +391 -0
- cudf_polars/experimental/parallel.py +423 -0
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +188 -0
- cudf_polars/experimental/shuffle.py +354 -0
- cudf_polars/experimental/sort.py +609 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/statistics.py +795 -0
- cudf_polars/experimental/utils.py +169 -0
- cudf_polars/py.typed +0 -0
- cudf_polars/testing/__init__.py +8 -0
- cudf_polars/testing/asserts.py +448 -0
- cudf_polars/testing/io.py +122 -0
- cudf_polars/testing/plugin.py +236 -0
- cudf_polars/typing/__init__.py +219 -0
- cudf_polars/utils/__init__.py +8 -0
- cudf_polars/utils/config.py +741 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +118 -0
- cudf_polars/utils/sorting.py +53 -0
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +27 -0
- cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
- cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
- cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
- cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
- cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Query 10."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
import polars as pl
|
|
11
|
+
|
|
12
|
+
from cudf_polars.experimental.benchmarks.utils import get_data
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from cudf_polars.experimental.benchmarks.utils import RunConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def duckdb_impl(run_config: RunConfig) -> str:
|
|
19
|
+
"""Query 10."""
|
|
20
|
+
return """
|
|
21
|
+
SELECT cd_gender,
|
|
22
|
+
cd_marital_status,
|
|
23
|
+
cd_education_status,
|
|
24
|
+
Count(*) cnt1,
|
|
25
|
+
cd_purchase_estimate,
|
|
26
|
+
Count(*) cnt2,
|
|
27
|
+
cd_credit_rating,
|
|
28
|
+
Count(*) cnt3,
|
|
29
|
+
cd_dep_count,
|
|
30
|
+
Count(*) cnt4,
|
|
31
|
+
cd_dep_employed_count,
|
|
32
|
+
Count(*) cnt5,
|
|
33
|
+
cd_dep_college_count,
|
|
34
|
+
Count(*) cnt6
|
|
35
|
+
FROM customer c,
|
|
36
|
+
customer_address ca,
|
|
37
|
+
customer_demographics
|
|
38
|
+
WHERE c.c_current_addr_sk = ca.ca_address_sk
|
|
39
|
+
AND ca_county IN ( 'Lycoming County', 'Sheridan County',
|
|
40
|
+
'Kandiyohi County',
|
|
41
|
+
'Pike County',
|
|
42
|
+
'Greene County' )
|
|
43
|
+
AND cd_demo_sk = c.c_current_cdemo_sk
|
|
44
|
+
AND EXISTS (SELECT *
|
|
45
|
+
FROM store_sales,
|
|
46
|
+
date_dim
|
|
47
|
+
WHERE c.c_customer_sk = ss_customer_sk
|
|
48
|
+
AND ss_sold_date_sk = d_date_sk
|
|
49
|
+
AND d_year = 2002
|
|
50
|
+
AND d_moy BETWEEN 4 AND 4 + 3)
|
|
51
|
+
AND ( EXISTS (SELECT *
|
|
52
|
+
FROM web_sales,
|
|
53
|
+
date_dim
|
|
54
|
+
WHERE c.c_customer_sk = ws_bill_customer_sk
|
|
55
|
+
AND ws_sold_date_sk = d_date_sk
|
|
56
|
+
AND d_year = 2002
|
|
57
|
+
AND d_moy BETWEEN 4 AND 4 + 3)
|
|
58
|
+
OR EXISTS (SELECT *
|
|
59
|
+
FROM catalog_sales,
|
|
60
|
+
date_dim
|
|
61
|
+
WHERE c.c_customer_sk = cs_ship_customer_sk
|
|
62
|
+
AND cs_sold_date_sk = d_date_sk
|
|
63
|
+
AND d_year = 2002
|
|
64
|
+
AND d_moy BETWEEN 4 AND 4 + 3) )
|
|
65
|
+
GROUP BY cd_gender,
|
|
66
|
+
cd_marital_status,
|
|
67
|
+
cd_education_status,
|
|
68
|
+
cd_purchase_estimate,
|
|
69
|
+
cd_credit_rating,
|
|
70
|
+
cd_dep_count,
|
|
71
|
+
cd_dep_employed_count,
|
|
72
|
+
cd_dep_college_count
|
|
73
|
+
ORDER BY cd_gender,
|
|
74
|
+
cd_marital_status,
|
|
75
|
+
cd_education_status,
|
|
76
|
+
cd_purchase_estimate,
|
|
77
|
+
cd_credit_rating,
|
|
78
|
+
cd_dep_count,
|
|
79
|
+
cd_dep_employed_count,
|
|
80
|
+
cd_dep_college_count
|
|
81
|
+
LIMIT 100;
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
|
|
86
|
+
"""Query 10."""
|
|
87
|
+
# Load required tables
|
|
88
|
+
customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
|
|
89
|
+
customer_address = get_data(
|
|
90
|
+
run_config.dataset_path, "customer_address", run_config.suffix
|
|
91
|
+
)
|
|
92
|
+
customer_demographics = get_data(
|
|
93
|
+
run_config.dataset_path, "customer_demographics", run_config.suffix
|
|
94
|
+
)
|
|
95
|
+
store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
|
|
96
|
+
web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix)
|
|
97
|
+
catalog_sales = get_data(
|
|
98
|
+
run_config.dataset_path, "catalog_sales", run_config.suffix
|
|
99
|
+
)
|
|
100
|
+
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
|
|
101
|
+
|
|
102
|
+
# Target counties and date range
|
|
103
|
+
target_counties = [
|
|
104
|
+
"Lycoming County",
|
|
105
|
+
"Sheridan County",
|
|
106
|
+
"Kandiyohi County",
|
|
107
|
+
"Pike County",
|
|
108
|
+
"Greene County",
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
# Get customers with store sales in the target period (EXISTS condition 1)
|
|
112
|
+
store_customers = (
|
|
113
|
+
store_sales.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk")
|
|
114
|
+
.filter(
|
|
115
|
+
(pl.col("d_year") == 2002)
|
|
116
|
+
& (pl.col("d_moy").is_between(4, 7, closed="both"))
|
|
117
|
+
)
|
|
118
|
+
.select("ss_customer_sk")
|
|
119
|
+
.unique()
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Get customers with web sales in the target period (EXISTS condition 2a)
|
|
123
|
+
web_customers = (
|
|
124
|
+
web_sales.join(date_dim, left_on="ws_sold_date_sk", right_on="d_date_sk")
|
|
125
|
+
.filter(
|
|
126
|
+
(pl.col("d_year") == 2002)
|
|
127
|
+
& (pl.col("d_moy").is_between(4, 7, closed="both"))
|
|
128
|
+
)
|
|
129
|
+
.select(pl.col("ws_bill_customer_sk").alias("customer_sk"))
|
|
130
|
+
.unique()
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Get customers with catalog sales in the target period (EXISTS condition 2b)
|
|
134
|
+
catalog_customers = (
|
|
135
|
+
catalog_sales.join(date_dim, left_on="cs_sold_date_sk", right_on="d_date_sk")
|
|
136
|
+
.filter(
|
|
137
|
+
(pl.col("d_year") == 2002)
|
|
138
|
+
& (pl.col("d_moy").is_between(4, 7, closed="both"))
|
|
139
|
+
)
|
|
140
|
+
.select(pl.col("cs_ship_customer_sk").alias("customer_sk"))
|
|
141
|
+
.unique()
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Combine web and catalog customers (OR condition)
|
|
145
|
+
web_or_catalog_customers = pl.concat([web_customers, catalog_customers]).unique()
|
|
146
|
+
|
|
147
|
+
# Main query: join customer tables and apply filters
|
|
148
|
+
return (
|
|
149
|
+
customer.join(
|
|
150
|
+
customer_address, left_on="c_current_addr_sk", right_on="ca_address_sk"
|
|
151
|
+
)
|
|
152
|
+
.join(
|
|
153
|
+
customer_demographics, left_on="c_current_cdemo_sk", right_on="cd_demo_sk"
|
|
154
|
+
)
|
|
155
|
+
.filter(pl.col("ca_county").is_in(target_counties))
|
|
156
|
+
# Apply EXISTS conditions through joins
|
|
157
|
+
.join(
|
|
158
|
+
store_customers,
|
|
159
|
+
left_on="c_customer_sk",
|
|
160
|
+
right_on="ss_customer_sk",
|
|
161
|
+
how="inner",
|
|
162
|
+
)
|
|
163
|
+
.join(
|
|
164
|
+
web_or_catalog_customers,
|
|
165
|
+
left_on="c_customer_sk",
|
|
166
|
+
right_on="customer_sk",
|
|
167
|
+
how="inner",
|
|
168
|
+
)
|
|
169
|
+
.group_by(
|
|
170
|
+
[
|
|
171
|
+
"cd_gender",
|
|
172
|
+
"cd_marital_status",
|
|
173
|
+
"cd_education_status",
|
|
174
|
+
"cd_purchase_estimate",
|
|
175
|
+
"cd_credit_rating",
|
|
176
|
+
"cd_dep_count",
|
|
177
|
+
"cd_dep_employed_count",
|
|
178
|
+
"cd_dep_college_count",
|
|
179
|
+
]
|
|
180
|
+
)
|
|
181
|
+
.agg(
|
|
182
|
+
[
|
|
183
|
+
# Cast -> Int64 to match DuckDB
|
|
184
|
+
# TODO: We should plan to make these optional
|
|
185
|
+
pl.len().alias("cnt1").cast(pl.Int64),
|
|
186
|
+
pl.len().alias("cnt2").cast(pl.Int64),
|
|
187
|
+
pl.len().alias("cnt3").cast(pl.Int64),
|
|
188
|
+
pl.len().alias("cnt4").cast(pl.Int64),
|
|
189
|
+
pl.len().alias("cnt5").cast(pl.Int64),
|
|
190
|
+
pl.len().alias("cnt6").cast(pl.Int64),
|
|
191
|
+
]
|
|
192
|
+
)
|
|
193
|
+
.sort(
|
|
194
|
+
[
|
|
195
|
+
"cd_gender",
|
|
196
|
+
"cd_marital_status",
|
|
197
|
+
"cd_education_status",
|
|
198
|
+
"cd_purchase_estimate",
|
|
199
|
+
"cd_credit_rating",
|
|
200
|
+
"cd_dep_count",
|
|
201
|
+
"cd_dep_employed_count",
|
|
202
|
+
"cd_dep_college_count",
|
|
203
|
+
],
|
|
204
|
+
nulls_last=True,
|
|
205
|
+
)
|
|
206
|
+
.limit(100)
|
|
207
|
+
.select(
|
|
208
|
+
[
|
|
209
|
+
"cd_gender",
|
|
210
|
+
"cd_marital_status",
|
|
211
|
+
"cd_education_status",
|
|
212
|
+
"cnt1",
|
|
213
|
+
"cd_purchase_estimate",
|
|
214
|
+
"cnt2",
|
|
215
|
+
"cd_credit_rating",
|
|
216
|
+
"cnt3",
|
|
217
|
+
"cd_dep_count",
|
|
218
|
+
"cnt4",
|
|
219
|
+
"cd_dep_employed_count",
|
|
220
|
+
"cnt5",
|
|
221
|
+
"cd_dep_college_count",
|
|
222
|
+
"cnt6",
|
|
223
|
+
]
|
|
224
|
+
)
|
|
225
|
+
)
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Query 2."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
import polars as pl
|
|
11
|
+
|
|
12
|
+
from cudf_polars.experimental.benchmarks.utils import get_data
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from cudf_polars.experimental.benchmarks.utils import RunConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def duckdb_impl(run_config: RunConfig) -> str:
|
|
19
|
+
"""Query 2."""
|
|
20
|
+
return """
|
|
21
|
+
WITH wscs
|
|
22
|
+
AS (SELECT sold_date_sk,
|
|
23
|
+
sales_price
|
|
24
|
+
FROM (SELECT ws_sold_date_sk sold_date_sk,
|
|
25
|
+
ws_ext_sales_price sales_price
|
|
26
|
+
FROM web_sales)
|
|
27
|
+
UNION ALL
|
|
28
|
+
(SELECT cs_sold_date_sk sold_date_sk,
|
|
29
|
+
cs_ext_sales_price sales_price
|
|
30
|
+
FROM catalog_sales)),
|
|
31
|
+
wswscs
|
|
32
|
+
AS (SELECT d_week_seq,
|
|
33
|
+
Sum(CASE
|
|
34
|
+
WHEN ( d_day_name = 'Sunday' ) THEN sales_price
|
|
35
|
+
ELSE NULL
|
|
36
|
+
END) sun_sales,
|
|
37
|
+
Sum(CASE
|
|
38
|
+
WHEN ( d_day_name = 'Monday' ) THEN sales_price
|
|
39
|
+
ELSE NULL
|
|
40
|
+
END) mon_sales,
|
|
41
|
+
Sum(CASE
|
|
42
|
+
WHEN ( d_day_name = 'Tuesday' ) THEN sales_price
|
|
43
|
+
ELSE NULL
|
|
44
|
+
END) tue_sales,
|
|
45
|
+
Sum(CASE
|
|
46
|
+
WHEN ( d_day_name = 'Wednesday' ) THEN sales_price
|
|
47
|
+
ELSE NULL
|
|
48
|
+
END) wed_sales,
|
|
49
|
+
Sum(CASE
|
|
50
|
+
WHEN ( d_day_name = 'Thursday' ) THEN sales_price
|
|
51
|
+
ELSE NULL
|
|
52
|
+
END) thu_sales,
|
|
53
|
+
Sum(CASE
|
|
54
|
+
WHEN ( d_day_name = 'Friday' ) THEN sales_price
|
|
55
|
+
ELSE NULL
|
|
56
|
+
END) fri_sales,
|
|
57
|
+
Sum(CASE
|
|
58
|
+
WHEN ( d_day_name = 'Saturday' ) THEN sales_price
|
|
59
|
+
ELSE NULL
|
|
60
|
+
END) sat_sales
|
|
61
|
+
FROM wscs,
|
|
62
|
+
date_dim
|
|
63
|
+
WHERE d_date_sk = sold_date_sk
|
|
64
|
+
GROUP BY d_week_seq)
|
|
65
|
+
SELECT d_week_seq1,
|
|
66
|
+
Round(sun_sales1 / sun_sales2, 2),
|
|
67
|
+
Round(mon_sales1 / mon_sales2, 2),
|
|
68
|
+
Round(tue_sales1 / tue_sales2, 2),
|
|
69
|
+
Round(wed_sales1 / wed_sales2, 2),
|
|
70
|
+
Round(thu_sales1 / thu_sales2, 2),
|
|
71
|
+
Round(fri_sales1 / fri_sales2, 2),
|
|
72
|
+
Round(sat_sales1 / sat_sales2, 2)
|
|
73
|
+
FROM (SELECT wswscs.d_week_seq d_week_seq1,
|
|
74
|
+
sun_sales sun_sales1,
|
|
75
|
+
mon_sales mon_sales1,
|
|
76
|
+
tue_sales tue_sales1,
|
|
77
|
+
wed_sales wed_sales1,
|
|
78
|
+
thu_sales thu_sales1,
|
|
79
|
+
fri_sales fri_sales1,
|
|
80
|
+
sat_sales sat_sales1
|
|
81
|
+
FROM wswscs,
|
|
82
|
+
date_dim
|
|
83
|
+
WHERE date_dim.d_week_seq = wswscs.d_week_seq
|
|
84
|
+
AND d_year = 1998) y,
|
|
85
|
+
(SELECT wswscs.d_week_seq d_week_seq2,
|
|
86
|
+
sun_sales sun_sales2,
|
|
87
|
+
mon_sales mon_sales2,
|
|
88
|
+
tue_sales tue_sales2,
|
|
89
|
+
wed_sales wed_sales2,
|
|
90
|
+
thu_sales thu_sales2,
|
|
91
|
+
fri_sales fri_sales2,
|
|
92
|
+
sat_sales sat_sales2
|
|
93
|
+
FROM wswscs,
|
|
94
|
+
date_dim
|
|
95
|
+
WHERE date_dim.d_week_seq = wswscs.d_week_seq
|
|
96
|
+
AND d_year = 1998 + 1) z
|
|
97
|
+
WHERE d_week_seq1 = d_week_seq2 - 53
|
|
98
|
+
ORDER BY d_week_seq1;
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
|
|
103
|
+
"""Query 2."""
|
|
104
|
+
# Load required tables
|
|
105
|
+
web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix)
|
|
106
|
+
catalog_sales = get_data(
|
|
107
|
+
run_config.dataset_path, "catalog_sales", run_config.suffix
|
|
108
|
+
)
|
|
109
|
+
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
|
|
110
|
+
# Step 1: Create wscs CTE equivalent (union of web and catalog sales)
|
|
111
|
+
wscs = pl.concat(
|
|
112
|
+
[
|
|
113
|
+
web_sales.select(
|
|
114
|
+
[
|
|
115
|
+
pl.col("ws_sold_date_sk").alias("sold_date_sk"),
|
|
116
|
+
pl.col("ws_ext_sales_price").alias("sales_price"),
|
|
117
|
+
]
|
|
118
|
+
),
|
|
119
|
+
catalog_sales.select(
|
|
120
|
+
[
|
|
121
|
+
pl.col("cs_sold_date_sk").alias("sold_date_sk"),
|
|
122
|
+
pl.col("cs_ext_sales_price").alias("sales_price"),
|
|
123
|
+
]
|
|
124
|
+
),
|
|
125
|
+
]
|
|
126
|
+
)
|
|
127
|
+
# Step 2: Create wswscs CTE equivalent (aggregate by week and day of week)
|
|
128
|
+
# First join with date_dim to get day names
|
|
129
|
+
wscs_with_dates = wscs.join(date_dim, left_on="sold_date_sk", right_on="d_date_sk")
|
|
130
|
+
# Create separate aggregations for each day to better control null handling
|
|
131
|
+
days = (
|
|
132
|
+
"Sunday",
|
|
133
|
+
"Monday",
|
|
134
|
+
"Tuesday",
|
|
135
|
+
"Wednesday",
|
|
136
|
+
"Thursday",
|
|
137
|
+
"Friday",
|
|
138
|
+
"Saturday",
|
|
139
|
+
)
|
|
140
|
+
day_cols = (
|
|
141
|
+
"sun_sales",
|
|
142
|
+
"mon_sales",
|
|
143
|
+
"tue_sales",
|
|
144
|
+
"wed_sales",
|
|
145
|
+
"thu_sales",
|
|
146
|
+
"fri_sales",
|
|
147
|
+
"sat_sales",
|
|
148
|
+
)
|
|
149
|
+
# Start with all week sequences
|
|
150
|
+
all_weeks = wscs_with_dates.select("d_week_seq").unique()
|
|
151
|
+
wswscs = all_weeks
|
|
152
|
+
|
|
153
|
+
wswscs = (
|
|
154
|
+
wscs_with_dates.with_columns(
|
|
155
|
+
[
|
|
156
|
+
pl.when(pl.col("d_day_name") == day)
|
|
157
|
+
.then(pl.col("sales_price"))
|
|
158
|
+
.otherwise(None)
|
|
159
|
+
.alias(name)
|
|
160
|
+
for day, name in zip(days, day_cols, strict=True)
|
|
161
|
+
]
|
|
162
|
+
)
|
|
163
|
+
.group_by("d_week_seq")
|
|
164
|
+
.agg(
|
|
165
|
+
*(pl.col(name).sum().alias(name) for name in day_cols),
|
|
166
|
+
*(pl.col(name).count().alias(f"{name}_count") for name in day_cols),
|
|
167
|
+
)
|
|
168
|
+
.with_columns(
|
|
169
|
+
[
|
|
170
|
+
pl.when(pl.col(f"{name}_count") > 0)
|
|
171
|
+
.then(pl.col(name))
|
|
172
|
+
.otherwise(None)
|
|
173
|
+
.alias(name)
|
|
174
|
+
for name in day_cols
|
|
175
|
+
]
|
|
176
|
+
)
|
|
177
|
+
.select(["d_week_seq", *day_cols])
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Step 3: Create year 1998 data (y subquery equivalent)
|
|
181
|
+
y_1998 = (
|
|
182
|
+
wswscs.join(date_dim, left_on="d_week_seq", right_on="d_week_seq")
|
|
183
|
+
.filter(pl.col("d_year") == 1998)
|
|
184
|
+
.select(
|
|
185
|
+
[
|
|
186
|
+
pl.col("d_week_seq").alias("d_week_seq1"),
|
|
187
|
+
pl.col("sun_sales").alias("sun_sales1"),
|
|
188
|
+
pl.col("mon_sales").alias("mon_sales1"),
|
|
189
|
+
pl.col("tue_sales").alias("tue_sales1"),
|
|
190
|
+
pl.col("wed_sales").alias("wed_sales1"),
|
|
191
|
+
pl.col("thu_sales").alias("thu_sales1"),
|
|
192
|
+
pl.col("fri_sales").alias("fri_sales1"),
|
|
193
|
+
pl.col("sat_sales").alias("sat_sales1"),
|
|
194
|
+
]
|
|
195
|
+
)
|
|
196
|
+
)
|
|
197
|
+
# Step 4: Create year 1999 data (z subquery equivalent)
|
|
198
|
+
z_1999 = (
|
|
199
|
+
wswscs.join(date_dim, left_on="d_week_seq", right_on="d_week_seq")
|
|
200
|
+
.filter(pl.col("d_year") == 1999)
|
|
201
|
+
.select(
|
|
202
|
+
[
|
|
203
|
+
pl.col("d_week_seq").alias("d_week_seq2"),
|
|
204
|
+
pl.col("sun_sales").alias("sun_sales2"),
|
|
205
|
+
pl.col("mon_sales").alias("mon_sales2"),
|
|
206
|
+
pl.col("tue_sales").alias("tue_sales2"),
|
|
207
|
+
pl.col("wed_sales").alias("wed_sales2"),
|
|
208
|
+
pl.col("thu_sales").alias("thu_sales2"),
|
|
209
|
+
pl.col("fri_sales").alias("fri_sales2"),
|
|
210
|
+
pl.col("sat_sales").alias("sat_sales2"),
|
|
211
|
+
]
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
# Step 5: Join the two years and calculate ratios
|
|
215
|
+
return (
|
|
216
|
+
y_1998.join(z_1999, left_on="d_week_seq1", right_on=pl.col("d_week_seq2") - 53)
|
|
217
|
+
.select(
|
|
218
|
+
[
|
|
219
|
+
pl.col("d_week_seq1"),
|
|
220
|
+
(pl.col("sun_sales1") / pl.col("sun_sales2"))
|
|
221
|
+
.round(2)
|
|
222
|
+
.alias("round((sun_sales1 / sun_sales2), 2)"),
|
|
223
|
+
(pl.col("mon_sales1") / pl.col("mon_sales2"))
|
|
224
|
+
.round(2)
|
|
225
|
+
.alias("round((mon_sales1 / mon_sales2), 2)"),
|
|
226
|
+
(pl.col("tue_sales1") / pl.col("tue_sales2"))
|
|
227
|
+
.round(2)
|
|
228
|
+
.alias("round((tue_sales1 / tue_sales2), 2)"),
|
|
229
|
+
(pl.col("wed_sales1") / pl.col("wed_sales2"))
|
|
230
|
+
.round(2)
|
|
231
|
+
.alias("round((wed_sales1 / wed_sales2), 2)"),
|
|
232
|
+
(pl.col("thu_sales1") / pl.col("thu_sales2"))
|
|
233
|
+
.round(2)
|
|
234
|
+
.alias("round((thu_sales1 / thu_sales2), 2)"),
|
|
235
|
+
(pl.col("fri_sales1") / pl.col("fri_sales2"))
|
|
236
|
+
.round(2)
|
|
237
|
+
.alias("round((fri_sales1 / fri_sales2), 2)"),
|
|
238
|
+
(pl.col("sat_sales1") / pl.col("sat_sales2"))
|
|
239
|
+
.round(2)
|
|
240
|
+
.alias("round((sat_sales1 / sat_sales2), 2)"),
|
|
241
|
+
]
|
|
242
|
+
)
|
|
243
|
+
.sort("d_week_seq1")
|
|
244
|
+
)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Query 3."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
import polars as pl
|
|
11
|
+
|
|
12
|
+
from cudf_polars.experimental.benchmarks.utils import get_data
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from cudf_polars.experimental.benchmarks.utils import RunConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def duckdb_impl(run_config: RunConfig) -> str:
|
|
19
|
+
"""Query 3."""
|
|
20
|
+
return """
|
|
21
|
+
SELECT dt.d_year,
|
|
22
|
+
item.i_brand_id brand_id,
|
|
23
|
+
item.i_brand brand,
|
|
24
|
+
Sum(ss_ext_discount_amt) sum_agg
|
|
25
|
+
FROM date_dim dt,
|
|
26
|
+
store_sales,
|
|
27
|
+
item
|
|
28
|
+
WHERE dt.d_date_sk = store_sales.ss_sold_date_sk
|
|
29
|
+
AND store_sales.ss_item_sk = item.i_item_sk
|
|
30
|
+
AND item.i_manufact_id = 427
|
|
31
|
+
AND dt.d_moy = 11
|
|
32
|
+
GROUP BY dt.d_year,
|
|
33
|
+
item.i_brand,
|
|
34
|
+
item.i_brand_id
|
|
35
|
+
ORDER BY dt.d_year,
|
|
36
|
+
sum_agg DESC,
|
|
37
|
+
brand_id
|
|
38
|
+
LIMIT 100;
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
|
|
43
|
+
"""Query 3."""
|
|
44
|
+
# Load required tables
|
|
45
|
+
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
|
|
46
|
+
store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
|
|
47
|
+
item = get_data(run_config.dataset_path, "item", run_config.suffix)
|
|
48
|
+
# Execute the query following the SQL logic
|
|
49
|
+
return (
|
|
50
|
+
date_dim.join(store_sales, left_on="d_date_sk", right_on="ss_sold_date_sk")
|
|
51
|
+
.join(item, left_on="ss_item_sk", right_on="i_item_sk")
|
|
52
|
+
.filter((pl.col("i_manufact_id") == 427) & (pl.col("d_moy") == 11))
|
|
53
|
+
.group_by(["d_year", "i_brand", "i_brand_id"])
|
|
54
|
+
.agg([pl.col("ss_ext_discount_amt").sum().alias("sum_agg")])
|
|
55
|
+
.select(
|
|
56
|
+
[
|
|
57
|
+
pl.col("d_year"),
|
|
58
|
+
pl.col("i_brand_id").alias("brand_id"),
|
|
59
|
+
pl.col("i_brand").alias("brand"),
|
|
60
|
+
pl.col("sum_agg"),
|
|
61
|
+
]
|
|
62
|
+
)
|
|
63
|
+
.sort(["d_year", "sum_agg", "brand_id"], descending=[False, True, False])
|
|
64
|
+
.limit(100)
|
|
65
|
+
)
|