cudf-polars-cu13 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -0
- cudf_polars/VERSION +1 -0
- cudf_polars/__init__.py +28 -0
- cudf_polars/_version.py +21 -0
- cudf_polars/callback.py +318 -0
- cudf_polars/containers/__init__.py +13 -0
- cudf_polars/containers/column.py +495 -0
- cudf_polars/containers/dataframe.py +361 -0
- cudf_polars/containers/datatype.py +137 -0
- cudf_polars/dsl/__init__.py +8 -0
- cudf_polars/dsl/expr.py +66 -0
- cudf_polars/dsl/expressions/__init__.py +8 -0
- cudf_polars/dsl/expressions/aggregation.py +226 -0
- cudf_polars/dsl/expressions/base.py +272 -0
- cudf_polars/dsl/expressions/binaryop.py +120 -0
- cudf_polars/dsl/expressions/boolean.py +326 -0
- cudf_polars/dsl/expressions/datetime.py +271 -0
- cudf_polars/dsl/expressions/literal.py +97 -0
- cudf_polars/dsl/expressions/rolling.py +643 -0
- cudf_polars/dsl/expressions/selection.py +74 -0
- cudf_polars/dsl/expressions/slicing.py +46 -0
- cudf_polars/dsl/expressions/sorting.py +85 -0
- cudf_polars/dsl/expressions/string.py +1002 -0
- cudf_polars/dsl/expressions/struct.py +137 -0
- cudf_polars/dsl/expressions/ternary.py +49 -0
- cudf_polars/dsl/expressions/unary.py +517 -0
- cudf_polars/dsl/ir.py +2607 -0
- cudf_polars/dsl/nodebase.py +164 -0
- cudf_polars/dsl/to_ast.py +359 -0
- cudf_polars/dsl/tracing.py +16 -0
- cudf_polars/dsl/translate.py +939 -0
- cudf_polars/dsl/traversal.py +224 -0
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +481 -0
- cudf_polars/dsl/utils/groupby.py +98 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +61 -0
- cudf_polars/dsl/utils/reshape.py +74 -0
- cudf_polars/dsl/utils/rolling.py +121 -0
- cudf_polars/dsl/utils/windows.py +192 -0
- cudf_polars/experimental/__init__.py +8 -0
- cudf_polars/experimental/base.py +386 -0
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds.py +220 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
- cudf_polars/experimental/benchmarks/pdsh.py +814 -0
- cudf_polars/experimental/benchmarks/utils.py +832 -0
- cudf_polars/experimental/dask_registers.py +200 -0
- cudf_polars/experimental/dispatch.py +156 -0
- cudf_polars/experimental/distinct.py +197 -0
- cudf_polars/experimental/explain.py +157 -0
- cudf_polars/experimental/expressions.py +590 -0
- cudf_polars/experimental/groupby.py +327 -0
- cudf_polars/experimental/io.py +943 -0
- cudf_polars/experimental/join.py +391 -0
- cudf_polars/experimental/parallel.py +423 -0
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +188 -0
- cudf_polars/experimental/shuffle.py +354 -0
- cudf_polars/experimental/sort.py +609 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/statistics.py +795 -0
- cudf_polars/experimental/utils.py +169 -0
- cudf_polars/py.typed +0 -0
- cudf_polars/testing/__init__.py +8 -0
- cudf_polars/testing/asserts.py +448 -0
- cudf_polars/testing/io.py +122 -0
- cudf_polars/testing/plugin.py +236 -0
- cudf_polars/typing/__init__.py +219 -0
- cudf_polars/utils/__init__.py +8 -0
- cudf_polars/utils/config.py +741 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +118 -0
- cudf_polars/utils/sorting.py +53 -0
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +27 -0
- cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
- cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
- cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
- cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
- cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Query 5."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from datetime import date, timedelta
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
import polars as pl
|
|
12
|
+
|
|
13
|
+
from cudf_polars.experimental.benchmarks.utils import get_data
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from cudf_polars.experimental.benchmarks.utils import RunConfig
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def duckdb_impl(run_config: RunConfig) -> str:
|
|
20
|
+
"""Query 5."""
|
|
21
|
+
return """
|
|
22
|
+
WITH ssr AS
|
|
23
|
+
(
|
|
24
|
+
SELECT s_store_id,
|
|
25
|
+
Sum(sales_price) AS sales,
|
|
26
|
+
Sum(profit) AS profit,
|
|
27
|
+
Sum(return_amt) AS returns1,
|
|
28
|
+
Sum(net_loss) AS profit_loss
|
|
29
|
+
FROM (
|
|
30
|
+
SELECT ss_store_sk AS store_sk,
|
|
31
|
+
ss_sold_date_sk AS date_sk,
|
|
32
|
+
ss_ext_sales_price AS sales_price,
|
|
33
|
+
ss_net_profit AS profit,
|
|
34
|
+
Cast(0 AS DECIMAL(7,2)) AS return_amt,
|
|
35
|
+
Cast(0 AS DECIMAL(7,2)) AS net_loss
|
|
36
|
+
FROM store_sales
|
|
37
|
+
UNION ALL
|
|
38
|
+
SELECT sr_store_sk AS store_sk,
|
|
39
|
+
sr_returned_date_sk AS date_sk,
|
|
40
|
+
Cast(0 AS DECIMAL(7,2)) AS sales_price,
|
|
41
|
+
Cast(0 AS DECIMAL(7,2)) AS profit,
|
|
42
|
+
sr_return_amt AS return_amt,
|
|
43
|
+
sr_net_loss AS net_loss
|
|
44
|
+
FROM store_returns ) salesreturns,
|
|
45
|
+
date_dim,
|
|
46
|
+
store
|
|
47
|
+
WHERE date_sk = d_date_sk
|
|
48
|
+
AND d_date BETWEEN Cast('2002-08-22' AS DATE) AND (
|
|
49
|
+
Cast('2002-08-22' AS DATE) + INTERVAL '14' day)
|
|
50
|
+
AND store_sk = s_store_sk
|
|
51
|
+
GROUP BY s_store_id) , csr AS
|
|
52
|
+
(
|
|
53
|
+
SELECT cp_catalog_page_id,
|
|
54
|
+
sum(sales_price) AS sales,
|
|
55
|
+
sum(profit) AS profit,
|
|
56
|
+
sum(return_amt) AS returns1,
|
|
57
|
+
sum(net_loss) AS profit_loss
|
|
58
|
+
FROM (
|
|
59
|
+
SELECT cs_catalog_page_sk AS page_sk,
|
|
60
|
+
cs_sold_date_sk AS date_sk,
|
|
61
|
+
cs_ext_sales_price AS sales_price,
|
|
62
|
+
cs_net_profit AS profit,
|
|
63
|
+
cast(0 AS decimal(7,2)) AS return_amt,
|
|
64
|
+
cast(0 AS decimal(7,2)) AS net_loss
|
|
65
|
+
FROM catalog_sales
|
|
66
|
+
UNION ALL
|
|
67
|
+
SELECT cr_catalog_page_sk AS page_sk,
|
|
68
|
+
cr_returned_date_sk AS date_sk,
|
|
69
|
+
cast(0 AS decimal(7,2)) AS sales_price,
|
|
70
|
+
cast(0 AS decimal(7,2)) AS profit,
|
|
71
|
+
cr_return_amount AS return_amt,
|
|
72
|
+
cr_net_loss AS net_loss
|
|
73
|
+
FROM catalog_returns ) salesreturns,
|
|
74
|
+
date_dim,
|
|
75
|
+
catalog_page
|
|
76
|
+
WHERE date_sk = d_date_sk
|
|
77
|
+
AND d_date BETWEEN cast('2002-08-22' AS date) AND (
|
|
78
|
+
cast('2002-08-22' AS date) + INTERVAL '14' day)
|
|
79
|
+
AND page_sk = cp_catalog_page_sk
|
|
80
|
+
GROUP BY cp_catalog_page_id) , wsr AS
|
|
81
|
+
(
|
|
82
|
+
SELECT web_site_id,
|
|
83
|
+
sum(sales_price) AS sales,
|
|
84
|
+
sum(profit) AS profit,
|
|
85
|
+
sum(return_amt) AS returns1,
|
|
86
|
+
sum(net_loss) AS profit_loss
|
|
87
|
+
FROM (
|
|
88
|
+
SELECT ws_web_site_sk AS wsr_web_site_sk,
|
|
89
|
+
ws_sold_date_sk AS date_sk,
|
|
90
|
+
ws_ext_sales_price AS sales_price,
|
|
91
|
+
ws_net_profit AS profit,
|
|
92
|
+
cast(0 AS decimal(7,2)) AS return_amt,
|
|
93
|
+
cast(0 AS decimal(7,2)) AS net_loss
|
|
94
|
+
FROM web_sales
|
|
95
|
+
UNION ALL
|
|
96
|
+
SELECT ws_web_site_sk AS wsr_web_site_sk,
|
|
97
|
+
wr_returned_date_sk AS date_sk,
|
|
98
|
+
cast(0 AS decimal(7,2)) AS sales_price,
|
|
99
|
+
cast(0 AS decimal(7,2)) AS profit,
|
|
100
|
+
wr_return_amt AS return_amt,
|
|
101
|
+
wr_net_loss AS net_loss
|
|
102
|
+
FROM web_returns
|
|
103
|
+
LEFT OUTER JOIN web_sales
|
|
104
|
+
ON (
|
|
105
|
+
wr_item_sk = ws_item_sk
|
|
106
|
+
AND wr_order_number = ws_order_number) ) salesreturns,
|
|
107
|
+
date_dim,
|
|
108
|
+
web_site
|
|
109
|
+
WHERE date_sk = d_date_sk
|
|
110
|
+
AND d_date BETWEEN cast('2002-08-22' AS date) AND (
|
|
111
|
+
cast('2002-08-22' AS date) + INTERVAL '14' day)
|
|
112
|
+
AND wsr_web_site_sk = web_site_sk
|
|
113
|
+
GROUP BY web_site_id)
|
|
114
|
+
SELECT
|
|
115
|
+
channel ,
|
|
116
|
+
id ,
|
|
117
|
+
sum(sales) AS sales ,
|
|
118
|
+
sum(returns1) AS returns1 ,
|
|
119
|
+
sum(profit) AS profit
|
|
120
|
+
FROM (
|
|
121
|
+
SELECT 'store channel' AS channel ,
|
|
122
|
+
'store'
|
|
123
|
+
|| s_store_id AS id ,
|
|
124
|
+
sales ,
|
|
125
|
+
returns1 ,
|
|
126
|
+
(profit - profit_loss) AS profit
|
|
127
|
+
FROM ssr
|
|
128
|
+
UNION ALL
|
|
129
|
+
SELECT 'catalog channel' AS channel ,
|
|
130
|
+
'catalog_page'
|
|
131
|
+
|| cp_catalog_page_id AS id ,
|
|
132
|
+
sales ,
|
|
133
|
+
returns1 ,
|
|
134
|
+
(profit - profit_loss) AS profit
|
|
135
|
+
FROM csr
|
|
136
|
+
UNION ALL
|
|
137
|
+
SELECT 'web channel' AS channel ,
|
|
138
|
+
'web_site'
|
|
139
|
+
|| web_site_id AS id ,
|
|
140
|
+
sales ,
|
|
141
|
+
returns1 ,
|
|
142
|
+
(profit - profit_loss) AS profit
|
|
143
|
+
FROM wsr ) x
|
|
144
|
+
GROUP BY rollup (channel, id)
|
|
145
|
+
ORDER BY channel ,
|
|
146
|
+
id
|
|
147
|
+
LIMIT 100;
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
|
|
152
|
+
"""Query 5."""
|
|
153
|
+
# Load required tables
|
|
154
|
+
store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
|
|
155
|
+
store_returns = get_data(
|
|
156
|
+
run_config.dataset_path, "store_returns", run_config.suffix
|
|
157
|
+
)
|
|
158
|
+
catalog_sales = get_data(
|
|
159
|
+
run_config.dataset_path, "catalog_sales", run_config.suffix
|
|
160
|
+
)
|
|
161
|
+
catalog_returns = get_data(
|
|
162
|
+
run_config.dataset_path, "catalog_returns", run_config.suffix
|
|
163
|
+
)
|
|
164
|
+
web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix)
|
|
165
|
+
web_returns = get_data(run_config.dataset_path, "web_returns", run_config.suffix)
|
|
166
|
+
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
|
|
167
|
+
store = get_data(run_config.dataset_path, "store", run_config.suffix)
|
|
168
|
+
catalog_page = get_data(run_config.dataset_path, "catalog_page", run_config.suffix)
|
|
169
|
+
web_site = get_data(run_config.dataset_path, "web_site", run_config.suffix)
|
|
170
|
+
|
|
171
|
+
# Date range filter - use actual date values
|
|
172
|
+
start_date = date(2002, 8, 22)
|
|
173
|
+
end_date = start_date + timedelta(days=14)
|
|
174
|
+
|
|
175
|
+
# Step 1: Create ssr CTE (Store Sales and Returns)
|
|
176
|
+
# Filter sales and returns by date first, then transform
|
|
177
|
+
store_sales_data = (
|
|
178
|
+
store_sales.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk")
|
|
179
|
+
.filter(pl.col("d_date").is_between(start_date, end_date, closed="both"))
|
|
180
|
+
.select(
|
|
181
|
+
[
|
|
182
|
+
pl.col("ss_store_sk").alias("store_sk"),
|
|
183
|
+
pl.col("ss_sold_date_sk").alias("date_sk"),
|
|
184
|
+
pl.col("ss_ext_sales_price").alias("sales_price"),
|
|
185
|
+
pl.col("ss_net_profit").alias("profit"),
|
|
186
|
+
pl.lit(0.0).alias("return_amt"),
|
|
187
|
+
pl.lit(0.0).alias("net_loss"),
|
|
188
|
+
]
|
|
189
|
+
)
|
|
190
|
+
)
|
|
191
|
+
store_returns_data = (
|
|
192
|
+
store_returns.join(
|
|
193
|
+
date_dim, left_on="sr_returned_date_sk", right_on="d_date_sk"
|
|
194
|
+
)
|
|
195
|
+
.filter(pl.col("d_date").is_between(start_date, end_date, closed="both"))
|
|
196
|
+
.select(
|
|
197
|
+
[
|
|
198
|
+
pl.col("sr_store_sk").alias("store_sk"),
|
|
199
|
+
pl.col("sr_returned_date_sk").alias("date_sk"),
|
|
200
|
+
pl.lit(0.0).alias("sales_price"),
|
|
201
|
+
pl.lit(0.0).alias("profit"),
|
|
202
|
+
pl.col("sr_return_amt").alias("return_amt"),
|
|
203
|
+
pl.col("sr_net_loss").alias("net_loss"),
|
|
204
|
+
]
|
|
205
|
+
)
|
|
206
|
+
)
|
|
207
|
+
store_salesreturns = pl.concat([store_sales_data, store_returns_data])
|
|
208
|
+
ssr = (
|
|
209
|
+
store_salesreturns.join(store, left_on="store_sk", right_on="s_store_sk")
|
|
210
|
+
.group_by("s_store_id")
|
|
211
|
+
.agg(
|
|
212
|
+
[
|
|
213
|
+
pl.col("sales_price").sum().alias("sales"),
|
|
214
|
+
pl.col("sales_price").count().alias("sales_count"),
|
|
215
|
+
pl.col("profit").sum().alias("profit"),
|
|
216
|
+
pl.col("profit").count().alias("profit_count"),
|
|
217
|
+
pl.col("return_amt").sum().alias("returns1"),
|
|
218
|
+
pl.col("return_amt").count().alias("returns1_count"),
|
|
219
|
+
pl.col("net_loss").sum().alias("profit_loss"),
|
|
220
|
+
pl.col("net_loss").count().alias("profit_loss_count"),
|
|
221
|
+
]
|
|
222
|
+
)
|
|
223
|
+
.with_columns(
|
|
224
|
+
[
|
|
225
|
+
pl.when(pl.col("sales_count") > 0)
|
|
226
|
+
.then(pl.col("sales"))
|
|
227
|
+
.otherwise(None)
|
|
228
|
+
.alias("sales"),
|
|
229
|
+
pl.when(pl.col("profit_count") > 0)
|
|
230
|
+
.then(pl.col("profit"))
|
|
231
|
+
.otherwise(None)
|
|
232
|
+
.alias("profit"),
|
|
233
|
+
pl.when(pl.col("returns1_count") > 0)
|
|
234
|
+
.then(pl.col("returns1"))
|
|
235
|
+
.otherwise(None)
|
|
236
|
+
.alias("returns1"),
|
|
237
|
+
pl.when(pl.col("profit_loss_count") > 0)
|
|
238
|
+
.then(pl.col("profit_loss"))
|
|
239
|
+
.otherwise(None)
|
|
240
|
+
.alias("profit_loss"),
|
|
241
|
+
]
|
|
242
|
+
)
|
|
243
|
+
.drop(["sales_count", "profit_count", "returns1_count", "profit_loss_count"])
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Step 2: Create csr CTE (Catalog Sales and Returns)
|
|
247
|
+
# Filter sales and returns by date first, then transform
|
|
248
|
+
catalog_sales_data = (
|
|
249
|
+
catalog_sales.join(date_dim, left_on="cs_sold_date_sk", right_on="d_date_sk")
|
|
250
|
+
.filter(pl.col("d_date").is_between(start_date, end_date, closed="both"))
|
|
251
|
+
.select(
|
|
252
|
+
[
|
|
253
|
+
pl.col("cs_catalog_page_sk").alias("page_sk"),
|
|
254
|
+
pl.col("cs_sold_date_sk").alias("date_sk"),
|
|
255
|
+
pl.col("cs_ext_sales_price").alias("sales_price"),
|
|
256
|
+
pl.col("cs_net_profit").alias("profit"),
|
|
257
|
+
pl.lit(0.0).alias("return_amt"),
|
|
258
|
+
pl.lit(0.0).alias("net_loss"),
|
|
259
|
+
]
|
|
260
|
+
)
|
|
261
|
+
)
|
|
262
|
+
catalog_returns_data = (
|
|
263
|
+
catalog_returns.join(
|
|
264
|
+
date_dim, left_on="cr_returned_date_sk", right_on="d_date_sk"
|
|
265
|
+
)
|
|
266
|
+
.filter(pl.col("d_date").is_between(start_date, end_date, closed="both"))
|
|
267
|
+
.select(
|
|
268
|
+
[
|
|
269
|
+
pl.col("cr_catalog_page_sk").alias("page_sk"),
|
|
270
|
+
pl.col("cr_returned_date_sk").alias("date_sk"),
|
|
271
|
+
pl.lit(0.0).alias("sales_price"),
|
|
272
|
+
pl.lit(0.0).alias("profit"),
|
|
273
|
+
pl.col("cr_return_amount").alias("return_amt"),
|
|
274
|
+
pl.col("cr_net_loss").alias("net_loss"),
|
|
275
|
+
]
|
|
276
|
+
)
|
|
277
|
+
)
|
|
278
|
+
catalog_salesreturns = pl.concat([catalog_sales_data, catalog_returns_data])
|
|
279
|
+
csr = (
|
|
280
|
+
catalog_salesreturns.join(
|
|
281
|
+
catalog_page, left_on="page_sk", right_on="cp_catalog_page_sk"
|
|
282
|
+
)
|
|
283
|
+
.group_by("cp_catalog_page_id")
|
|
284
|
+
.agg(
|
|
285
|
+
[
|
|
286
|
+
pl.col("sales_price").sum().alias("sales"),
|
|
287
|
+
pl.col("sales_price").count().alias("sales_count"),
|
|
288
|
+
pl.col("profit").sum().alias("profit"),
|
|
289
|
+
pl.col("profit").count().alias("profit_count"),
|
|
290
|
+
pl.col("return_amt").sum().alias("returns1"),
|
|
291
|
+
pl.col("return_amt").count().alias("returns1_count"),
|
|
292
|
+
pl.col("net_loss").sum().alias("profit_loss"),
|
|
293
|
+
pl.col("net_loss").count().alias("profit_loss_count"),
|
|
294
|
+
]
|
|
295
|
+
)
|
|
296
|
+
.with_columns(
|
|
297
|
+
[
|
|
298
|
+
pl.when(pl.col("sales_count") > 0)
|
|
299
|
+
.then(pl.col("sales"))
|
|
300
|
+
.otherwise(None)
|
|
301
|
+
.alias("sales"),
|
|
302
|
+
pl.when(pl.col("profit_count") > 0)
|
|
303
|
+
.then(pl.col("profit"))
|
|
304
|
+
.otherwise(None)
|
|
305
|
+
.alias("profit"),
|
|
306
|
+
pl.when(pl.col("returns1_count") > 0)
|
|
307
|
+
.then(pl.col("returns1"))
|
|
308
|
+
.otherwise(None)
|
|
309
|
+
.alias("returns1"),
|
|
310
|
+
pl.when(pl.col("profit_loss_count") > 0)
|
|
311
|
+
.then(pl.col("profit_loss"))
|
|
312
|
+
.otherwise(None)
|
|
313
|
+
.alias("profit_loss"),
|
|
314
|
+
]
|
|
315
|
+
)
|
|
316
|
+
.drop(["sales_count", "profit_count", "returns1_count", "profit_loss_count"])
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# Step 3: Create wsr CTE (Web Sales and Returns)
|
|
320
|
+
# Filter sales and returns by date first, then transform
|
|
321
|
+
web_sales_data = (
|
|
322
|
+
web_sales.join(date_dim, left_on="ws_sold_date_sk", right_on="d_date_sk")
|
|
323
|
+
.filter(pl.col("d_date").is_between(start_date, end_date, closed="both"))
|
|
324
|
+
.select(
|
|
325
|
+
[
|
|
326
|
+
pl.col("ws_web_site_sk").alias("wsr_web_site_sk"),
|
|
327
|
+
pl.col("ws_sold_date_sk").alias("date_sk"),
|
|
328
|
+
pl.col("ws_ext_sales_price").alias("sales_price"),
|
|
329
|
+
pl.col("ws_net_profit").alias("profit"),
|
|
330
|
+
pl.lit(0.0).alias("return_amt"),
|
|
331
|
+
pl.lit(0.0).alias("net_loss"),
|
|
332
|
+
]
|
|
333
|
+
)
|
|
334
|
+
)
|
|
335
|
+
# For web returns, we need the LEFT OUTER JOIN with web_sales, then filter by date
|
|
336
|
+
web_returns_data = (
|
|
337
|
+
web_returns.join(date_dim, left_on="wr_returned_date_sk", right_on="d_date_sk")
|
|
338
|
+
.filter(pl.col("d_date").is_between(start_date, end_date, closed="both"))
|
|
339
|
+
.join(
|
|
340
|
+
web_sales.select(["ws_item_sk", "ws_order_number", "ws_web_site_sk"]),
|
|
341
|
+
left_on=["wr_item_sk", "wr_order_number"],
|
|
342
|
+
right_on=["ws_item_sk", "ws_order_number"],
|
|
343
|
+
how="left",
|
|
344
|
+
)
|
|
345
|
+
.select(
|
|
346
|
+
[
|
|
347
|
+
pl.col("ws_web_site_sk").alias("wsr_web_site_sk"),
|
|
348
|
+
pl.col("wr_returned_date_sk").alias("date_sk"),
|
|
349
|
+
pl.lit(0.0).alias("sales_price"),
|
|
350
|
+
pl.lit(0.0).alias("profit"),
|
|
351
|
+
pl.col("wr_return_amt").alias("return_amt"),
|
|
352
|
+
pl.col("wr_net_loss").alias("net_loss"),
|
|
353
|
+
]
|
|
354
|
+
)
|
|
355
|
+
)
|
|
356
|
+
web_salesreturns = pl.concat([web_sales_data, web_returns_data])
|
|
357
|
+
wsr = (
|
|
358
|
+
web_salesreturns.join(
|
|
359
|
+
web_site, left_on="wsr_web_site_sk", right_on="web_site_sk"
|
|
360
|
+
)
|
|
361
|
+
.group_by("web_site_id")
|
|
362
|
+
.agg(
|
|
363
|
+
[
|
|
364
|
+
pl.col("sales_price").sum().alias("sales"),
|
|
365
|
+
pl.col("sales_price").count().alias("sales_count"),
|
|
366
|
+
pl.col("profit").sum().alias("profit"),
|
|
367
|
+
pl.col("profit").count().alias("profit_count"),
|
|
368
|
+
pl.col("return_amt").sum().alias("returns1"),
|
|
369
|
+
pl.col("return_amt").count().alias("returns1_count"),
|
|
370
|
+
pl.col("net_loss").sum().alias("profit_loss"),
|
|
371
|
+
pl.col("net_loss").count().alias("profit_loss_count"),
|
|
372
|
+
]
|
|
373
|
+
)
|
|
374
|
+
.with_columns(
|
|
375
|
+
[
|
|
376
|
+
pl.when(pl.col("sales_count") > 0)
|
|
377
|
+
.then(pl.col("sales"))
|
|
378
|
+
.otherwise(None)
|
|
379
|
+
.alias("sales"),
|
|
380
|
+
pl.when(pl.col("profit_count") > 0)
|
|
381
|
+
.then(pl.col("profit"))
|
|
382
|
+
.otherwise(None)
|
|
383
|
+
.alias("profit"),
|
|
384
|
+
pl.when(pl.col("returns1_count") > 0)
|
|
385
|
+
.then(pl.col("returns1"))
|
|
386
|
+
.otherwise(None)
|
|
387
|
+
.alias("returns1"),
|
|
388
|
+
pl.when(pl.col("profit_loss_count") > 0)
|
|
389
|
+
.then(pl.col("profit_loss"))
|
|
390
|
+
.otherwise(None)
|
|
391
|
+
.alias("profit_loss"),
|
|
392
|
+
]
|
|
393
|
+
)
|
|
394
|
+
.drop(["sales_count", "profit_count", "returns1_count", "profit_loss_count"])
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
# Step 4: Create the union of all channels
|
|
398
|
+
store_channel = ssr.select(
|
|
399
|
+
[
|
|
400
|
+
pl.lit("store channel").alias("channel"),
|
|
401
|
+
(pl.lit("store") + pl.col("s_store_id").cast(pl.Utf8)).alias("id"),
|
|
402
|
+
pl.col("sales"),
|
|
403
|
+
pl.col("returns1"),
|
|
404
|
+
(pl.col("profit") - pl.col("profit_loss")).alias("profit"),
|
|
405
|
+
]
|
|
406
|
+
)
|
|
407
|
+
catalog_channel = csr.select(
|
|
408
|
+
[
|
|
409
|
+
pl.lit("catalog channel").alias("channel"),
|
|
410
|
+
(pl.lit("catalog_page") + pl.col("cp_catalog_page_id").cast(pl.Utf8)).alias(
|
|
411
|
+
"id"
|
|
412
|
+
),
|
|
413
|
+
pl.col("sales"),
|
|
414
|
+
pl.col("returns1"),
|
|
415
|
+
(pl.col("profit") - pl.col("profit_loss")).alias("profit"),
|
|
416
|
+
]
|
|
417
|
+
)
|
|
418
|
+
web_channel = wsr.select(
|
|
419
|
+
[
|
|
420
|
+
pl.lit("web channel").alias("channel"),
|
|
421
|
+
(pl.lit("web_site") + pl.col("web_site_id").cast(pl.Utf8)).alias("id"),
|
|
422
|
+
pl.col("sales"),
|
|
423
|
+
pl.col("returns1"),
|
|
424
|
+
(pl.col("profit") - pl.col("profit_loss")).alias("profit"),
|
|
425
|
+
]
|
|
426
|
+
)
|
|
427
|
+
all_channels = pl.concat([store_channel, catalog_channel, web_channel])
|
|
428
|
+
|
|
429
|
+
# Step 5: Group by channel and id (filter out NULL rollup rows)
|
|
430
|
+
return (
|
|
431
|
+
all_channels.group_by(["channel", "id"])
|
|
432
|
+
.agg(
|
|
433
|
+
[
|
|
434
|
+
pl.col("sales").sum().alias("sales"),
|
|
435
|
+
pl.col("sales").count().alias("sales_count"),
|
|
436
|
+
pl.col("returns1").sum().alias("returns1"),
|
|
437
|
+
pl.col("returns1").count().alias("returns1_count"),
|
|
438
|
+
pl.col("profit").sum().alias("profit"),
|
|
439
|
+
pl.col("profit").count().alias("profit_count"),
|
|
440
|
+
]
|
|
441
|
+
)
|
|
442
|
+
.with_columns(
|
|
443
|
+
[
|
|
444
|
+
pl.when(pl.col("sales_count") > 0)
|
|
445
|
+
.then(pl.col("sales"))
|
|
446
|
+
.otherwise(None)
|
|
447
|
+
.alias("sales"),
|
|
448
|
+
pl.when(pl.col("returns1_count") > 0)
|
|
449
|
+
.then(pl.col("returns1"))
|
|
450
|
+
.otherwise(None)
|
|
451
|
+
.alias("returns1"),
|
|
452
|
+
pl.when(pl.col("profit_count") > 0)
|
|
453
|
+
.then(pl.col("profit"))
|
|
454
|
+
.otherwise(None)
|
|
455
|
+
.alias("profit"),
|
|
456
|
+
]
|
|
457
|
+
)
|
|
458
|
+
.drop(["sales_count", "returns1_count", "profit_count"])
|
|
459
|
+
.filter(pl.col("channel").is_not_null() & pl.col("id").is_not_null())
|
|
460
|
+
.sort(["channel", "id"])
|
|
461
|
+
.limit(100)
|
|
462
|
+
)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Query 6."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
import polars as pl
|
|
11
|
+
|
|
12
|
+
from cudf_polars.experimental.benchmarks.utils import get_data
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from cudf_polars.experimental.benchmarks.utils import RunConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def duckdb_impl(run_config: RunConfig) -> str:
|
|
19
|
+
"""Query 6."""
|
|
20
|
+
return """
|
|
21
|
+
SELECT a.ca_state state,
|
|
22
|
+
Count(*) cnt
|
|
23
|
+
FROM customer_address a,
|
|
24
|
+
customer c,
|
|
25
|
+
store_sales s,
|
|
26
|
+
date_dim d,
|
|
27
|
+
item i
|
|
28
|
+
WHERE a.ca_address_sk = c.c_current_addr_sk
|
|
29
|
+
AND c.c_customer_sk = s.ss_customer_sk
|
|
30
|
+
AND s.ss_sold_date_sk = d.d_date_sk
|
|
31
|
+
AND s.ss_item_sk = i.i_item_sk
|
|
32
|
+
AND d.d_month_seq = (SELECT DISTINCT ( d_month_seq )
|
|
33
|
+
FROM date_dim
|
|
34
|
+
WHERE d_year = 1998
|
|
35
|
+
AND d_moy = 7)
|
|
36
|
+
AND i.i_current_price > 1.2 * (SELECT Avg(j.i_current_price)
|
|
37
|
+
FROM item j
|
|
38
|
+
WHERE j.i_category = i.i_category)
|
|
39
|
+
GROUP BY a.ca_state
|
|
40
|
+
HAVING Count(*) >= 10
|
|
41
|
+
--ORDER BY cnt
|
|
42
|
+
ORDER BY cnt, state
|
|
43
|
+
LIMIT 100;
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
|
|
48
|
+
"""Query 6."""
|
|
49
|
+
# Load required tables
|
|
50
|
+
customer_address = get_data(
|
|
51
|
+
run_config.dataset_path, "customer_address", run_config.suffix
|
|
52
|
+
)
|
|
53
|
+
customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
|
|
54
|
+
store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
|
|
55
|
+
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
|
|
56
|
+
item = get_data(run_config.dataset_path, "item", run_config.suffix)
|
|
57
|
+
|
|
58
|
+
# Subquery 1: d_month_seq values for July 1998
|
|
59
|
+
target_month_seq_table = (
|
|
60
|
+
date_dim.filter((pl.col("d_year") == 1998) & (pl.col("d_moy") == 7))
|
|
61
|
+
.select("d_month_seq")
|
|
62
|
+
.unique()
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Subquery 2: Calculate average price per category
|
|
66
|
+
avg_price_per_category = item.group_by("i_category").agg(
|
|
67
|
+
pl.col("i_current_price").mean().alias("avg_price")
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return (
|
|
71
|
+
customer_address.join(
|
|
72
|
+
customer, left_on="ca_address_sk", right_on="c_current_addr_sk"
|
|
73
|
+
)
|
|
74
|
+
.join(store_sales, left_on="c_customer_sk", right_on="ss_customer_sk")
|
|
75
|
+
.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk")
|
|
76
|
+
.join(item, left_on="ss_item_sk", right_on="i_item_sk")
|
|
77
|
+
.join(avg_price_per_category, on="i_category")
|
|
78
|
+
.join(target_month_seq_table, on="d_month_seq", how="semi")
|
|
79
|
+
.filter(pl.col("i_current_price") > 1.2 * pl.col("avg_price"))
|
|
80
|
+
.group_by("ca_state")
|
|
81
|
+
.agg(pl.len().alias("cnt"))
|
|
82
|
+
.filter(pl.col("cnt") >= 10)
|
|
83
|
+
.sort(["cnt", "ca_state"], nulls_last=True)
|
|
84
|
+
.limit(100)
|
|
85
|
+
.select(
|
|
86
|
+
[
|
|
87
|
+
pl.col("ca_state").alias("state"),
|
|
88
|
+
# Cast -> Int64 to match DuckDB
|
|
89
|
+
pl.col("cnt").cast(pl.Int64),
|
|
90
|
+
]
|
|
91
|
+
)
|
|
92
|
+
)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Query 7."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
import polars as pl
|
|
11
|
+
|
|
12
|
+
from cudf_polars.experimental.benchmarks.utils import get_data
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from cudf_polars.experimental.benchmarks.utils import RunConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def duckdb_impl(run_config: RunConfig) -> str:
|
|
19
|
+
"""Query 7."""
|
|
20
|
+
return """
|
|
21
|
+
SELECT i_item_id,
|
|
22
|
+
Avg(ss_quantity) agg1,
|
|
23
|
+
Avg(ss_list_price) agg2,
|
|
24
|
+
Avg(ss_coupon_amt) agg3,
|
|
25
|
+
Avg(ss_sales_price) agg4
|
|
26
|
+
FROM store_sales,
|
|
27
|
+
customer_demographics,
|
|
28
|
+
date_dim,
|
|
29
|
+
item,
|
|
30
|
+
promotion
|
|
31
|
+
WHERE ss_sold_date_sk = d_date_sk
|
|
32
|
+
AND ss_item_sk = i_item_sk
|
|
33
|
+
AND ss_cdemo_sk = cd_demo_sk
|
|
34
|
+
AND ss_promo_sk = p_promo_sk
|
|
35
|
+
AND cd_gender = 'F'
|
|
36
|
+
AND cd_marital_status = 'W'
|
|
37
|
+
AND cd_education_status = '2 yr Degree'
|
|
38
|
+
AND ( p_channel_email = 'N'
|
|
39
|
+
OR p_channel_event = 'N' )
|
|
40
|
+
AND d_year = 1998
|
|
41
|
+
GROUP BY i_item_id
|
|
42
|
+
ORDER BY i_item_id
|
|
43
|
+
LIMIT 100;
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
|
|
48
|
+
"""Query 7."""
|
|
49
|
+
# Load required tables
|
|
50
|
+
store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
|
|
51
|
+
customer_demographics = get_data(
|
|
52
|
+
run_config.dataset_path, "customer_demographics", run_config.suffix
|
|
53
|
+
)
|
|
54
|
+
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
|
|
55
|
+
item = get_data(run_config.dataset_path, "item", run_config.suffix)
|
|
56
|
+
promotion = get_data(run_config.dataset_path, "promotion", run_config.suffix)
|
|
57
|
+
|
|
58
|
+
return (
|
|
59
|
+
store_sales.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk")
|
|
60
|
+
.join(item, left_on="ss_item_sk", right_on="i_item_sk")
|
|
61
|
+
.join(customer_demographics, left_on="ss_cdemo_sk", right_on="cd_demo_sk")
|
|
62
|
+
.join(promotion, left_on="ss_promo_sk", right_on="p_promo_sk")
|
|
63
|
+
.filter(pl.col("cd_gender") == "F")
|
|
64
|
+
.filter(pl.col("cd_marital_status") == "W")
|
|
65
|
+
.filter(pl.col("cd_education_status") == "2 yr Degree")
|
|
66
|
+
.filter((pl.col("p_channel_email") == "N") | (pl.col("p_channel_event") == "N"))
|
|
67
|
+
.filter(pl.col("d_year") == 1998)
|
|
68
|
+
.group_by("i_item_id")
|
|
69
|
+
.agg(
|
|
70
|
+
[
|
|
71
|
+
pl.col("ss_quantity").mean().alias("agg1"),
|
|
72
|
+
pl.col("ss_list_price").mean().alias("agg2"),
|
|
73
|
+
pl.col("ss_coupon_amt").mean().alias("agg3"),
|
|
74
|
+
pl.col("ss_sales_price").mean().alias("agg4"),
|
|
75
|
+
]
|
|
76
|
+
)
|
|
77
|
+
.sort("i_item_id", nulls_last=True)
|
|
78
|
+
.limit(100)
|
|
79
|
+
)
|