cudf-polars-cu13 25.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. cudf_polars/GIT_COMMIT +1 -0
  2. cudf_polars/VERSION +1 -0
  3. cudf_polars/__init__.py +28 -0
  4. cudf_polars/_version.py +21 -0
  5. cudf_polars/callback.py +318 -0
  6. cudf_polars/containers/__init__.py +13 -0
  7. cudf_polars/containers/column.py +495 -0
  8. cudf_polars/containers/dataframe.py +361 -0
  9. cudf_polars/containers/datatype.py +137 -0
  10. cudf_polars/dsl/__init__.py +8 -0
  11. cudf_polars/dsl/expr.py +66 -0
  12. cudf_polars/dsl/expressions/__init__.py +8 -0
  13. cudf_polars/dsl/expressions/aggregation.py +226 -0
  14. cudf_polars/dsl/expressions/base.py +272 -0
  15. cudf_polars/dsl/expressions/binaryop.py +120 -0
  16. cudf_polars/dsl/expressions/boolean.py +326 -0
  17. cudf_polars/dsl/expressions/datetime.py +271 -0
  18. cudf_polars/dsl/expressions/literal.py +97 -0
  19. cudf_polars/dsl/expressions/rolling.py +643 -0
  20. cudf_polars/dsl/expressions/selection.py +74 -0
  21. cudf_polars/dsl/expressions/slicing.py +46 -0
  22. cudf_polars/dsl/expressions/sorting.py +85 -0
  23. cudf_polars/dsl/expressions/string.py +1002 -0
  24. cudf_polars/dsl/expressions/struct.py +137 -0
  25. cudf_polars/dsl/expressions/ternary.py +49 -0
  26. cudf_polars/dsl/expressions/unary.py +517 -0
  27. cudf_polars/dsl/ir.py +2607 -0
  28. cudf_polars/dsl/nodebase.py +164 -0
  29. cudf_polars/dsl/to_ast.py +359 -0
  30. cudf_polars/dsl/tracing.py +16 -0
  31. cudf_polars/dsl/translate.py +939 -0
  32. cudf_polars/dsl/traversal.py +224 -0
  33. cudf_polars/dsl/utils/__init__.py +8 -0
  34. cudf_polars/dsl/utils/aggregations.py +481 -0
  35. cudf_polars/dsl/utils/groupby.py +98 -0
  36. cudf_polars/dsl/utils/naming.py +34 -0
  37. cudf_polars/dsl/utils/replace.py +61 -0
  38. cudf_polars/dsl/utils/reshape.py +74 -0
  39. cudf_polars/dsl/utils/rolling.py +121 -0
  40. cudf_polars/dsl/utils/windows.py +192 -0
  41. cudf_polars/experimental/__init__.py +8 -0
  42. cudf_polars/experimental/base.py +386 -0
  43. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  44. cudf_polars/experimental/benchmarks/pdsds.py +220 -0
  45. cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
  46. cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
  47. cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
  48. cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
  49. cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
  50. cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
  51. cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
  52. cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
  53. cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
  54. cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
  55. cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
  56. cudf_polars/experimental/benchmarks/pdsh.py +814 -0
  57. cudf_polars/experimental/benchmarks/utils.py +832 -0
  58. cudf_polars/experimental/dask_registers.py +200 -0
  59. cudf_polars/experimental/dispatch.py +156 -0
  60. cudf_polars/experimental/distinct.py +197 -0
  61. cudf_polars/experimental/explain.py +157 -0
  62. cudf_polars/experimental/expressions.py +590 -0
  63. cudf_polars/experimental/groupby.py +327 -0
  64. cudf_polars/experimental/io.py +943 -0
  65. cudf_polars/experimental/join.py +391 -0
  66. cudf_polars/experimental/parallel.py +423 -0
  67. cudf_polars/experimental/repartition.py +69 -0
  68. cudf_polars/experimental/scheduler.py +155 -0
  69. cudf_polars/experimental/select.py +188 -0
  70. cudf_polars/experimental/shuffle.py +354 -0
  71. cudf_polars/experimental/sort.py +609 -0
  72. cudf_polars/experimental/spilling.py +151 -0
  73. cudf_polars/experimental/statistics.py +795 -0
  74. cudf_polars/experimental/utils.py +169 -0
  75. cudf_polars/py.typed +0 -0
  76. cudf_polars/testing/__init__.py +8 -0
  77. cudf_polars/testing/asserts.py +448 -0
  78. cudf_polars/testing/io.py +122 -0
  79. cudf_polars/testing/plugin.py +236 -0
  80. cudf_polars/typing/__init__.py +219 -0
  81. cudf_polars/utils/__init__.py +8 -0
  82. cudf_polars/utils/config.py +741 -0
  83. cudf_polars/utils/conversion.py +40 -0
  84. cudf_polars/utils/dtypes.py +118 -0
  85. cudf_polars/utils/sorting.py +53 -0
  86. cudf_polars/utils/timer.py +39 -0
  87. cudf_polars/utils/versions.py +27 -0
  88. cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
  89. cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
  90. cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
  91. cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
  92. cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,462 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Query 5."""
5
+
6
+ from __future__ import annotations
7
+
8
+ from datetime import date, timedelta
9
+ from typing import TYPE_CHECKING
10
+
11
+ import polars as pl
12
+
13
+ from cudf_polars.experimental.benchmarks.utils import get_data
14
+
15
+ if TYPE_CHECKING:
16
+ from cudf_polars.experimental.benchmarks.utils import RunConfig
17
+
18
+
19
+ def duckdb_impl(run_config: RunConfig) -> str:
20
+ """Query 5."""
21
+ return """
22
+ WITH ssr AS
23
+ (
24
+ SELECT s_store_id,
25
+ Sum(sales_price) AS sales,
26
+ Sum(profit) AS profit,
27
+ Sum(return_amt) AS returns1,
28
+ Sum(net_loss) AS profit_loss
29
+ FROM (
30
+ SELECT ss_store_sk AS store_sk,
31
+ ss_sold_date_sk AS date_sk,
32
+ ss_ext_sales_price AS sales_price,
33
+ ss_net_profit AS profit,
34
+ Cast(0 AS DECIMAL(7,2)) AS return_amt,
35
+ Cast(0 AS DECIMAL(7,2)) AS net_loss
36
+ FROM store_sales
37
+ UNION ALL
38
+ SELECT sr_store_sk AS store_sk,
39
+ sr_returned_date_sk AS date_sk,
40
+ Cast(0 AS DECIMAL(7,2)) AS sales_price,
41
+ Cast(0 AS DECIMAL(7,2)) AS profit,
42
+ sr_return_amt AS return_amt,
43
+ sr_net_loss AS net_loss
44
+ FROM store_returns ) salesreturns,
45
+ date_dim,
46
+ store
47
+ WHERE date_sk = d_date_sk
48
+ AND d_date BETWEEN Cast('2002-08-22' AS DATE) AND (
49
+ Cast('2002-08-22' AS DATE) + INTERVAL '14' day)
50
+ AND store_sk = s_store_sk
51
+ GROUP BY s_store_id) , csr AS
52
+ (
53
+ SELECT cp_catalog_page_id,
54
+ sum(sales_price) AS sales,
55
+ sum(profit) AS profit,
56
+ sum(return_amt) AS returns1,
57
+ sum(net_loss) AS profit_loss
58
+ FROM (
59
+ SELECT cs_catalog_page_sk AS page_sk,
60
+ cs_sold_date_sk AS date_sk,
61
+ cs_ext_sales_price AS sales_price,
62
+ cs_net_profit AS profit,
63
+ cast(0 AS decimal(7,2)) AS return_amt,
64
+ cast(0 AS decimal(7,2)) AS net_loss
65
+ FROM catalog_sales
66
+ UNION ALL
67
+ SELECT cr_catalog_page_sk AS page_sk,
68
+ cr_returned_date_sk AS date_sk,
69
+ cast(0 AS decimal(7,2)) AS sales_price,
70
+ cast(0 AS decimal(7,2)) AS profit,
71
+ cr_return_amount AS return_amt,
72
+ cr_net_loss AS net_loss
73
+ FROM catalog_returns ) salesreturns,
74
+ date_dim,
75
+ catalog_page
76
+ WHERE date_sk = d_date_sk
77
+ AND d_date BETWEEN cast('2002-08-22' AS date) AND (
78
+ cast('2002-08-22' AS date) + INTERVAL '14' day)
79
+ AND page_sk = cp_catalog_page_sk
80
+ GROUP BY cp_catalog_page_id) , wsr AS
81
+ (
82
+ SELECT web_site_id,
83
+ sum(sales_price) AS sales,
84
+ sum(profit) AS profit,
85
+ sum(return_amt) AS returns1,
86
+ sum(net_loss) AS profit_loss
87
+ FROM (
88
+ SELECT ws_web_site_sk AS wsr_web_site_sk,
89
+ ws_sold_date_sk AS date_sk,
90
+ ws_ext_sales_price AS sales_price,
91
+ ws_net_profit AS profit,
92
+ cast(0 AS decimal(7,2)) AS return_amt,
93
+ cast(0 AS decimal(7,2)) AS net_loss
94
+ FROM web_sales
95
+ UNION ALL
96
+ SELECT ws_web_site_sk AS wsr_web_site_sk,
97
+ wr_returned_date_sk AS date_sk,
98
+ cast(0 AS decimal(7,2)) AS sales_price,
99
+ cast(0 AS decimal(7,2)) AS profit,
100
+ wr_return_amt AS return_amt,
101
+ wr_net_loss AS net_loss
102
+ FROM web_returns
103
+ LEFT OUTER JOIN web_sales
104
+ ON (
105
+ wr_item_sk = ws_item_sk
106
+ AND wr_order_number = ws_order_number) ) salesreturns,
107
+ date_dim,
108
+ web_site
109
+ WHERE date_sk = d_date_sk
110
+ AND d_date BETWEEN cast('2002-08-22' AS date) AND (
111
+ cast('2002-08-22' AS date) + INTERVAL '14' day)
112
+ AND wsr_web_site_sk = web_site_sk
113
+ GROUP BY web_site_id)
114
+ SELECT
115
+ channel ,
116
+ id ,
117
+ sum(sales) AS sales ,
118
+ sum(returns1) AS returns1 ,
119
+ sum(profit) AS profit
120
+ FROM (
121
+ SELECT 'store channel' AS channel ,
122
+ 'store'
123
+ || s_store_id AS id ,
124
+ sales ,
125
+ returns1 ,
126
+ (profit - profit_loss) AS profit
127
+ FROM ssr
128
+ UNION ALL
129
+ SELECT 'catalog channel' AS channel ,
130
+ 'catalog_page'
131
+ || cp_catalog_page_id AS id ,
132
+ sales ,
133
+ returns1 ,
134
+ (profit - profit_loss) AS profit
135
+ FROM csr
136
+ UNION ALL
137
+ SELECT 'web channel' AS channel ,
138
+ 'web_site'
139
+ || web_site_id AS id ,
140
+ sales ,
141
+ returns1 ,
142
+ (profit - profit_loss) AS profit
143
+ FROM wsr ) x
144
+ GROUP BY rollup (channel, id)
145
+ ORDER BY channel ,
146
+ id
147
+ LIMIT 100;
148
+ """
149
+
150
+
151
+ def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
152
+ """Query 5."""
153
+ # Load required tables
154
+ store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
155
+ store_returns = get_data(
156
+ run_config.dataset_path, "store_returns", run_config.suffix
157
+ )
158
+ catalog_sales = get_data(
159
+ run_config.dataset_path, "catalog_sales", run_config.suffix
160
+ )
161
+ catalog_returns = get_data(
162
+ run_config.dataset_path, "catalog_returns", run_config.suffix
163
+ )
164
+ web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix)
165
+ web_returns = get_data(run_config.dataset_path, "web_returns", run_config.suffix)
166
+ date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
167
+ store = get_data(run_config.dataset_path, "store", run_config.suffix)
168
+ catalog_page = get_data(run_config.dataset_path, "catalog_page", run_config.suffix)
169
+ web_site = get_data(run_config.dataset_path, "web_site", run_config.suffix)
170
+
171
+ # Date range filter - use actual date values
172
+ start_date = date(2002, 8, 22)
173
+ end_date = start_date + timedelta(days=14)
174
+
175
+ # Step 1: Create ssr CTE (Store Sales and Returns)
176
+ # Filter sales and returns by date first, then transform
177
+ store_sales_data = (
178
+ store_sales.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk")
179
+ .filter(pl.col("d_date").is_between(start_date, end_date, closed="both"))
180
+ .select(
181
+ [
182
+ pl.col("ss_store_sk").alias("store_sk"),
183
+ pl.col("ss_sold_date_sk").alias("date_sk"),
184
+ pl.col("ss_ext_sales_price").alias("sales_price"),
185
+ pl.col("ss_net_profit").alias("profit"),
186
+ pl.lit(0.0).alias("return_amt"),
187
+ pl.lit(0.0).alias("net_loss"),
188
+ ]
189
+ )
190
+ )
191
+ store_returns_data = (
192
+ store_returns.join(
193
+ date_dim, left_on="sr_returned_date_sk", right_on="d_date_sk"
194
+ )
195
+ .filter(pl.col("d_date").is_between(start_date, end_date, closed="both"))
196
+ .select(
197
+ [
198
+ pl.col("sr_store_sk").alias("store_sk"),
199
+ pl.col("sr_returned_date_sk").alias("date_sk"),
200
+ pl.lit(0.0).alias("sales_price"),
201
+ pl.lit(0.0).alias("profit"),
202
+ pl.col("sr_return_amt").alias("return_amt"),
203
+ pl.col("sr_net_loss").alias("net_loss"),
204
+ ]
205
+ )
206
+ )
207
+ store_salesreturns = pl.concat([store_sales_data, store_returns_data])
208
+ ssr = (
209
+ store_salesreturns.join(store, left_on="store_sk", right_on="s_store_sk")
210
+ .group_by("s_store_id")
211
+ .agg(
212
+ [
213
+ pl.col("sales_price").sum().alias("sales"),
214
+ pl.col("sales_price").count().alias("sales_count"),
215
+ pl.col("profit").sum().alias("profit"),
216
+ pl.col("profit").count().alias("profit_count"),
217
+ pl.col("return_amt").sum().alias("returns1"),
218
+ pl.col("return_amt").count().alias("returns1_count"),
219
+ pl.col("net_loss").sum().alias("profit_loss"),
220
+ pl.col("net_loss").count().alias("profit_loss_count"),
221
+ ]
222
+ )
223
+ .with_columns(
224
+ [
225
+ pl.when(pl.col("sales_count") > 0)
226
+ .then(pl.col("sales"))
227
+ .otherwise(None)
228
+ .alias("sales"),
229
+ pl.when(pl.col("profit_count") > 0)
230
+ .then(pl.col("profit"))
231
+ .otherwise(None)
232
+ .alias("profit"),
233
+ pl.when(pl.col("returns1_count") > 0)
234
+ .then(pl.col("returns1"))
235
+ .otherwise(None)
236
+ .alias("returns1"),
237
+ pl.when(pl.col("profit_loss_count") > 0)
238
+ .then(pl.col("profit_loss"))
239
+ .otherwise(None)
240
+ .alias("profit_loss"),
241
+ ]
242
+ )
243
+ .drop(["sales_count", "profit_count", "returns1_count", "profit_loss_count"])
244
+ )
245
+
246
+ # Step 2: Create csr CTE (Catalog Sales and Returns)
247
+ # Filter sales and returns by date first, then transform
248
+ catalog_sales_data = (
249
+ catalog_sales.join(date_dim, left_on="cs_sold_date_sk", right_on="d_date_sk")
250
+ .filter(pl.col("d_date").is_between(start_date, end_date, closed="both"))
251
+ .select(
252
+ [
253
+ pl.col("cs_catalog_page_sk").alias("page_sk"),
254
+ pl.col("cs_sold_date_sk").alias("date_sk"),
255
+ pl.col("cs_ext_sales_price").alias("sales_price"),
256
+ pl.col("cs_net_profit").alias("profit"),
257
+ pl.lit(0.0).alias("return_amt"),
258
+ pl.lit(0.0).alias("net_loss"),
259
+ ]
260
+ )
261
+ )
262
+ catalog_returns_data = (
263
+ catalog_returns.join(
264
+ date_dim, left_on="cr_returned_date_sk", right_on="d_date_sk"
265
+ )
266
+ .filter(pl.col("d_date").is_between(start_date, end_date, closed="both"))
267
+ .select(
268
+ [
269
+ pl.col("cr_catalog_page_sk").alias("page_sk"),
270
+ pl.col("cr_returned_date_sk").alias("date_sk"),
271
+ pl.lit(0.0).alias("sales_price"),
272
+ pl.lit(0.0).alias("profit"),
273
+ pl.col("cr_return_amount").alias("return_amt"),
274
+ pl.col("cr_net_loss").alias("net_loss"),
275
+ ]
276
+ )
277
+ )
278
+ catalog_salesreturns = pl.concat([catalog_sales_data, catalog_returns_data])
279
+ csr = (
280
+ catalog_salesreturns.join(
281
+ catalog_page, left_on="page_sk", right_on="cp_catalog_page_sk"
282
+ )
283
+ .group_by("cp_catalog_page_id")
284
+ .agg(
285
+ [
286
+ pl.col("sales_price").sum().alias("sales"),
287
+ pl.col("sales_price").count().alias("sales_count"),
288
+ pl.col("profit").sum().alias("profit"),
289
+ pl.col("profit").count().alias("profit_count"),
290
+ pl.col("return_amt").sum().alias("returns1"),
291
+ pl.col("return_amt").count().alias("returns1_count"),
292
+ pl.col("net_loss").sum().alias("profit_loss"),
293
+ pl.col("net_loss").count().alias("profit_loss_count"),
294
+ ]
295
+ )
296
+ .with_columns(
297
+ [
298
+ pl.when(pl.col("sales_count") > 0)
299
+ .then(pl.col("sales"))
300
+ .otherwise(None)
301
+ .alias("sales"),
302
+ pl.when(pl.col("profit_count") > 0)
303
+ .then(pl.col("profit"))
304
+ .otherwise(None)
305
+ .alias("profit"),
306
+ pl.when(pl.col("returns1_count") > 0)
307
+ .then(pl.col("returns1"))
308
+ .otherwise(None)
309
+ .alias("returns1"),
310
+ pl.when(pl.col("profit_loss_count") > 0)
311
+ .then(pl.col("profit_loss"))
312
+ .otherwise(None)
313
+ .alias("profit_loss"),
314
+ ]
315
+ )
316
+ .drop(["sales_count", "profit_count", "returns1_count", "profit_loss_count"])
317
+ )
318
+
319
+ # Step 3: Create wsr CTE (Web Sales and Returns)
320
+ # Filter sales and returns by date first, then transform
321
+ web_sales_data = (
322
+ web_sales.join(date_dim, left_on="ws_sold_date_sk", right_on="d_date_sk")
323
+ .filter(pl.col("d_date").is_between(start_date, end_date, closed="both"))
324
+ .select(
325
+ [
326
+ pl.col("ws_web_site_sk").alias("wsr_web_site_sk"),
327
+ pl.col("ws_sold_date_sk").alias("date_sk"),
328
+ pl.col("ws_ext_sales_price").alias("sales_price"),
329
+ pl.col("ws_net_profit").alias("profit"),
330
+ pl.lit(0.0).alias("return_amt"),
331
+ pl.lit(0.0).alias("net_loss"),
332
+ ]
333
+ )
334
+ )
335
+ # For web returns, we need the LEFT OUTER JOIN with web_sales, then filter by date
336
+ web_returns_data = (
337
+ web_returns.join(date_dim, left_on="wr_returned_date_sk", right_on="d_date_sk")
338
+ .filter(pl.col("d_date").is_between(start_date, end_date, closed="both"))
339
+ .join(
340
+ web_sales.select(["ws_item_sk", "ws_order_number", "ws_web_site_sk"]),
341
+ left_on=["wr_item_sk", "wr_order_number"],
342
+ right_on=["ws_item_sk", "ws_order_number"],
343
+ how="left",
344
+ )
345
+ .select(
346
+ [
347
+ pl.col("ws_web_site_sk").alias("wsr_web_site_sk"),
348
+ pl.col("wr_returned_date_sk").alias("date_sk"),
349
+ pl.lit(0.0).alias("sales_price"),
350
+ pl.lit(0.0).alias("profit"),
351
+ pl.col("wr_return_amt").alias("return_amt"),
352
+ pl.col("wr_net_loss").alias("net_loss"),
353
+ ]
354
+ )
355
+ )
356
+ web_salesreturns = pl.concat([web_sales_data, web_returns_data])
357
+ wsr = (
358
+ web_salesreturns.join(
359
+ web_site, left_on="wsr_web_site_sk", right_on="web_site_sk"
360
+ )
361
+ .group_by("web_site_id")
362
+ .agg(
363
+ [
364
+ pl.col("sales_price").sum().alias("sales"),
365
+ pl.col("sales_price").count().alias("sales_count"),
366
+ pl.col("profit").sum().alias("profit"),
367
+ pl.col("profit").count().alias("profit_count"),
368
+ pl.col("return_amt").sum().alias("returns1"),
369
+ pl.col("return_amt").count().alias("returns1_count"),
370
+ pl.col("net_loss").sum().alias("profit_loss"),
371
+ pl.col("net_loss").count().alias("profit_loss_count"),
372
+ ]
373
+ )
374
+ .with_columns(
375
+ [
376
+ pl.when(pl.col("sales_count") > 0)
377
+ .then(pl.col("sales"))
378
+ .otherwise(None)
379
+ .alias("sales"),
380
+ pl.when(pl.col("profit_count") > 0)
381
+ .then(pl.col("profit"))
382
+ .otherwise(None)
383
+ .alias("profit"),
384
+ pl.when(pl.col("returns1_count") > 0)
385
+ .then(pl.col("returns1"))
386
+ .otherwise(None)
387
+ .alias("returns1"),
388
+ pl.when(pl.col("profit_loss_count") > 0)
389
+ .then(pl.col("profit_loss"))
390
+ .otherwise(None)
391
+ .alias("profit_loss"),
392
+ ]
393
+ )
394
+ .drop(["sales_count", "profit_count", "returns1_count", "profit_loss_count"])
395
+ )
396
+
397
+ # Step 4: Create the union of all channels
398
+ store_channel = ssr.select(
399
+ [
400
+ pl.lit("store channel").alias("channel"),
401
+ (pl.lit("store") + pl.col("s_store_id").cast(pl.Utf8)).alias("id"),
402
+ pl.col("sales"),
403
+ pl.col("returns1"),
404
+ (pl.col("profit") - pl.col("profit_loss")).alias("profit"),
405
+ ]
406
+ )
407
+ catalog_channel = csr.select(
408
+ [
409
+ pl.lit("catalog channel").alias("channel"),
410
+ (pl.lit("catalog_page") + pl.col("cp_catalog_page_id").cast(pl.Utf8)).alias(
411
+ "id"
412
+ ),
413
+ pl.col("sales"),
414
+ pl.col("returns1"),
415
+ (pl.col("profit") - pl.col("profit_loss")).alias("profit"),
416
+ ]
417
+ )
418
+ web_channel = wsr.select(
419
+ [
420
+ pl.lit("web channel").alias("channel"),
421
+ (pl.lit("web_site") + pl.col("web_site_id").cast(pl.Utf8)).alias("id"),
422
+ pl.col("sales"),
423
+ pl.col("returns1"),
424
+ (pl.col("profit") - pl.col("profit_loss")).alias("profit"),
425
+ ]
426
+ )
427
+ all_channels = pl.concat([store_channel, catalog_channel, web_channel])
428
+
429
+ # Step 5: Group by channel and id (filter out NULL rollup rows)
430
+ return (
431
+ all_channels.group_by(["channel", "id"])
432
+ .agg(
433
+ [
434
+ pl.col("sales").sum().alias("sales"),
435
+ pl.col("sales").count().alias("sales_count"),
436
+ pl.col("returns1").sum().alias("returns1"),
437
+ pl.col("returns1").count().alias("returns1_count"),
438
+ pl.col("profit").sum().alias("profit"),
439
+ pl.col("profit").count().alias("profit_count"),
440
+ ]
441
+ )
442
+ .with_columns(
443
+ [
444
+ pl.when(pl.col("sales_count") > 0)
445
+ .then(pl.col("sales"))
446
+ .otherwise(None)
447
+ .alias("sales"),
448
+ pl.when(pl.col("returns1_count") > 0)
449
+ .then(pl.col("returns1"))
450
+ .otherwise(None)
451
+ .alias("returns1"),
452
+ pl.when(pl.col("profit_count") > 0)
453
+ .then(pl.col("profit"))
454
+ .otherwise(None)
455
+ .alias("profit"),
456
+ ]
457
+ )
458
+ .drop(["sales_count", "returns1_count", "profit_count"])
459
+ .filter(pl.col("channel").is_not_null() & pl.col("id").is_not_null())
460
+ .sort(["channel", "id"])
461
+ .limit(100)
462
+ )
@@ -0,0 +1,92 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Query 6."""
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import TYPE_CHECKING
9
+
10
+ import polars as pl
11
+
12
+ from cudf_polars.experimental.benchmarks.utils import get_data
13
+
14
+ if TYPE_CHECKING:
15
+ from cudf_polars.experimental.benchmarks.utils import RunConfig
16
+
17
+
18
+ def duckdb_impl(run_config: RunConfig) -> str:
19
+ """Query 6."""
20
+ return """
21
+ SELECT a.ca_state state,
22
+ Count(*) cnt
23
+ FROM customer_address a,
24
+ customer c,
25
+ store_sales s,
26
+ date_dim d,
27
+ item i
28
+ WHERE a.ca_address_sk = c.c_current_addr_sk
29
+ AND c.c_customer_sk = s.ss_customer_sk
30
+ AND s.ss_sold_date_sk = d.d_date_sk
31
+ AND s.ss_item_sk = i.i_item_sk
32
+ AND d.d_month_seq = (SELECT DISTINCT ( d_month_seq )
33
+ FROM date_dim
34
+ WHERE d_year = 1998
35
+ AND d_moy = 7)
36
+ AND i.i_current_price > 1.2 * (SELECT Avg(j.i_current_price)
37
+ FROM item j
38
+ WHERE j.i_category = i.i_category)
39
+ GROUP BY a.ca_state
40
+ HAVING Count(*) >= 10
41
+ --ORDER BY cnt
42
+ ORDER BY cnt, state
43
+ LIMIT 100;
44
+ """
45
+
46
+
47
+ def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
48
+ """Query 6."""
49
+ # Load required tables
50
+ customer_address = get_data(
51
+ run_config.dataset_path, "customer_address", run_config.suffix
52
+ )
53
+ customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
54
+ store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
55
+ date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
56
+ item = get_data(run_config.dataset_path, "item", run_config.suffix)
57
+
58
+ # Subquery 1: d_month_seq values for July 1998
59
+ target_month_seq_table = (
60
+ date_dim.filter((pl.col("d_year") == 1998) & (pl.col("d_moy") == 7))
61
+ .select("d_month_seq")
62
+ .unique()
63
+ )
64
+
65
+ # Subquery 2: Calculate average price per category
66
+ avg_price_per_category = item.group_by("i_category").agg(
67
+ pl.col("i_current_price").mean().alias("avg_price")
68
+ )
69
+
70
+ return (
71
+ customer_address.join(
72
+ customer, left_on="ca_address_sk", right_on="c_current_addr_sk"
73
+ )
74
+ .join(store_sales, left_on="c_customer_sk", right_on="ss_customer_sk")
75
+ .join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk")
76
+ .join(item, left_on="ss_item_sk", right_on="i_item_sk")
77
+ .join(avg_price_per_category, on="i_category")
78
+ .join(target_month_seq_table, on="d_month_seq", how="semi")
79
+ .filter(pl.col("i_current_price") > 1.2 * pl.col("avg_price"))
80
+ .group_by("ca_state")
81
+ .agg(pl.len().alias("cnt"))
82
+ .filter(pl.col("cnt") >= 10)
83
+ .sort(["cnt", "ca_state"], nulls_last=True)
84
+ .limit(100)
85
+ .select(
86
+ [
87
+ pl.col("ca_state").alias("state"),
88
+ # Cast -> Int64 to match DuckDB
89
+ pl.col("cnt").cast(pl.Int64),
90
+ ]
91
+ )
92
+ )
@@ -0,0 +1,79 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Query 7."""
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import TYPE_CHECKING
9
+
10
+ import polars as pl
11
+
12
+ from cudf_polars.experimental.benchmarks.utils import get_data
13
+
14
+ if TYPE_CHECKING:
15
+ from cudf_polars.experimental.benchmarks.utils import RunConfig
16
+
17
+
18
+ def duckdb_impl(run_config: RunConfig) -> str:
19
+ """Query 7."""
20
+ return """
21
+ SELECT i_item_id,
22
+ Avg(ss_quantity) agg1,
23
+ Avg(ss_list_price) agg2,
24
+ Avg(ss_coupon_amt) agg3,
25
+ Avg(ss_sales_price) agg4
26
+ FROM store_sales,
27
+ customer_demographics,
28
+ date_dim,
29
+ item,
30
+ promotion
31
+ WHERE ss_sold_date_sk = d_date_sk
32
+ AND ss_item_sk = i_item_sk
33
+ AND ss_cdemo_sk = cd_demo_sk
34
+ AND ss_promo_sk = p_promo_sk
35
+ AND cd_gender = 'F'
36
+ AND cd_marital_status = 'W'
37
+ AND cd_education_status = '2 yr Degree'
38
+ AND ( p_channel_email = 'N'
39
+ OR p_channel_event = 'N' )
40
+ AND d_year = 1998
41
+ GROUP BY i_item_id
42
+ ORDER BY i_item_id
43
+ LIMIT 100;
44
+ """
45
+
46
+
47
+ def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
48
+ """Query 7."""
49
+ # Load required tables
50
+ store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
51
+ customer_demographics = get_data(
52
+ run_config.dataset_path, "customer_demographics", run_config.suffix
53
+ )
54
+ date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
55
+ item = get_data(run_config.dataset_path, "item", run_config.suffix)
56
+ promotion = get_data(run_config.dataset_path, "promotion", run_config.suffix)
57
+
58
+ return (
59
+ store_sales.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk")
60
+ .join(item, left_on="ss_item_sk", right_on="i_item_sk")
61
+ .join(customer_demographics, left_on="ss_cdemo_sk", right_on="cd_demo_sk")
62
+ .join(promotion, left_on="ss_promo_sk", right_on="p_promo_sk")
63
+ .filter(pl.col("cd_gender") == "F")
64
+ .filter(pl.col("cd_marital_status") == "W")
65
+ .filter(pl.col("cd_education_status") == "2 yr Degree")
66
+ .filter((pl.col("p_channel_email") == "N") | (pl.col("p_channel_event") == "N"))
67
+ .filter(pl.col("d_year") == 1998)
68
+ .group_by("i_item_id")
69
+ .agg(
70
+ [
71
+ pl.col("ss_quantity").mean().alias("agg1"),
72
+ pl.col("ss_list_price").mean().alias("agg2"),
73
+ pl.col("ss_coupon_amt").mean().alias("agg3"),
74
+ pl.col("ss_sales_price").mean().alias("agg4"),
75
+ ]
76
+ )
77
+ .sort("i_item_id", nulls_last=True)
78
+ .limit(100)
79
+ )