cudf-polars-cu13 25.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. cudf_polars/GIT_COMMIT +1 -0
  2. cudf_polars/VERSION +1 -0
  3. cudf_polars/__init__.py +28 -0
  4. cudf_polars/_version.py +21 -0
  5. cudf_polars/callback.py +318 -0
  6. cudf_polars/containers/__init__.py +13 -0
  7. cudf_polars/containers/column.py +495 -0
  8. cudf_polars/containers/dataframe.py +361 -0
  9. cudf_polars/containers/datatype.py +137 -0
  10. cudf_polars/dsl/__init__.py +8 -0
  11. cudf_polars/dsl/expr.py +66 -0
  12. cudf_polars/dsl/expressions/__init__.py +8 -0
  13. cudf_polars/dsl/expressions/aggregation.py +226 -0
  14. cudf_polars/dsl/expressions/base.py +272 -0
  15. cudf_polars/dsl/expressions/binaryop.py +120 -0
  16. cudf_polars/dsl/expressions/boolean.py +326 -0
  17. cudf_polars/dsl/expressions/datetime.py +271 -0
  18. cudf_polars/dsl/expressions/literal.py +97 -0
  19. cudf_polars/dsl/expressions/rolling.py +643 -0
  20. cudf_polars/dsl/expressions/selection.py +74 -0
  21. cudf_polars/dsl/expressions/slicing.py +46 -0
  22. cudf_polars/dsl/expressions/sorting.py +85 -0
  23. cudf_polars/dsl/expressions/string.py +1002 -0
  24. cudf_polars/dsl/expressions/struct.py +137 -0
  25. cudf_polars/dsl/expressions/ternary.py +49 -0
  26. cudf_polars/dsl/expressions/unary.py +517 -0
  27. cudf_polars/dsl/ir.py +2607 -0
  28. cudf_polars/dsl/nodebase.py +164 -0
  29. cudf_polars/dsl/to_ast.py +359 -0
  30. cudf_polars/dsl/tracing.py +16 -0
  31. cudf_polars/dsl/translate.py +939 -0
  32. cudf_polars/dsl/traversal.py +224 -0
  33. cudf_polars/dsl/utils/__init__.py +8 -0
  34. cudf_polars/dsl/utils/aggregations.py +481 -0
  35. cudf_polars/dsl/utils/groupby.py +98 -0
  36. cudf_polars/dsl/utils/naming.py +34 -0
  37. cudf_polars/dsl/utils/replace.py +61 -0
  38. cudf_polars/dsl/utils/reshape.py +74 -0
  39. cudf_polars/dsl/utils/rolling.py +121 -0
  40. cudf_polars/dsl/utils/windows.py +192 -0
  41. cudf_polars/experimental/__init__.py +8 -0
  42. cudf_polars/experimental/base.py +386 -0
  43. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  44. cudf_polars/experimental/benchmarks/pdsds.py +220 -0
  45. cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
  46. cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
  47. cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
  48. cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
  49. cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
  50. cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
  51. cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
  52. cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
  53. cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
  54. cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
  55. cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
  56. cudf_polars/experimental/benchmarks/pdsh.py +814 -0
  57. cudf_polars/experimental/benchmarks/utils.py +832 -0
  58. cudf_polars/experimental/dask_registers.py +200 -0
  59. cudf_polars/experimental/dispatch.py +156 -0
  60. cudf_polars/experimental/distinct.py +197 -0
  61. cudf_polars/experimental/explain.py +157 -0
  62. cudf_polars/experimental/expressions.py +590 -0
  63. cudf_polars/experimental/groupby.py +327 -0
  64. cudf_polars/experimental/io.py +943 -0
  65. cudf_polars/experimental/join.py +391 -0
  66. cudf_polars/experimental/parallel.py +423 -0
  67. cudf_polars/experimental/repartition.py +69 -0
  68. cudf_polars/experimental/scheduler.py +155 -0
  69. cudf_polars/experimental/select.py +188 -0
  70. cudf_polars/experimental/shuffle.py +354 -0
  71. cudf_polars/experimental/sort.py +609 -0
  72. cudf_polars/experimental/spilling.py +151 -0
  73. cudf_polars/experimental/statistics.py +795 -0
  74. cudf_polars/experimental/utils.py +169 -0
  75. cudf_polars/py.typed +0 -0
  76. cudf_polars/testing/__init__.py +8 -0
  77. cudf_polars/testing/asserts.py +448 -0
  78. cudf_polars/testing/io.py +122 -0
  79. cudf_polars/testing/plugin.py +236 -0
  80. cudf_polars/typing/__init__.py +219 -0
  81. cudf_polars/utils/__init__.py +8 -0
  82. cudf_polars/utils/config.py +741 -0
  83. cudf_polars/utils/conversion.py +40 -0
  84. cudf_polars/utils/dtypes.py +118 -0
  85. cudf_polars/utils/sorting.py +53 -0
  86. cudf_polars/utils/timer.py +39 -0
  87. cudf_polars/utils/versions.py +27 -0
  88. cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
  89. cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
  90. cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
  91. cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
  92. cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,359 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Query 4."""
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import TYPE_CHECKING
9
+
10
+ import polars as pl
11
+
12
+ from cudf_polars.experimental.benchmarks.utils import get_data
13
+
14
+ if TYPE_CHECKING:
15
+ from cudf_polars.experimental.benchmarks.utils import RunConfig
16
+
17
+
18
+ def duckdb_impl(run_config: RunConfig) -> str:
19
+ """Query 4."""
20
+ return """
21
+ WITH year_total
22
+ AS (SELECT c_customer_id customer_id,
23
+ c_first_name customer_first_name,
24
+ c_last_name customer_last_name,
25
+ c_preferred_cust_flag customer_preferred_cust_flag
26
+ ,
27
+ c_birth_country
28
+ customer_birth_country,
29
+ c_login customer_login,
30
+ c_email_address customer_email_address,
31
+ d_year dyear,
32
+ Sum(( ( ss_ext_list_price - ss_ext_wholesale_cost
33
+ - ss_ext_discount_amt
34
+ )
35
+ +
36
+ ss_ext_sales_price ) / 2) year_total,
37
+ 's' sale_type
38
+ FROM customer,
39
+ store_sales,
40
+ date_dim
41
+ WHERE c_customer_sk = ss_customer_sk
42
+ AND ss_sold_date_sk = d_date_sk
43
+ GROUP BY c_customer_id,
44
+ c_first_name,
45
+ c_last_name,
46
+ c_preferred_cust_flag,
47
+ c_birth_country,
48
+ c_login,
49
+ c_email_address,
50
+ d_year
51
+ UNION ALL
52
+ SELECT c_customer_id customer_id,
53
+ c_first_name customer_first_name,
54
+ c_last_name customer_last_name,
55
+ c_preferred_cust_flag
56
+ customer_preferred_cust_flag,
57
+ c_birth_country customer_birth_country
58
+ ,
59
+ c_login
60
+ customer_login,
61
+ c_email_address customer_email_address
62
+ ,
63
+ d_year dyear
64
+ ,
65
+ Sum(( ( ( cs_ext_list_price
66
+ - cs_ext_wholesale_cost
67
+ - cs_ext_discount_amt
68
+ ) +
69
+ cs_ext_sales_price ) / 2 )) year_total,
70
+ 'c' sale_type
71
+ FROM customer,
72
+ catalog_sales,
73
+ date_dim
74
+ WHERE c_customer_sk = cs_bill_customer_sk
75
+ AND cs_sold_date_sk = d_date_sk
76
+ GROUP BY c_customer_id,
77
+ c_first_name,
78
+ c_last_name,
79
+ c_preferred_cust_flag,
80
+ c_birth_country,
81
+ c_login,
82
+ c_email_address,
83
+ d_year
84
+ UNION ALL
85
+ SELECT c_customer_id customer_id,
86
+ c_first_name customer_first_name,
87
+ c_last_name customer_last_name,
88
+ c_preferred_cust_flag
89
+ customer_preferred_cust_flag,
90
+ c_birth_country customer_birth_country
91
+ ,
92
+ c_login
93
+ customer_login,
94
+ c_email_address customer_email_address
95
+ ,
96
+ d_year dyear
97
+ ,
98
+ Sum(( ( ( ws_ext_list_price
99
+ - ws_ext_wholesale_cost
100
+ - ws_ext_discount_amt
101
+ ) +
102
+ ws_ext_sales_price ) / 2 )) year_total,
103
+ 'w' sale_type
104
+ FROM customer,
105
+ web_sales,
106
+ date_dim
107
+ WHERE c_customer_sk = ws_bill_customer_sk
108
+ AND ws_sold_date_sk = d_date_sk
109
+ GROUP BY c_customer_id,
110
+ c_first_name,
111
+ c_last_name,
112
+ c_preferred_cust_flag,
113
+ c_birth_country,
114
+ c_login,
115
+ c_email_address,
116
+ d_year)
117
+ SELECT t_s_secyear.customer_id,
118
+ t_s_secyear.customer_first_name,
119
+ t_s_secyear.customer_last_name,
120
+ t_s_secyear.customer_preferred_cust_flag
121
+ FROM year_total t_s_firstyear,
122
+ year_total t_s_secyear,
123
+ year_total t_c_firstyear,
124
+ year_total t_c_secyear,
125
+ year_total t_w_firstyear,
126
+ year_total t_w_secyear
127
+ WHERE t_s_secyear.customer_id = t_s_firstyear.customer_id
128
+ AND t_s_firstyear.customer_id = t_c_secyear.customer_id
129
+ AND t_s_firstyear.customer_id = t_c_firstyear.customer_id
130
+ AND t_s_firstyear.customer_id = t_w_firstyear.customer_id
131
+ AND t_s_firstyear.customer_id = t_w_secyear.customer_id
132
+ AND t_s_firstyear.sale_type = 's'
133
+ AND t_c_firstyear.sale_type = 'c'
134
+ AND t_w_firstyear.sale_type = 'w'
135
+ AND t_s_secyear.sale_type = 's'
136
+ AND t_c_secyear.sale_type = 'c'
137
+ AND t_w_secyear.sale_type = 'w'
138
+ AND t_s_firstyear.dyear = 2001
139
+ AND t_s_secyear.dyear = 2001 + 1
140
+ AND t_c_firstyear.dyear = 2001
141
+ AND t_c_secyear.dyear = 2001 + 1
142
+ AND t_w_firstyear.dyear = 2001
143
+ AND t_w_secyear.dyear = 2001 + 1
144
+ AND t_s_firstyear.year_total > 0
145
+ AND t_c_firstyear.year_total > 0
146
+ AND t_w_firstyear.year_total > 0
147
+ AND CASE
148
+ WHEN t_c_firstyear.year_total > 0 THEN t_c_secyear.year_total /
149
+ t_c_firstyear.year_total
150
+ ELSE NULL
151
+ END > CASE
152
+ WHEN t_s_firstyear.year_total > 0 THEN
153
+ t_s_secyear.year_total /
154
+ t_s_firstyear.year_total
155
+ ELSE NULL
156
+ END
157
+ AND CASE
158
+ WHEN t_c_firstyear.year_total > 0 THEN t_c_secyear.year_total /
159
+ t_c_firstyear.year_total
160
+ ELSE NULL
161
+ END > CASE
162
+ WHEN t_w_firstyear.year_total > 0 THEN
163
+ t_w_secyear.year_total /
164
+ t_w_firstyear.year_total
165
+ ELSE NULL
166
+ END
167
+ ORDER BY t_s_secyear.customer_id,
168
+ t_s_secyear.customer_first_name,
169
+ t_s_secyear.customer_last_name,
170
+ t_s_secyear.customer_preferred_cust_flag
171
+ LIMIT 100;
172
+ """
173
+
174
+
175
+ def build_sales_subquery( # noqa: D103
176
+ sales_df: pl.LazyFrame,
177
+ date_df: pl.LazyFrame,
178
+ customer_df: pl.LazyFrame,
179
+ sold_date_key: str,
180
+ customer_key: str,
181
+ col_prefix: str,
182
+ *,
183
+ year_filter: bool = False,
184
+ include_customer_info: bool = False,
185
+ ) -> pl.LazyFrame:
186
+ profit_expr = (
187
+ (
188
+ pl.col(f"{col_prefix}ext_list_price")
189
+ - pl.col(f"{col_prefix}ext_wholesale_cost")
190
+ - pl.col(f"{col_prefix}ext_discount_amt")
191
+ )
192
+ + pl.col(f"{col_prefix}ext_sales_price")
193
+ ) / 2
194
+
195
+ df = (
196
+ sales_df.join(date_df, left_on=sold_date_key, right_on="d_date_sk")
197
+ .join(customer_df, left_on=customer_key, right_on="c_customer_sk")
198
+ .group_by(
199
+ [
200
+ "c_customer_id",
201
+ "c_first_name",
202
+ "c_last_name",
203
+ "c_preferred_cust_flag",
204
+ "c_birth_country",
205
+ "c_login",
206
+ "c_email_address",
207
+ "d_year",
208
+ ]
209
+ )
210
+ .agg(profit_expr.sum().alias("year_total"))
211
+ )
212
+
213
+ if year_filter:
214
+ df = df.filter(pl.col("year_total") > 0)
215
+
216
+ if include_customer_info:
217
+ return df.select(
218
+ [
219
+ pl.col("c_customer_id").alias("customer_id"),
220
+ pl.col("c_first_name").alias("customer_first_name"),
221
+ pl.col("c_last_name").alias("customer_last_name"),
222
+ pl.col("c_preferred_cust_flag").alias("customer_preferred_cust_flag"),
223
+ pl.col("year_total"),
224
+ ]
225
+ )
226
+ else:
227
+ return df.select(
228
+ [pl.col("c_customer_id").alias("customer_id"), pl.col("year_total")]
229
+ )
230
+
231
+
232
+ def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
233
+ """Query 4."""
234
+ # Load required tables
235
+ customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
236
+ store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
237
+ catalog_sales = get_data(
238
+ run_config.dataset_path, "catalog_sales", run_config.suffix
239
+ )
240
+ web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix)
241
+ date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
242
+ date_2001 = date_dim.filter(pl.col("d_year") == 2001)
243
+ date_2002 = date_dim.filter(pl.col("d_year") == 2002)
244
+
245
+ # Store sales - first year (2001)
246
+ t_s_firstyear = build_sales_subquery(
247
+ store_sales,
248
+ date_2001,
249
+ customer,
250
+ sold_date_key="ss_sold_date_sk",
251
+ customer_key="ss_customer_sk",
252
+ col_prefix="ss_",
253
+ year_filter=True,
254
+ include_customer_info=True,
255
+ )
256
+
257
+ # Store sales - second year (2002)
258
+ t_s_secyear = build_sales_subquery(
259
+ store_sales,
260
+ date_2002,
261
+ customer,
262
+ sold_date_key="ss_sold_date_sk",
263
+ customer_key="ss_customer_sk",
264
+ col_prefix="ss_",
265
+ year_filter=False,
266
+ include_customer_info=True,
267
+ )
268
+
269
+ # Catalog sales - first year (2001)
270
+ t_c_firstyear = build_sales_subquery(
271
+ catalog_sales,
272
+ date_2001,
273
+ customer,
274
+ sold_date_key="cs_sold_date_sk",
275
+ customer_key="cs_bill_customer_sk",
276
+ col_prefix="cs_",
277
+ year_filter=True,
278
+ include_customer_info=False,
279
+ )
280
+
281
+ # Catalog sales - first year (2002)
282
+ t_c_secyear = build_sales_subquery(
283
+ catalog_sales,
284
+ date_2002,
285
+ customer,
286
+ sold_date_key="cs_sold_date_sk",
287
+ customer_key="cs_bill_customer_sk",
288
+ col_prefix="cs_",
289
+ year_filter=False,
290
+ include_customer_info=False,
291
+ )
292
+
293
+ # Web sales - first year (2001)
294
+ t_w_firstyear = build_sales_subquery(
295
+ web_sales,
296
+ date_2001,
297
+ customer,
298
+ sold_date_key="ws_sold_date_sk",
299
+ customer_key="ws_bill_customer_sk",
300
+ col_prefix="ws_",
301
+ year_filter=True,
302
+ include_customer_info=False,
303
+ )
304
+
305
+ # Web sales - first year (2001)
306
+ t_w_secyear = build_sales_subquery(
307
+ web_sales,
308
+ date_2002,
309
+ customer,
310
+ sold_date_key="ws_sold_date_sk",
311
+ customer_key="ws_bill_customer_sk",
312
+ col_prefix="ws_",
313
+ year_filter=False,
314
+ include_customer_info=False,
315
+ )
316
+
317
+ # Perform the joins and filtering
318
+ sort_cols = [
319
+ "customer_id",
320
+ "customer_first_name",
321
+ "customer_last_name",
322
+ "customer_preferred_cust_flag",
323
+ ]
324
+ return (
325
+ t_s_secyear.join(t_s_firstyear, on="customer_id", suffix="_sf", how="inner")
326
+ .join(t_c_firstyear, on="customer_id", suffix="_cf", how="inner")
327
+ .join(t_c_secyear, on="customer_id", suffix="_cs", how="inner")
328
+ .join(t_w_firstyear, on="customer_id", suffix="_wf", how="inner")
329
+ .join(t_w_secyear, on="customer_id", suffix="_ws", how="inner")
330
+ .filter(
331
+ # All first year totals must be > 0
332
+ (pl.col("year_total_sf") > 0)
333
+ & (pl.col("year_total_cf") > 0)
334
+ & (pl.col("year_total_wf") > 0)
335
+ &
336
+ # Catalog growth rate > Store growth rate
337
+ (
338
+ pl.when(pl.col("year_total_cf") > 0)
339
+ .then(pl.col("year_total_cs") / pl.col("year_total_cf"))
340
+ .otherwise(None)
341
+ > pl.when(pl.col("year_total_sf") > 0)
342
+ .then(pl.col("year_total") / pl.col("year_total_sf"))
343
+ .otherwise(None)
344
+ )
345
+ &
346
+ # Catalog growth rate > Web growth rate
347
+ (
348
+ pl.when(pl.col("year_total_cf") > 0)
349
+ .then(pl.col("year_total_cs") / pl.col("year_total_cf"))
350
+ .otherwise(None)
351
+ > pl.when(pl.col("year_total_wf") > 0)
352
+ .then(pl.col("year_total_ws") / pl.col("year_total_wf"))
353
+ .otherwise(None)
354
+ )
355
+ )
356
+ .select(sort_cols)
357
+ .sort(sort_cols)
358
+ .limit(100)
359
+ )