cudf-polars-cu13 25.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. cudf_polars/GIT_COMMIT +1 -0
  2. cudf_polars/VERSION +1 -0
  3. cudf_polars/__init__.py +28 -0
  4. cudf_polars/_version.py +21 -0
  5. cudf_polars/callback.py +318 -0
  6. cudf_polars/containers/__init__.py +13 -0
  7. cudf_polars/containers/column.py +495 -0
  8. cudf_polars/containers/dataframe.py +361 -0
  9. cudf_polars/containers/datatype.py +137 -0
  10. cudf_polars/dsl/__init__.py +8 -0
  11. cudf_polars/dsl/expr.py +66 -0
  12. cudf_polars/dsl/expressions/__init__.py +8 -0
  13. cudf_polars/dsl/expressions/aggregation.py +226 -0
  14. cudf_polars/dsl/expressions/base.py +272 -0
  15. cudf_polars/dsl/expressions/binaryop.py +120 -0
  16. cudf_polars/dsl/expressions/boolean.py +326 -0
  17. cudf_polars/dsl/expressions/datetime.py +271 -0
  18. cudf_polars/dsl/expressions/literal.py +97 -0
  19. cudf_polars/dsl/expressions/rolling.py +643 -0
  20. cudf_polars/dsl/expressions/selection.py +74 -0
  21. cudf_polars/dsl/expressions/slicing.py +46 -0
  22. cudf_polars/dsl/expressions/sorting.py +85 -0
  23. cudf_polars/dsl/expressions/string.py +1002 -0
  24. cudf_polars/dsl/expressions/struct.py +137 -0
  25. cudf_polars/dsl/expressions/ternary.py +49 -0
  26. cudf_polars/dsl/expressions/unary.py +517 -0
  27. cudf_polars/dsl/ir.py +2607 -0
  28. cudf_polars/dsl/nodebase.py +164 -0
  29. cudf_polars/dsl/to_ast.py +359 -0
  30. cudf_polars/dsl/tracing.py +16 -0
  31. cudf_polars/dsl/translate.py +939 -0
  32. cudf_polars/dsl/traversal.py +224 -0
  33. cudf_polars/dsl/utils/__init__.py +8 -0
  34. cudf_polars/dsl/utils/aggregations.py +481 -0
  35. cudf_polars/dsl/utils/groupby.py +98 -0
  36. cudf_polars/dsl/utils/naming.py +34 -0
  37. cudf_polars/dsl/utils/replace.py +61 -0
  38. cudf_polars/dsl/utils/reshape.py +74 -0
  39. cudf_polars/dsl/utils/rolling.py +121 -0
  40. cudf_polars/dsl/utils/windows.py +192 -0
  41. cudf_polars/experimental/__init__.py +8 -0
  42. cudf_polars/experimental/base.py +386 -0
  43. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  44. cudf_polars/experimental/benchmarks/pdsds.py +220 -0
  45. cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
  46. cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
  47. cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
  48. cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
  49. cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
  50. cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
  51. cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
  52. cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
  53. cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
  54. cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
  55. cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
  56. cudf_polars/experimental/benchmarks/pdsh.py +814 -0
  57. cudf_polars/experimental/benchmarks/utils.py +832 -0
  58. cudf_polars/experimental/dask_registers.py +200 -0
  59. cudf_polars/experimental/dispatch.py +156 -0
  60. cudf_polars/experimental/distinct.py +197 -0
  61. cudf_polars/experimental/explain.py +157 -0
  62. cudf_polars/experimental/expressions.py +590 -0
  63. cudf_polars/experimental/groupby.py +327 -0
  64. cudf_polars/experimental/io.py +943 -0
  65. cudf_polars/experimental/join.py +391 -0
  66. cudf_polars/experimental/parallel.py +423 -0
  67. cudf_polars/experimental/repartition.py +69 -0
  68. cudf_polars/experimental/scheduler.py +155 -0
  69. cudf_polars/experimental/select.py +188 -0
  70. cudf_polars/experimental/shuffle.py +354 -0
  71. cudf_polars/experimental/sort.py +609 -0
  72. cudf_polars/experimental/spilling.py +151 -0
  73. cudf_polars/experimental/statistics.py +795 -0
  74. cudf_polars/experimental/utils.py +169 -0
  75. cudf_polars/py.typed +0 -0
  76. cudf_polars/testing/__init__.py +8 -0
  77. cudf_polars/testing/asserts.py +448 -0
  78. cudf_polars/testing/io.py +122 -0
  79. cudf_polars/testing/plugin.py +236 -0
  80. cudf_polars/typing/__init__.py +219 -0
  81. cudf_polars/utils/__init__.py +8 -0
  82. cudf_polars/utils/config.py +741 -0
  83. cudf_polars/utils/conversion.py +40 -0
  84. cudf_polars/utils/dtypes.py +118 -0
  85. cudf_polars/utils/sorting.py +53 -0
  86. cudf_polars/utils/timer.py +39 -0
  87. cudf_polars/utils/versions.py +27 -0
  88. cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
  89. cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
  90. cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
  91. cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
  92. cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,814 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """
5
+ Experimental PDS-H benchmarks.
6
+
7
+ Based on https://github.com/pola-rs/polars-benchmark.
8
+
9
+ WARNING: This is an experimental (and unofficial)
10
+ benchmark script. It is not intended for public use
11
+ and may be modified or removed at any time.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import contextlib
17
+ import os
18
+ from datetime import date
19
+ from typing import TYPE_CHECKING
20
+
21
+ import polars as pl
22
+
23
+ with contextlib.suppress(ImportError):
24
+ from cudf_polars.experimental.benchmarks.utils import (
25
+ get_data,
26
+ run_polars,
27
+ )
28
+
29
+
30
+ if TYPE_CHECKING:
31
+ from cudf_polars.experimental.benchmarks.utils import RunConfig
32
+
33
+ # Without this setting, the first IO task to run
34
+ # on each worker takes ~15 sec extra
35
+ os.environ["KVIKIO_COMPAT_MODE"] = os.environ.get("KVIKIO_COMPAT_MODE", "on")
36
+ os.environ["KVIKIO_NTHREADS"] = os.environ.get("KVIKIO_NTHREADS", "8")
37
+
38
+
39
+ class PDSHQueries:
40
+ """PDS-H query definitions."""
41
+
42
+ name: str = "pdsh"
43
+
44
+ @staticmethod
45
+ def q0(run_config: RunConfig) -> pl.LazyFrame:
46
+ """Query 0."""
47
+ return pl.LazyFrame()
48
+
49
+ @staticmethod
50
+ def q1(run_config: RunConfig) -> pl.LazyFrame:
51
+ """Query 1."""
52
+ lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
53
+
54
+ var1 = date(1998, 9, 2)
55
+
56
+ return (
57
+ lineitem.filter(pl.col("l_shipdate") <= var1)
58
+ .group_by("l_returnflag", "l_linestatus")
59
+ .agg(
60
+ pl.sum("l_quantity").alias("sum_qty"),
61
+ pl.sum("l_extendedprice").alias("sum_base_price"),
62
+ (pl.col("l_extendedprice") * (1.0 - pl.col("l_discount")))
63
+ .sum()
64
+ .alias("sum_disc_price"),
65
+ (
66
+ pl.col("l_extendedprice")
67
+ * (1.0 - pl.col("l_discount"))
68
+ * (1.0 + pl.col("l_tax"))
69
+ )
70
+ .sum()
71
+ .alias("sum_charge"),
72
+ pl.mean("l_quantity").alias("avg_qty"),
73
+ pl.mean("l_extendedprice").alias("avg_price"),
74
+ pl.mean("l_discount").alias("avg_disc"),
75
+ pl.len().alias("count_order"),
76
+ )
77
+ .sort("l_returnflag", "l_linestatus")
78
+ )
79
+
80
+ @staticmethod
81
+ def q2(run_config: RunConfig) -> pl.LazyFrame:
82
+ """Query 2."""
83
+ nation = get_data(run_config.dataset_path, "nation", run_config.suffix)
84
+ part = get_data(run_config.dataset_path, "part", run_config.suffix)
85
+ partsupp = get_data(run_config.dataset_path, "partsupp", run_config.suffix)
86
+ region = get_data(run_config.dataset_path, "region", run_config.suffix)
87
+ supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
88
+
89
+ var1 = 15
90
+ var2 = "BRASS"
91
+ var3 = "EUROPE"
92
+
93
+ q1 = (
94
+ part.join(partsupp, left_on="p_partkey", right_on="ps_partkey")
95
+ .join(supplier, left_on="ps_suppkey", right_on="s_suppkey")
96
+ .join(nation, left_on="s_nationkey", right_on="n_nationkey")
97
+ .join(region, left_on="n_regionkey", right_on="r_regionkey")
98
+ .filter(pl.col("p_size") == var1)
99
+ .filter(pl.col("p_type").str.ends_with(var2))
100
+ .filter(pl.col("r_name") == var3)
101
+ )
102
+
103
+ return (
104
+ q1.group_by("p_partkey")
105
+ .agg(pl.min("ps_supplycost"))
106
+ .join(q1, on=["p_partkey", "ps_supplycost"])
107
+ .select(
108
+ "s_acctbal",
109
+ "s_name",
110
+ "n_name",
111
+ "p_partkey",
112
+ "p_mfgr",
113
+ "s_address",
114
+ "s_phone",
115
+ "s_comment",
116
+ )
117
+ .sort(
118
+ by=["s_acctbal", "n_name", "s_name", "p_partkey"],
119
+ descending=[True, False, False, False],
120
+ )
121
+ .head(100)
122
+ )
123
+
124
+ @staticmethod
125
+ def q3(run_config: RunConfig) -> pl.LazyFrame:
126
+ """Query 3."""
127
+ customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
128
+ lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
129
+ orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
130
+
131
+ var1 = "BUILDING"
132
+ var2 = date(1995, 3, 15)
133
+
134
+ return (
135
+ customer.filter(pl.col("c_mktsegment") == var1)
136
+ .join(orders, left_on="c_custkey", right_on="o_custkey")
137
+ .join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
138
+ .filter(pl.col("o_orderdate") < var2)
139
+ .filter(pl.col("l_shipdate") > var2)
140
+ .with_columns(
141
+ (pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).alias(
142
+ "revenue"
143
+ )
144
+ )
145
+ .group_by("o_orderkey", "o_orderdate", "o_shippriority")
146
+ .agg(pl.sum("revenue"))
147
+ .select(
148
+ pl.col("o_orderkey").alias("l_orderkey"),
149
+ "revenue",
150
+ "o_orderdate",
151
+ "o_shippriority",
152
+ )
153
+ .sort(by=["revenue", "o_orderdate"], descending=[True, False])
154
+ .head(10)
155
+ )
156
+
157
+ @staticmethod
158
+ def q4(run_config: RunConfig) -> pl.LazyFrame:
159
+ """Query 4."""
160
+ lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
161
+ orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
162
+
163
+ var1 = date(1993, 7, 1)
164
+ var2 = date(1993, 10, 1)
165
+
166
+ return (
167
+ # SQL exists translates to semi join in Polars API
168
+ orders.join(
169
+ (lineitem.filter(pl.col("l_commitdate") < pl.col("l_receiptdate"))),
170
+ left_on="o_orderkey",
171
+ right_on="l_orderkey",
172
+ how="semi",
173
+ )
174
+ .filter(pl.col("o_orderdate").is_between(var1, var2, closed="left"))
175
+ .group_by("o_orderpriority")
176
+ .agg(pl.len().alias("order_count"))
177
+ .sort("o_orderpriority")
178
+ )
179
+
180
+ @staticmethod
181
+ def q5(run_config: RunConfig) -> pl.LazyFrame:
182
+ """Query 5."""
183
+ path = run_config.dataset_path
184
+ suffix = run_config.suffix
185
+ customer = get_data(path, "customer", suffix)
186
+ lineitem = get_data(path, "lineitem", suffix)
187
+ nation = get_data(path, "nation", suffix)
188
+ orders = get_data(path, "orders", suffix)
189
+ region = get_data(path, "region", suffix)
190
+ supplier = get_data(path, "supplier", suffix)
191
+
192
+ var1 = "ASIA"
193
+ var2 = date(1994, 1, 1)
194
+ var3 = date(1995, 1, 1)
195
+
196
+ return (
197
+ region.join(nation, left_on="r_regionkey", right_on="n_regionkey")
198
+ .join(customer, left_on="n_nationkey", right_on="c_nationkey")
199
+ .join(orders, left_on="c_custkey", right_on="o_custkey")
200
+ .join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
201
+ .join(
202
+ supplier,
203
+ left_on=["l_suppkey", "n_nationkey"],
204
+ right_on=["s_suppkey", "s_nationkey"],
205
+ )
206
+ .filter(pl.col("r_name") == var1)
207
+ .filter(pl.col("o_orderdate").is_between(var2, var3, closed="left"))
208
+ .with_columns(
209
+ (pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).alias(
210
+ "revenue"
211
+ )
212
+ )
213
+ .group_by("n_name")
214
+ .agg(pl.sum("revenue"))
215
+ .sort(by="revenue", descending=True)
216
+ )
217
+
218
+ @staticmethod
219
+ def q6(run_config: RunConfig) -> pl.LazyFrame:
220
+ """Query 6."""
221
+ path = run_config.dataset_path
222
+ suffix = run_config.suffix
223
+ lineitem = get_data(path, "lineitem", suffix)
224
+
225
+ var1 = date(1994, 1, 1)
226
+ var2 = date(1995, 1, 1)
227
+ var3 = 0.05
228
+ var4 = 0.07
229
+ var5 = 24
230
+
231
+ return (
232
+ lineitem.filter(pl.col("l_shipdate").is_between(var1, var2, closed="left"))
233
+ .filter(pl.col("l_discount").is_between(var3, var4))
234
+ .filter(pl.col("l_quantity") < var5)
235
+ .with_columns(
236
+ (pl.col("l_extendedprice") * pl.col("l_discount")).alias("revenue")
237
+ )
238
+ .select(pl.sum("revenue"))
239
+ )
240
+
241
+ @staticmethod
242
+ def q7(run_config: RunConfig) -> pl.LazyFrame:
243
+ """Query 7."""
244
+ customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
245
+ lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
246
+ nation = get_data(run_config.dataset_path, "nation", run_config.suffix)
247
+ orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
248
+ supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
249
+
250
+ var1 = "FRANCE"
251
+ var2 = "GERMANY"
252
+ var3 = date(1995, 1, 1)
253
+ var4 = date(1996, 12, 31)
254
+
255
+ n1 = nation.filter(pl.col("n_name") == var1)
256
+ n2 = nation.filter(pl.col("n_name") == var2)
257
+
258
+ q1 = (
259
+ customer.join(n1, left_on="c_nationkey", right_on="n_nationkey")
260
+ .join(orders, left_on="c_custkey", right_on="o_custkey")
261
+ .rename({"n_name": "cust_nation"})
262
+ .join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
263
+ .join(supplier, left_on="l_suppkey", right_on="s_suppkey")
264
+ .join(n2, left_on="s_nationkey", right_on="n_nationkey")
265
+ .rename({"n_name": "supp_nation"})
266
+ )
267
+
268
+ q2 = (
269
+ customer.join(n2, left_on="c_nationkey", right_on="n_nationkey")
270
+ .join(orders, left_on="c_custkey", right_on="o_custkey")
271
+ .rename({"n_name": "cust_nation"})
272
+ .join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
273
+ .join(supplier, left_on="l_suppkey", right_on="s_suppkey")
274
+ .join(n1, left_on="s_nationkey", right_on="n_nationkey")
275
+ .rename({"n_name": "supp_nation"})
276
+ )
277
+
278
+ return (
279
+ pl.concat([q1, q2])
280
+ .filter(pl.col("l_shipdate").is_between(var3, var4))
281
+ .with_columns(
282
+ (pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).alias(
283
+ "volume"
284
+ ),
285
+ pl.col("l_shipdate").dt.year().alias("l_year"),
286
+ )
287
+ .group_by("supp_nation", "cust_nation", "l_year")
288
+ .agg(pl.sum("volume").alias("revenue"))
289
+ .sort(by=["supp_nation", "cust_nation", "l_year"])
290
+ )
291
+
292
+ @staticmethod
293
+ def q8(run_config: RunConfig) -> pl.LazyFrame:
294
+ """Query 8."""
295
+ customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
296
+ lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
297
+ nation = get_data(run_config.dataset_path, "nation", run_config.suffix)
298
+ orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
299
+ part = get_data(run_config.dataset_path, "part", run_config.suffix)
300
+ region = get_data(run_config.dataset_path, "region", run_config.suffix)
301
+ supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
302
+
303
+ var1 = "BRAZIL"
304
+ var2 = "AMERICA"
305
+ var3 = "ECONOMY ANODIZED STEEL"
306
+ var4 = date(1995, 1, 1)
307
+ var5 = date(1996, 12, 31)
308
+
309
+ n1 = nation.select("n_nationkey", "n_regionkey")
310
+ n2 = nation.select("n_nationkey", "n_name")
311
+
312
+ return (
313
+ part.join(lineitem, left_on="p_partkey", right_on="l_partkey")
314
+ .join(supplier, left_on="l_suppkey", right_on="s_suppkey")
315
+ .join(orders, left_on="l_orderkey", right_on="o_orderkey")
316
+ .join(customer, left_on="o_custkey", right_on="c_custkey")
317
+ .join(n1, left_on="c_nationkey", right_on="n_nationkey")
318
+ .join(region, left_on="n_regionkey", right_on="r_regionkey")
319
+ .filter(pl.col("r_name") == var2)
320
+ .join(n2, left_on="s_nationkey", right_on="n_nationkey")
321
+ .filter(pl.col("o_orderdate").is_between(var4, var5))
322
+ .filter(pl.col("p_type") == var3)
323
+ .select(
324
+ pl.col("o_orderdate").dt.year().alias("o_year"),
325
+ (pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).alias(
326
+ "volume"
327
+ ),
328
+ pl.col("n_name").alias("nation"),
329
+ )
330
+ .with_columns(
331
+ pl.when(pl.col("nation") == var1)
332
+ .then(pl.col("volume"))
333
+ .otherwise(0)
334
+ .alias("_tmp")
335
+ )
336
+ .group_by("o_year")
337
+ .agg((pl.sum("_tmp") / pl.sum("volume")).round(2).alias("mkt_share"))
338
+ .sort("o_year")
339
+ )
340
+
341
+ @staticmethod
342
+ def q9(run_config: RunConfig) -> pl.LazyFrame:
343
+ """Query 9."""
344
+ path = run_config.dataset_path
345
+ suffix = run_config.suffix
346
+ lineitem = get_data(path, "lineitem", suffix)
347
+ nation = get_data(path, "nation", suffix)
348
+ orders = get_data(path, "orders", suffix)
349
+ part = get_data(path, "part", suffix)
350
+ partsupp = get_data(path, "partsupp", suffix)
351
+ supplier = get_data(path, "supplier", suffix)
352
+
353
+ return (
354
+ part.join(partsupp, left_on="p_partkey", right_on="ps_partkey")
355
+ .join(supplier, left_on="ps_suppkey", right_on="s_suppkey")
356
+ .join(
357
+ lineitem,
358
+ left_on=["p_partkey", "ps_suppkey"],
359
+ right_on=["l_partkey", "l_suppkey"],
360
+ )
361
+ .join(orders, left_on="l_orderkey", right_on="o_orderkey")
362
+ .join(nation, left_on="s_nationkey", right_on="n_nationkey")
363
+ .filter(pl.col("p_name").str.contains("green"))
364
+ .select(
365
+ pl.col("n_name").alias("nation"),
366
+ pl.col("o_orderdate").dt.year().alias("o_year"),
367
+ (
368
+ pl.col("l_extendedprice") * (1 - pl.col("l_discount"))
369
+ - pl.col("ps_supplycost") * pl.col("l_quantity")
370
+ ).alias("amount"),
371
+ )
372
+ .group_by("nation", "o_year")
373
+ .agg(pl.sum("amount").round(2).alias("sum_profit"))
374
+ .sort(by=["nation", "o_year"], descending=[False, True])
375
+ )
376
+
377
+ @staticmethod
378
+ def q10(run_config: RunConfig) -> pl.LazyFrame:
379
+ """Query 10."""
380
+ path = run_config.dataset_path
381
+ suffix = run_config.suffix
382
+ customer = get_data(path, "customer", suffix)
383
+ lineitem = get_data(path, "lineitem", suffix)
384
+ nation = get_data(path, "nation", suffix)
385
+ orders = get_data(path, "orders", suffix)
386
+
387
+ var1 = date(1993, 10, 1)
388
+ var2 = date(1994, 1, 1)
389
+
390
+ return (
391
+ customer.join(orders, left_on="c_custkey", right_on="o_custkey")
392
+ .join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
393
+ .join(nation, left_on="c_nationkey", right_on="n_nationkey")
394
+ .filter(pl.col("o_orderdate").is_between(var1, var2, closed="left"))
395
+ .filter(pl.col("l_returnflag") == "R")
396
+ .group_by(
397
+ "c_custkey",
398
+ "c_name",
399
+ "c_acctbal",
400
+ "c_phone",
401
+ "n_name",
402
+ "c_address",
403
+ "c_comment",
404
+ )
405
+ .agg(
406
+ (pl.col("l_extendedprice") * (1 - pl.col("l_discount")))
407
+ .sum()
408
+ .round(2)
409
+ .alias("revenue")
410
+ )
411
+ .select(
412
+ "c_custkey",
413
+ "c_name",
414
+ "revenue",
415
+ "c_acctbal",
416
+ "n_name",
417
+ "c_address",
418
+ "c_phone",
419
+ "c_comment",
420
+ )
421
+ .sort(by="revenue", descending=True)
422
+ .head(20)
423
+ )
424
+
425
+ @staticmethod
426
+ def q11(run_config: RunConfig) -> pl.LazyFrame:
427
+ """Query 11."""
428
+ nation = get_data(run_config.dataset_path, "nation", run_config.suffix)
429
+ partsupp = get_data(run_config.dataset_path, "partsupp", run_config.suffix)
430
+ supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
431
+
432
+ var1 = "GERMANY"
433
+ var2 = 0.0001 / run_config.scale_factor
434
+
435
+ q1 = (
436
+ partsupp.join(supplier, left_on="ps_suppkey", right_on="s_suppkey")
437
+ .join(nation, left_on="s_nationkey", right_on="n_nationkey")
438
+ .filter(pl.col("n_name") == var1)
439
+ )
440
+ q2 = q1.select(
441
+ (pl.col("ps_supplycost") * pl.col("ps_availqty"))
442
+ .sum()
443
+ .round(2)
444
+ .alias("tmp")
445
+ * var2
446
+ )
447
+
448
+ return (
449
+ q1.group_by("ps_partkey")
450
+ .agg(
451
+ (pl.col("ps_supplycost") * pl.col("ps_availqty"))
452
+ .sum()
453
+ .round(2)
454
+ .alias("value")
455
+ )
456
+ .join(q2, how="cross")
457
+ .filter(pl.col("value") > pl.col("tmp"))
458
+ .select("ps_partkey", "value")
459
+ .sort("value", descending=True)
460
+ )
461
+
462
+ @staticmethod
463
+ def q12(run_config: RunConfig) -> pl.LazyFrame:
464
+ """Query 12."""
465
+ lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
466
+ orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
467
+
468
+ var1 = "MAIL"
469
+ var2 = "SHIP"
470
+ var3 = date(1994, 1, 1)
471
+ var4 = date(1995, 1, 1)
472
+
473
+ return (
474
+ orders.join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
475
+ .filter(pl.col("l_shipmode").is_in([var1, var2]))
476
+ .filter(pl.col("l_commitdate") < pl.col("l_receiptdate"))
477
+ .filter(pl.col("l_shipdate") < pl.col("l_commitdate"))
478
+ .filter(pl.col("l_receiptdate").is_between(var3, var4, closed="left"))
479
+ .with_columns(
480
+ pl.when(pl.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"]))
481
+ .then(1)
482
+ .otherwise(0)
483
+ .alias("high_line_count"),
484
+ pl.when(pl.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"]).not_())
485
+ .then(1)
486
+ .otherwise(0)
487
+ .alias("low_line_count"),
488
+ )
489
+ .group_by("l_shipmode")
490
+ .agg(pl.col("high_line_count").sum(), pl.col("low_line_count").sum())
491
+ .sort("l_shipmode")
492
+ )
493
+
494
+ @staticmethod
495
+ def q13(run_config: RunConfig) -> pl.LazyFrame:
496
+ """Query 13."""
497
+ customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
498
+ orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
499
+
500
+ var1 = "special"
501
+ var2 = "requests"
502
+
503
+ orders = orders.filter(
504
+ pl.col("o_comment").str.contains(f"{var1}.*{var2}").not_()
505
+ )
506
+ return (
507
+ customer.join(orders, left_on="c_custkey", right_on="o_custkey", how="left")
508
+ .group_by("c_custkey")
509
+ .agg(pl.col("o_orderkey").count().alias("c_count"))
510
+ .group_by("c_count")
511
+ .len()
512
+ .select(pl.col("c_count"), pl.col("len").alias("custdist"))
513
+ .sort(by=["custdist", "c_count"], descending=[True, True])
514
+ )
515
+
516
+ @staticmethod
517
+ def q14(run_config: RunConfig) -> pl.LazyFrame:
518
+ """Query 14."""
519
+ lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
520
+ part = get_data(run_config.dataset_path, "part", run_config.suffix)
521
+
522
+ var1 = date(1995, 9, 1)
523
+ var2 = date(1995, 10, 1)
524
+
525
+ return (
526
+ lineitem.join(part, left_on="l_partkey", right_on="p_partkey")
527
+ .filter(pl.col("l_shipdate").is_between(var1, var2, closed="left"))
528
+ .select(
529
+ (
530
+ 100.00
531
+ * pl.when(pl.col("p_type").str.contains("PROMO*"))
532
+ .then(pl.col("l_extendedprice") * (1 - pl.col("l_discount")))
533
+ .otherwise(0)
534
+ .sum()
535
+ / (pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).sum()
536
+ )
537
+ .round(2)
538
+ .alias("promo_revenue")
539
+ )
540
+ )
541
+
542
+ @staticmethod
543
+ def q15(run_config: RunConfig) -> pl.LazyFrame:
544
+ """Query 15."""
545
+ lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
546
+ supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
547
+
548
+ var1 = date(1996, 1, 1)
549
+ var2 = date(1996, 4, 1)
550
+
551
+ revenue = (
552
+ lineitem.filter(pl.col("l_shipdate").is_between(var1, var2, closed="left"))
553
+ .group_by("l_suppkey")
554
+ .agg(
555
+ (pl.col("l_extendedprice") * (1 - pl.col("l_discount")))
556
+ .sum()
557
+ .alias("total_revenue")
558
+ )
559
+ .select(pl.col("l_suppkey").alias("supplier_no"), pl.col("total_revenue"))
560
+ )
561
+
562
+ return (
563
+ supplier.join(revenue, left_on="s_suppkey", right_on="supplier_no")
564
+ .filter(pl.col("total_revenue") == pl.col("total_revenue").max())
565
+ .with_columns(pl.col("total_revenue").round(2))
566
+ .select("s_suppkey", "s_name", "s_address", "s_phone", "total_revenue")
567
+ .sort("s_suppkey")
568
+ )
569
+
570
+ @staticmethod
571
+ def q16(run_config: RunConfig) -> pl.LazyFrame:
572
+ """Query 16."""
573
+ part = get_data(run_config.dataset_path, "part", run_config.suffix)
574
+ partsupp = get_data(run_config.dataset_path, "partsupp", run_config.suffix)
575
+ supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
576
+
577
+ var1 = "Brand#45"
578
+
579
+ supplier = supplier.filter(
580
+ pl.col("s_comment").str.contains(".*Customer.*Complaints.*")
581
+ ).select(pl.col("s_suppkey"), pl.col("s_suppkey").alias("ps_suppkey"))
582
+
583
+ return (
584
+ part.join(partsupp, left_on="p_partkey", right_on="ps_partkey")
585
+ .filter(pl.col("p_brand") != var1)
586
+ .filter(pl.col("p_type").str.contains("MEDIUM POLISHED*").not_())
587
+ .filter(pl.col("p_size").is_in([49, 14, 23, 45, 19, 3, 36, 9]))
588
+ .join(supplier, left_on="ps_suppkey", right_on="s_suppkey", how="left")
589
+ .filter(pl.col("ps_suppkey_right").is_null())
590
+ .group_by("p_brand", "p_type", "p_size")
591
+ .agg(pl.col("ps_suppkey").n_unique().alias("supplier_cnt"))
592
+ .sort(
593
+ by=["supplier_cnt", "p_brand", "p_type", "p_size"],
594
+ descending=[True, False, False, False],
595
+ )
596
+ )
597
+
598
+ @staticmethod
599
+ def q17(run_config: RunConfig) -> pl.LazyFrame:
600
+ """Query 17."""
601
+ lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
602
+ part = get_data(run_config.dataset_path, "part", run_config.suffix)
603
+
604
+ var1 = "Brand#23"
605
+ var2 = "MED BOX"
606
+
607
+ q1 = (
608
+ part.filter(pl.col("p_brand") == var1)
609
+ .filter(pl.col("p_container") == var2)
610
+ .join(lineitem, how="left", left_on="p_partkey", right_on="l_partkey")
611
+ )
612
+
613
+ return (
614
+ q1.group_by("p_partkey")
615
+ .agg((0.2 * pl.col("l_quantity").mean()).alias("avg_quantity"))
616
+ .select(pl.col("p_partkey").alias("key"), pl.col("avg_quantity"))
617
+ .join(q1, left_on="key", right_on="p_partkey")
618
+ .filter(pl.col("l_quantity") < pl.col("avg_quantity"))
619
+ .select(
620
+ (pl.col("l_extendedprice").sum() / 7.0).round(2).alias("avg_yearly")
621
+ )
622
+ )
623
+
624
+ @staticmethod
625
+ def q18(run_config: RunConfig) -> pl.LazyFrame:
626
+ """Query 18."""
627
+ path = run_config.dataset_path
628
+ suffix = run_config.suffix
629
+ customer = get_data(path, "customer", suffix)
630
+ lineitem = get_data(path, "lineitem", suffix)
631
+ orders = get_data(path, "orders", suffix)
632
+
633
+ var1 = 300
634
+
635
+ q1 = (
636
+ lineitem.group_by("l_orderkey")
637
+ .agg(pl.col("l_quantity").sum().alias("sum_quantity"))
638
+ .filter(pl.col("sum_quantity") > var1)
639
+ )
640
+
641
+ return (
642
+ orders.join(q1, left_on="o_orderkey", right_on="l_orderkey", how="semi")
643
+ .join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
644
+ .join(customer, left_on="o_custkey", right_on="c_custkey")
645
+ .group_by(
646
+ "c_name", "o_custkey", "o_orderkey", "o_orderdate", "o_totalprice"
647
+ )
648
+ .agg(pl.col("l_quantity").sum().alias("col6"))
649
+ .select(
650
+ pl.col("c_name"),
651
+ pl.col("o_custkey").alias("c_custkey"),
652
+ pl.col("o_orderkey"),
653
+ pl.col("o_orderdate").alias("o_orderdat"),
654
+ pl.col("o_totalprice"),
655
+ pl.col("col6"),
656
+ )
657
+ .sort(by=["o_totalprice", "o_orderdat"], descending=[True, False])
658
+ .head(100)
659
+ )
660
+
661
+ @staticmethod
662
+ def q19(run_config: RunConfig) -> pl.LazyFrame:
663
+ """Query 19."""
664
+ lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
665
+ part = get_data(run_config.dataset_path, "part", run_config.suffix)
666
+
667
+ return (
668
+ part.join(lineitem, left_on="p_partkey", right_on="l_partkey")
669
+ .filter(pl.col("l_shipmode").is_in(["AIR", "AIR REG"]))
670
+ .filter(pl.col("l_shipinstruct") == "DELIVER IN PERSON")
671
+ .filter(
672
+ (
673
+ (pl.col("p_brand") == "Brand#12")
674
+ & pl.col("p_container").is_in(
675
+ ["SM CASE", "SM BOX", "SM PACK", "SM PKG"]
676
+ )
677
+ & (pl.col("l_quantity").is_between(1, 11))
678
+ & (pl.col("p_size").is_between(1, 5))
679
+ )
680
+ | (
681
+ (pl.col("p_brand") == "Brand#23")
682
+ & pl.col("p_container").is_in(
683
+ ["MED BAG", "MED BOX", "MED PKG", "MED PACK"]
684
+ )
685
+ & (pl.col("l_quantity").is_between(10, 20))
686
+ & (pl.col("p_size").is_between(1, 10))
687
+ )
688
+ | (
689
+ (pl.col("p_brand") == "Brand#34")
690
+ & pl.col("p_container").is_in(
691
+ ["LG CASE", "LG BOX", "LG PACK", "LG PKG"]
692
+ )
693
+ & (pl.col("l_quantity").is_between(20, 30))
694
+ & (pl.col("p_size").is_between(1, 15))
695
+ )
696
+ )
697
+ .select(
698
+ (pl.col("l_extendedprice") * (1 - pl.col("l_discount")))
699
+ .sum()
700
+ .round(2)
701
+ .alias("revenue")
702
+ )
703
+ )
704
+
705
+ @staticmethod
706
+ def q20(run_config: RunConfig) -> pl.LazyFrame:
707
+ """Query 20."""
708
+ lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
709
+ nation = get_data(run_config.dataset_path, "nation", run_config.suffix)
710
+ part = get_data(run_config.dataset_path, "part", run_config.suffix)
711
+ partsupp = get_data(run_config.dataset_path, "partsupp", run_config.suffix)
712
+ supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
713
+
714
+ var1 = date(1994, 1, 1)
715
+ var2 = date(1995, 1, 1)
716
+ var3 = "CANADA"
717
+ var4 = "forest"
718
+
719
+ q1 = (
720
+ lineitem.filter(pl.col("l_shipdate").is_between(var1, var2, closed="left"))
721
+ .group_by("l_partkey", "l_suppkey")
722
+ .agg((pl.col("l_quantity").sum() * 0.5).alias("sum_quantity"))
723
+ )
724
+ q2 = nation.filter(pl.col("n_name") == var3)
725
+ q3 = supplier.join(q2, left_on="s_nationkey", right_on="n_nationkey")
726
+
727
+ return (
728
+ part.filter(pl.col("p_name").str.starts_with(var4))
729
+ .select(pl.col("p_partkey").unique())
730
+ .join(partsupp, left_on="p_partkey", right_on="ps_partkey")
731
+ .join(
732
+ q1,
733
+ left_on=["ps_suppkey", "p_partkey"],
734
+ right_on=["l_suppkey", "l_partkey"],
735
+ )
736
+ .filter(pl.col("ps_availqty") > pl.col("sum_quantity"))
737
+ .select(pl.col("ps_suppkey").unique())
738
+ .join(q3, left_on="ps_suppkey", right_on="s_suppkey")
739
+ .select("s_name", "s_address")
740
+ .sort("s_name")
741
+ )
742
+
743
+ @staticmethod
744
+ def q21(run_config: RunConfig) -> pl.LazyFrame:
745
+ """Query 21."""
746
+ lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
747
+ nation = get_data(run_config.dataset_path, "nation", run_config.suffix)
748
+ orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
749
+ supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
750
+
751
+ var1 = "SAUDI ARABIA"
752
+
753
+ q1 = (
754
+ lineitem.group_by("l_orderkey")
755
+ .agg(pl.col("l_suppkey").len().alias("n_supp_by_order"))
756
+ .filter(pl.col("n_supp_by_order") > 1)
757
+ .join(
758
+ lineitem.filter(pl.col("l_receiptdate") > pl.col("l_commitdate")),
759
+ on="l_orderkey",
760
+ )
761
+ )
762
+
763
+ return (
764
+ q1.group_by("l_orderkey")
765
+ .agg(pl.col("l_suppkey").len().alias("n_supp_by_order"))
766
+ .join(q1, on="l_orderkey")
767
+ .join(supplier, left_on="l_suppkey", right_on="s_suppkey")
768
+ .join(nation, left_on="s_nationkey", right_on="n_nationkey")
769
+ .join(orders, left_on="l_orderkey", right_on="o_orderkey")
770
+ .filter(pl.col("n_supp_by_order") == 1)
771
+ .filter(pl.col("n_name") == var1)
772
+ .filter(pl.col("o_orderstatus") == "F")
773
+ .group_by("s_name")
774
+ .agg(pl.len().alias("numwait"))
775
+ .sort(by=["numwait", "s_name"], descending=[True, False])
776
+ .head(100)
777
+ )
778
+
779
+ @staticmethod
780
+ def q22(run_config: RunConfig) -> pl.LazyFrame:
781
+ """Query 22."""
782
+ customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
783
+ orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
784
+
785
+ q1 = (
786
+ customer.with_columns(pl.col("c_phone").str.slice(0, 2).alias("cntrycode"))
787
+ .filter(pl.col("cntrycode").str.contains("13|31|23|29|30|18|17"))
788
+ .select("c_acctbal", "c_custkey", "cntrycode")
789
+ )
790
+
791
+ q2 = q1.filter(pl.col("c_acctbal") > 0.0).select(
792
+ pl.col("c_acctbal").mean().alias("avg_acctbal")
793
+ )
794
+
795
+ q3 = orders.select(pl.col("o_custkey").unique()).with_columns(
796
+ pl.col("o_custkey").alias("c_custkey")
797
+ )
798
+
799
+ return (
800
+ q1.join(q3, on="c_custkey", how="left")
801
+ .filter(pl.col("o_custkey").is_null())
802
+ .join(q2, how="cross")
803
+ .filter(pl.col("c_acctbal") > pl.col("avg_acctbal"))
804
+ .group_by("cntrycode")
805
+ .agg(
806
+ pl.col("c_acctbal").count().alias("numcust"),
807
+ pl.col("c_acctbal").sum().round(2).alias("totacctbal"),
808
+ )
809
+ .sort("cntrycode")
810
+ )
811
+
812
+
813
+ if __name__ == "__main__":
814
+ run_polars(PDSHQueries)