cudf-polars-cu13 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -0
- cudf_polars/VERSION +1 -0
- cudf_polars/__init__.py +28 -0
- cudf_polars/_version.py +21 -0
- cudf_polars/callback.py +318 -0
- cudf_polars/containers/__init__.py +13 -0
- cudf_polars/containers/column.py +495 -0
- cudf_polars/containers/dataframe.py +361 -0
- cudf_polars/containers/datatype.py +137 -0
- cudf_polars/dsl/__init__.py +8 -0
- cudf_polars/dsl/expr.py +66 -0
- cudf_polars/dsl/expressions/__init__.py +8 -0
- cudf_polars/dsl/expressions/aggregation.py +226 -0
- cudf_polars/dsl/expressions/base.py +272 -0
- cudf_polars/dsl/expressions/binaryop.py +120 -0
- cudf_polars/dsl/expressions/boolean.py +326 -0
- cudf_polars/dsl/expressions/datetime.py +271 -0
- cudf_polars/dsl/expressions/literal.py +97 -0
- cudf_polars/dsl/expressions/rolling.py +643 -0
- cudf_polars/dsl/expressions/selection.py +74 -0
- cudf_polars/dsl/expressions/slicing.py +46 -0
- cudf_polars/dsl/expressions/sorting.py +85 -0
- cudf_polars/dsl/expressions/string.py +1002 -0
- cudf_polars/dsl/expressions/struct.py +137 -0
- cudf_polars/dsl/expressions/ternary.py +49 -0
- cudf_polars/dsl/expressions/unary.py +517 -0
- cudf_polars/dsl/ir.py +2607 -0
- cudf_polars/dsl/nodebase.py +164 -0
- cudf_polars/dsl/to_ast.py +359 -0
- cudf_polars/dsl/tracing.py +16 -0
- cudf_polars/dsl/translate.py +939 -0
- cudf_polars/dsl/traversal.py +224 -0
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +481 -0
- cudf_polars/dsl/utils/groupby.py +98 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +61 -0
- cudf_polars/dsl/utils/reshape.py +74 -0
- cudf_polars/dsl/utils/rolling.py +121 -0
- cudf_polars/dsl/utils/windows.py +192 -0
- cudf_polars/experimental/__init__.py +8 -0
- cudf_polars/experimental/base.py +386 -0
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds.py +220 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
- cudf_polars/experimental/benchmarks/pdsh.py +814 -0
- cudf_polars/experimental/benchmarks/utils.py +832 -0
- cudf_polars/experimental/dask_registers.py +200 -0
- cudf_polars/experimental/dispatch.py +156 -0
- cudf_polars/experimental/distinct.py +197 -0
- cudf_polars/experimental/explain.py +157 -0
- cudf_polars/experimental/expressions.py +590 -0
- cudf_polars/experimental/groupby.py +327 -0
- cudf_polars/experimental/io.py +943 -0
- cudf_polars/experimental/join.py +391 -0
- cudf_polars/experimental/parallel.py +423 -0
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +188 -0
- cudf_polars/experimental/shuffle.py +354 -0
- cudf_polars/experimental/sort.py +609 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/statistics.py +795 -0
- cudf_polars/experimental/utils.py +169 -0
- cudf_polars/py.typed +0 -0
- cudf_polars/testing/__init__.py +8 -0
- cudf_polars/testing/asserts.py +448 -0
- cudf_polars/testing/io.py +122 -0
- cudf_polars/testing/plugin.py +236 -0
- cudf_polars/typing/__init__.py +219 -0
- cudf_polars/utils/__init__.py +8 -0
- cudf_polars/utils/config.py +741 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +118 -0
- cudf_polars/utils/sorting.py +53 -0
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +27 -0
- cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
- cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
- cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
- cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
- cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,814 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Experimental PDS-H benchmarks.
|
|
6
|
+
|
|
7
|
+
Based on https://github.com/pola-rs/polars-benchmark.
|
|
8
|
+
|
|
9
|
+
WARNING: This is an experimental (and unofficial)
|
|
10
|
+
benchmark script. It is not intended for public use
|
|
11
|
+
and may be modified or removed at any time.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import contextlib
|
|
17
|
+
import os
|
|
18
|
+
from datetime import date
|
|
19
|
+
from typing import TYPE_CHECKING
|
|
20
|
+
|
|
21
|
+
import polars as pl
|
|
22
|
+
|
|
23
|
+
with contextlib.suppress(ImportError):
|
|
24
|
+
from cudf_polars.experimental.benchmarks.utils import (
|
|
25
|
+
get_data,
|
|
26
|
+
run_polars,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from cudf_polars.experimental.benchmarks.utils import RunConfig
|
|
32
|
+
|
|
33
|
+
# Without this setting, the first IO task to run
|
|
34
|
+
# on each worker takes ~15 sec extra
|
|
35
|
+
os.environ["KVIKIO_COMPAT_MODE"] = os.environ.get("KVIKIO_COMPAT_MODE", "on")
|
|
36
|
+
os.environ["KVIKIO_NTHREADS"] = os.environ.get("KVIKIO_NTHREADS", "8")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class PDSHQueries:
|
|
40
|
+
"""PDS-H query definitions."""
|
|
41
|
+
|
|
42
|
+
name: str = "pdsh"
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def q0(run_config: RunConfig) -> pl.LazyFrame:
|
|
46
|
+
"""Query 0."""
|
|
47
|
+
return pl.LazyFrame()
|
|
48
|
+
|
|
49
|
+
@staticmethod
|
|
50
|
+
def q1(run_config: RunConfig) -> pl.LazyFrame:
|
|
51
|
+
"""Query 1."""
|
|
52
|
+
lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
|
|
53
|
+
|
|
54
|
+
var1 = date(1998, 9, 2)
|
|
55
|
+
|
|
56
|
+
return (
|
|
57
|
+
lineitem.filter(pl.col("l_shipdate") <= var1)
|
|
58
|
+
.group_by("l_returnflag", "l_linestatus")
|
|
59
|
+
.agg(
|
|
60
|
+
pl.sum("l_quantity").alias("sum_qty"),
|
|
61
|
+
pl.sum("l_extendedprice").alias("sum_base_price"),
|
|
62
|
+
(pl.col("l_extendedprice") * (1.0 - pl.col("l_discount")))
|
|
63
|
+
.sum()
|
|
64
|
+
.alias("sum_disc_price"),
|
|
65
|
+
(
|
|
66
|
+
pl.col("l_extendedprice")
|
|
67
|
+
* (1.0 - pl.col("l_discount"))
|
|
68
|
+
* (1.0 + pl.col("l_tax"))
|
|
69
|
+
)
|
|
70
|
+
.sum()
|
|
71
|
+
.alias("sum_charge"),
|
|
72
|
+
pl.mean("l_quantity").alias("avg_qty"),
|
|
73
|
+
pl.mean("l_extendedprice").alias("avg_price"),
|
|
74
|
+
pl.mean("l_discount").alias("avg_disc"),
|
|
75
|
+
pl.len().alias("count_order"),
|
|
76
|
+
)
|
|
77
|
+
.sort("l_returnflag", "l_linestatus")
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
@staticmethod
|
|
81
|
+
def q2(run_config: RunConfig) -> pl.LazyFrame:
|
|
82
|
+
"""Query 2."""
|
|
83
|
+
nation = get_data(run_config.dataset_path, "nation", run_config.suffix)
|
|
84
|
+
part = get_data(run_config.dataset_path, "part", run_config.suffix)
|
|
85
|
+
partsupp = get_data(run_config.dataset_path, "partsupp", run_config.suffix)
|
|
86
|
+
region = get_data(run_config.dataset_path, "region", run_config.suffix)
|
|
87
|
+
supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
|
|
88
|
+
|
|
89
|
+
var1 = 15
|
|
90
|
+
var2 = "BRASS"
|
|
91
|
+
var3 = "EUROPE"
|
|
92
|
+
|
|
93
|
+
q1 = (
|
|
94
|
+
part.join(partsupp, left_on="p_partkey", right_on="ps_partkey")
|
|
95
|
+
.join(supplier, left_on="ps_suppkey", right_on="s_suppkey")
|
|
96
|
+
.join(nation, left_on="s_nationkey", right_on="n_nationkey")
|
|
97
|
+
.join(region, left_on="n_regionkey", right_on="r_regionkey")
|
|
98
|
+
.filter(pl.col("p_size") == var1)
|
|
99
|
+
.filter(pl.col("p_type").str.ends_with(var2))
|
|
100
|
+
.filter(pl.col("r_name") == var3)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
return (
|
|
104
|
+
q1.group_by("p_partkey")
|
|
105
|
+
.agg(pl.min("ps_supplycost"))
|
|
106
|
+
.join(q1, on=["p_partkey", "ps_supplycost"])
|
|
107
|
+
.select(
|
|
108
|
+
"s_acctbal",
|
|
109
|
+
"s_name",
|
|
110
|
+
"n_name",
|
|
111
|
+
"p_partkey",
|
|
112
|
+
"p_mfgr",
|
|
113
|
+
"s_address",
|
|
114
|
+
"s_phone",
|
|
115
|
+
"s_comment",
|
|
116
|
+
)
|
|
117
|
+
.sort(
|
|
118
|
+
by=["s_acctbal", "n_name", "s_name", "p_partkey"],
|
|
119
|
+
descending=[True, False, False, False],
|
|
120
|
+
)
|
|
121
|
+
.head(100)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def q3(run_config: RunConfig) -> pl.LazyFrame:
|
|
126
|
+
"""Query 3."""
|
|
127
|
+
customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
|
|
128
|
+
lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
|
|
129
|
+
orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
|
|
130
|
+
|
|
131
|
+
var1 = "BUILDING"
|
|
132
|
+
var2 = date(1995, 3, 15)
|
|
133
|
+
|
|
134
|
+
return (
|
|
135
|
+
customer.filter(pl.col("c_mktsegment") == var1)
|
|
136
|
+
.join(orders, left_on="c_custkey", right_on="o_custkey")
|
|
137
|
+
.join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
|
|
138
|
+
.filter(pl.col("o_orderdate") < var2)
|
|
139
|
+
.filter(pl.col("l_shipdate") > var2)
|
|
140
|
+
.with_columns(
|
|
141
|
+
(pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).alias(
|
|
142
|
+
"revenue"
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
.group_by("o_orderkey", "o_orderdate", "o_shippriority")
|
|
146
|
+
.agg(pl.sum("revenue"))
|
|
147
|
+
.select(
|
|
148
|
+
pl.col("o_orderkey").alias("l_orderkey"),
|
|
149
|
+
"revenue",
|
|
150
|
+
"o_orderdate",
|
|
151
|
+
"o_shippriority",
|
|
152
|
+
)
|
|
153
|
+
.sort(by=["revenue", "o_orderdate"], descending=[True, False])
|
|
154
|
+
.head(10)
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
@staticmethod
|
|
158
|
+
def q4(run_config: RunConfig) -> pl.LazyFrame:
|
|
159
|
+
"""Query 4."""
|
|
160
|
+
lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
|
|
161
|
+
orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
|
|
162
|
+
|
|
163
|
+
var1 = date(1993, 7, 1)
|
|
164
|
+
var2 = date(1993, 10, 1)
|
|
165
|
+
|
|
166
|
+
return (
|
|
167
|
+
# SQL exists translates to semi join in Polars API
|
|
168
|
+
orders.join(
|
|
169
|
+
(lineitem.filter(pl.col("l_commitdate") < pl.col("l_receiptdate"))),
|
|
170
|
+
left_on="o_orderkey",
|
|
171
|
+
right_on="l_orderkey",
|
|
172
|
+
how="semi",
|
|
173
|
+
)
|
|
174
|
+
.filter(pl.col("o_orderdate").is_between(var1, var2, closed="left"))
|
|
175
|
+
.group_by("o_orderpriority")
|
|
176
|
+
.agg(pl.len().alias("order_count"))
|
|
177
|
+
.sort("o_orderpriority")
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
@staticmethod
|
|
181
|
+
def q5(run_config: RunConfig) -> pl.LazyFrame:
|
|
182
|
+
"""Query 5."""
|
|
183
|
+
path = run_config.dataset_path
|
|
184
|
+
suffix = run_config.suffix
|
|
185
|
+
customer = get_data(path, "customer", suffix)
|
|
186
|
+
lineitem = get_data(path, "lineitem", suffix)
|
|
187
|
+
nation = get_data(path, "nation", suffix)
|
|
188
|
+
orders = get_data(path, "orders", suffix)
|
|
189
|
+
region = get_data(path, "region", suffix)
|
|
190
|
+
supplier = get_data(path, "supplier", suffix)
|
|
191
|
+
|
|
192
|
+
var1 = "ASIA"
|
|
193
|
+
var2 = date(1994, 1, 1)
|
|
194
|
+
var3 = date(1995, 1, 1)
|
|
195
|
+
|
|
196
|
+
return (
|
|
197
|
+
region.join(nation, left_on="r_regionkey", right_on="n_regionkey")
|
|
198
|
+
.join(customer, left_on="n_nationkey", right_on="c_nationkey")
|
|
199
|
+
.join(orders, left_on="c_custkey", right_on="o_custkey")
|
|
200
|
+
.join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
|
|
201
|
+
.join(
|
|
202
|
+
supplier,
|
|
203
|
+
left_on=["l_suppkey", "n_nationkey"],
|
|
204
|
+
right_on=["s_suppkey", "s_nationkey"],
|
|
205
|
+
)
|
|
206
|
+
.filter(pl.col("r_name") == var1)
|
|
207
|
+
.filter(pl.col("o_orderdate").is_between(var2, var3, closed="left"))
|
|
208
|
+
.with_columns(
|
|
209
|
+
(pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).alias(
|
|
210
|
+
"revenue"
|
|
211
|
+
)
|
|
212
|
+
)
|
|
213
|
+
.group_by("n_name")
|
|
214
|
+
.agg(pl.sum("revenue"))
|
|
215
|
+
.sort(by="revenue", descending=True)
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
@staticmethod
|
|
219
|
+
def q6(run_config: RunConfig) -> pl.LazyFrame:
|
|
220
|
+
"""Query 6."""
|
|
221
|
+
path = run_config.dataset_path
|
|
222
|
+
suffix = run_config.suffix
|
|
223
|
+
lineitem = get_data(path, "lineitem", suffix)
|
|
224
|
+
|
|
225
|
+
var1 = date(1994, 1, 1)
|
|
226
|
+
var2 = date(1995, 1, 1)
|
|
227
|
+
var3 = 0.05
|
|
228
|
+
var4 = 0.07
|
|
229
|
+
var5 = 24
|
|
230
|
+
|
|
231
|
+
return (
|
|
232
|
+
lineitem.filter(pl.col("l_shipdate").is_between(var1, var2, closed="left"))
|
|
233
|
+
.filter(pl.col("l_discount").is_between(var3, var4))
|
|
234
|
+
.filter(pl.col("l_quantity") < var5)
|
|
235
|
+
.with_columns(
|
|
236
|
+
(pl.col("l_extendedprice") * pl.col("l_discount")).alias("revenue")
|
|
237
|
+
)
|
|
238
|
+
.select(pl.sum("revenue"))
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
@staticmethod
|
|
242
|
+
def q7(run_config: RunConfig) -> pl.LazyFrame:
|
|
243
|
+
"""Query 7."""
|
|
244
|
+
customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
|
|
245
|
+
lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
|
|
246
|
+
nation = get_data(run_config.dataset_path, "nation", run_config.suffix)
|
|
247
|
+
orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
|
|
248
|
+
supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
|
|
249
|
+
|
|
250
|
+
var1 = "FRANCE"
|
|
251
|
+
var2 = "GERMANY"
|
|
252
|
+
var3 = date(1995, 1, 1)
|
|
253
|
+
var4 = date(1996, 12, 31)
|
|
254
|
+
|
|
255
|
+
n1 = nation.filter(pl.col("n_name") == var1)
|
|
256
|
+
n2 = nation.filter(pl.col("n_name") == var2)
|
|
257
|
+
|
|
258
|
+
q1 = (
|
|
259
|
+
customer.join(n1, left_on="c_nationkey", right_on="n_nationkey")
|
|
260
|
+
.join(orders, left_on="c_custkey", right_on="o_custkey")
|
|
261
|
+
.rename({"n_name": "cust_nation"})
|
|
262
|
+
.join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
|
|
263
|
+
.join(supplier, left_on="l_suppkey", right_on="s_suppkey")
|
|
264
|
+
.join(n2, left_on="s_nationkey", right_on="n_nationkey")
|
|
265
|
+
.rename({"n_name": "supp_nation"})
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
q2 = (
|
|
269
|
+
customer.join(n2, left_on="c_nationkey", right_on="n_nationkey")
|
|
270
|
+
.join(orders, left_on="c_custkey", right_on="o_custkey")
|
|
271
|
+
.rename({"n_name": "cust_nation"})
|
|
272
|
+
.join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
|
|
273
|
+
.join(supplier, left_on="l_suppkey", right_on="s_suppkey")
|
|
274
|
+
.join(n1, left_on="s_nationkey", right_on="n_nationkey")
|
|
275
|
+
.rename({"n_name": "supp_nation"})
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
return (
|
|
279
|
+
pl.concat([q1, q2])
|
|
280
|
+
.filter(pl.col("l_shipdate").is_between(var3, var4))
|
|
281
|
+
.with_columns(
|
|
282
|
+
(pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).alias(
|
|
283
|
+
"volume"
|
|
284
|
+
),
|
|
285
|
+
pl.col("l_shipdate").dt.year().alias("l_year"),
|
|
286
|
+
)
|
|
287
|
+
.group_by("supp_nation", "cust_nation", "l_year")
|
|
288
|
+
.agg(pl.sum("volume").alias("revenue"))
|
|
289
|
+
.sort(by=["supp_nation", "cust_nation", "l_year"])
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
@staticmethod
|
|
293
|
+
def q8(run_config: RunConfig) -> pl.LazyFrame:
|
|
294
|
+
"""Query 8."""
|
|
295
|
+
customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
|
|
296
|
+
lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
|
|
297
|
+
nation = get_data(run_config.dataset_path, "nation", run_config.suffix)
|
|
298
|
+
orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
|
|
299
|
+
part = get_data(run_config.dataset_path, "part", run_config.suffix)
|
|
300
|
+
region = get_data(run_config.dataset_path, "region", run_config.suffix)
|
|
301
|
+
supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
|
|
302
|
+
|
|
303
|
+
var1 = "BRAZIL"
|
|
304
|
+
var2 = "AMERICA"
|
|
305
|
+
var3 = "ECONOMY ANODIZED STEEL"
|
|
306
|
+
var4 = date(1995, 1, 1)
|
|
307
|
+
var5 = date(1996, 12, 31)
|
|
308
|
+
|
|
309
|
+
n1 = nation.select("n_nationkey", "n_regionkey")
|
|
310
|
+
n2 = nation.select("n_nationkey", "n_name")
|
|
311
|
+
|
|
312
|
+
return (
|
|
313
|
+
part.join(lineitem, left_on="p_partkey", right_on="l_partkey")
|
|
314
|
+
.join(supplier, left_on="l_suppkey", right_on="s_suppkey")
|
|
315
|
+
.join(orders, left_on="l_orderkey", right_on="o_orderkey")
|
|
316
|
+
.join(customer, left_on="o_custkey", right_on="c_custkey")
|
|
317
|
+
.join(n1, left_on="c_nationkey", right_on="n_nationkey")
|
|
318
|
+
.join(region, left_on="n_regionkey", right_on="r_regionkey")
|
|
319
|
+
.filter(pl.col("r_name") == var2)
|
|
320
|
+
.join(n2, left_on="s_nationkey", right_on="n_nationkey")
|
|
321
|
+
.filter(pl.col("o_orderdate").is_between(var4, var5))
|
|
322
|
+
.filter(pl.col("p_type") == var3)
|
|
323
|
+
.select(
|
|
324
|
+
pl.col("o_orderdate").dt.year().alias("o_year"),
|
|
325
|
+
(pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).alias(
|
|
326
|
+
"volume"
|
|
327
|
+
),
|
|
328
|
+
pl.col("n_name").alias("nation"),
|
|
329
|
+
)
|
|
330
|
+
.with_columns(
|
|
331
|
+
pl.when(pl.col("nation") == var1)
|
|
332
|
+
.then(pl.col("volume"))
|
|
333
|
+
.otherwise(0)
|
|
334
|
+
.alias("_tmp")
|
|
335
|
+
)
|
|
336
|
+
.group_by("o_year")
|
|
337
|
+
.agg((pl.sum("_tmp") / pl.sum("volume")).round(2).alias("mkt_share"))
|
|
338
|
+
.sort("o_year")
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
@staticmethod
|
|
342
|
+
def q9(run_config: RunConfig) -> pl.LazyFrame:
|
|
343
|
+
"""Query 9."""
|
|
344
|
+
path = run_config.dataset_path
|
|
345
|
+
suffix = run_config.suffix
|
|
346
|
+
lineitem = get_data(path, "lineitem", suffix)
|
|
347
|
+
nation = get_data(path, "nation", suffix)
|
|
348
|
+
orders = get_data(path, "orders", suffix)
|
|
349
|
+
part = get_data(path, "part", suffix)
|
|
350
|
+
partsupp = get_data(path, "partsupp", suffix)
|
|
351
|
+
supplier = get_data(path, "supplier", suffix)
|
|
352
|
+
|
|
353
|
+
return (
|
|
354
|
+
part.join(partsupp, left_on="p_partkey", right_on="ps_partkey")
|
|
355
|
+
.join(supplier, left_on="ps_suppkey", right_on="s_suppkey")
|
|
356
|
+
.join(
|
|
357
|
+
lineitem,
|
|
358
|
+
left_on=["p_partkey", "ps_suppkey"],
|
|
359
|
+
right_on=["l_partkey", "l_suppkey"],
|
|
360
|
+
)
|
|
361
|
+
.join(orders, left_on="l_orderkey", right_on="o_orderkey")
|
|
362
|
+
.join(nation, left_on="s_nationkey", right_on="n_nationkey")
|
|
363
|
+
.filter(pl.col("p_name").str.contains("green"))
|
|
364
|
+
.select(
|
|
365
|
+
pl.col("n_name").alias("nation"),
|
|
366
|
+
pl.col("o_orderdate").dt.year().alias("o_year"),
|
|
367
|
+
(
|
|
368
|
+
pl.col("l_extendedprice") * (1 - pl.col("l_discount"))
|
|
369
|
+
- pl.col("ps_supplycost") * pl.col("l_quantity")
|
|
370
|
+
).alias("amount"),
|
|
371
|
+
)
|
|
372
|
+
.group_by("nation", "o_year")
|
|
373
|
+
.agg(pl.sum("amount").round(2).alias("sum_profit"))
|
|
374
|
+
.sort(by=["nation", "o_year"], descending=[False, True])
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
@staticmethod
|
|
378
|
+
def q10(run_config: RunConfig) -> pl.LazyFrame:
|
|
379
|
+
"""Query 10."""
|
|
380
|
+
path = run_config.dataset_path
|
|
381
|
+
suffix = run_config.suffix
|
|
382
|
+
customer = get_data(path, "customer", suffix)
|
|
383
|
+
lineitem = get_data(path, "lineitem", suffix)
|
|
384
|
+
nation = get_data(path, "nation", suffix)
|
|
385
|
+
orders = get_data(path, "orders", suffix)
|
|
386
|
+
|
|
387
|
+
var1 = date(1993, 10, 1)
|
|
388
|
+
var2 = date(1994, 1, 1)
|
|
389
|
+
|
|
390
|
+
return (
|
|
391
|
+
customer.join(orders, left_on="c_custkey", right_on="o_custkey")
|
|
392
|
+
.join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
|
|
393
|
+
.join(nation, left_on="c_nationkey", right_on="n_nationkey")
|
|
394
|
+
.filter(pl.col("o_orderdate").is_between(var1, var2, closed="left"))
|
|
395
|
+
.filter(pl.col("l_returnflag") == "R")
|
|
396
|
+
.group_by(
|
|
397
|
+
"c_custkey",
|
|
398
|
+
"c_name",
|
|
399
|
+
"c_acctbal",
|
|
400
|
+
"c_phone",
|
|
401
|
+
"n_name",
|
|
402
|
+
"c_address",
|
|
403
|
+
"c_comment",
|
|
404
|
+
)
|
|
405
|
+
.agg(
|
|
406
|
+
(pl.col("l_extendedprice") * (1 - pl.col("l_discount")))
|
|
407
|
+
.sum()
|
|
408
|
+
.round(2)
|
|
409
|
+
.alias("revenue")
|
|
410
|
+
)
|
|
411
|
+
.select(
|
|
412
|
+
"c_custkey",
|
|
413
|
+
"c_name",
|
|
414
|
+
"revenue",
|
|
415
|
+
"c_acctbal",
|
|
416
|
+
"n_name",
|
|
417
|
+
"c_address",
|
|
418
|
+
"c_phone",
|
|
419
|
+
"c_comment",
|
|
420
|
+
)
|
|
421
|
+
.sort(by="revenue", descending=True)
|
|
422
|
+
.head(20)
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
@staticmethod
|
|
426
|
+
def q11(run_config: RunConfig) -> pl.LazyFrame:
|
|
427
|
+
"""Query 11."""
|
|
428
|
+
nation = get_data(run_config.dataset_path, "nation", run_config.suffix)
|
|
429
|
+
partsupp = get_data(run_config.dataset_path, "partsupp", run_config.suffix)
|
|
430
|
+
supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
|
|
431
|
+
|
|
432
|
+
var1 = "GERMANY"
|
|
433
|
+
var2 = 0.0001 / run_config.scale_factor
|
|
434
|
+
|
|
435
|
+
q1 = (
|
|
436
|
+
partsupp.join(supplier, left_on="ps_suppkey", right_on="s_suppkey")
|
|
437
|
+
.join(nation, left_on="s_nationkey", right_on="n_nationkey")
|
|
438
|
+
.filter(pl.col("n_name") == var1)
|
|
439
|
+
)
|
|
440
|
+
q2 = q1.select(
|
|
441
|
+
(pl.col("ps_supplycost") * pl.col("ps_availqty"))
|
|
442
|
+
.sum()
|
|
443
|
+
.round(2)
|
|
444
|
+
.alias("tmp")
|
|
445
|
+
* var2
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
return (
|
|
449
|
+
q1.group_by("ps_partkey")
|
|
450
|
+
.agg(
|
|
451
|
+
(pl.col("ps_supplycost") * pl.col("ps_availqty"))
|
|
452
|
+
.sum()
|
|
453
|
+
.round(2)
|
|
454
|
+
.alias("value")
|
|
455
|
+
)
|
|
456
|
+
.join(q2, how="cross")
|
|
457
|
+
.filter(pl.col("value") > pl.col("tmp"))
|
|
458
|
+
.select("ps_partkey", "value")
|
|
459
|
+
.sort("value", descending=True)
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
@staticmethod
|
|
463
|
+
def q12(run_config: RunConfig) -> pl.LazyFrame:
|
|
464
|
+
"""Query 12."""
|
|
465
|
+
lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
|
|
466
|
+
orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
|
|
467
|
+
|
|
468
|
+
var1 = "MAIL"
|
|
469
|
+
var2 = "SHIP"
|
|
470
|
+
var3 = date(1994, 1, 1)
|
|
471
|
+
var4 = date(1995, 1, 1)
|
|
472
|
+
|
|
473
|
+
return (
|
|
474
|
+
orders.join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
|
|
475
|
+
.filter(pl.col("l_shipmode").is_in([var1, var2]))
|
|
476
|
+
.filter(pl.col("l_commitdate") < pl.col("l_receiptdate"))
|
|
477
|
+
.filter(pl.col("l_shipdate") < pl.col("l_commitdate"))
|
|
478
|
+
.filter(pl.col("l_receiptdate").is_between(var3, var4, closed="left"))
|
|
479
|
+
.with_columns(
|
|
480
|
+
pl.when(pl.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"]))
|
|
481
|
+
.then(1)
|
|
482
|
+
.otherwise(0)
|
|
483
|
+
.alias("high_line_count"),
|
|
484
|
+
pl.when(pl.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"]).not_())
|
|
485
|
+
.then(1)
|
|
486
|
+
.otherwise(0)
|
|
487
|
+
.alias("low_line_count"),
|
|
488
|
+
)
|
|
489
|
+
.group_by("l_shipmode")
|
|
490
|
+
.agg(pl.col("high_line_count").sum(), pl.col("low_line_count").sum())
|
|
491
|
+
.sort("l_shipmode")
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
@staticmethod
|
|
495
|
+
def q13(run_config: RunConfig) -> pl.LazyFrame:
|
|
496
|
+
"""Query 13."""
|
|
497
|
+
customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
|
|
498
|
+
orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
|
|
499
|
+
|
|
500
|
+
var1 = "special"
|
|
501
|
+
var2 = "requests"
|
|
502
|
+
|
|
503
|
+
orders = orders.filter(
|
|
504
|
+
pl.col("o_comment").str.contains(f"{var1}.*{var2}").not_()
|
|
505
|
+
)
|
|
506
|
+
return (
|
|
507
|
+
customer.join(orders, left_on="c_custkey", right_on="o_custkey", how="left")
|
|
508
|
+
.group_by("c_custkey")
|
|
509
|
+
.agg(pl.col("o_orderkey").count().alias("c_count"))
|
|
510
|
+
.group_by("c_count")
|
|
511
|
+
.len()
|
|
512
|
+
.select(pl.col("c_count"), pl.col("len").alias("custdist"))
|
|
513
|
+
.sort(by=["custdist", "c_count"], descending=[True, True])
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
@staticmethod
|
|
517
|
+
def q14(run_config: RunConfig) -> pl.LazyFrame:
|
|
518
|
+
"""Query 14."""
|
|
519
|
+
lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
|
|
520
|
+
part = get_data(run_config.dataset_path, "part", run_config.suffix)
|
|
521
|
+
|
|
522
|
+
var1 = date(1995, 9, 1)
|
|
523
|
+
var2 = date(1995, 10, 1)
|
|
524
|
+
|
|
525
|
+
return (
|
|
526
|
+
lineitem.join(part, left_on="l_partkey", right_on="p_partkey")
|
|
527
|
+
.filter(pl.col("l_shipdate").is_between(var1, var2, closed="left"))
|
|
528
|
+
.select(
|
|
529
|
+
(
|
|
530
|
+
100.00
|
|
531
|
+
* pl.when(pl.col("p_type").str.contains("PROMO*"))
|
|
532
|
+
.then(pl.col("l_extendedprice") * (1 - pl.col("l_discount")))
|
|
533
|
+
.otherwise(0)
|
|
534
|
+
.sum()
|
|
535
|
+
/ (pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).sum()
|
|
536
|
+
)
|
|
537
|
+
.round(2)
|
|
538
|
+
.alias("promo_revenue")
|
|
539
|
+
)
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
@staticmethod
|
|
543
|
+
def q15(run_config: RunConfig) -> pl.LazyFrame:
|
|
544
|
+
"""Query 15."""
|
|
545
|
+
lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
|
|
546
|
+
supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
|
|
547
|
+
|
|
548
|
+
var1 = date(1996, 1, 1)
|
|
549
|
+
var2 = date(1996, 4, 1)
|
|
550
|
+
|
|
551
|
+
revenue = (
|
|
552
|
+
lineitem.filter(pl.col("l_shipdate").is_between(var1, var2, closed="left"))
|
|
553
|
+
.group_by("l_suppkey")
|
|
554
|
+
.agg(
|
|
555
|
+
(pl.col("l_extendedprice") * (1 - pl.col("l_discount")))
|
|
556
|
+
.sum()
|
|
557
|
+
.alias("total_revenue")
|
|
558
|
+
)
|
|
559
|
+
.select(pl.col("l_suppkey").alias("supplier_no"), pl.col("total_revenue"))
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
return (
|
|
563
|
+
supplier.join(revenue, left_on="s_suppkey", right_on="supplier_no")
|
|
564
|
+
.filter(pl.col("total_revenue") == pl.col("total_revenue").max())
|
|
565
|
+
.with_columns(pl.col("total_revenue").round(2))
|
|
566
|
+
.select("s_suppkey", "s_name", "s_address", "s_phone", "total_revenue")
|
|
567
|
+
.sort("s_suppkey")
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
@staticmethod
|
|
571
|
+
def q16(run_config: RunConfig) -> pl.LazyFrame:
|
|
572
|
+
"""Query 16."""
|
|
573
|
+
part = get_data(run_config.dataset_path, "part", run_config.suffix)
|
|
574
|
+
partsupp = get_data(run_config.dataset_path, "partsupp", run_config.suffix)
|
|
575
|
+
supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
|
|
576
|
+
|
|
577
|
+
var1 = "Brand#45"
|
|
578
|
+
|
|
579
|
+
supplier = supplier.filter(
|
|
580
|
+
pl.col("s_comment").str.contains(".*Customer.*Complaints.*")
|
|
581
|
+
).select(pl.col("s_suppkey"), pl.col("s_suppkey").alias("ps_suppkey"))
|
|
582
|
+
|
|
583
|
+
return (
|
|
584
|
+
part.join(partsupp, left_on="p_partkey", right_on="ps_partkey")
|
|
585
|
+
.filter(pl.col("p_brand") != var1)
|
|
586
|
+
.filter(pl.col("p_type").str.contains("MEDIUM POLISHED*").not_())
|
|
587
|
+
.filter(pl.col("p_size").is_in([49, 14, 23, 45, 19, 3, 36, 9]))
|
|
588
|
+
.join(supplier, left_on="ps_suppkey", right_on="s_suppkey", how="left")
|
|
589
|
+
.filter(pl.col("ps_suppkey_right").is_null())
|
|
590
|
+
.group_by("p_brand", "p_type", "p_size")
|
|
591
|
+
.agg(pl.col("ps_suppkey").n_unique().alias("supplier_cnt"))
|
|
592
|
+
.sort(
|
|
593
|
+
by=["supplier_cnt", "p_brand", "p_type", "p_size"],
|
|
594
|
+
descending=[True, False, False, False],
|
|
595
|
+
)
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
@staticmethod
|
|
599
|
+
def q17(run_config: RunConfig) -> pl.LazyFrame:
|
|
600
|
+
"""Query 17."""
|
|
601
|
+
lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
|
|
602
|
+
part = get_data(run_config.dataset_path, "part", run_config.suffix)
|
|
603
|
+
|
|
604
|
+
var1 = "Brand#23"
|
|
605
|
+
var2 = "MED BOX"
|
|
606
|
+
|
|
607
|
+
q1 = (
|
|
608
|
+
part.filter(pl.col("p_brand") == var1)
|
|
609
|
+
.filter(pl.col("p_container") == var2)
|
|
610
|
+
.join(lineitem, how="left", left_on="p_partkey", right_on="l_partkey")
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
return (
|
|
614
|
+
q1.group_by("p_partkey")
|
|
615
|
+
.agg((0.2 * pl.col("l_quantity").mean()).alias("avg_quantity"))
|
|
616
|
+
.select(pl.col("p_partkey").alias("key"), pl.col("avg_quantity"))
|
|
617
|
+
.join(q1, left_on="key", right_on="p_partkey")
|
|
618
|
+
.filter(pl.col("l_quantity") < pl.col("avg_quantity"))
|
|
619
|
+
.select(
|
|
620
|
+
(pl.col("l_extendedprice").sum() / 7.0).round(2).alias("avg_yearly")
|
|
621
|
+
)
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
@staticmethod
|
|
625
|
+
def q18(run_config: RunConfig) -> pl.LazyFrame:
|
|
626
|
+
"""Query 18."""
|
|
627
|
+
path = run_config.dataset_path
|
|
628
|
+
suffix = run_config.suffix
|
|
629
|
+
customer = get_data(path, "customer", suffix)
|
|
630
|
+
lineitem = get_data(path, "lineitem", suffix)
|
|
631
|
+
orders = get_data(path, "orders", suffix)
|
|
632
|
+
|
|
633
|
+
var1 = 300
|
|
634
|
+
|
|
635
|
+
q1 = (
|
|
636
|
+
lineitem.group_by("l_orderkey")
|
|
637
|
+
.agg(pl.col("l_quantity").sum().alias("sum_quantity"))
|
|
638
|
+
.filter(pl.col("sum_quantity") > var1)
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
return (
|
|
642
|
+
orders.join(q1, left_on="o_orderkey", right_on="l_orderkey", how="semi")
|
|
643
|
+
.join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
|
|
644
|
+
.join(customer, left_on="o_custkey", right_on="c_custkey")
|
|
645
|
+
.group_by(
|
|
646
|
+
"c_name", "o_custkey", "o_orderkey", "o_orderdate", "o_totalprice"
|
|
647
|
+
)
|
|
648
|
+
.agg(pl.col("l_quantity").sum().alias("col6"))
|
|
649
|
+
.select(
|
|
650
|
+
pl.col("c_name"),
|
|
651
|
+
pl.col("o_custkey").alias("c_custkey"),
|
|
652
|
+
pl.col("o_orderkey"),
|
|
653
|
+
pl.col("o_orderdate").alias("o_orderdat"),
|
|
654
|
+
pl.col("o_totalprice"),
|
|
655
|
+
pl.col("col6"),
|
|
656
|
+
)
|
|
657
|
+
.sort(by=["o_totalprice", "o_orderdat"], descending=[True, False])
|
|
658
|
+
.head(100)
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
@staticmethod
|
|
662
|
+
def q19(run_config: RunConfig) -> pl.LazyFrame:
|
|
663
|
+
"""Query 19."""
|
|
664
|
+
lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
|
|
665
|
+
part = get_data(run_config.dataset_path, "part", run_config.suffix)
|
|
666
|
+
|
|
667
|
+
return (
|
|
668
|
+
part.join(lineitem, left_on="p_partkey", right_on="l_partkey")
|
|
669
|
+
.filter(pl.col("l_shipmode").is_in(["AIR", "AIR REG"]))
|
|
670
|
+
.filter(pl.col("l_shipinstruct") == "DELIVER IN PERSON")
|
|
671
|
+
.filter(
|
|
672
|
+
(
|
|
673
|
+
(pl.col("p_brand") == "Brand#12")
|
|
674
|
+
& pl.col("p_container").is_in(
|
|
675
|
+
["SM CASE", "SM BOX", "SM PACK", "SM PKG"]
|
|
676
|
+
)
|
|
677
|
+
& (pl.col("l_quantity").is_between(1, 11))
|
|
678
|
+
& (pl.col("p_size").is_between(1, 5))
|
|
679
|
+
)
|
|
680
|
+
| (
|
|
681
|
+
(pl.col("p_brand") == "Brand#23")
|
|
682
|
+
& pl.col("p_container").is_in(
|
|
683
|
+
["MED BAG", "MED BOX", "MED PKG", "MED PACK"]
|
|
684
|
+
)
|
|
685
|
+
& (pl.col("l_quantity").is_between(10, 20))
|
|
686
|
+
& (pl.col("p_size").is_between(1, 10))
|
|
687
|
+
)
|
|
688
|
+
| (
|
|
689
|
+
(pl.col("p_brand") == "Brand#34")
|
|
690
|
+
& pl.col("p_container").is_in(
|
|
691
|
+
["LG CASE", "LG BOX", "LG PACK", "LG PKG"]
|
|
692
|
+
)
|
|
693
|
+
& (pl.col("l_quantity").is_between(20, 30))
|
|
694
|
+
& (pl.col("p_size").is_between(1, 15))
|
|
695
|
+
)
|
|
696
|
+
)
|
|
697
|
+
.select(
|
|
698
|
+
(pl.col("l_extendedprice") * (1 - pl.col("l_discount")))
|
|
699
|
+
.sum()
|
|
700
|
+
.round(2)
|
|
701
|
+
.alias("revenue")
|
|
702
|
+
)
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
@staticmethod
|
|
706
|
+
def q20(run_config: RunConfig) -> pl.LazyFrame:
|
|
707
|
+
"""Query 20."""
|
|
708
|
+
lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
|
|
709
|
+
nation = get_data(run_config.dataset_path, "nation", run_config.suffix)
|
|
710
|
+
part = get_data(run_config.dataset_path, "part", run_config.suffix)
|
|
711
|
+
partsupp = get_data(run_config.dataset_path, "partsupp", run_config.suffix)
|
|
712
|
+
supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
|
|
713
|
+
|
|
714
|
+
var1 = date(1994, 1, 1)
|
|
715
|
+
var2 = date(1995, 1, 1)
|
|
716
|
+
var3 = "CANADA"
|
|
717
|
+
var4 = "forest"
|
|
718
|
+
|
|
719
|
+
q1 = (
|
|
720
|
+
lineitem.filter(pl.col("l_shipdate").is_between(var1, var2, closed="left"))
|
|
721
|
+
.group_by("l_partkey", "l_suppkey")
|
|
722
|
+
.agg((pl.col("l_quantity").sum() * 0.5).alias("sum_quantity"))
|
|
723
|
+
)
|
|
724
|
+
q2 = nation.filter(pl.col("n_name") == var3)
|
|
725
|
+
q3 = supplier.join(q2, left_on="s_nationkey", right_on="n_nationkey")
|
|
726
|
+
|
|
727
|
+
return (
|
|
728
|
+
part.filter(pl.col("p_name").str.starts_with(var4))
|
|
729
|
+
.select(pl.col("p_partkey").unique())
|
|
730
|
+
.join(partsupp, left_on="p_partkey", right_on="ps_partkey")
|
|
731
|
+
.join(
|
|
732
|
+
q1,
|
|
733
|
+
left_on=["ps_suppkey", "p_partkey"],
|
|
734
|
+
right_on=["l_suppkey", "l_partkey"],
|
|
735
|
+
)
|
|
736
|
+
.filter(pl.col("ps_availqty") > pl.col("sum_quantity"))
|
|
737
|
+
.select(pl.col("ps_suppkey").unique())
|
|
738
|
+
.join(q3, left_on="ps_suppkey", right_on="s_suppkey")
|
|
739
|
+
.select("s_name", "s_address")
|
|
740
|
+
.sort("s_name")
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
@staticmethod
|
|
744
|
+
def q21(run_config: RunConfig) -> pl.LazyFrame:
|
|
745
|
+
"""Query 21."""
|
|
746
|
+
lineitem = get_data(run_config.dataset_path, "lineitem", run_config.suffix)
|
|
747
|
+
nation = get_data(run_config.dataset_path, "nation", run_config.suffix)
|
|
748
|
+
orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
|
|
749
|
+
supplier = get_data(run_config.dataset_path, "supplier", run_config.suffix)
|
|
750
|
+
|
|
751
|
+
var1 = "SAUDI ARABIA"
|
|
752
|
+
|
|
753
|
+
q1 = (
|
|
754
|
+
lineitem.group_by("l_orderkey")
|
|
755
|
+
.agg(pl.col("l_suppkey").len().alias("n_supp_by_order"))
|
|
756
|
+
.filter(pl.col("n_supp_by_order") > 1)
|
|
757
|
+
.join(
|
|
758
|
+
lineitem.filter(pl.col("l_receiptdate") > pl.col("l_commitdate")),
|
|
759
|
+
on="l_orderkey",
|
|
760
|
+
)
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
return (
|
|
764
|
+
q1.group_by("l_orderkey")
|
|
765
|
+
.agg(pl.col("l_suppkey").len().alias("n_supp_by_order"))
|
|
766
|
+
.join(q1, on="l_orderkey")
|
|
767
|
+
.join(supplier, left_on="l_suppkey", right_on="s_suppkey")
|
|
768
|
+
.join(nation, left_on="s_nationkey", right_on="n_nationkey")
|
|
769
|
+
.join(orders, left_on="l_orderkey", right_on="o_orderkey")
|
|
770
|
+
.filter(pl.col("n_supp_by_order") == 1)
|
|
771
|
+
.filter(pl.col("n_name") == var1)
|
|
772
|
+
.filter(pl.col("o_orderstatus") == "F")
|
|
773
|
+
.group_by("s_name")
|
|
774
|
+
.agg(pl.len().alias("numwait"))
|
|
775
|
+
.sort(by=["numwait", "s_name"], descending=[True, False])
|
|
776
|
+
.head(100)
|
|
777
|
+
)
|
|
778
|
+
|
|
779
|
+
@staticmethod
|
|
780
|
+
def q22(run_config: RunConfig) -> pl.LazyFrame:
|
|
781
|
+
"""Query 22."""
|
|
782
|
+
customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
|
|
783
|
+
orders = get_data(run_config.dataset_path, "orders", run_config.suffix)
|
|
784
|
+
|
|
785
|
+
q1 = (
|
|
786
|
+
customer.with_columns(pl.col("c_phone").str.slice(0, 2).alias("cntrycode"))
|
|
787
|
+
.filter(pl.col("cntrycode").str.contains("13|31|23|29|30|18|17"))
|
|
788
|
+
.select("c_acctbal", "c_custkey", "cntrycode")
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
q2 = q1.filter(pl.col("c_acctbal") > 0.0).select(
|
|
792
|
+
pl.col("c_acctbal").mean().alias("avg_acctbal")
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
q3 = orders.select(pl.col("o_custkey").unique()).with_columns(
|
|
796
|
+
pl.col("o_custkey").alias("c_custkey")
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
return (
|
|
800
|
+
q1.join(q3, on="c_custkey", how="left")
|
|
801
|
+
.filter(pl.col("o_custkey").is_null())
|
|
802
|
+
.join(q2, how="cross")
|
|
803
|
+
.filter(pl.col("c_acctbal") > pl.col("avg_acctbal"))
|
|
804
|
+
.group_by("cntrycode")
|
|
805
|
+
.agg(
|
|
806
|
+
pl.col("c_acctbal").count().alias("numcust"),
|
|
807
|
+
pl.col("c_acctbal").sum().round(2).alias("totacctbal"),
|
|
808
|
+
)
|
|
809
|
+
.sort("cntrycode")
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
if __name__ == "__main__":
|
|
814
|
+
run_polars(PDSHQueries)
|