spark-connect 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +82 -0
- data/LICENSE +202 -0
- data/NOTICE +16 -0
- data/README.md +166 -0
- data/lib/spark-connect.rb +5 -0
- data/lib/spark_connect/arrow.rb +115 -0
- data/lib/spark_connect/catalog.rb +190 -0
- data/lib/spark_connect/channel_builder.rb +134 -0
- data/lib/spark_connect/client.rb +264 -0
- data/lib/spark_connect/column.rb +379 -0
- data/lib/spark_connect/conf.rb +79 -0
- data/lib/spark_connect/data_frame.rb +828 -0
- data/lib/spark_connect/errors.rb +58 -0
- data/lib/spark_connect/functions.rb +903 -0
- data/lib/spark_connect/grouped_data.rb +101 -0
- data/lib/spark_connect/na_functions.rb +98 -0
- data/lib/spark_connect/observation.rb +61 -0
- data/lib/spark_connect/pipelines.rb +221 -0
- data/lib/spark_connect/plan.rb +39 -0
- data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
- data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
- data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
- data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
- data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
- data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
- data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
- data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
- data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
- data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
- data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
- data/lib/spark_connect/proto.rb +32 -0
- data/lib/spark_connect/reader.rb +98 -0
- data/lib/spark_connect/row.rb +105 -0
- data/lib/spark_connect/session.rb +317 -0
- data/lib/spark_connect/stat_functions.rb +109 -0
- data/lib/spark_connect/streaming.rb +351 -0
- data/lib/spark_connect/types.rb +490 -0
- data/lib/spark_connect/version.rb +11 -0
- data/lib/spark_connect/window.rb +119 -0
- data/lib/spark_connect/writer.rb +208 -0
- data/lib/spark_connect.rb +58 -0
- data/proto/spark/connect/base.proto +1275 -0
- data/proto/spark/connect/catalog.proto +243 -0
- data/proto/spark/connect/commands.proto +553 -0
- data/proto/spark/connect/common.proto +179 -0
- data/proto/spark/connect/expressions.proto +557 -0
- data/proto/spark/connect/ml.proto +147 -0
- data/proto/spark/connect/ml_common.proto +64 -0
- data/proto/spark/connect/pipelines.proto +307 -0
- data/proto/spark/connect/relations.proto +1252 -0
- data/proto/spark/connect/types.proto +227 -0
- metadata +149 -0
|
@@ -0,0 +1,828 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SparkConnect
|
|
4
|
+
# A distributed, lazily-evaluated collection of rows organised into named
|
|
5
|
+
# columns - the central abstraction of the DataFrame API.
|
|
6
|
+
#
|
|
7
|
+
# A {DataFrame} is immutable: every transformation ({#select}, {#filter},
|
|
8
|
+
# {#join}, ...) returns a new {DataFrame} wrapping a new logical plan; nothing
|
|
9
|
+
# is sent to the server until an action ({#collect}, {#show}, {#count}, ...) is
|
|
10
|
+
# invoked.
|
|
11
|
+
#
|
|
12
|
+
# Method names are snake_case (Ruby idiom); camelCase aliases are provided for
|
|
13
|
+
# the highest-traffic PySpark names (`groupBy`, `withColumn`, `orderBy`, ...).
|
|
14
|
+
#
|
|
15
|
+
# @example
|
|
16
|
+
# F = SparkConnect::F
|
|
17
|
+
# df = spark.range(100)
|
|
18
|
+
# df.filter(F.col("id") % 2 == 0)
|
|
19
|
+
# .select((F.col("id") * 10).alias("ten_x"))
|
|
20
|
+
# .order_by(F.col("ten_x").desc)
|
|
21
|
+
# .show(5)
|
|
22
|
+
class DataFrame
|
|
23
|
+
Proto = SparkConnect::Proto
|
|
24
|
+
|
|
25
|
+
JOIN_TYPES = {
|
|
26
|
+
inner: :JOIN_TYPE_INNER,
|
|
27
|
+
cross: :JOIN_TYPE_CROSS,
|
|
28
|
+
outer: :JOIN_TYPE_FULL_OUTER,
|
|
29
|
+
full: :JOIN_TYPE_FULL_OUTER,
|
|
30
|
+
fullouter: :JOIN_TYPE_FULL_OUTER,
|
|
31
|
+
full_outer: :JOIN_TYPE_FULL_OUTER,
|
|
32
|
+
left: :JOIN_TYPE_LEFT_OUTER,
|
|
33
|
+
leftouter: :JOIN_TYPE_LEFT_OUTER,
|
|
34
|
+
left_outer: :JOIN_TYPE_LEFT_OUTER,
|
|
35
|
+
right: :JOIN_TYPE_RIGHT_OUTER,
|
|
36
|
+
rightouter: :JOIN_TYPE_RIGHT_OUTER,
|
|
37
|
+
right_outer: :JOIN_TYPE_RIGHT_OUTER,
|
|
38
|
+
semi: :JOIN_TYPE_LEFT_SEMI,
|
|
39
|
+
leftsemi: :JOIN_TYPE_LEFT_SEMI,
|
|
40
|
+
left_semi: :JOIN_TYPE_LEFT_SEMI,
|
|
41
|
+
anti: :JOIN_TYPE_LEFT_ANTI,
|
|
42
|
+
leftanti: :JOIN_TYPE_LEFT_ANTI,
|
|
43
|
+
left_anti: :JOIN_TYPE_LEFT_ANTI,
|
|
44
|
+
}.freeze
|
|
45
|
+
|
|
46
|
+
# @return [SparkSession]
|
|
47
|
+
attr_reader :session
|
|
48
|
+
# @return [Spark::Connect::Relation] the logical plan this DataFrame builds.
|
|
49
|
+
attr_reader :relation
|
|
50
|
+
|
|
51
|
+
# @param session [SparkSession]
|
|
52
|
+
# @param relation [Spark::Connect::Relation]
|
|
53
|
+
def initialize(session, relation)
|
|
54
|
+
@session = session
|
|
55
|
+
@relation = relation
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# ---- Projection --------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
# Select a set of columns or expressions.
|
|
61
|
+
#
|
|
62
|
+
# @param cols [Array<Column, String, Symbol>]
|
|
63
|
+
# @return [DataFrame]
|
|
64
|
+
def select(*cols)
|
|
65
|
+
exprs = normalize_columns(cols).map(&:to_expr)
|
|
66
|
+
build(project: Proto::Project.new(input: @relation, expressions: exprs))
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Select using SQL expression strings.
|
|
70
|
+
#
|
|
71
|
+
# @param exprs [Array<String>]
|
|
72
|
+
# @return [DataFrame]
|
|
73
|
+
def select_expr(*exprs)
|
|
74
|
+
parsed = exprs.flatten.map do |e|
|
|
75
|
+
Proto::Expression.new(expression_string: Proto::Expression::ExpressionString.new(expression: e))
|
|
76
|
+
end
|
|
77
|
+
build(project: Proto::Project.new(input: @relation, expressions: parsed))
|
|
78
|
+
end
|
|
79
|
+
alias selectExpr select_expr
|
|
80
|
+
|
|
81
|
+
# ---- Filtering ---------------------------------------------------------
|
|
82
|
+
|
|
83
|
+
# Filter rows by a condition.
|
|
84
|
+
#
|
|
85
|
+
# @param condition [Column, String] a boolean column or SQL expression string.
|
|
86
|
+
# @return [DataFrame]
|
|
87
|
+
def filter(condition)
|
|
88
|
+
cond = condition.is_a?(String) ? Functions.expr(condition) : condition
|
|
89
|
+
build(filter: Proto::Filter.new(input: @relation, condition: cond.to_expr))
|
|
90
|
+
end
|
|
91
|
+
alias where filter
|
|
92
|
+
|
|
93
|
+
# ---- Column manipulation ----------------------------------------------
|
|
94
|
+
|
|
95
|
+
# Add or replace a single column.
|
|
96
|
+
#
|
|
97
|
+
# @param name [String]
|
|
98
|
+
# @param col [Column]
|
|
99
|
+
# @return [DataFrame]
|
|
100
|
+
def with_column(name, col)
|
|
101
|
+
with_columns(name => col)
|
|
102
|
+
end
|
|
103
|
+
alias withColumn with_column
|
|
104
|
+
|
|
105
|
+
# Add or replace multiple columns.
|
|
106
|
+
#
|
|
107
|
+
# @param assigns [Hash{String=>Column}]
|
|
108
|
+
# @return [DataFrame]
|
|
109
|
+
def with_columns(assigns)
|
|
110
|
+
aliases = assigns.map do |name, col|
|
|
111
|
+
Proto::Expression::Alias.new(expr: Column.to_col(col).to_expr, name: [name.to_s])
|
|
112
|
+
end
|
|
113
|
+
build(with_columns: Proto::WithColumns.new(input: @relation, aliases: aliases))
|
|
114
|
+
end
|
|
115
|
+
alias withColumns with_columns
|
|
116
|
+
|
|
117
|
+
# Rename a single column.
|
|
118
|
+
# @return [DataFrame]
|
|
119
|
+
def with_column_renamed(existing, new_name)
|
|
120
|
+
with_columns_renamed(existing => new_name)
|
|
121
|
+
end
|
|
122
|
+
alias withColumnRenamed with_column_renamed
|
|
123
|
+
|
|
124
|
+
# Rename multiple columns.
|
|
125
|
+
#
|
|
126
|
+
# @param renames [Hash{String=>String}]
|
|
127
|
+
# @return [DataFrame]
|
|
128
|
+
def with_columns_renamed(renames)
|
|
129
|
+
pairs = renames.map do |old, new_name|
|
|
130
|
+
Proto::WithColumnsRenamed::Rename.new(col_name: old.to_s, new_col_name: new_name.to_s)
|
|
131
|
+
end
|
|
132
|
+
build(with_columns_renamed: Proto::WithColumnsRenamed.new(input: @relation, renames: pairs))
|
|
133
|
+
end
|
|
134
|
+
alias withColumnsRenamed with_columns_renamed
|
|
135
|
+
|
|
136
|
+
# Drop one or more columns (by name or {Column}).
|
|
137
|
+
# @return [DataFrame]
|
|
138
|
+
def drop(*cols)
|
|
139
|
+
names = []
|
|
140
|
+
columns = []
|
|
141
|
+
cols.flatten.each do |c|
|
|
142
|
+
case c
|
|
143
|
+
when Column then columns << c.to_expr
|
|
144
|
+
else names << c.to_s
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
build(drop: Proto::Drop.new(input: @relation, columns: columns, column_names: names))
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Rename all columns positionally.
|
|
151
|
+
# @return [DataFrame]
|
|
152
|
+
def to_df(*names)
|
|
153
|
+
build(to_df: Proto::ToDF.new(input: @relation, column_names: names.flatten.map(&:to_s)))
|
|
154
|
+
end
|
|
155
|
+
alias toDF to_df
|
|
156
|
+
|
|
157
|
+
# Apply a {Types::StructType} (reconciling/casting columns to it).
|
|
158
|
+
# @return [DataFrame]
|
|
159
|
+
def to(schema)
|
|
160
|
+
build(to_schema: Proto::ToSchema.new(input: @relation, schema: schema.to_proto))
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# ---- Deduplication -----------------------------------------------------
|
|
164
|
+
|
|
165
|
+
# Distinct rows.
|
|
166
|
+
# @return [DataFrame]
|
|
167
|
+
def distinct
|
|
168
|
+
build(deduplicate: Proto::Deduplicate.new(input: @relation, all_columns_as_keys: true))
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Drop duplicate rows, optionally restricted to a subset of columns.
|
|
172
|
+
#
|
|
173
|
+
# @param subset [Array<String>, nil]
|
|
174
|
+
# @return [DataFrame]
|
|
175
|
+
def drop_duplicates(subset = nil)
|
|
176
|
+
dedup =
|
|
177
|
+
if subset.nil? || subset.empty?
|
|
178
|
+
Proto::Deduplicate.new(input: @relation, all_columns_as_keys: true)
|
|
179
|
+
else
|
|
180
|
+
Proto::Deduplicate.new(input: @relation, column_names: Array(subset).map(&:to_s))
|
|
181
|
+
end
|
|
182
|
+
build(deduplicate: dedup)
|
|
183
|
+
end
|
|
184
|
+
alias dropDuplicates drop_duplicates
|
|
185
|
+
alias drop_duplicates_within_watermark drop_duplicates
|
|
186
|
+
|
|
187
|
+
# ---- Ordering ----------------------------------------------------------
|
|
188
|
+
|
|
189
|
+
# Sort by the given columns (globally).
|
|
190
|
+
#
|
|
191
|
+
# @param cols [Array<Column, String>]
|
|
192
|
+
# @return [DataFrame]
|
|
193
|
+
def order_by(*cols)
|
|
194
|
+
orders = normalize_columns(cols).map { |c| to_sort_order(c) }
|
|
195
|
+
build(sort: Proto::Sort.new(input: @relation, order: orders, is_global: true))
|
|
196
|
+
end
|
|
197
|
+
alias sort order_by
|
|
198
|
+
alias orderBy order_by
|
|
199
|
+
|
|
200
|
+
# Sort within each partition (no global shuffle).
|
|
201
|
+
# @return [DataFrame]
|
|
202
|
+
def sort_within_partitions(*cols)
|
|
203
|
+
orders = normalize_columns(cols).map { |c| to_sort_order(c) }
|
|
204
|
+
build(sort: Proto::Sort.new(input: @relation, order: orders, is_global: false))
|
|
205
|
+
end
|
|
206
|
+
alias sortWithinPartitions sort_within_partitions
|
|
207
|
+
|
|
208
|
+
# ---- Limiting ----------------------------------------------------------
|
|
209
|
+
|
|
210
|
+
# @return [DataFrame] the first `n` rows.
|
|
211
|
+
def limit(n)
|
|
212
|
+
build(limit: Proto::Limit.new(input: @relation, limit: n))
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# @return [DataFrame] all rows except the first `n`.
|
|
216
|
+
def offset(n)
|
|
217
|
+
build(offset: Proto::Offset.new(input: @relation, offset: n))
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# ---- Grouping & aggregation -------------------------------------------
|
|
221
|
+
|
|
222
|
+
# Group by the given columns.
|
|
223
|
+
#
|
|
224
|
+
# @param cols [Array<Column, String>]
|
|
225
|
+
# @return [GroupedData]
|
|
226
|
+
def group_by(*cols)
|
|
227
|
+
GroupedData.new(self, normalize_columns(cols), :GROUP_TYPE_GROUPBY)
|
|
228
|
+
end
|
|
229
|
+
alias groupBy group_by
|
|
230
|
+
alias groupby group_by
|
|
231
|
+
|
|
232
|
+
# Multi-dimensional rollup.
|
|
233
|
+
# @return [GroupedData]
|
|
234
|
+
def rollup(*cols)
|
|
235
|
+
GroupedData.new(self, normalize_columns(cols), :GROUP_TYPE_ROLLUP)
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Multi-dimensional cube.
|
|
239
|
+
# @return [GroupedData]
|
|
240
|
+
def cube(*cols)
|
|
241
|
+
GroupedData.new(self, normalize_columns(cols), :GROUP_TYPE_CUBE)
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Aggregate over the whole DataFrame (a group-by with no grouping columns).
|
|
245
|
+
#
|
|
246
|
+
# @param exprs [Array<Column>, Hash]
|
|
247
|
+
# @return [DataFrame]
|
|
248
|
+
def agg(*exprs)
|
|
249
|
+
group_by.agg(*exprs)
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# ---- Joins -------------------------------------------------------------
|
|
253
|
+
|
|
254
|
+
# Join with another DataFrame.
|
|
255
|
+
#
|
|
256
|
+
# @param other [DataFrame]
|
|
257
|
+
# @param on [String, Array<String>, Column, nil] join key column name(s) or a
|
|
258
|
+
# boolean join condition.
|
|
259
|
+
# @param how [Symbol, String] join type (see {JOIN_TYPES}).
|
|
260
|
+
# @return [DataFrame]
|
|
261
|
+
def join(other, on: nil, how: :inner)
|
|
262
|
+
jt = JOIN_TYPES[how.to_s.downcase.to_sym] or
|
|
263
|
+
raise IllegalArgumentError, "Unsupported join type: #{how}"
|
|
264
|
+
j = Proto::Join.new(left: @relation, right: other.relation, join_type: jt)
|
|
265
|
+
case on
|
|
266
|
+
when nil then nil
|
|
267
|
+
when Column then j.join_condition = on.to_expr
|
|
268
|
+
when Array then j.using_columns += on.map(&:to_s)
|
|
269
|
+
else j.using_columns << on.to_s
|
|
270
|
+
end
|
|
271
|
+
build(join: j)
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
# Cartesian product with another DataFrame.
|
|
275
|
+
# @return [DataFrame]
|
|
276
|
+
def cross_join(other)
|
|
277
|
+
build(join: Proto::Join.new(left: @relation, right: other.relation, join_type: :JOIN_TYPE_CROSS))
|
|
278
|
+
end
|
|
279
|
+
alias crossJoin cross_join
|
|
280
|
+
|
|
281
|
+
# ---- Set operations ----------------------------------------------------
|
|
282
|
+
|
|
283
|
+
# Union (by position; keeps duplicates - equivalent to Spark's `unionAll`).
|
|
284
|
+
# @return [DataFrame]
|
|
285
|
+
def union(other)
|
|
286
|
+
set_op(other, :SET_OP_TYPE_UNION, is_all: true)
|
|
287
|
+
end
|
|
288
|
+
alias union_all union
|
|
289
|
+
alias unionAll union
|
|
290
|
+
|
|
291
|
+
# Union by column name.
|
|
292
|
+
#
|
|
293
|
+
# @param other [DataFrame]
|
|
294
|
+
# @param allow_missing_columns [Boolean]
|
|
295
|
+
# @return [DataFrame]
|
|
296
|
+
def union_by_name(other, allow_missing_columns: false)
|
|
297
|
+
set_op(other, :SET_OP_TYPE_UNION, is_all: true, by_name: true, allow_missing_columns: allow_missing_columns)
|
|
298
|
+
end
|
|
299
|
+
alias unionByName union_by_name
|
|
300
|
+
|
|
301
|
+
# Set intersection (distinct).
|
|
302
|
+
# @return [DataFrame]
|
|
303
|
+
def intersect(other)
|
|
304
|
+
set_op(other, :SET_OP_TYPE_INTERSECT, is_all: false)
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
# Set intersection keeping duplicates.
|
|
308
|
+
# @return [DataFrame]
|
|
309
|
+
def intersect_all(other)
|
|
310
|
+
set_op(other, :SET_OP_TYPE_INTERSECT, is_all: true)
|
|
311
|
+
end
|
|
312
|
+
alias intersectAll intersect_all
|
|
313
|
+
|
|
314
|
+
# Rows in this DataFrame not in `other` (distinct).
|
|
315
|
+
# @return [DataFrame]
|
|
316
|
+
def except_all(other)
|
|
317
|
+
set_op(other, :SET_OP_TYPE_EXCEPT, is_all: true)
|
|
318
|
+
end
|
|
319
|
+
alias exceptAll except_all
|
|
320
|
+
|
|
321
|
+
# Rows in this DataFrame not in `other` (distinct) - Spark's `EXCEPT`.
|
|
322
|
+
# @return [DataFrame]
|
|
323
|
+
def subtract(other)
|
|
324
|
+
set_op(other, :SET_OP_TYPE_EXCEPT, is_all: false)
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
# ---- Partitioning ------------------------------------------------------
|
|
328
|
+
|
|
329
|
+
# Repartition into `num_partitions`, optionally hash-partitioned by columns.
|
|
330
|
+
#
|
|
331
|
+
# @param num_partitions [Integer]
|
|
332
|
+
# @param cols [Array<Column, String>]
|
|
333
|
+
# @return [DataFrame]
|
|
334
|
+
def repartition(num_partitions, *cols)
|
|
335
|
+
if cols.empty?
|
|
336
|
+
build(repartition: Proto::Repartition.new(input: @relation, num_partitions: num_partitions, shuffle: true))
|
|
337
|
+
else
|
|
338
|
+
rbe = Proto::RepartitionByExpression.new(
|
|
339
|
+
input: @relation, partition_exprs: normalize_columns(cols).map(&:to_expr), num_partitions: num_partitions
|
|
340
|
+
)
|
|
341
|
+
build(repartition_by_expression: rbe)
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
# Reduce to `num_partitions` without a full shuffle.
|
|
346
|
+
# @return [DataFrame]
|
|
347
|
+
def coalesce(num_partitions)
|
|
348
|
+
build(repartition: Proto::Repartition.new(input: @relation, num_partitions: num_partitions, shuffle: false))
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
# Range-partition by the given columns (rows are range-partitioned on the
|
|
352
|
+
# sort order of the columns).
|
|
353
|
+
#
|
|
354
|
+
# @overload repartition_by_range(*cols)
|
|
355
|
+
# @overload repartition_by_range(num_partitions, *cols)
|
|
356
|
+
# @return [DataFrame]
|
|
357
|
+
def repartition_by_range(*args)
|
|
358
|
+
num_partitions = args.first.is_a?(Integer) ? args.shift : nil
|
|
359
|
+
orders = normalize_columns(args).map do |c|
|
|
360
|
+
expr = c.to_expr
|
|
361
|
+
if expr.expr_type == :sort_order
|
|
362
|
+
expr
|
|
363
|
+
else
|
|
364
|
+
Proto::Expression.new(sort_order: Proto::Expression::SortOrder.new(
|
|
365
|
+
child: expr, direction: :SORT_DIRECTION_ASCENDING, null_ordering: :SORT_NULLS_FIRST
|
|
366
|
+
))
|
|
367
|
+
end
|
|
368
|
+
end
|
|
369
|
+
rbe = Proto::RepartitionByExpression.new(input: @relation, partition_exprs: orders)
|
|
370
|
+
rbe.num_partitions = num_partitions if num_partitions
|
|
371
|
+
build(repartition_by_expression: rbe)
|
|
372
|
+
end
|
|
373
|
+
alias repartitionByRange repartition_by_range
|
|
374
|
+
|
|
375
|
+
# ---- Sampling ----------------------------------------------------------
|
|
376
|
+
|
|
377
|
+
# Random sample of rows.
|
|
378
|
+
#
|
|
379
|
+
# @param fraction [Float] expected fraction (0.0..1.0).
|
|
380
|
+
# @param with_replacement [Boolean]
|
|
381
|
+
# @param seed [Integer, nil]
|
|
382
|
+
# @return [DataFrame]
|
|
383
|
+
def sample(fraction, with_replacement: false, seed: nil)
|
|
384
|
+
s = Proto::Sample.new(
|
|
385
|
+
input: @relation, lower_bound: 0.0, upper_bound: fraction, with_replacement: with_replacement
|
|
386
|
+
)
|
|
387
|
+
s.seed = seed if seed
|
|
388
|
+
build(sample: s)
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
# ---- Misc transforms ---------------------------------------------------
|
|
392
|
+
|
|
393
|
+
# Alias this DataFrame (a subquery alias usable in join conditions).
|
|
394
|
+
# @return [DataFrame]
|
|
395
|
+
def alias(name)
|
|
396
|
+
build(subquery_alias: Proto::SubqueryAlias.new(input: @relation, alias: name.to_s))
|
|
397
|
+
end
|
|
398
|
+
alias as alias
|
|
399
|
+
|
|
400
|
+
# Attach a planner hint (e.g. `"broadcast"`).
|
|
401
|
+
#
|
|
402
|
+
# @param name [String]
|
|
403
|
+
# @param params [Array]
|
|
404
|
+
# @return [DataFrame]
|
|
405
|
+
def hint(name, *params)
|
|
406
|
+
h = Proto::Hint.new(input: @relation, name: name.to_s,
|
|
407
|
+
parameters: params.map { |p| Column.to_col(p).to_expr })
|
|
408
|
+
build(hint: h)
|
|
409
|
+
end
|
|
410
|
+
|
|
411
|
+
# Unpivot (melt) columns from wide to long format.
|
|
412
|
+
#
|
|
413
|
+
# @param ids [Array<Column, String>] identifier columns.
|
|
414
|
+
# @param values [Array<Column, String>, nil] value columns (nil = all others).
|
|
415
|
+
# @param variable_column_name [String]
|
|
416
|
+
# @param value_column_name [String]
|
|
417
|
+
# @return [DataFrame]
|
|
418
|
+
def unpivot(ids, values, variable_column_name, value_column_name)
|
|
419
|
+
u = Proto::Unpivot.new(
|
|
420
|
+
input: @relation,
|
|
421
|
+
ids: normalize_columns(Array(ids)).map(&:to_expr),
|
|
422
|
+
variable_column_name: variable_column_name,
|
|
423
|
+
value_column_name: value_column_name
|
|
424
|
+
)
|
|
425
|
+
u.values = Proto::Unpivot::Values.new(values: normalize_columns(Array(values)).map(&:to_expr)) unless values.nil?
|
|
426
|
+
build(unpivot: u)
|
|
427
|
+
end
|
|
428
|
+
alias melt unpivot
|
|
429
|
+
|
|
430
|
+
# ---- NA / stat / IO facades -------------------------------------------
|
|
431
|
+
|
|
432
|
+
# @return [DataFrameNaFunctions] missing-data helpers (`drop`, `fill`, `replace`).
|
|
433
|
+
def na
|
|
434
|
+
DataFrameNaFunctions.new(self)
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
# @return [DataFrameStatFunctions] statistical helpers.
|
|
438
|
+
def stat
|
|
439
|
+
DataFrameStatFunctions.new(self)
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
# @return [DataFrameWriter] interface for saving this DataFrame.
|
|
443
|
+
def write
|
|
444
|
+
DataFrameWriter.new(self)
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
# @return [DataFrameWriterV2] the v2 (catalog) write interface.
|
|
448
|
+
def write_to(table)
|
|
449
|
+
DataFrameWriterV2.new(self, table)
|
|
450
|
+
end
|
|
451
|
+
alias writeTo write_to
|
|
452
|
+
|
|
453
|
+
# @return [DataStreamWriter] interface for starting a streaming query from
|
|
454
|
+
# this (streaming) DataFrame.
|
|
455
|
+
def write_stream
|
|
456
|
+
DataStreamWriter.new(self)
|
|
457
|
+
end
|
|
458
|
+
alias writeStream write_stream
|
|
459
|
+
|
|
460
|
+
# ---- Temporary views ---------------------------------------------------
|
|
461
|
+
|
|
462
|
+
# Register this DataFrame as a session-scoped temporary view, failing if a
|
|
463
|
+
# view of the same name already exists.
|
|
464
|
+
# @return [void]
|
|
465
|
+
def create_temp_view(name)
|
|
466
|
+
register_view(name, global: false, replace: false)
|
|
467
|
+
end
|
|
468
|
+
alias createTempView create_temp_view
|
|
469
|
+
|
|
470
|
+
# Register (or replace) this DataFrame as a session-scoped temporary view.
|
|
471
|
+
# @return [void]
|
|
472
|
+
def create_or_replace_temp_view(name)
|
|
473
|
+
register_view(name, global: false, replace: true)
|
|
474
|
+
end
|
|
475
|
+
alias createOrReplaceTempView create_or_replace_temp_view
|
|
476
|
+
|
|
477
|
+
# Register this DataFrame as a global (cross-session) temporary view.
|
|
478
|
+
# @return [void]
|
|
479
|
+
def create_global_temp_view(name)
|
|
480
|
+
register_view(name, global: true, replace: false)
|
|
481
|
+
end
|
|
482
|
+
alias createGlobalTempView create_global_temp_view
|
|
483
|
+
|
|
484
|
+
# Register (or replace) this DataFrame as a global temporary view.
|
|
485
|
+
# @return [void]
|
|
486
|
+
def create_or_replace_global_temp_view(name)
|
|
487
|
+
register_view(name, global: true, replace: true)
|
|
488
|
+
end
|
|
489
|
+
alias createOrReplaceGlobalTempView create_or_replace_global_temp_view
|
|
490
|
+
|
|
491
|
+
# Select columns by a regular expression matched against their names.
|
|
492
|
+
#
|
|
493
|
+
# @param regex [String]
|
|
494
|
+
# @return [Column]
|
|
495
|
+
def col_regex(regex)
|
|
496
|
+
Column.new(Proto::Expression.new(unresolved_regex: Proto::Expression::UnresolvedRegex.new(col_name: regex.to_s)))
|
|
497
|
+
end
|
|
498
|
+
alias colRegex col_regex
|
|
499
|
+
|
|
500
|
+
# @return [DataFrame] a single-column (`value`) DataFrame of each row encoded
|
|
501
|
+
# as a JSON string.
|
|
502
|
+
def to_json(*_args)
|
|
503
|
+
select(Functions.to_json(Functions.struct(Functions.col("*"))).alias("value"))
|
|
504
|
+
end
|
|
505
|
+
alias toJSON to_json
|
|
506
|
+
|
|
507
|
+
# Define an event-time watermark for late-data handling on a streaming
|
|
508
|
+
# DataFrame.
|
|
509
|
+
#
|
|
510
|
+
# @param event_time [String] the event-time column name.
|
|
511
|
+
# @param delay_threshold [String] e.g. `"10 minutes"`.
|
|
512
|
+
# @return [DataFrame]
|
|
513
|
+
def with_watermark(event_time, delay_threshold)
|
|
514
|
+
build(with_watermark: Proto::WithWatermark.new(
|
|
515
|
+
input: @relation, event_time: event_time.to_s, delay_threshold: delay_threshold.to_s
|
|
516
|
+
))
|
|
517
|
+
end
|
|
518
|
+
alias withWatermark with_watermark
|
|
519
|
+
|
|
520
|
+
# Apply a function to this DataFrame and return its result, enabling a
|
|
521
|
+
# fluent chain of custom transformations.
|
|
522
|
+
#
|
|
523
|
+
# @yieldparam df [DataFrame] self
|
|
524
|
+
# @return [DataFrame] whatever the block returns
|
|
525
|
+
def transform
|
|
526
|
+
yield(self)
|
|
527
|
+
end
|
|
528
|
+
|
|
529
|
+
# Eagerly checkpoint this DataFrame: materialise it server-side and return a
|
|
530
|
+
# new DataFrame backed by the cached result (truncates the logical plan).
|
|
531
|
+
#
|
|
532
|
+
# @param eager [Boolean] materialise immediately.
|
|
533
|
+
# @return [DataFrame]
|
|
534
|
+
def checkpoint(eager: true)
|
|
535
|
+
checkpoint_command(local: false, eager: eager)
|
|
536
|
+
end
|
|
537
|
+
|
|
538
|
+
# Like {#checkpoint} but uses the executors' local storage (no reliable
|
|
539
|
+
# storage), which is faster but not fault-tolerant.
|
|
540
|
+
#
|
|
541
|
+
# @param eager [Boolean]
|
|
542
|
+
# @return [DataFrame]
|
|
543
|
+
def local_checkpoint(eager: true)
|
|
544
|
+
checkpoint_command(local: true, eager: eager)
|
|
545
|
+
end
|
|
546
|
+
alias localCheckpoint local_checkpoint
|
|
547
|
+
|
|
548
|
+
# Observe named metrics over this DataFrame.
|
|
549
|
+
#
|
|
550
|
+
# @param name [String, Observation]
|
|
551
|
+
# @param exprs [Array<Column>]
|
|
552
|
+
# @return [DataFrame]
|
|
553
|
+
def observe(name, *exprs)
|
|
554
|
+
obs_name = name.is_a?(Observation) ? name.name : name.to_s
|
|
555
|
+
cm = Proto::CollectMetrics.new(
|
|
556
|
+
input: @relation, name: obs_name, metrics: exprs.map { |e| Column.to_col(e).to_expr }
|
|
557
|
+
)
|
|
558
|
+
df = build(collect_metrics: cm)
|
|
559
|
+
name.bind(df) if name.is_a?(Observation)
|
|
560
|
+
df
|
|
561
|
+
end
|
|
562
|
+
|
|
563
|
+
# ---- Schema introspection ---------------------------------------------
|
|
564
|
+
|
|
565
|
+
# @return [Types::StructType] the DataFrame's schema.
|
|
566
|
+
def schema
|
|
567
|
+
@schema ||= Types.from_proto(analyze(schema: Proto::AnalyzePlanRequest::Schema.new(plan: plan)).schema.schema)
|
|
568
|
+
end
|
|
569
|
+
|
|
570
|
+
# @return [Array<String>] column names.
|
|
571
|
+
def columns
|
|
572
|
+
schema.names
|
|
573
|
+
end
|
|
574
|
+
|
|
575
|
+
# @return [Array<Array(String, String)>] (name, simpleString-type) pairs.
|
|
576
|
+
def dtypes
|
|
577
|
+
schema.fields.map { |f| [f.name, f.data_type.simple_string] }
|
|
578
|
+
end
|
|
579
|
+
|
|
580
|
+
# @return [Array<Column>] one {Column} per output column.
|
|
581
|
+
def column_objects
|
|
582
|
+
columns.map { |c| Functions.col(c) }
|
|
583
|
+
end
|
|
584
|
+
|
|
585
|
+
# Print the schema as an indented tree to `io`.
|
|
586
|
+
# @return [void]
|
|
587
|
+
def print_schema(io = $stdout)
|
|
588
|
+
io.puts(schema.tree_string)
|
|
589
|
+
end
|
|
590
|
+
alias printSchema print_schema
|
|
591
|
+
|
|
592
|
+
# Index into a column by name (`df["id"]`) or position (`df[0]`).
|
|
593
|
+
#
|
|
594
|
+
# @param key [String, Symbol, Integer]
|
|
595
|
+
# @return [Column]
|
|
596
|
+
def [](key)
|
|
597
|
+
case key
|
|
598
|
+
when Integer then Functions.col(columns[key])
|
|
599
|
+
else Functions.col(key.to_s)
|
|
600
|
+
end
|
|
601
|
+
end
|
|
602
|
+
|
|
603
|
+
# Allows `df.column_name` for valid identifier column names.
|
|
604
|
+
def method_missing(name, *args)
|
|
605
|
+
if args.empty? && columns.include?(name.to_s)
|
|
606
|
+
Functions.col(name.to_s)
|
|
607
|
+
else
|
|
608
|
+
super
|
|
609
|
+
end
|
|
610
|
+
end
|
|
611
|
+
|
|
612
|
+
def respond_to_missing?(name, include_private = false)
|
|
613
|
+
begin
|
|
614
|
+
columns.include?(name.to_s)
|
|
615
|
+
rescue StandardError
|
|
616
|
+
false
|
|
617
|
+
end || super
|
|
618
|
+
end
|
|
619
|
+
|
|
620
|
+
# ---- Actions -----------------------------------------------------------
|
|
621
|
+
|
|
622
|
+
# Execute the plan and return all rows.
|
|
623
|
+
# @return [Array<Row>]
|
|
624
|
+
def collect
|
|
625
|
+
result = @session.client.execute_plan(@relation)
|
|
626
|
+
ArrowConverter.to_rows(result.arrow_batches)
|
|
627
|
+
end
|
|
628
|
+
alias to_a collect
|
|
629
|
+
|
|
630
|
+
# Iterate over all rows (materialises the result). Returns an Enumerator when
|
|
631
|
+
# no block is given.
|
|
632
|
+
#
|
|
633
|
+
# @yieldparam row [Row]
|
|
634
|
+
# @return [Enumerator, void]
|
|
635
|
+
def each(&block)
|
|
636
|
+
return collect.each unless block
|
|
637
|
+
|
|
638
|
+
collect.each(&block)
|
|
639
|
+
end
|
|
640
|
+
|
|
641
|
+
# @return [Enumerator<Row>] an enumerator over all rows.
|
|
642
|
+
def to_local_iterator
|
|
643
|
+
collect.each
|
|
644
|
+
end
|
|
645
|
+
alias toLocalIterator to_local_iterator
|
|
646
|
+
|
|
647
|
+
# @return [Array<Row>] the first `n` rows.
|
|
648
|
+
def take(n)
|
|
649
|
+
limit(n).collect
|
|
650
|
+
end
|
|
651
|
+
|
|
652
|
+
# @return [Array<Row>, Row] `n` rows (array) or the single first row when called with no arg.
|
|
653
|
+
def head(n = nil)
|
|
654
|
+
return first if n.nil?
|
|
655
|
+
|
|
656
|
+
take(n)
|
|
657
|
+
end
|
|
658
|
+
|
|
659
|
+
# @return [Row, nil] the first row.
|
|
660
|
+
def first
|
|
661
|
+
take(1).first
|
|
662
|
+
end
|
|
663
|
+
|
|
664
|
+
# @return [Integer] the number of rows.
|
|
665
|
+
def count
|
|
666
|
+
df = build(aggregate: Proto::Aggregate.new(
|
|
667
|
+
input: @relation,
|
|
668
|
+
group_type: :GROUP_TYPE_GROUPBY,
|
|
669
|
+
grouping_expressions: [],
|
|
670
|
+
aggregate_expressions: [Column.invoke("count", Column.lit(1)).to_expr]
|
|
671
|
+
))
|
|
672
|
+
row = df.collect.first
|
|
673
|
+
row ? row[0] : 0
|
|
674
|
+
end
|
|
675
|
+
|
|
676
|
+
# @return [Boolean] whether the DataFrame has no rows.
|
|
677
|
+
def empty?
|
|
678
|
+
limit(1).collect.empty?
|
|
679
|
+
end
|
|
680
|
+
alias is_empty empty?
|
|
681
|
+
|
|
682
|
+
# Render the first `n` rows as a formatted table.
|
|
683
|
+
#
|
|
684
|
+
# @param n [Integer]
|
|
685
|
+
# @param truncate [Boolean, Integer] truncate long values to 20 chars (true)
|
|
686
|
+
# or to the given width (Integer).
|
|
687
|
+
# @param vertical [Boolean]
|
|
688
|
+
# @return [void]
|
|
689
|
+
def show(n = 20, truncate: true, vertical: false)
|
|
690
|
+
$stdout.puts(show_string(n, truncate: truncate, vertical: vertical))
|
|
691
|
+
end
|
|
692
|
+
|
|
693
|
+
# @return [String] the formatted table string (what {#show} prints).
|
|
694
|
+
def show_string(n = 20, truncate: true, vertical: false)
|
|
695
|
+
trunc = if truncate == true
|
|
696
|
+
20
|
|
697
|
+
else
|
|
698
|
+
(truncate == false ? 0 : truncate.to_i)
|
|
699
|
+
end
|
|
700
|
+
ss = Proto::ShowString.new(input: @relation, num_rows: n, truncate: trunc, vertical: vertical)
|
|
701
|
+
df = build(show_string: ss)
|
|
702
|
+
df.collect.first&.[](0).to_s
|
|
703
|
+
end
|
|
704
|
+
|
|
705
|
+
# Materialise the result as an Arrow {Arrow::Table} (columnar).
|
|
706
|
+
# @return [Arrow::Table, nil]
|
|
707
|
+
def to_arrow
|
|
708
|
+
result = @session.client.execute_plan(@relation)
|
|
709
|
+
ArrowConverter.to_table(result.arrow_batches)
|
|
710
|
+
end
|
|
711
|
+
|
|
712
|
+
# @return [Array<Hash>] all rows as Hashes.
|
|
713
|
+
def to_h_array
|
|
714
|
+
collect.map(&:to_h)
|
|
715
|
+
end
|
|
716
|
+
|
|
717
|
+
# ---- Explain / metadata ------------------------------------------------
|
|
718
|
+
|
|
719
|
+
# Return the query plan as a string.
|
|
720
|
+
#
|
|
721
|
+
# @param mode [Symbol] `:simple`, `:extended`, `:codegen`, `:cost`, `:formatted`.
|
|
722
|
+
# @return [String]
|
|
723
|
+
def explain_string(mode = :simple)
|
|
724
|
+
em = :"EXPLAIN_MODE_#{mode.to_s.upcase}"
|
|
725
|
+
analyze(explain: Proto::AnalyzePlanRequest::Explain.new(plan: plan, explain_mode: em)).explain.explain_string
|
|
726
|
+
end
|
|
727
|
+
|
|
728
|
+
# Print the query plan.
|
|
729
|
+
# @return [void]
|
|
730
|
+
def explain(mode = :simple)
|
|
731
|
+
$stdout.puts(explain_string(mode))
|
|
732
|
+
end
|
|
733
|
+
|
|
734
|
+
# @return [Array<String>] the input files backing this DataFrame.
|
|
735
|
+
def input_files
|
|
736
|
+
analyze(input_files: Proto::AnalyzePlanRequest::InputFiles.new(plan: plan)).input_files.files.to_a
|
|
737
|
+
end
|
|
738
|
+
|
|
739
|
+
# @return [Boolean] whether the data is small enough to be local.
|
|
740
|
+
def local?
|
|
741
|
+
analyze(is_local: Proto::AnalyzePlanRequest::IsLocal.new(plan: plan)).is_local.is_local
|
|
742
|
+
end
|
|
743
|
+
|
|
744
|
+
# @return [Boolean] whether this is a streaming DataFrame.
|
|
745
|
+
def streaming?
|
|
746
|
+
analyze(is_streaming: Proto::AnalyzePlanRequest::IsStreaming.new(plan: plan)).is_streaming.is_streaming
|
|
747
|
+
end
|
|
748
|
+
|
|
749
|
+
# @return [Boolean] whether `other` has the same logical plan.
|
|
750
|
+
def same_semantics?(other)
|
|
751
|
+
analyze(same_semantics: Proto::AnalyzePlanRequest::SameSemantics.new(
|
|
752
|
+
target_plan: plan, other_plan: other.plan
|
|
753
|
+
)).same_semantics.result
|
|
754
|
+
end
|
|
755
|
+
|
|
756
|
+
# @return [Integer] a hash of the logical plan.
|
|
757
|
+
def semantic_hash
|
|
758
|
+
analyze(semantic_hash: Proto::AnalyzePlanRequest::SemanticHash.new(plan: plan)).semantic_hash.result
|
|
759
|
+
end
|
|
760
|
+
|
|
761
|
+
# @api private - the executable plan rooted at this relation.
|
|
762
|
+
# @return [Spark::Connect::Plan]
|
|
763
|
+
def plan
|
|
764
|
+
PlanBuilder.root_plan(@relation)
|
|
765
|
+
end
|
|
766
|
+
|
|
767
|
+
def to_s
|
|
768
|
+
"#<SparkConnect::DataFrame>"
|
|
769
|
+
end
|
|
770
|
+
alias inspect to_s
|
|
771
|
+
|
|
772
|
+
# @api private - build a derived DataFrame from a relation built by W-owned
|
|
773
|
+
# facades (GroupedData, NaFunctions, ...).
|
|
774
|
+
def build(**rel)
|
|
775
|
+
DataFrame.new(@session, PlanBuilder.relation(@session, **rel))
|
|
776
|
+
end
|
|
777
|
+
|
|
778
|
+
private
|
|
779
|
+
|
|
780
|
+
def analyze(**kw)
|
|
781
|
+
@session.client.analyze(**kw)
|
|
782
|
+
end
|
|
783
|
+
|
|
784
|
+
def register_view(name, global:, replace:)
|
|
785
|
+
cmd = Proto::CreateDataFrameViewCommand.new(
|
|
786
|
+
input: @relation, name: name.to_s, is_global: global, replace: replace
|
|
787
|
+
)
|
|
788
|
+
@session.client.execute_command(Proto::Command.new(create_dataframe_view: cmd))
|
|
789
|
+
nil
|
|
790
|
+
end
|
|
791
|
+
|
|
792
|
+
def checkpoint_command(local:, eager:)
|
|
793
|
+
cmd = Proto::CheckpointCommand.new(relation: @relation, local: local, eager: eager)
|
|
794
|
+
result = @session.client.execute_command(Proto::Command.new(checkpoint_command: cmd))
|
|
795
|
+
cached = result.checkpoint_relation
|
|
796
|
+
raise SparkConnectError, "Server did not return a checkpointed relation" unless cached
|
|
797
|
+
|
|
798
|
+
relation = Proto::Relation.new(
|
|
799
|
+
common: Proto::RelationCommon.new(plan_id: @session.next_plan_id),
|
|
800
|
+
cached_remote_relation: Proto::CachedRemoteRelation.new(relation_id: cached.relation_id)
|
|
801
|
+
)
|
|
802
|
+
DataFrame.new(@session, relation)
|
|
803
|
+
end
|
|
804
|
+
|
|
805
|
+
def normalize_columns(cols)
|
|
806
|
+
cols.flatten.map { |c| c.is_a?(Column) ? c : Functions.col(c.to_s) }
|
|
807
|
+
end
|
|
808
|
+
|
|
809
|
+
def to_sort_order(col)
|
|
810
|
+
expr = col.to_expr
|
|
811
|
+
if expr.expr_type == :sort_order
|
|
812
|
+
expr.sort_order
|
|
813
|
+
else
|
|
814
|
+
Proto::Expression::SortOrder.new(
|
|
815
|
+
child: expr, direction: :SORT_DIRECTION_ASCENDING, null_ordering: :SORT_NULLS_FIRST
|
|
816
|
+
)
|
|
817
|
+
end
|
|
818
|
+
end
|
|
819
|
+
|
|
820
|
+
def set_op(other, type, is_all:, by_name: false, allow_missing_columns: false)
|
|
821
|
+
op = Proto::SetOperation.new(
|
|
822
|
+
left_input: @relation, right_input: other.relation, set_op_type: type,
|
|
823
|
+
is_all: is_all, by_name: by_name, allow_missing_columns: allow_missing_columns
|
|
824
|
+
)
|
|
825
|
+
build(set_op: op)
|
|
826
|
+
end
|
|
827
|
+
end
|
|
828
|
+
end
|