spark-connect 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +82 -0
  3. data/LICENSE +202 -0
  4. data/NOTICE +16 -0
  5. data/README.md +166 -0
  6. data/lib/spark-connect.rb +5 -0
  7. data/lib/spark_connect/arrow.rb +115 -0
  8. data/lib/spark_connect/catalog.rb +190 -0
  9. data/lib/spark_connect/channel_builder.rb +134 -0
  10. data/lib/spark_connect/client.rb +264 -0
  11. data/lib/spark_connect/column.rb +379 -0
  12. data/lib/spark_connect/conf.rb +79 -0
  13. data/lib/spark_connect/data_frame.rb +828 -0
  14. data/lib/spark_connect/errors.rb +58 -0
  15. data/lib/spark_connect/functions.rb +903 -0
  16. data/lib/spark_connect/grouped_data.rb +101 -0
  17. data/lib/spark_connect/na_functions.rb +98 -0
  18. data/lib/spark_connect/observation.rb +61 -0
  19. data/lib/spark_connect/pipelines.rb +221 -0
  20. data/lib/spark_connect/plan.rb +39 -0
  21. data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
  22. data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
  23. data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
  24. data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
  25. data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
  26. data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
  27. data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
  28. data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
  29. data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
  30. data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
  31. data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
  32. data/lib/spark_connect/proto.rb +32 -0
  33. data/lib/spark_connect/reader.rb +98 -0
  34. data/lib/spark_connect/row.rb +105 -0
  35. data/lib/spark_connect/session.rb +317 -0
  36. data/lib/spark_connect/stat_functions.rb +109 -0
  37. data/lib/spark_connect/streaming.rb +351 -0
  38. data/lib/spark_connect/types.rb +490 -0
  39. data/lib/spark_connect/version.rb +11 -0
  40. data/lib/spark_connect/window.rb +119 -0
  41. data/lib/spark_connect/writer.rb +208 -0
  42. data/lib/spark_connect.rb +58 -0
  43. data/proto/spark/connect/base.proto +1275 -0
  44. data/proto/spark/connect/catalog.proto +243 -0
  45. data/proto/spark/connect/commands.proto +553 -0
  46. data/proto/spark/connect/common.proto +179 -0
  47. data/proto/spark/connect/expressions.proto +557 -0
  48. data/proto/spark/connect/ml.proto +147 -0
  49. data/proto/spark/connect/ml_common.proto +64 -0
  50. data/proto/spark/connect/pipelines.proto +307 -0
  51. data/proto/spark/connect/relations.proto +1252 -0
  52. data/proto/spark/connect/types.proto +227 -0
  53. metadata +149 -0
@@ -0,0 +1,828 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SparkConnect
4
+ # A distributed, lazily-evaluated collection of rows organised into named
5
+ # columns - the central abstraction of the DataFrame API.
6
+ #
7
+ # A {DataFrame} is immutable: every transformation ({#select}, {#filter},
8
+ # {#join}, ...) returns a new {DataFrame} wrapping a new logical plan; nothing
9
+ # is sent to the server until an action ({#collect}, {#show}, {#count}, ...) is
10
+ # invoked.
11
+ #
12
+ # Method names are snake_case (Ruby idiom); camelCase aliases are provided for
13
+ # the highest-traffic PySpark names (`groupBy`, `withColumn`, `orderBy`, ...).
14
+ #
15
+ # @example
16
+ # F = SparkConnect::F
17
+ # df = spark.range(100)
18
+ # df.filter(F.col("id") % 2 == 0)
19
+ # .select((F.col("id") * 10).alias("ten_x"))
20
+ # .order_by(F.col("ten_x").desc)
21
+ # .show(5)
22
+ class DataFrame
23
+ Proto = SparkConnect::Proto
24
+
25
+ JOIN_TYPES = {
26
+ inner: :JOIN_TYPE_INNER,
27
+ cross: :JOIN_TYPE_CROSS,
28
+ outer: :JOIN_TYPE_FULL_OUTER,
29
+ full: :JOIN_TYPE_FULL_OUTER,
30
+ fullouter: :JOIN_TYPE_FULL_OUTER,
31
+ full_outer: :JOIN_TYPE_FULL_OUTER,
32
+ left: :JOIN_TYPE_LEFT_OUTER,
33
+ leftouter: :JOIN_TYPE_LEFT_OUTER,
34
+ left_outer: :JOIN_TYPE_LEFT_OUTER,
35
+ right: :JOIN_TYPE_RIGHT_OUTER,
36
+ rightouter: :JOIN_TYPE_RIGHT_OUTER,
37
+ right_outer: :JOIN_TYPE_RIGHT_OUTER,
38
+ semi: :JOIN_TYPE_LEFT_SEMI,
39
+ leftsemi: :JOIN_TYPE_LEFT_SEMI,
40
+ left_semi: :JOIN_TYPE_LEFT_SEMI,
41
+ anti: :JOIN_TYPE_LEFT_ANTI,
42
+ leftanti: :JOIN_TYPE_LEFT_ANTI,
43
+ left_anti: :JOIN_TYPE_LEFT_ANTI,
44
+ }.freeze
45
+
46
+ # @return [SparkSession]
47
+ attr_reader :session
48
+ # @return [Spark::Connect::Relation] the logical plan this DataFrame builds.
49
+ attr_reader :relation
50
+
51
+ # @param session [SparkSession]
52
+ # @param relation [Spark::Connect::Relation]
53
+ def initialize(session, relation)
54
+ @session = session
55
+ @relation = relation
56
+ end
57
+
58
+ # ---- Projection --------------------------------------------------------
59
+
60
+ # Select a set of columns or expressions.
61
+ #
62
+ # @param cols [Array<Column, String, Symbol>]
63
+ # @return [DataFrame]
64
+ def select(*cols)
65
+ exprs = normalize_columns(cols).map(&:to_expr)
66
+ build(project: Proto::Project.new(input: @relation, expressions: exprs))
67
+ end
68
+
69
+ # Select using SQL expression strings.
70
+ #
71
+ # @param exprs [Array<String>]
72
+ # @return [DataFrame]
73
+ def select_expr(*exprs)
74
+ parsed = exprs.flatten.map do |e|
75
+ Proto::Expression.new(expression_string: Proto::Expression::ExpressionString.new(expression: e))
76
+ end
77
+ build(project: Proto::Project.new(input: @relation, expressions: parsed))
78
+ end
79
+ alias selectExpr select_expr
80
+
81
+ # ---- Filtering ---------------------------------------------------------
82
+
83
+ # Filter rows by a condition.
84
+ #
85
+ # @param condition [Column, String] a boolean column or SQL expression string.
86
+ # @return [DataFrame]
87
+ def filter(condition)
88
+ cond = condition.is_a?(String) ? Functions.expr(condition) : condition
89
+ build(filter: Proto::Filter.new(input: @relation, condition: cond.to_expr))
90
+ end
91
+ alias where filter
92
+
93
+ # ---- Column manipulation ----------------------------------------------
94
+
95
+ # Add or replace a single column.
96
+ #
97
+ # @param name [String]
98
+ # @param col [Column]
99
+ # @return [DataFrame]
100
+ def with_column(name, col)
101
+ with_columns(name => col)
102
+ end
103
+ alias withColumn with_column
104
+
105
+ # Add or replace multiple columns.
106
+ #
107
+ # @param assigns [Hash{String=>Column}]
108
+ # @return [DataFrame]
109
+ def with_columns(assigns)
110
+ aliases = assigns.map do |name, col|
111
+ Proto::Expression::Alias.new(expr: Column.to_col(col).to_expr, name: [name.to_s])
112
+ end
113
+ build(with_columns: Proto::WithColumns.new(input: @relation, aliases: aliases))
114
+ end
115
+ alias withColumns with_columns
116
+
117
+ # Rename a single column.
118
+ # @return [DataFrame]
119
+ def with_column_renamed(existing, new_name)
120
+ with_columns_renamed(existing => new_name)
121
+ end
122
+ alias withColumnRenamed with_column_renamed
123
+
124
+ # Rename multiple columns.
125
+ #
126
+ # @param renames [Hash{String=>String}]
127
+ # @return [DataFrame]
128
+ def with_columns_renamed(renames)
129
+ pairs = renames.map do |old, new_name|
130
+ Proto::WithColumnsRenamed::Rename.new(col_name: old.to_s, new_col_name: new_name.to_s)
131
+ end
132
+ build(with_columns_renamed: Proto::WithColumnsRenamed.new(input: @relation, renames: pairs))
133
+ end
134
+ alias withColumnsRenamed with_columns_renamed
135
+
136
+ # Drop one or more columns (by name or {Column}).
137
+ # @return [DataFrame]
138
+ def drop(*cols)
139
+ names = []
140
+ columns = []
141
+ cols.flatten.each do |c|
142
+ case c
143
+ when Column then columns << c.to_expr
144
+ else names << c.to_s
145
+ end
146
+ end
147
+ build(drop: Proto::Drop.new(input: @relation, columns: columns, column_names: names))
148
+ end
149
+
150
+ # Rename all columns positionally.
151
+ # @return [DataFrame]
152
+ def to_df(*names)
153
+ build(to_df: Proto::ToDF.new(input: @relation, column_names: names.flatten.map(&:to_s)))
154
+ end
155
+ alias toDF to_df
156
+
157
+ # Apply a {Types::StructType} (reconciling/casting columns to it).
158
+ # @return [DataFrame]
159
+ def to(schema)
160
+ build(to_schema: Proto::ToSchema.new(input: @relation, schema: schema.to_proto))
161
+ end
162
+
163
+ # ---- Deduplication -----------------------------------------------------
164
+
165
+ # Distinct rows.
166
+ # @return [DataFrame]
167
+ def distinct
168
+ build(deduplicate: Proto::Deduplicate.new(input: @relation, all_columns_as_keys: true))
169
+ end
170
+
171
+ # Drop duplicate rows, optionally restricted to a subset of columns.
172
+ #
173
+ # @param subset [Array<String>, nil]
174
+ # @return [DataFrame]
175
+ def drop_duplicates(subset = nil)
176
+ dedup =
177
+ if subset.nil? || subset.empty?
178
+ Proto::Deduplicate.new(input: @relation, all_columns_as_keys: true)
179
+ else
180
+ Proto::Deduplicate.new(input: @relation, column_names: Array(subset).map(&:to_s))
181
+ end
182
+ build(deduplicate: dedup)
183
+ end
184
+ alias dropDuplicates drop_duplicates
185
+ alias drop_duplicates_within_watermark drop_duplicates
186
+
187
+ # ---- Ordering ----------------------------------------------------------
188
+
189
+ # Sort by the given columns (globally).
190
+ #
191
+ # @param cols [Array<Column, String>]
192
+ # @return [DataFrame]
193
+ def order_by(*cols)
194
+ orders = normalize_columns(cols).map { |c| to_sort_order(c) }
195
+ build(sort: Proto::Sort.new(input: @relation, order: orders, is_global: true))
196
+ end
197
+ alias sort order_by
198
+ alias orderBy order_by
199
+
200
+ # Sort within each partition (no global shuffle).
201
+ # @return [DataFrame]
202
+ def sort_within_partitions(*cols)
203
+ orders = normalize_columns(cols).map { |c| to_sort_order(c) }
204
+ build(sort: Proto::Sort.new(input: @relation, order: orders, is_global: false))
205
+ end
206
+ alias sortWithinPartitions sort_within_partitions
207
+
208
+ # ---- Limiting ----------------------------------------------------------
209
+
210
+ # @return [DataFrame] the first `n` rows.
211
+ def limit(n)
212
+ build(limit: Proto::Limit.new(input: @relation, limit: n))
213
+ end
214
+
215
+ # @return [DataFrame] all rows except the first `n`.
216
+ def offset(n)
217
+ build(offset: Proto::Offset.new(input: @relation, offset: n))
218
+ end
219
+
220
+ # ---- Grouping & aggregation -------------------------------------------
221
+
222
+ # Group by the given columns.
223
+ #
224
+ # @param cols [Array<Column, String>]
225
+ # @return [GroupedData]
226
+ def group_by(*cols)
227
+ GroupedData.new(self, normalize_columns(cols), :GROUP_TYPE_GROUPBY)
228
+ end
229
+ alias groupBy group_by
230
+ alias groupby group_by
231
+
232
+ # Multi-dimensional rollup.
233
+ # @return [GroupedData]
234
+ def rollup(*cols)
235
+ GroupedData.new(self, normalize_columns(cols), :GROUP_TYPE_ROLLUP)
236
+ end
237
+
238
+ # Multi-dimensional cube.
239
+ # @return [GroupedData]
240
+ def cube(*cols)
241
+ GroupedData.new(self, normalize_columns(cols), :GROUP_TYPE_CUBE)
242
+ end
243
+
244
+ # Aggregate over the whole DataFrame (a group-by with no grouping columns).
245
+ #
246
+ # @param exprs [Array<Column>, Hash]
247
+ # @return [DataFrame]
248
+ def agg(*exprs)
249
+ group_by.agg(*exprs)
250
+ end
251
+
252
+ # ---- Joins -------------------------------------------------------------
253
+
254
+ # Join with another DataFrame.
255
+ #
256
+ # @param other [DataFrame]
257
+ # @param on [String, Array<String>, Column, nil] join key column name(s) or a
258
+ # boolean join condition.
259
+ # @param how [Symbol, String] join type (see {JOIN_TYPES}).
260
+ # @return [DataFrame]
261
+ def join(other, on: nil, how: :inner)
262
+ jt = JOIN_TYPES[how.to_s.downcase.to_sym] or
263
+ raise IllegalArgumentError, "Unsupported join type: #{how}"
264
+ j = Proto::Join.new(left: @relation, right: other.relation, join_type: jt)
265
+ case on
266
+ when nil then nil
267
+ when Column then j.join_condition = on.to_expr
268
+ when Array then j.using_columns += on.map(&:to_s)
269
+ else j.using_columns << on.to_s
270
+ end
271
+ build(join: j)
272
+ end
273
+
274
+ # Cartesian product with another DataFrame.
275
+ # @return [DataFrame]
276
+ def cross_join(other)
277
+ build(join: Proto::Join.new(left: @relation, right: other.relation, join_type: :JOIN_TYPE_CROSS))
278
+ end
279
+ alias crossJoin cross_join
280
+
281
+ # ---- Set operations ----------------------------------------------------
282
+
283
+ # Union (by position; keeps duplicates - equivalent to Spark's `unionAll`).
284
+ # @return [DataFrame]
285
+ def union(other)
286
+ set_op(other, :SET_OP_TYPE_UNION, is_all: true)
287
+ end
288
+ alias union_all union
289
+ alias unionAll union
290
+
291
+ # Union by column name.
292
+ #
293
+ # @param other [DataFrame]
294
+ # @param allow_missing_columns [Boolean]
295
+ # @return [DataFrame]
296
+ def union_by_name(other, allow_missing_columns: false)
297
+ set_op(other, :SET_OP_TYPE_UNION, is_all: true, by_name: true, allow_missing_columns: allow_missing_columns)
298
+ end
299
+ alias unionByName union_by_name
300
+
301
+ # Set intersection (distinct).
302
+ # @return [DataFrame]
303
+ def intersect(other)
304
+ set_op(other, :SET_OP_TYPE_INTERSECT, is_all: false)
305
+ end
306
+
307
+ # Set intersection keeping duplicates.
308
+ # @return [DataFrame]
309
+ def intersect_all(other)
310
+ set_op(other, :SET_OP_TYPE_INTERSECT, is_all: true)
311
+ end
312
+ alias intersectAll intersect_all
313
+
314
+ # Rows in this DataFrame not in `other` (distinct).
315
+ # @return [DataFrame]
316
+ def except_all(other)
317
+ set_op(other, :SET_OP_TYPE_EXCEPT, is_all: true)
318
+ end
319
+ alias exceptAll except_all
320
+
321
+ # Rows in this DataFrame not in `other` (distinct) - Spark's `EXCEPT`.
322
+ # @return [DataFrame]
323
+ def subtract(other)
324
+ set_op(other, :SET_OP_TYPE_EXCEPT, is_all: false)
325
+ end
326
+
327
+ # ---- Partitioning ------------------------------------------------------
328
+
329
+ # Repartition into `num_partitions`, optionally hash-partitioned by columns.
330
+ #
331
+ # @param num_partitions [Integer]
332
+ # @param cols [Array<Column, String>]
333
+ # @return [DataFrame]
334
+ def repartition(num_partitions, *cols)
335
+ if cols.empty?
336
+ build(repartition: Proto::Repartition.new(input: @relation, num_partitions: num_partitions, shuffle: true))
337
+ else
338
+ rbe = Proto::RepartitionByExpression.new(
339
+ input: @relation, partition_exprs: normalize_columns(cols).map(&:to_expr), num_partitions: num_partitions
340
+ )
341
+ build(repartition_by_expression: rbe)
342
+ end
343
+ end
344
+
345
+ # Reduce to `num_partitions` without a full shuffle.
346
+ # @return [DataFrame]
347
+ def coalesce(num_partitions)
348
+ build(repartition: Proto::Repartition.new(input: @relation, num_partitions: num_partitions, shuffle: false))
349
+ end
350
+
351
+ # Range-partition by the given columns (rows are range-partitioned on the
352
+ # sort order of the columns).
353
+ #
354
+ # @overload repartition_by_range(*cols)
355
+ # @overload repartition_by_range(num_partitions, *cols)
356
+ # @return [DataFrame]
357
+ def repartition_by_range(*args)
358
+ num_partitions = args.first.is_a?(Integer) ? args.shift : nil
359
+ orders = normalize_columns(args).map do |c|
360
+ expr = c.to_expr
361
+ if expr.expr_type == :sort_order
362
+ expr
363
+ else
364
+ Proto::Expression.new(sort_order: Proto::Expression::SortOrder.new(
365
+ child: expr, direction: :SORT_DIRECTION_ASCENDING, null_ordering: :SORT_NULLS_FIRST
366
+ ))
367
+ end
368
+ end
369
+ rbe = Proto::RepartitionByExpression.new(input: @relation, partition_exprs: orders)
370
+ rbe.num_partitions = num_partitions if num_partitions
371
+ build(repartition_by_expression: rbe)
372
+ end
373
+ alias repartitionByRange repartition_by_range
374
+
375
+ # ---- Sampling ----------------------------------------------------------
376
+
377
+ # Random sample of rows.
378
+ #
379
+ # @param fraction [Float] expected fraction (0.0..1.0).
380
+ # @param with_replacement [Boolean]
381
+ # @param seed [Integer, nil]
382
+ # @return [DataFrame]
383
+ def sample(fraction, with_replacement: false, seed: nil)
384
+ s = Proto::Sample.new(
385
+ input: @relation, lower_bound: 0.0, upper_bound: fraction, with_replacement: with_replacement
386
+ )
387
+ s.seed = seed if seed
388
+ build(sample: s)
389
+ end
390
+
391
+ # ---- Misc transforms ---------------------------------------------------
392
+
393
+ # Alias this DataFrame (a subquery alias usable in join conditions).
394
+ # @return [DataFrame]
395
+ def alias(name)
396
+ build(subquery_alias: Proto::SubqueryAlias.new(input: @relation, alias: name.to_s))
397
+ end
398
+ alias as alias
399
+
400
+ # Attach a planner hint (e.g. `"broadcast"`).
401
+ #
402
+ # @param name [String]
403
+ # @param params [Array]
404
+ # @return [DataFrame]
405
+ def hint(name, *params)
406
+ h = Proto::Hint.new(input: @relation, name: name.to_s,
407
+ parameters: params.map { |p| Column.to_col(p).to_expr })
408
+ build(hint: h)
409
+ end
410
+
411
+ # Unpivot (melt) columns from wide to long format.
412
+ #
413
+ # @param ids [Array<Column, String>] identifier columns.
414
+ # @param values [Array<Column, String>, nil] value columns (nil = all others).
415
+ # @param variable_column_name [String]
416
+ # @param value_column_name [String]
417
+ # @return [DataFrame]
418
+ def unpivot(ids, values, variable_column_name, value_column_name)
419
+ u = Proto::Unpivot.new(
420
+ input: @relation,
421
+ ids: normalize_columns(Array(ids)).map(&:to_expr),
422
+ variable_column_name: variable_column_name,
423
+ value_column_name: value_column_name
424
+ )
425
+ u.values = Proto::Unpivot::Values.new(values: normalize_columns(Array(values)).map(&:to_expr)) unless values.nil?
426
+ build(unpivot: u)
427
+ end
428
+ alias melt unpivot
429
+
430
+ # ---- NA / stat / IO facades -------------------------------------------
431
+
432
+ # @return [DataFrameNaFunctions] missing-data helpers (`drop`, `fill`, `replace`).
433
+ def na
434
+ DataFrameNaFunctions.new(self)
435
+ end
436
+
437
+ # @return [DataFrameStatFunctions] statistical helpers.
438
+ def stat
439
+ DataFrameStatFunctions.new(self)
440
+ end
441
+
442
+ # @return [DataFrameWriter] interface for saving this DataFrame.
443
+ def write
444
+ DataFrameWriter.new(self)
445
+ end
446
+
447
+ # @return [DataFrameWriterV2] the v2 (catalog) write interface.
448
+ def write_to(table)
449
+ DataFrameWriterV2.new(self, table)
450
+ end
451
+ alias writeTo write_to
452
+
453
+ # @return [DataStreamWriter] interface for starting a streaming query from
454
+ # this (streaming) DataFrame.
455
+ def write_stream
456
+ DataStreamWriter.new(self)
457
+ end
458
+ alias writeStream write_stream
459
+
460
+ # ---- Temporary views ---------------------------------------------------
461
+
462
+ # Register this DataFrame as a session-scoped temporary view, failing if a
463
+ # view of the same name already exists.
464
+ # @return [void]
465
+ def create_temp_view(name)
466
+ register_view(name, global: false, replace: false)
467
+ end
468
+ alias createTempView create_temp_view
469
+
470
+ # Register (or replace) this DataFrame as a session-scoped temporary view.
471
+ # @return [void]
472
+ def create_or_replace_temp_view(name)
473
+ register_view(name, global: false, replace: true)
474
+ end
475
+ alias createOrReplaceTempView create_or_replace_temp_view
476
+
477
+ # Register this DataFrame as a global (cross-session) temporary view.
478
+ # @return [void]
479
+ def create_global_temp_view(name)
480
+ register_view(name, global: true, replace: false)
481
+ end
482
+ alias createGlobalTempView create_global_temp_view
483
+
484
+ # Register (or replace) this DataFrame as a global temporary view.
485
+ # @return [void]
486
+ def create_or_replace_global_temp_view(name)
487
+ register_view(name, global: true, replace: true)
488
+ end
489
+ alias createOrReplaceGlobalTempView create_or_replace_global_temp_view
490
+
491
+ # Select columns by a regular expression matched against their names.
492
+ #
493
+ # @param regex [String]
494
+ # @return [Column]
495
+ def col_regex(regex)
496
+ Column.new(Proto::Expression.new(unresolved_regex: Proto::Expression::UnresolvedRegex.new(col_name: regex.to_s)))
497
+ end
498
+ alias colRegex col_regex
499
+
500
+ # @return [DataFrame] a single-column (`value`) DataFrame of each row encoded
501
+ # as a JSON string.
502
+ def to_json(*_args)
503
+ select(Functions.to_json(Functions.struct(Functions.col("*"))).alias("value"))
504
+ end
505
+ alias toJSON to_json
506
+
507
+ # Define an event-time watermark for late-data handling on a streaming
508
+ # DataFrame.
509
+ #
510
+ # @param event_time [String] the event-time column name.
511
+ # @param delay_threshold [String] e.g. `"10 minutes"`.
512
+ # @return [DataFrame]
513
+ def with_watermark(event_time, delay_threshold)
514
+ build(with_watermark: Proto::WithWatermark.new(
515
+ input: @relation, event_time: event_time.to_s, delay_threshold: delay_threshold.to_s
516
+ ))
517
+ end
518
+ alias withWatermark with_watermark
519
+
520
+ # Apply a function to this DataFrame and return its result, enabling a
521
+ # fluent chain of custom transformations.
522
+ #
523
+ # @yieldparam df [DataFrame] self
524
+ # @return [DataFrame] whatever the block returns
525
+ def transform
526
+ yield(self)
527
+ end
528
+
529
+ # Eagerly checkpoint this DataFrame: materialise it server-side and return a
530
+ # new DataFrame backed by the cached result (truncates the logical plan).
531
+ #
532
+ # @param eager [Boolean] materialise immediately.
533
+ # @return [DataFrame]
534
+ def checkpoint(eager: true)
535
+ checkpoint_command(local: false, eager: eager)
536
+ end
537
+
538
+ # Like {#checkpoint} but uses the executors' local storage (no reliable
539
+ # storage), which is faster but not fault-tolerant.
540
+ #
541
+ # @param eager [Boolean]
542
+ # @return [DataFrame]
543
+ def local_checkpoint(eager: true)
544
+ checkpoint_command(local: true, eager: eager)
545
+ end
546
+ alias localCheckpoint local_checkpoint
547
+
548
+ # Observe named metrics over this DataFrame.
549
+ #
550
+ # @param name [String, Observation]
551
+ # @param exprs [Array<Column>]
552
+ # @return [DataFrame]
553
+ def observe(name, *exprs)
554
+ obs_name = name.is_a?(Observation) ? name.name : name.to_s
555
+ cm = Proto::CollectMetrics.new(
556
+ input: @relation, name: obs_name, metrics: exprs.map { |e| Column.to_col(e).to_expr }
557
+ )
558
+ df = build(collect_metrics: cm)
559
+ name.bind(df) if name.is_a?(Observation)
560
+ df
561
+ end
562
+
563
+ # ---- Schema introspection ---------------------------------------------
564
+
565
+ # @return [Types::StructType] the DataFrame's schema.
566
+ def schema
567
+ @schema ||= Types.from_proto(analyze(schema: Proto::AnalyzePlanRequest::Schema.new(plan: plan)).schema.schema)
568
+ end
569
+
570
+ # @return [Array<String>] column names.
571
+ def columns
572
+ schema.names
573
+ end
574
+
575
+ # @return [Array<Array(String, String)>] (name, simpleString-type) pairs.
576
+ def dtypes
577
+ schema.fields.map { |f| [f.name, f.data_type.simple_string] }
578
+ end
579
+
580
+ # @return [Array<Column>] one {Column} per output column.
581
+ def column_objects
582
+ columns.map { |c| Functions.col(c) }
583
+ end
584
+
585
+ # Print the schema as an indented tree to `io`.
586
+ # @return [void]
587
+ def print_schema(io = $stdout)
588
+ io.puts(schema.tree_string)
589
+ end
590
+ alias printSchema print_schema
591
+
592
+ # Index into a column by name (`df["id"]`) or position (`df[0]`).
593
+ #
594
+ # @param key [String, Symbol, Integer]
595
+ # @return [Column]
596
+ def [](key)
597
+ case key
598
+ when Integer then Functions.col(columns[key])
599
+ else Functions.col(key.to_s)
600
+ end
601
+ end
602
+
603
+ # Allows `df.column_name` for valid identifier column names.
604
+ def method_missing(name, *args)
605
+ if args.empty? && columns.include?(name.to_s)
606
+ Functions.col(name.to_s)
607
+ else
608
+ super
609
+ end
610
+ end
611
+
612
+ def respond_to_missing?(name, include_private = false)
613
+ begin
614
+ columns.include?(name.to_s)
615
+ rescue StandardError
616
+ false
617
+ end || super
618
+ end
619
+
620
+ # ---- Actions -----------------------------------------------------------
621
+
622
+ # Execute the plan and return all rows.
623
+ # @return [Array<Row>]
624
+ def collect
625
+ result = @session.client.execute_plan(@relation)
626
+ ArrowConverter.to_rows(result.arrow_batches)
627
+ end
628
+ alias to_a collect
629
+
630
+ # Iterate over all rows (materialises the result). Returns an Enumerator when
631
+ # no block is given.
632
+ #
633
+ # @yieldparam row [Row]
634
+ # @return [Enumerator, void]
635
+ def each(&block)
636
+ return collect.each unless block
637
+
638
+ collect.each(&block)
639
+ end
640
+
641
+ # @return [Enumerator<Row>] an enumerator over all rows.
642
+ def to_local_iterator
643
+ collect.each
644
+ end
645
+ alias toLocalIterator to_local_iterator
646
+
647
+ # @return [Array<Row>] the first `n` rows.
648
+ def take(n)
649
+ limit(n).collect
650
+ end
651
+
652
+ # @return [Array<Row>, Row] `n` rows (array) or the single first row when called with no arg.
653
+ def head(n = nil)
654
+ return first if n.nil?
655
+
656
+ take(n)
657
+ end
658
+
659
+ # @return [Row, nil] the first row.
660
+ def first
661
+ take(1).first
662
+ end
663
+
664
+ # @return [Integer] the number of rows.
665
+ def count
666
+ df = build(aggregate: Proto::Aggregate.new(
667
+ input: @relation,
668
+ group_type: :GROUP_TYPE_GROUPBY,
669
+ grouping_expressions: [],
670
+ aggregate_expressions: [Column.invoke("count", Column.lit(1)).to_expr]
671
+ ))
672
+ row = df.collect.first
673
+ row ? row[0] : 0
674
+ end
675
+
676
+ # @return [Boolean] whether the DataFrame has no rows.
677
+ def empty?
678
+ limit(1).collect.empty?
679
+ end
680
+ alias is_empty empty?
681
+
682
+ # Render the first `n` rows as a formatted table.
683
+ #
684
+ # @param n [Integer]
685
+ # @param truncate [Boolean, Integer] truncate long values to 20 chars (true)
686
+ # or to the given width (Integer).
687
+ # @param vertical [Boolean]
688
+ # @return [void]
689
+ def show(n = 20, truncate: true, vertical: false)
690
+ $stdout.puts(show_string(n, truncate: truncate, vertical: vertical))
691
+ end
692
+
693
+ # @return [String] the formatted table string (what {#show} prints).
694
+ def show_string(n = 20, truncate: true, vertical: false)
695
+ trunc = if truncate == true
696
+ 20
697
+ else
698
+ (truncate == false ? 0 : truncate.to_i)
699
+ end
700
+ ss = Proto::ShowString.new(input: @relation, num_rows: n, truncate: trunc, vertical: vertical)
701
+ df = build(show_string: ss)
702
+ df.collect.first&.[](0).to_s
703
+ end
704
+
705
+ # Materialise the result as an Arrow {Arrow::Table} (columnar).
706
+ # @return [Arrow::Table, nil]
707
+ def to_arrow
708
+ result = @session.client.execute_plan(@relation)
709
+ ArrowConverter.to_table(result.arrow_batches)
710
+ end
711
+
712
+ # @return [Array<Hash>] all rows as Hashes.
713
+ def to_h_array
714
+ collect.map(&:to_h)
715
+ end
716
+
717
+ # ---- Explain / metadata ------------------------------------------------
718
+
719
+ # Return the query plan as a string.
720
+ #
721
+ # @param mode [Symbol] `:simple`, `:extended`, `:codegen`, `:cost`, `:formatted`.
722
+ # @return [String]
723
+ def explain_string(mode = :simple)
724
+ em = :"EXPLAIN_MODE_#{mode.to_s.upcase}"
725
+ analyze(explain: Proto::AnalyzePlanRequest::Explain.new(plan: plan, explain_mode: em)).explain.explain_string
726
+ end
727
+
728
+ # Print the query plan.
729
+ # @return [void]
730
+ def explain(mode = :simple)
731
+ $stdout.puts(explain_string(mode))
732
+ end
733
+
734
+ # @return [Array<String>] the input files backing this DataFrame.
735
+ def input_files
736
+ analyze(input_files: Proto::AnalyzePlanRequest::InputFiles.new(plan: plan)).input_files.files.to_a
737
+ end
738
+
739
+ # @return [Boolean] whether the data is small enough to be local.
740
+ def local?
741
+ analyze(is_local: Proto::AnalyzePlanRequest::IsLocal.new(plan: plan)).is_local.is_local
742
+ end
743
+
744
+ # @return [Boolean] whether this is a streaming DataFrame.
745
+ def streaming?
746
+ analyze(is_streaming: Proto::AnalyzePlanRequest::IsStreaming.new(plan: plan)).is_streaming.is_streaming
747
+ end
748
+
749
+ # @return [Boolean] whether `other` has the same logical plan.
750
+ def same_semantics?(other)
751
+ analyze(same_semantics: Proto::AnalyzePlanRequest::SameSemantics.new(
752
+ target_plan: plan, other_plan: other.plan
753
+ )).same_semantics.result
754
+ end
755
+
756
+ # @return [Integer] a hash of the logical plan.
757
+ def semantic_hash
758
+ analyze(semantic_hash: Proto::AnalyzePlanRequest::SemanticHash.new(plan: plan)).semantic_hash.result
759
+ end
760
+
761
+ # @api private - the executable plan rooted at this relation.
762
+ # @return [Spark::Connect::Plan]
763
+ def plan
764
+ PlanBuilder.root_plan(@relation)
765
+ end
766
+
767
+ def to_s
768
+ "#<SparkConnect::DataFrame>"
769
+ end
770
+ alias inspect to_s
771
+
772
+ # @api private - build a derived DataFrame from a relation built by W-owned
773
+ # facades (GroupedData, NaFunctions, ...).
774
+ def build(**rel)
775
+ DataFrame.new(@session, PlanBuilder.relation(@session, **rel))
776
+ end
777
+
778
+ private
779
+
780
+ def analyze(**kw)
781
+ @session.client.analyze(**kw)
782
+ end
783
+
784
+ def register_view(name, global:, replace:)
785
+ cmd = Proto::CreateDataFrameViewCommand.new(
786
+ input: @relation, name: name.to_s, is_global: global, replace: replace
787
+ )
788
+ @session.client.execute_command(Proto::Command.new(create_dataframe_view: cmd))
789
+ nil
790
+ end
791
+
792
+ def checkpoint_command(local:, eager:)
793
+ cmd = Proto::CheckpointCommand.new(relation: @relation, local: local, eager: eager)
794
+ result = @session.client.execute_command(Proto::Command.new(checkpoint_command: cmd))
795
+ cached = result.checkpoint_relation
796
+ raise SparkConnectError, "Server did not return a checkpointed relation" unless cached
797
+
798
+ relation = Proto::Relation.new(
799
+ common: Proto::RelationCommon.new(plan_id: @session.next_plan_id),
800
+ cached_remote_relation: Proto::CachedRemoteRelation.new(relation_id: cached.relation_id)
801
+ )
802
+ DataFrame.new(@session, relation)
803
+ end
804
+
805
+ def normalize_columns(cols)
806
+ cols.flatten.map { |c| c.is_a?(Column) ? c : Functions.col(c.to_s) }
807
+ end
808
+
809
+ def to_sort_order(col)
810
+ expr = col.to_expr
811
+ if expr.expr_type == :sort_order
812
+ expr.sort_order
813
+ else
814
+ Proto::Expression::SortOrder.new(
815
+ child: expr, direction: :SORT_DIRECTION_ASCENDING, null_ordering: :SORT_NULLS_FIRST
816
+ )
817
+ end
818
+ end
819
+
820
+ def set_op(other, type, is_all:, by_name: false, allow_missing_columns: false)
821
+ op = Proto::SetOperation.new(
822
+ left_input: @relation, right_input: other.relation, set_op_type: type,
823
+ is_all: is_all, by_name: by_name, allow_missing_columns: allow_missing_columns
824
+ )
825
+ build(set_op: op)
826
+ end
827
+ end
828
+ end