spark-connect 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +82 -0
  3. data/LICENSE +202 -0
  4. data/NOTICE +16 -0
  5. data/README.md +166 -0
  6. data/lib/spark-connect.rb +5 -0
  7. data/lib/spark_connect/arrow.rb +115 -0
  8. data/lib/spark_connect/catalog.rb +190 -0
  9. data/lib/spark_connect/channel_builder.rb +134 -0
  10. data/lib/spark_connect/client.rb +264 -0
  11. data/lib/spark_connect/column.rb +379 -0
  12. data/lib/spark_connect/conf.rb +79 -0
  13. data/lib/spark_connect/data_frame.rb +828 -0
  14. data/lib/spark_connect/errors.rb +58 -0
  15. data/lib/spark_connect/functions.rb +903 -0
  16. data/lib/spark_connect/grouped_data.rb +101 -0
  17. data/lib/spark_connect/na_functions.rb +98 -0
  18. data/lib/spark_connect/observation.rb +61 -0
  19. data/lib/spark_connect/pipelines.rb +221 -0
  20. data/lib/spark_connect/plan.rb +39 -0
  21. data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
  22. data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
  23. data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
  24. data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
  25. data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
  26. data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
  27. data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
  28. data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
  29. data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
  30. data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
  31. data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
  32. data/lib/spark_connect/proto.rb +32 -0
  33. data/lib/spark_connect/reader.rb +98 -0
  34. data/lib/spark_connect/row.rb +105 -0
  35. data/lib/spark_connect/session.rb +317 -0
  36. data/lib/spark_connect/stat_functions.rb +109 -0
  37. data/lib/spark_connect/streaming.rb +351 -0
  38. data/lib/spark_connect/types.rb +490 -0
  39. data/lib/spark_connect/version.rb +11 -0
  40. data/lib/spark_connect/window.rb +119 -0
  41. data/lib/spark_connect/writer.rb +208 -0
  42. data/lib/spark_connect.rb +58 -0
  43. data/proto/spark/connect/base.proto +1275 -0
  44. data/proto/spark/connect/catalog.proto +243 -0
  45. data/proto/spark/connect/commands.proto +553 -0
  46. data/proto/spark/connect/common.proto +179 -0
  47. data/proto/spark/connect/expressions.proto +557 -0
  48. data/proto/spark/connect/ml.proto +147 -0
  49. data/proto/spark/connect/ml_common.proto +64 -0
  50. data/proto/spark/connect/pipelines.proto +307 -0
  51. data/proto/spark/connect/relations.proto +1252 -0
  52. data/proto/spark/connect/types.proto +227 -0
  53. metadata +149 -0
@@ -0,0 +1,379 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bigdecimal"
4
+ require "date"
5
+
6
+ module SparkConnect
7
+ # A column expression: a lazily-evaluated reference to a column or a
8
+ # computation over columns. Columns are immutable; operators and methods
9
+ # return new {Column}s.
10
+ #
11
+ # A {Column} wraps a protobuf `Expression`. Build them with
12
+ # {SparkConnect::Functions.col}, {SparkConnect::Functions.lit}, by indexing a
13
+ # DataFrame (`df["id"]`), or by combining other columns with operators.
14
+ #
15
+ # @example
16
+ # F = SparkConnect::F
17
+ # (F.col("age") + 1).alias("next_age")
18
+ # F.col("name").like("a%") & (F.col("age") >= 18)
19
+ class Column
20
+ Proto = SparkConnect::Proto
21
+
22
+ # @return [Spark::Connect::Expression] the wrapped protobuf expression.
23
+ attr_reader :expr
24
+
25
+ # @param expr [Spark::Connect::Expression]
26
+ def initialize(expr)
27
+ @expr = expr
28
+ end
29
+
30
+ # @return [Spark::Connect::Expression]
31
+ def to_expr
32
+ @expr
33
+ end
34
+
35
+ class << self
36
+ # Wrap an existing protobuf expression.
37
+ # @return [Column]
38
+ def from_expr(expr)
39
+ new(expr)
40
+ end
41
+
42
+ # An unresolved attribute reference by (possibly dotted) name. The special
43
+ # name `"*"` expands to all columns.
44
+ #
45
+ # @param name [String]
46
+ # @return [Column]
47
+ def from_name(name)
48
+ if name == "*"
49
+ new(Proto::Expression.new(unresolved_star: Proto::Expression::UnresolvedStar.new))
50
+ else
51
+ new(Proto::Expression.new(
52
+ unresolved_attribute: Proto::Expression::UnresolvedAttribute.new(unparsed_identifier: name.to_s)
53
+ ))
54
+ end
55
+ end
56
+
57
+ # Build a literal column from a Ruby value.
58
+ #
59
+ # @param value [Object] nil, Boolean, Integer, Float, String, Symbol,
60
+ # Time, Date, BigDecimal, Array, Hash, or an existing {Column}.
61
+ # @return [Column]
62
+ def lit(value)
63
+ return value if value.is_a?(Column)
64
+
65
+ new(Proto::Expression.new(literal: to_literal(value)))
66
+ end
67
+
68
+ # Build an `UnresolvedFunction` call column.
69
+ #
70
+ # @param name [String] the Spark function name.
71
+ # @param args [Array<Column, Object>] arguments (non-columns become literals).
72
+ # @param is_distinct [Boolean]
73
+ # @return [Column]
74
+ def invoke(name, *args, is_distinct: false)
75
+ new(Proto::Expression.new(
76
+ unresolved_function: Proto::Expression::UnresolvedFunction.new(
77
+ function_name: name.to_s,
78
+ arguments: args.map { |a| to_col(a).to_expr },
79
+ is_distinct: is_distinct
80
+ )
81
+ ))
82
+ end
83
+
84
+ # Coerce a value into a {Column} (literals are wrapped).
85
+ # @return [Column]
86
+ def to_col(value)
87
+ value.is_a?(Column) ? value : lit(value)
88
+ end
89
+
90
+ # Encode a Ruby value as a protobuf `Expression.Literal`.
91
+ #
92
+ # @param value [Object]
93
+ # @return [Spark::Connect::Expression::Literal]
94
+ def to_literal(value)
95
+ l = Proto::Expression::Literal
96
+ case value
97
+ when nil
98
+ l.new(null: Types.null.to_proto)
99
+ when true, false
100
+ l.new(boolean: value)
101
+ when Integer
102
+ if value.between?(-2_147_483_648, 2_147_483_647)
103
+ l.new(integer: value)
104
+ else
105
+ l.new(long: value)
106
+ end
107
+ when Float
108
+ l.new(double: value)
109
+ when BigDecimal
110
+ l.new(decimal: l::Decimal.new(value: value.to_s("F")))
111
+ when Rational
112
+ l.new(double: value.to_f)
113
+ when String
114
+ if value.encoding == Encoding::ASCII_8BIT
115
+ l.new(binary: value)
116
+ else
117
+ l.new(string: value)
118
+ end
119
+ when Symbol
120
+ l.new(string: value.to_s)
121
+ when Time
122
+ l.new(timestamp: (value.to_r * 1_000_000).to_i)
123
+ when DateTime
124
+ l.new(timestamp: (value.to_time.to_r * 1_000_000).to_i)
125
+ when Date
126
+ l.new(date: (value - Date.new(1970, 1, 1)).to_i)
127
+ when Array
128
+ elem_type = infer_array_element_type(value)
129
+ l.new(array: l::Array.new(
130
+ element_type: elem_type.to_proto,
131
+ elements: value.map { |v| to_literal(v) }
132
+ ))
133
+ when Hash
134
+ key_type = value.empty? ? Types.string : infer_type(value.keys.first)
135
+ val_type = value.empty? ? Types.string : infer_type(value.values.first)
136
+ l.new(map: l::Map.new(
137
+ key_type: key_type.to_proto,
138
+ value_type: val_type.to_proto,
139
+ keys: value.keys.map { |k| to_literal(k) },
140
+ values: value.values.map { |v| to_literal(v) }
141
+ ))
142
+ else
143
+ raise IllegalArgumentError, "Unsupported literal value of type #{value.class}: #{value.inspect}"
144
+ end
145
+ end
146
+
147
+ # Infer the Spark {Types::DataType} for a Ruby value (used when building
148
+ # array/map literals). Mirrors PySpark's literal type inference.
149
+ #
150
+ # @param value [Object]
151
+ # @return [Types::DataType]
152
+ def infer_type(value)
153
+ case value
154
+ when nil then Types.null
155
+ when true, false then Types.boolean
156
+ when Integer
157
+ value.between?(-2_147_483_648, 2_147_483_647) ? Types.integer : Types.long
158
+ when Float, Rational then Types.double
159
+ when BigDecimal then Types.decimal(38, 18)
160
+ when String then value.encoding == Encoding::ASCII_8BIT ? Types.binary : Types.string
161
+ when Symbol then Types.string
162
+ when Time, DateTime then Types.timestamp
163
+ when Date then Types.date
164
+ when Array then Types.array(value.empty? ? Types.null : infer_type(value.find { |v| !v.nil? }))
165
+ when Hash
166
+ Types.map(value.empty? ? Types.string : infer_type(value.keys.first),
167
+ value.empty? ? Types.string : infer_type(value.values.first))
168
+ else
169
+ raise IllegalArgumentError, "Cannot infer Spark type for #{value.class}"
170
+ end
171
+ end
172
+
173
+ private
174
+
175
+ def infer_array_element_type(array)
176
+ sample = array.find { |v| !v.nil? }
177
+ sample.nil? ? Types.null : infer_type(sample)
178
+ end
179
+ end
180
+
181
+ # ---- Arithmetic --------------------------------------------------------
182
+ def +(other) = bin_op("+", other)
183
+ def -(other) = bin_op("-", other)
184
+ def *(other) = bin_op("*", other)
185
+ def /(other) = bin_op("/", other)
186
+ def %(other) = bin_op("%", other)
187
+ def -@ = Column.invoke("negative", self)
188
+ def +@ = self
189
+
190
+ # Raise this column to the power of `other`.
191
+ # @return [Column]
192
+ def **(other) = bin_op("power", other)
193
+
194
+ # ---- Comparison --------------------------------------------------------
195
+ def ==(other) = bin_op("==", other)
196
+ def !=(other) = bin_op("!=", other)
197
+ def <(other) = bin_op("<", other)
198
+ def <=(other) = bin_op("<=", other)
199
+ def >(other) = bin_op(">", other)
200
+ def >=(other) = bin_op(">=", other)
201
+
202
+ # Null-safe equality (`<=>` in Spark SQL): `null <=> null` is true.
203
+ # @return [Column]
204
+ def eq_null_safe(other) = bin_op("<=>", other)
205
+
206
+ # ---- Boolean -----------------------------------------------------------
207
+ def &(other) = bin_op("and", other)
208
+ def |(other) = bin_op("or", other)
209
+
210
+ def !
211
+ Column.invoke("not", self)
212
+ end
213
+ alias not !
214
+
215
+ # ---- Bitwise -----------------------------------------------------------
216
+ def bitwise_and(other) = bin_op("&", other)
217
+ def bitwise_or(other) = bin_op("|", other)
218
+ def bitwise_xor(other) = bin_op("^", other)
219
+
220
+ # ---- Null / membership predicates -------------------------------------
221
+ def is_null = Column.invoke("isNull", self)
222
+ def is_not_null = Column.invoke("isNotNull", self)
223
+ def is_nan = Column.invoke("isNaN", self)
224
+ alias isNull is_null
225
+ alias isNotNull is_not_null
226
+
227
+ # True if the column's value is in `values`.
228
+ # @return [Column]
229
+ def isin(*values)
230
+ values = values.first if values.size == 1 && values.first.is_a?(Array)
231
+ Column.invoke("in", self, *Array(values))
232
+ end
233
+ alias in_list isin
234
+
235
+ # True if `lower <= self <= upper`.
236
+ # @return [Column]
237
+ def between(lower, upper)
238
+ (self >= lower) & (self <= upper)
239
+ end
240
+
241
+ # ---- String predicates -------------------------------------------------
242
+ def like(pattern) = bin_op("like", pattern)
243
+ def rlike(pattern) = bin_op("rlike", pattern)
244
+ def ilike(pattern) = bin_op("ilike", pattern)
245
+ def contains(other) = bin_op("contains", other)
246
+ def startswith(other) = bin_op("startswith", other)
247
+ def endswith(other) = bin_op("endswith", other)
248
+
249
+ # Substring of length `len` starting at 1-based position `start`.
250
+ # @return [Column]
251
+ def substr(start, len)
252
+ Column.invoke("substr", self, start, len)
253
+ end
254
+
255
+ # ---- Complex-type access ----------------------------------------------
256
+ # Extract an array element by index, a map value by key, or a struct field.
257
+ # @return [Column]
258
+ def [](key)
259
+ get_item(key)
260
+ end
261
+
262
+ def get_item(key)
263
+ Column.new(Proto::Expression.new(
264
+ unresolved_extract_value: Proto::Expression::UnresolvedExtractValue.new(
265
+ child: @expr, extraction: Column.lit(key).to_expr
266
+ )
267
+ ))
268
+ end
269
+
270
+ # Extract a struct field by name.
271
+ # @return [Column]
272
+ def get_field(name)
273
+ get_item(name.to_s)
274
+ end
275
+
276
+ # ---- Aliasing / naming -------------------------------------------------
277
+ # Assign one or more output names. With multiple names the expression must
278
+ # produce a struct/multiple columns (e.g. `inline`).
279
+ #
280
+ # @param names [Array<String>]
281
+ # @param metadata [Hash, nil] optional JSON metadata for a single alias.
282
+ # @return [Column]
283
+ def alias(*names, metadata: nil)
284
+ a = Proto::Expression::Alias.new(expr: @expr, name: names.map(&:to_s))
285
+ a.metadata = JSON.generate(metadata) if metadata
286
+ Column.new(Proto::Expression.new(alias: a))
287
+ end
288
+ alias name alias
289
+ alias as alias
290
+
291
+ # ---- Casting -----------------------------------------------------------
292
+ # Cast to another type, given either a {Types::DataType} or a DDL type
293
+ # string (e.g. `"int"`, `"decimal(10,2)"`).
294
+ #
295
+ # @param data_type [Types::DataType, String]
296
+ # @return [Column]
297
+ def cast(data_type)
298
+ c = Proto::Expression::Cast.new(expr: @expr)
299
+ if data_type.is_a?(String)
300
+ c.type_str = data_type
301
+ else
302
+ c.type = data_type.to_proto
303
+ end
304
+ Column.new(Proto::Expression.new(cast: c))
305
+ end
306
+ alias as_type cast
307
+ alias astype cast
308
+
309
+ # ---- Sort ordering -----------------------------------------------------
310
+ def asc = sort_order(:SORT_DIRECTION_ASCENDING, :SORT_NULLS_FIRST)
311
+ def desc = sort_order(:SORT_DIRECTION_DESCENDING, :SORT_NULLS_LAST)
312
+ def asc_nulls_first = sort_order(:SORT_DIRECTION_ASCENDING, :SORT_NULLS_FIRST)
313
+ def asc_nulls_last = sort_order(:SORT_DIRECTION_ASCENDING, :SORT_NULLS_LAST)
314
+ def desc_nulls_first = sort_order(:SORT_DIRECTION_DESCENDING, :SORT_NULLS_FIRST)
315
+ def desc_nulls_last = sort_order(:SORT_DIRECTION_DESCENDING, :SORT_NULLS_LAST)
316
+
317
+ # ---- CASE WHEN ---------------------------------------------------------
318
+ # Add a branch to a CASE expression started by {Functions.when}.
319
+ #
320
+ # @return [Column]
321
+ def when(condition, value)
322
+ unless @expr.expr_type == :unresolved_function && @expr.unresolved_function.function_name == "when"
323
+ raise IllegalArgumentError, "when() can only be applied on a Column previously generated by when()"
324
+ end
325
+
326
+ args = @expr.unresolved_function.arguments.to_a + [Column.to_col(condition).to_expr, Column.to_col(value).to_expr]
327
+ Column.new(Proto::Expression.new(
328
+ unresolved_function: Proto::Expression::UnresolvedFunction.new(function_name: "when", arguments: args)
329
+ ))
330
+ end
331
+
332
+ # Provide the default (ELSE) value for a CASE expression.
333
+ # @return [Column]
334
+ def otherwise(value)
335
+ unless @expr.expr_type == :unresolved_function && @expr.unresolved_function.function_name == "when"
336
+ raise IllegalArgumentError, "otherwise() can only be applied on a Column previously generated by when()"
337
+ end
338
+
339
+ args = @expr.unresolved_function.arguments.to_a + [Column.to_col(value).to_expr]
340
+ Column.new(Proto::Expression.new(
341
+ unresolved_function: Proto::Expression::UnresolvedFunction.new(function_name: "when", arguments: args)
342
+ ))
343
+ end
344
+
345
+ # ---- Windowing ---------------------------------------------------------
346
+ # Define a windowed aggregation / analytic computation over this column.
347
+ #
348
+ # @param window [WindowSpec]
349
+ # @return [Column]
350
+ def over(window)
351
+ w = Proto::Expression::Window.new(
352
+ window_function: @expr,
353
+ partition_spec: window.partition_spec,
354
+ order_spec: window.order_spec
355
+ )
356
+ w.frame_spec = window.frame_spec if window.frame_spec
357
+ Column.new(Proto::Expression.new(window: w))
358
+ end
359
+
360
+ def to_s
361
+ "Column<#{@expr.expr_type}>"
362
+ end
363
+ alias inspect to_s
364
+
365
+ private
366
+
367
+ def bin_op(name, other)
368
+ Column.invoke(name, self, Column.to_col(other))
369
+ end
370
+
371
+ def sort_order(direction, null_ordering)
372
+ Column.new(Proto::Expression.new(
373
+ sort_order: Proto::Expression::SortOrder.new(
374
+ child: @expr, direction: direction, null_ordering: null_ordering
375
+ )
376
+ ))
377
+ end
378
+ end
379
+ end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SparkConnect
4
+ # Runtime configuration interface, returned by {SparkSession#conf}. Mirrors
5
+ # PySpark's `spark.conf`.
6
+ #
7
+ # @example
8
+ # spark.conf.set("spark.sql.shuffle.partitions", "8")
9
+ # spark.conf.get("spark.sql.shuffle.partitions") #=> "8"
10
+ class RuntimeConfig
11
+ Proto = SparkConnect::Proto
12
+ Op = Proto::ConfigRequest::Operation
13
+ CR = Proto::ConfigRequest
14
+
15
+ # @param client [SparkConnectClient]
16
+ def initialize(client)
17
+ @client = client
18
+ end
19
+
20
+ # Set a configuration property.
21
+ #
22
+ # @param key [String]
23
+ # @param value [String, Integer, Boolean]
24
+ # @return [void]
25
+ def set(key, value)
26
+ op = Op.new(set: CR::Set.new(pairs: [Proto::KeyValue.new(key: key.to_s, value: value.to_s)]))
27
+ @client.config(op)
28
+ nil
29
+ end
30
+
31
+ # Get the value of a configuration property.
32
+ #
33
+ # @param key [String]
34
+ # @param default [String, nil] returned when the key is unset (when given).
35
+ # @return [String, nil]
36
+ def get(key, default = :__unset__)
37
+ op =
38
+ if default == :__unset__
39
+ Op.new(get: CR::Get.new(keys: [key.to_s]))
40
+ else
41
+ Op.new(get_with_default: CR::GetWithDefault.new(
42
+ pairs: [Proto::KeyValue.new(key: key.to_s, value: default)]
43
+ ))
44
+ end
45
+ resp = @client.config(op)
46
+ pair = resp.pairs.first
47
+ pair&.value
48
+ end
49
+
50
+ # Unset a configuration property.
51
+ #
52
+ # @param key [String]
53
+ # @return [void]
54
+ def unset(key)
55
+ @client.config(Op.new(unset: CR::Unset.new(keys: [key.to_s])))
56
+ nil
57
+ end
58
+
59
+ # All configuration properties (optionally filtered by `prefix`).
60
+ #
61
+ # @param prefix [String, nil]
62
+ # @return [Hash{String=>String}]
63
+ def get_all(prefix = nil)
64
+ ga = CR::GetAll.new
65
+ ga.prefix = prefix if prefix
66
+ resp = @client.config(Op.new(get_all: ga))
67
+ resp.pairs.to_h { |p| [p.key, p.value] }
68
+ end
69
+
70
+ # Whether a configuration property is modifiable in the current session.
71
+ #
72
+ # @param key [String]
73
+ # @return [Boolean]
74
+ def modifiable?(key)
75
+ resp = @client.config(Op.new(is_modifiable: CR::IsModifiable.new(keys: [key.to_s])))
76
+ resp.pairs.first&.value == "true"
77
+ end
78
+ end
79
+ end