spark-connect 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +82 -0
  3. data/LICENSE +202 -0
  4. data/NOTICE +16 -0
  5. data/README.md +166 -0
  6. data/lib/spark-connect.rb +5 -0
  7. data/lib/spark_connect/arrow.rb +115 -0
  8. data/lib/spark_connect/catalog.rb +190 -0
  9. data/lib/spark_connect/channel_builder.rb +134 -0
  10. data/lib/spark_connect/client.rb +264 -0
  11. data/lib/spark_connect/column.rb +379 -0
  12. data/lib/spark_connect/conf.rb +79 -0
  13. data/lib/spark_connect/data_frame.rb +828 -0
  14. data/lib/spark_connect/errors.rb +58 -0
  15. data/lib/spark_connect/functions.rb +903 -0
  16. data/lib/spark_connect/grouped_data.rb +101 -0
  17. data/lib/spark_connect/na_functions.rb +98 -0
  18. data/lib/spark_connect/observation.rb +61 -0
  19. data/lib/spark_connect/pipelines.rb +221 -0
  20. data/lib/spark_connect/plan.rb +39 -0
  21. data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
  22. data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
  23. data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
  24. data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
  25. data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
  26. data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
  27. data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
  28. data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
  29. data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
  30. data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
  31. data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
  32. data/lib/spark_connect/proto.rb +32 -0
  33. data/lib/spark_connect/reader.rb +98 -0
  34. data/lib/spark_connect/row.rb +105 -0
  35. data/lib/spark_connect/session.rb +317 -0
  36. data/lib/spark_connect/stat_functions.rb +109 -0
  37. data/lib/spark_connect/streaming.rb +351 -0
  38. data/lib/spark_connect/types.rb +490 -0
  39. data/lib/spark_connect/version.rb +11 -0
  40. data/lib/spark_connect/window.rb +119 -0
  41. data/lib/spark_connect/writer.rb +208 -0
  42. data/lib/spark_connect.rb +58 -0
  43. data/proto/spark/connect/base.proto +1275 -0
  44. data/proto/spark/connect/catalog.proto +243 -0
  45. data/proto/spark/connect/commands.proto +553 -0
  46. data/proto/spark/connect/common.proto +179 -0
  47. data/proto/spark/connect/expressions.proto +557 -0
  48. data/proto/spark/connect/ml.proto +147 -0
  49. data/proto/spark/connect/ml_common.proto +64 -0
  50. data/proto/spark/connect/pipelines.proto +307 -0
  51. data/proto/spark/connect/relations.proto +1252 -0
  52. data/proto/spark/connect/types.proto +227 -0
  53. metadata +149 -0
@@ -0,0 +1,490 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module SparkConnect
6
+ # The Spark SQL type system.
7
+ #
8
+ # Every Spark data type is represented by an instance of a {DataType}
9
+ # subclass. Types convert to and from the protobuf `DataType` message via
10
+ # {DataType#to_proto} and {Types.from_proto}, and render a Spark-compatible
11
+ # `simpleString` (e.g. `"array<int>"`) and `typeName` (e.g. `"integer"`).
12
+ #
13
+ # @example
14
+ # SparkConnect::Types::IntegerType.new.simple_string #=> "int"
15
+ # SparkConnect::Types.array(SparkConnect::Types::StringType.new).simple_string
16
+ # #=> "array<string>"
17
+ module Types
18
+ Proto = SparkConnect::Proto
19
+
20
+ # Abstract base class for all Spark data types.
21
+ class DataType
22
+ # @return [String] the Spark `simpleString` representation, e.g. `"int"`,
23
+ # `"array<string>"`, `"struct<a:int>"`.
24
+ def simple_string
25
+ type_name
26
+ end
27
+
28
+ # @return [String] the short type name used by Spark's JSON schema, e.g.
29
+ # `"integer"`, `"long"`, `"string"`.
30
+ def type_name
31
+ n = self.class.name.split("::").last.sub(/Type$/, "")
32
+ n.gsub(/([a-z\d])([A-Z])/, '\1_\2').downcase
33
+ end
34
+
35
+ # @return [String, Hash] the Spark JSON schema fragment for this type.
36
+ def json_value
37
+ type_name
38
+ end
39
+
40
+ # @return [String] the JSON schema string for this type.
41
+ def json
42
+ JSON.generate(json_value)
43
+ end
44
+
45
+ # @return [Spark::Connect::DataType] the protobuf representation.
46
+ def to_proto
47
+ raise NotImplementedError, "#{self.class}#to_proto is not implemented"
48
+ end
49
+
50
+ def ==(other)
51
+ other.is_a?(self.class) && other.class == self.class
52
+ end
53
+ alias eql? ==
54
+
55
+ def hash
56
+ self.class.hash
57
+ end
58
+
59
+ def to_s
60
+ simple_string
61
+ end
62
+
63
+ def inspect
64
+ "#<#{self.class.name} #{simple_string}>"
65
+ end
66
+ end
67
+
68
+ # Helper that wraps a kind message into a `DataType` proto.
69
+ def self.wrap(**kwargs)
70
+ Proto::DataType.new(**kwargs)
71
+ end
72
+
73
+ class NullType < DataType
74
+ def simple_string = "void"
75
+ def type_name = "void"
76
+ def to_proto = Types.wrap(null: Proto::DataType::NULL.new)
77
+ end
78
+
79
+ class BooleanType < DataType
80
+ def type_name = "boolean"
81
+ def to_proto = Types.wrap(boolean: Proto::DataType::Boolean.new)
82
+ end
83
+
84
+ class ByteType < DataType
85
+ def simple_string = "tinyint"
86
+ def type_name = "byte"
87
+ def to_proto = Types.wrap(byte: Proto::DataType::Byte.new)
88
+ end
89
+
90
+ class ShortType < DataType
91
+ def simple_string = "smallint"
92
+ def type_name = "short"
93
+ def to_proto = Types.wrap(short: Proto::DataType::Short.new)
94
+ end
95
+
96
+ class IntegerType < DataType
97
+ def simple_string = "int"
98
+ def type_name = "integer"
99
+ def to_proto = Types.wrap(integer: Proto::DataType::Integer.new)
100
+ end
101
+
102
+ class LongType < DataType
103
+ def simple_string = "bigint"
104
+ def type_name = "long"
105
+ def to_proto = Types.wrap(long: Proto::DataType::Long.new)
106
+ end
107
+
108
+ class FloatType < DataType
109
+ def type_name = "float"
110
+ def to_proto = Types.wrap(float: Proto::DataType::Float.new)
111
+ end
112
+
113
+ class DoubleType < DataType
114
+ def type_name = "double"
115
+ def to_proto = Types.wrap(double: Proto::DataType::Double.new)
116
+ end
117
+
118
+ class StringType < DataType
119
+ # @return [String] the collation name (default `"UTF8_BINARY"`).
120
+ attr_reader :collation
121
+
122
+ def initialize(collation = "UTF8_BINARY")
123
+ super()
124
+ @collation = collation
125
+ end
126
+
127
+ def type_name = "string"
128
+ def to_proto = Types.wrap(string: Proto::DataType::String.new(collation: @collation))
129
+ end
130
+
131
+ class BinaryType < DataType
132
+ def type_name = "binary"
133
+ def to_proto = Types.wrap(binary: Proto::DataType::Binary.new)
134
+ end
135
+
136
+ class DateType < DataType
137
+ def type_name = "date"
138
+ def to_proto = Types.wrap(date: Proto::DataType::Date.new)
139
+ end
140
+
141
+ class TimestampType < DataType
142
+ def type_name = "timestamp"
143
+ def to_proto = Types.wrap(timestamp: Proto::DataType::Timestamp.new)
144
+ end
145
+
146
+ class TimestampNTZType < DataType
147
+ def simple_string = "timestamp_ntz"
148
+ def type_name = "timestamp_ntz"
149
+ def to_proto = Types.wrap(timestamp_ntz: Proto::DataType::TimestampNTZ.new)
150
+ end
151
+
152
+ class VariantType < DataType
153
+ def type_name = "variant"
154
+ def to_proto = Types.wrap(variant: Proto::DataType::Variant.new)
155
+ end
156
+
157
+ class DecimalType < DataType
158
+ # @return [Integer] total number of digits (max 38).
159
+ attr_reader :precision
160
+ # @return [Integer] number of digits to the right of the decimal point.
161
+ attr_reader :scale
162
+
163
+ def initialize(precision = 10, scale = 0)
164
+ super()
165
+ @precision = precision
166
+ @scale = scale
167
+ end
168
+
169
+ def simple_string = "decimal(#{precision},#{scale})"
170
+ def type_name = "decimal"
171
+ def json_value = "decimal(#{precision},#{scale})"
172
+ def to_proto = Types.wrap(decimal: Proto::DataType::Decimal.new(precision: precision, scale: scale))
173
+
174
+ def ==(other)
175
+ other.is_a?(DecimalType) && other.precision == precision && other.scale == scale
176
+ end
177
+ end
178
+
179
+ class CharType < DataType
180
+ attr_reader :length
181
+
182
+ def initialize(length)
183
+ super()
184
+ @length = length
185
+ end
186
+
187
+ def simple_string = "char(#{length})"
188
+ def type_name = "char"
189
+ def json_value = "char(#{length})"
190
+ def to_proto = Types.wrap(char: Proto::DataType::Char.new(length: length))
191
+
192
+ def ==(other) = other.is_a?(CharType) && other.length == length
193
+ end
194
+
195
+ class VarcharType < DataType
196
+ attr_reader :length
197
+
198
+ def initialize(length)
199
+ super()
200
+ @length = length
201
+ end
202
+
203
+ def simple_string = "varchar(#{length})"
204
+ def type_name = "varchar"
205
+ def json_value = "varchar(#{length})"
206
+ def to_proto = Types.wrap(var_char: Proto::DataType::VarChar.new(length: length))
207
+
208
+ def ==(other) = other.is_a?(VarcharType) && other.length == length
209
+ end
210
+
211
+ class DayTimeIntervalType < DataType
212
+ DAY = 0
213
+ HOUR = 1
214
+ MINUTE = 2
215
+ SECOND = 3
216
+ attr_reader :start_field, :end_field
217
+
218
+ def initialize(start_field = DAY, end_field = SECOND)
219
+ super()
220
+ @start_field = start_field
221
+ @end_field = end_field
222
+ end
223
+
224
+ def simple_string = "interval day to second"
225
+ def type_name = "interval"
226
+
227
+ def to_proto
228
+ Types.wrap(day_time_interval: Proto::DataType::DayTimeInterval.new(start_field: start_field, end_field: end_field))
229
+ end
230
+ end
231
+
232
+ class YearMonthIntervalType < DataType
233
+ YEAR = 0
234
+ MONTH = 1
235
+ attr_reader :start_field, :end_field
236
+
237
+ def initialize(start_field = YEAR, end_field = MONTH)
238
+ super()
239
+ @start_field = start_field
240
+ @end_field = end_field
241
+ end
242
+
243
+ def simple_string = "interval year to month"
244
+ def type_name = "interval"
245
+
246
+ def to_proto
247
+ Types.wrap(year_month_interval: Proto::DataType::YearMonthInterval.new(start_field: start_field, end_field: end_field))
248
+ end
249
+ end
250
+
251
+ class CalendarIntervalType < DataType
252
+ def simple_string = "interval"
253
+ def type_name = "calendar_interval"
254
+ def to_proto = Types.wrap(calendar_interval: Proto::DataType::CalendarInterval.new)
255
+ end
256
+
257
+ # An array type. `element_type` is the type of every element; `contains_null`
258
+ # indicates whether the array may contain `null` values.
259
+ class ArrayType < DataType
260
+ attr_reader :element_type, :contains_null
261
+
262
+ def initialize(element_type, contains_null: true)
263
+ super()
264
+ @element_type = element_type
265
+ @contains_null = contains_null
266
+ end
267
+
268
+ def simple_string = "array<#{element_type.simple_string}>"
269
+ def type_name = "array"
270
+
271
+ def json_value
272
+ { "type" => "array", "elementType" => element_type.json_value, "containsNull" => contains_null }
273
+ end
274
+
275
+ def to_proto
276
+ Types.wrap(array: Proto::DataType::Array.new(element_type: element_type.to_proto, contains_null: contains_null))
277
+ end
278
+
279
+ def ==(other)
280
+ other.is_a?(ArrayType) && other.element_type == element_type && other.contains_null == contains_null
281
+ end
282
+ end
283
+
284
+ # A map type with key and value element types.
285
+ class MapType < DataType
286
+ attr_reader :key_type, :value_type, :value_contains_null
287
+
288
+ def initialize(key_type, value_type, value_contains_null: true)
289
+ super()
290
+ @key_type = key_type
291
+ @value_type = value_type
292
+ @value_contains_null = value_contains_null
293
+ end
294
+
295
+ def simple_string = "map<#{key_type.simple_string},#{value_type.simple_string}>"
296
+ def type_name = "map"
297
+
298
+ def json_value
299
+ {
300
+ "type" => "map",
301
+ "keyType" => key_type.json_value,
302
+ "valueType" => value_type.json_value,
303
+ "valueContainsNull" => value_contains_null,
304
+ }
305
+ end
306
+
307
+ def to_proto
308
+ Types.wrap(map: Proto::DataType::Map.new(
309
+ key_type: key_type.to_proto,
310
+ value_type: value_type.to_proto,
311
+ value_contains_null: value_contains_null
312
+ ))
313
+ end
314
+
315
+ def ==(other)
316
+ other.is_a?(MapType) && other.key_type == key_type &&
317
+ other.value_type == value_type && other.value_contains_null == value_contains_null
318
+ end
319
+ end
320
+
321
+ # A single field within a {StructType}.
322
+ class StructField
323
+ attr_reader :name, :data_type, :nullable, :metadata
324
+
325
+ def initialize(name, data_type, nullable: true, metadata: nil)
326
+ @name = name.to_s
327
+ @data_type = data_type
328
+ @nullable = nullable
329
+ @metadata = metadata
330
+ end
331
+
332
+ def simple_string = "#{name}:#{data_type.simple_string}"
333
+
334
+ def json_value
335
+ h = { "name" => name, "type" => data_type.json_value, "nullable" => nullable }
336
+ h["metadata"] = metadata if metadata
337
+ h
338
+ end
339
+
340
+ def to_proto
341
+ Proto::DataType::StructField.new(
342
+ name: name,
343
+ data_type: data_type.to_proto,
344
+ nullable: nullable,
345
+ metadata: metadata ? JSON.generate(metadata) : nil
346
+ )
347
+ end
348
+
349
+ def ==(other)
350
+ other.is_a?(StructField) && other.name == name &&
351
+ other.data_type == data_type && other.nullable == nullable
352
+ end
353
+ end
354
+
355
+ # A struct (row) type: an ordered collection of {StructField}s. This is the
356
+ # type of every {DataFrame}'s schema.
357
+ class StructType < DataType
358
+ include Enumerable
359
+
360
+ attr_reader :fields
361
+
362
+ def initialize(fields = [])
363
+ super()
364
+ @fields = fields
365
+ end
366
+
367
+ # Append a field and return self (chainable builder).
368
+ #
369
+ # @param name [String, StructField]
370
+ # @return [StructType]
371
+ def add(name, data_type = nil, nullable: true, metadata: nil)
372
+ @fields << if name.is_a?(StructField)
373
+ name
374
+ else
375
+ StructField.new(name, data_type, nullable: nullable, metadata: metadata)
376
+ end
377
+ self
378
+ end
379
+
380
+ def each(&) = fields.each(&)
381
+ def [](key) = key.is_a?(Integer) ? fields[key] : fields.find { |f| f.name == key.to_s }
382
+ def names = fields.map(&:name)
383
+ def length = fields.length
384
+ alias size length
385
+
386
+ def simple_string = "struct<#{fields.map(&:simple_string).join(',')}>"
387
+ def type_name = "struct"
388
+
389
+ def json_value
390
+ { "type" => "struct", "fields" => fields.map(&:json_value) }
391
+ end
392
+
393
+ def to_proto
394
+ Types.wrap(struct: Proto::DataType::Struct.new(fields: fields.map(&:to_proto)))
395
+ end
396
+
397
+ # A human-readable, indented tree (used by {DataFrame#print_schema}).
398
+ #
399
+ # @return [String]
400
+ def tree_string
401
+ lines = ["root"]
402
+ fields.each { |f| append_tree(lines, f, " |") }
403
+ "#{lines.join("\n")}\n"
404
+ end
405
+
406
+ def ==(other) = other.is_a?(StructType) && other.fields == fields
407
+
408
+ private
409
+
410
+ def append_tree(lines, field, prefix)
411
+ dt = field.data_type
412
+ lines << "#{prefix}-- #{field.name}: #{dt.type_name} (nullable = #{field.nullable})"
413
+ case dt
414
+ when StructType
415
+ dt.fields.each { |f| append_tree(lines, f, "#{prefix} |") }
416
+ when ArrayType
417
+ lines << "#{prefix} |-- element: #{dt.element_type.type_name} (containsNull = #{dt.contains_null})"
418
+ dt.element_type.fields.each { |f| append_tree(lines, f, "#{prefix} | |") } if dt.element_type.is_a?(StructType)
419
+ when MapType
420
+ lines << "#{prefix} |-- key: #{dt.key_type.type_name}"
421
+ lines << "#{prefix} |-- value: #{dt.value_type.type_name} (valueContainsNull = #{dt.value_contains_null})"
422
+ end
423
+ end
424
+ end
425
+
426
+ # ---- Convenience constructors -----------------------------------------
427
+
428
+ module_function
429
+
430
+ def null = NullType.new
431
+ def boolean = BooleanType.new
432
+ def byte = ByteType.new
433
+ def short = ShortType.new
434
+ def integer = IntegerType.new
435
+ def long = LongType.new
436
+ def float = FloatType.new
437
+ def double = DoubleType.new
438
+ def string = StringType.new
439
+ def binary = BinaryType.new
440
+ def date = DateType.new
441
+ def timestamp = TimestampType.new
442
+ def timestamp_ntz = TimestampNTZType.new
443
+ def variant = VariantType.new
444
+ def decimal(precision = 10, scale = 0) = DecimalType.new(precision, scale)
445
+ def array(element_type, contains_null: true) = ArrayType.new(element_type, contains_null: contains_null)
446
+ def map(key_type, value_type, value_contains_null: true) = MapType.new(key_type, value_type, value_contains_null: value_contains_null)
447
+ def struct(*fields) = StructType.new(fields.flatten)
448
+ def field(name, data_type, nullable: true, metadata: nil) = StructField.new(name, data_type, nullable: nullable, metadata: metadata)
449
+
450
+ # Convert a protobuf `DataType` message into a {DataType} instance.
451
+ #
452
+ # @param proto [Spark::Connect::DataType]
453
+ # @return [DataType]
454
+ def from_proto(proto)
455
+ kind = proto.kind
456
+ sub = proto.public_send(kind)
457
+ case kind
458
+ when :null then NullType.new
459
+ when :boolean then BooleanType.new
460
+ when :byte then ByteType.new
461
+ when :short then ShortType.new
462
+ when :integer then IntegerType.new
463
+ when :long then LongType.new
464
+ when :float then FloatType.new
465
+ when :double then DoubleType.new
466
+ when :string then StringType.new(sub.collation.empty? ? "UTF8_BINARY" : sub.collation)
467
+ when :binary then BinaryType.new
468
+ when :date then DateType.new
469
+ when :timestamp then TimestampType.new
470
+ when :timestamp_ntz then TimestampNTZType.new
471
+ when :variant then VariantType.new
472
+ when :calendar_interval then CalendarIntervalType.new
473
+ when :day_time_interval then DayTimeIntervalType.new(sub.start_field || 0, sub.end_field || 3)
474
+ when :year_month_interval then YearMonthIntervalType.new(sub.start_field || 0, sub.end_field || 1)
475
+ when :decimal then DecimalType.new(sub.precision || 10, sub.scale || 0)
476
+ when :char then CharType.new(sub.length)
477
+ when :var_char then VarcharType.new(sub.length)
478
+ when :array then ArrayType.new(from_proto(sub.element_type), contains_null: sub.contains_null)
479
+ when :map then MapType.new(from_proto(sub.key_type), from_proto(sub.value_type), value_contains_null: sub.value_contains_null)
480
+ when :struct
481
+ StructType.new(sub.fields.map do |f|
482
+ StructField.new(f.name, from_proto(f.data_type), nullable: f.nullable,
483
+ metadata: (f.metadata && !f.metadata.empty? ? JSON.parse(f.metadata) : nil))
484
+ end)
485
+ else
486
+ raise IllegalArgumentError, "Unsupported proto DataType kind: #{kind}"
487
+ end
488
+ end
489
+ end
490
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SparkConnect
4
+ # The released version of the spark-connect gem.
5
+ VERSION = "0.2.0"
6
+
7
+ # The Apache Spark version whose Spark Connect protocol definitions this
8
+ # client is generated against. The client aims to be wire-compatible with
9
+ # Spark Connect servers of this major/minor line and newer.
10
+ SPARK_VERSION = "4.1.0"
11
+ end
@@ -0,0 +1,119 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SparkConnect
4
+ # Defines the partitioning, ordering, and frame for a window aggregation.
5
+ # Build one with the {Window} factory and attach it to an analytic column via
6
+ # {Column#over}.
7
+ #
8
+ # @example
9
+ # w = SparkConnect::Window.partition_by("dept").order_by(F.col("salary").desc)
10
+ # df.with_column("rank", F.rank.over(w))
11
+ class WindowSpec
12
+ Proto = SparkConnect::Proto
13
+
14
+ # Boundary sentinels (matching Spark's `Window.unboundedPreceding`, etc.).
15
+ UNBOUNDED_PRECEDING = -(2**63)
16
+ UNBOUNDED_FOLLOWING = (2**63) - 1
17
+ CURRENT_ROW = 0
18
+
19
+ # @return [Array<Spark::Connect::Expression>]
20
+ attr_reader :partition_spec
21
+ # @return [Array<Spark::Connect::Expression::SortOrder>]
22
+ attr_reader :order_spec
23
+ # @return [Spark::Connect::Expression::Window::WindowFrame, nil]
24
+ attr_reader :frame_spec
25
+
26
+ def initialize(partition_spec: [], order_spec: [], frame_spec: nil)
27
+ @partition_spec = partition_spec
28
+ @order_spec = order_spec
29
+ @frame_spec = frame_spec
30
+ end
31
+
32
+ # @return [WindowSpec] a copy partitioned by the given columns.
33
+ def partition_by(*cols)
34
+ copy(partition_spec: to_exprs(cols))
35
+ end
36
+
37
+ # @return [WindowSpec] a copy ordered by the given columns.
38
+ def order_by(*cols)
39
+ copy(order_spec: to_sort_orders(cols))
40
+ end
41
+
42
+ # Row-based frame between `start` and `end` (offsets relative to the current row).
43
+ # @return [WindowSpec]
44
+ def rows_between(start_, end_)
45
+ copy(frame_spec: frame(:FRAME_TYPE_ROW, start_, end_))
46
+ end
47
+
48
+ # Range-based frame between `start` and `end` (value offsets over the ordering).
49
+ # @return [WindowSpec]
50
+ def range_between(start_, end_)
51
+ copy(frame_spec: frame(:FRAME_TYPE_RANGE, start_, end_))
52
+ end
53
+
54
+ private
55
+
56
+ def copy(partition_spec: @partition_spec, order_spec: @order_spec, frame_spec: @frame_spec)
57
+ WindowSpec.new(partition_spec: partition_spec, order_spec: order_spec, frame_spec: frame_spec)
58
+ end
59
+
60
+ def to_exprs(cols)
61
+ cols.flatten.map { |c| (c.is_a?(Column) ? c : Functions.col(c.to_s)).to_expr }
62
+ end
63
+
64
+ def to_sort_orders(cols)
65
+ cols.flatten.map do |c|
66
+ col = c.is_a?(Column) ? c : Functions.col(c.to_s)
67
+ expr = col.to_expr
68
+ if expr.expr_type == :sort_order
69
+ expr.sort_order
70
+ else
71
+ Proto::Expression::SortOrder.new(child: expr, direction: :SORT_DIRECTION_ASCENDING,
72
+ null_ordering: :SORT_NULLS_FIRST)
73
+ end
74
+ end
75
+ end
76
+
77
+ def frame(type, start_, end_)
78
+ Proto::Expression::Window::WindowFrame.new(frame_type: type, lower: boundary(start_), upper: boundary(end_))
79
+ end
80
+
81
+ def boundary(value)
82
+ fb = Proto::Expression::Window::WindowFrame::FrameBoundary
83
+ case value
84
+ when CURRENT_ROW then fb.new(current_row: true)
85
+ when UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING then fb.new(unbounded: true)
86
+ else fb.new(value: Column.lit(value).to_expr)
87
+ end
88
+ end
89
+ end
90
+
91
+ # Factory entry point for building {WindowSpec}s. Mirrors PySpark's `Window`.
92
+ module Window
93
+ UNBOUNDED_PRECEDING = WindowSpec::UNBOUNDED_PRECEDING
94
+ UNBOUNDED_FOLLOWING = WindowSpec::UNBOUNDED_FOLLOWING
95
+ CURRENT_ROW = WindowSpec::CURRENT_ROW
96
+
97
+ module_function
98
+
99
+ # @return [WindowSpec] partitioned by the given columns.
100
+ def partition_by(*cols)
101
+ WindowSpec.new.partition_by(*cols)
102
+ end
103
+
104
+ # @return [WindowSpec] ordered by the given columns.
105
+ def order_by(*cols)
106
+ WindowSpec.new.order_by(*cols)
107
+ end
108
+
109
+ # @return [WindowSpec] with a row-based frame.
110
+ def rows_between(start_, end_)
111
+ WindowSpec.new.rows_between(start_, end_)
112
+ end
113
+
114
+ # @return [WindowSpec] with a range-based frame.
115
+ def range_between(start_, end_)
116
+ WindowSpec.new.range_between(start_, end_)
117
+ end
118
+ end
119
+ end