spark-connect 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +82 -0
- data/LICENSE +202 -0
- data/NOTICE +16 -0
- data/README.md +166 -0
- data/lib/spark-connect.rb +5 -0
- data/lib/spark_connect/arrow.rb +115 -0
- data/lib/spark_connect/catalog.rb +190 -0
- data/lib/spark_connect/channel_builder.rb +134 -0
- data/lib/spark_connect/client.rb +264 -0
- data/lib/spark_connect/column.rb +379 -0
- data/lib/spark_connect/conf.rb +79 -0
- data/lib/spark_connect/data_frame.rb +828 -0
- data/lib/spark_connect/errors.rb +58 -0
- data/lib/spark_connect/functions.rb +903 -0
- data/lib/spark_connect/grouped_data.rb +101 -0
- data/lib/spark_connect/na_functions.rb +98 -0
- data/lib/spark_connect/observation.rb +61 -0
- data/lib/spark_connect/pipelines.rb +221 -0
- data/lib/spark_connect/plan.rb +39 -0
- data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
- data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
- data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
- data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
- data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
- data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
- data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
- data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
- data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
- data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
- data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
- data/lib/spark_connect/proto.rb +32 -0
- data/lib/spark_connect/reader.rb +98 -0
- data/lib/spark_connect/row.rb +105 -0
- data/lib/spark_connect/session.rb +317 -0
- data/lib/spark_connect/stat_functions.rb +109 -0
- data/lib/spark_connect/streaming.rb +351 -0
- data/lib/spark_connect/types.rb +490 -0
- data/lib/spark_connect/version.rb +11 -0
- data/lib/spark_connect/window.rb +119 -0
- data/lib/spark_connect/writer.rb +208 -0
- data/lib/spark_connect.rb +58 -0
- data/proto/spark/connect/base.proto +1275 -0
- data/proto/spark/connect/catalog.proto +243 -0
- data/proto/spark/connect/commands.proto +553 -0
- data/proto/spark/connect/common.proto +179 -0
- data/proto/spark/connect/expressions.proto +557 -0
- data/proto/spark/connect/ml.proto +147 -0
- data/proto/spark/connect/ml_common.proto +64 -0
- data/proto/spark/connect/pipelines.proto +307 -0
- data/proto/spark/connect/relations.proto +1252 -0
- data/proto/spark/connect/types.proto +227 -0
- metadata +149 -0
|
@@ -0,0 +1,490 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module SparkConnect
|
|
6
|
+
# The Spark SQL type system.
|
|
7
|
+
#
|
|
8
|
+
# Every Spark data type is represented by an instance of a {DataType}
|
|
9
|
+
# subclass. Types convert to and from the protobuf `DataType` message via
|
|
10
|
+
# {DataType#to_proto} and {Types.from_proto}, and render a Spark-compatible
|
|
11
|
+
# `simpleString` (e.g. `"array<int>"`) and `typeName` (e.g. `"integer"`).
|
|
12
|
+
#
|
|
13
|
+
# @example
|
|
14
|
+
# SparkConnect::Types::IntegerType.new.simple_string #=> "int"
|
|
15
|
+
# SparkConnect::Types.array(SparkConnect::Types::StringType.new).simple_string
|
|
16
|
+
# #=> "array<string>"
|
|
17
|
+
module Types
|
|
18
|
+
Proto = SparkConnect::Proto
|
|
19
|
+
|
|
20
|
+
# Abstract base class for all Spark data types.
|
|
21
|
+
class DataType
|
|
22
|
+
# @return [String] the Spark `simpleString` representation, e.g. `"int"`,
|
|
23
|
+
# `"array<string>"`, `"struct<a:int>"`.
|
|
24
|
+
def simple_string
|
|
25
|
+
type_name
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# @return [String] the short type name used by Spark's JSON schema, e.g.
|
|
29
|
+
# `"integer"`, `"long"`, `"string"`.
|
|
30
|
+
def type_name
|
|
31
|
+
n = self.class.name.split("::").last.sub(/Type$/, "")
|
|
32
|
+
n.gsub(/([a-z\d])([A-Z])/, '\1_\2').downcase
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# @return [String, Hash] the Spark JSON schema fragment for this type.
|
|
36
|
+
def json_value
|
|
37
|
+
type_name
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# @return [String] the JSON schema string for this type.
|
|
41
|
+
def json
|
|
42
|
+
JSON.generate(json_value)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# @return [Spark::Connect::DataType] the protobuf representation.
|
|
46
|
+
def to_proto
|
|
47
|
+
raise NotImplementedError, "#{self.class}#to_proto is not implemented"
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def ==(other)
|
|
51
|
+
other.is_a?(self.class) && other.class == self.class
|
|
52
|
+
end
|
|
53
|
+
alias eql? ==
|
|
54
|
+
|
|
55
|
+
def hash
|
|
56
|
+
self.class.hash
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def to_s
|
|
60
|
+
simple_string
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def inspect
|
|
64
|
+
"#<#{self.class.name} #{simple_string}>"
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Helper that wraps a kind message into a `DataType` proto.
|
|
69
|
+
def self.wrap(**kwargs)
|
|
70
|
+
Proto::DataType.new(**kwargs)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
class NullType < DataType
|
|
74
|
+
def simple_string = "void"
|
|
75
|
+
def type_name = "void"
|
|
76
|
+
def to_proto = Types.wrap(null: Proto::DataType::NULL.new)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
class BooleanType < DataType
|
|
80
|
+
def type_name = "boolean"
|
|
81
|
+
def to_proto = Types.wrap(boolean: Proto::DataType::Boolean.new)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
class ByteType < DataType
|
|
85
|
+
def simple_string = "tinyint"
|
|
86
|
+
def type_name = "byte"
|
|
87
|
+
def to_proto = Types.wrap(byte: Proto::DataType::Byte.new)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
class ShortType < DataType
|
|
91
|
+
def simple_string = "smallint"
|
|
92
|
+
def type_name = "short"
|
|
93
|
+
def to_proto = Types.wrap(short: Proto::DataType::Short.new)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
class IntegerType < DataType
|
|
97
|
+
def simple_string = "int"
|
|
98
|
+
def type_name = "integer"
|
|
99
|
+
def to_proto = Types.wrap(integer: Proto::DataType::Integer.new)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
class LongType < DataType
|
|
103
|
+
def simple_string = "bigint"
|
|
104
|
+
def type_name = "long"
|
|
105
|
+
def to_proto = Types.wrap(long: Proto::DataType::Long.new)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
class FloatType < DataType
|
|
109
|
+
def type_name = "float"
|
|
110
|
+
def to_proto = Types.wrap(float: Proto::DataType::Float.new)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
class DoubleType < DataType
|
|
114
|
+
def type_name = "double"
|
|
115
|
+
def to_proto = Types.wrap(double: Proto::DataType::Double.new)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
class StringType < DataType
|
|
119
|
+
# @return [String] the collation name (default `"UTF8_BINARY"`).
|
|
120
|
+
attr_reader :collation
|
|
121
|
+
|
|
122
|
+
def initialize(collation = "UTF8_BINARY")
|
|
123
|
+
super()
|
|
124
|
+
@collation = collation
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def type_name = "string"
|
|
128
|
+
def to_proto = Types.wrap(string: Proto::DataType::String.new(collation: @collation))
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
class BinaryType < DataType
|
|
132
|
+
def type_name = "binary"
|
|
133
|
+
def to_proto = Types.wrap(binary: Proto::DataType::Binary.new)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
class DateType < DataType
|
|
137
|
+
def type_name = "date"
|
|
138
|
+
def to_proto = Types.wrap(date: Proto::DataType::Date.new)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
class TimestampType < DataType
|
|
142
|
+
def type_name = "timestamp"
|
|
143
|
+
def to_proto = Types.wrap(timestamp: Proto::DataType::Timestamp.new)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
class TimestampNTZType < DataType
|
|
147
|
+
def simple_string = "timestamp_ntz"
|
|
148
|
+
def type_name = "timestamp_ntz"
|
|
149
|
+
def to_proto = Types.wrap(timestamp_ntz: Proto::DataType::TimestampNTZ.new)
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
class VariantType < DataType
|
|
153
|
+
def type_name = "variant"
|
|
154
|
+
def to_proto = Types.wrap(variant: Proto::DataType::Variant.new)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
class DecimalType < DataType
|
|
158
|
+
# @return [Integer] total number of digits (max 38).
|
|
159
|
+
attr_reader :precision
|
|
160
|
+
# @return [Integer] number of digits to the right of the decimal point.
|
|
161
|
+
attr_reader :scale
|
|
162
|
+
|
|
163
|
+
def initialize(precision = 10, scale = 0)
|
|
164
|
+
super()
|
|
165
|
+
@precision = precision
|
|
166
|
+
@scale = scale
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def simple_string = "decimal(#{precision},#{scale})"
|
|
170
|
+
def type_name = "decimal"
|
|
171
|
+
def json_value = "decimal(#{precision},#{scale})"
|
|
172
|
+
def to_proto = Types.wrap(decimal: Proto::DataType::Decimal.new(precision: precision, scale: scale))
|
|
173
|
+
|
|
174
|
+
def ==(other)
|
|
175
|
+
other.is_a?(DecimalType) && other.precision == precision && other.scale == scale
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
class CharType < DataType
|
|
180
|
+
attr_reader :length
|
|
181
|
+
|
|
182
|
+
def initialize(length)
|
|
183
|
+
super()
|
|
184
|
+
@length = length
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def simple_string = "char(#{length})"
|
|
188
|
+
def type_name = "char"
|
|
189
|
+
def json_value = "char(#{length})"
|
|
190
|
+
def to_proto = Types.wrap(char: Proto::DataType::Char.new(length: length))
|
|
191
|
+
|
|
192
|
+
def ==(other) = other.is_a?(CharType) && other.length == length
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
class VarcharType < DataType
|
|
196
|
+
attr_reader :length
|
|
197
|
+
|
|
198
|
+
def initialize(length)
|
|
199
|
+
super()
|
|
200
|
+
@length = length
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def simple_string = "varchar(#{length})"
|
|
204
|
+
def type_name = "varchar"
|
|
205
|
+
def json_value = "varchar(#{length})"
|
|
206
|
+
def to_proto = Types.wrap(var_char: Proto::DataType::VarChar.new(length: length))
|
|
207
|
+
|
|
208
|
+
def ==(other) = other.is_a?(VarcharType) && other.length == length
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
class DayTimeIntervalType < DataType
|
|
212
|
+
DAY = 0
|
|
213
|
+
HOUR = 1
|
|
214
|
+
MINUTE = 2
|
|
215
|
+
SECOND = 3
|
|
216
|
+
attr_reader :start_field, :end_field
|
|
217
|
+
|
|
218
|
+
def initialize(start_field = DAY, end_field = SECOND)
|
|
219
|
+
super()
|
|
220
|
+
@start_field = start_field
|
|
221
|
+
@end_field = end_field
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def simple_string = "interval day to second"
|
|
225
|
+
def type_name = "interval"
|
|
226
|
+
|
|
227
|
+
def to_proto
|
|
228
|
+
Types.wrap(day_time_interval: Proto::DataType::DayTimeInterval.new(start_field: start_field, end_field: end_field))
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
class YearMonthIntervalType < DataType
|
|
233
|
+
YEAR = 0
|
|
234
|
+
MONTH = 1
|
|
235
|
+
attr_reader :start_field, :end_field
|
|
236
|
+
|
|
237
|
+
def initialize(start_field = YEAR, end_field = MONTH)
|
|
238
|
+
super()
|
|
239
|
+
@start_field = start_field
|
|
240
|
+
@end_field = end_field
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
def simple_string = "interval year to month"
|
|
244
|
+
def type_name = "interval"
|
|
245
|
+
|
|
246
|
+
def to_proto
|
|
247
|
+
Types.wrap(year_month_interval: Proto::DataType::YearMonthInterval.new(start_field: start_field, end_field: end_field))
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
class CalendarIntervalType < DataType
|
|
252
|
+
def simple_string = "interval"
|
|
253
|
+
def type_name = "calendar_interval"
|
|
254
|
+
def to_proto = Types.wrap(calendar_interval: Proto::DataType::CalendarInterval.new)
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# An array type. `element_type` is the type of every element; `contains_null`
|
|
258
|
+
# indicates whether the array may contain `null` values.
|
|
259
|
+
class ArrayType < DataType
|
|
260
|
+
attr_reader :element_type, :contains_null
|
|
261
|
+
|
|
262
|
+
def initialize(element_type, contains_null: true)
|
|
263
|
+
super()
|
|
264
|
+
@element_type = element_type
|
|
265
|
+
@contains_null = contains_null
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
def simple_string = "array<#{element_type.simple_string}>"
|
|
269
|
+
def type_name = "array"
|
|
270
|
+
|
|
271
|
+
def json_value
|
|
272
|
+
{ "type" => "array", "elementType" => element_type.json_value, "containsNull" => contains_null }
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def to_proto
|
|
276
|
+
Types.wrap(array: Proto::DataType::Array.new(element_type: element_type.to_proto, contains_null: contains_null))
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
def ==(other)
|
|
280
|
+
other.is_a?(ArrayType) && other.element_type == element_type && other.contains_null == contains_null
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# A map type with key and value element types.
|
|
285
|
+
class MapType < DataType
|
|
286
|
+
attr_reader :key_type, :value_type, :value_contains_null
|
|
287
|
+
|
|
288
|
+
def initialize(key_type, value_type, value_contains_null: true)
|
|
289
|
+
super()
|
|
290
|
+
@key_type = key_type
|
|
291
|
+
@value_type = value_type
|
|
292
|
+
@value_contains_null = value_contains_null
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
def simple_string = "map<#{key_type.simple_string},#{value_type.simple_string}>"
|
|
296
|
+
def type_name = "map"
|
|
297
|
+
|
|
298
|
+
def json_value
|
|
299
|
+
{
|
|
300
|
+
"type" => "map",
|
|
301
|
+
"keyType" => key_type.json_value,
|
|
302
|
+
"valueType" => value_type.json_value,
|
|
303
|
+
"valueContainsNull" => value_contains_null,
|
|
304
|
+
}
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
def to_proto
|
|
308
|
+
Types.wrap(map: Proto::DataType::Map.new(
|
|
309
|
+
key_type: key_type.to_proto,
|
|
310
|
+
value_type: value_type.to_proto,
|
|
311
|
+
value_contains_null: value_contains_null
|
|
312
|
+
))
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def ==(other)
|
|
316
|
+
other.is_a?(MapType) && other.key_type == key_type &&
|
|
317
|
+
other.value_type == value_type && other.value_contains_null == value_contains_null
|
|
318
|
+
end
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
# A single field within a {StructType}.
|
|
322
|
+
class StructField
|
|
323
|
+
attr_reader :name, :data_type, :nullable, :metadata
|
|
324
|
+
|
|
325
|
+
def initialize(name, data_type, nullable: true, metadata: nil)
|
|
326
|
+
@name = name.to_s
|
|
327
|
+
@data_type = data_type
|
|
328
|
+
@nullable = nullable
|
|
329
|
+
@metadata = metadata
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
def simple_string = "#{name}:#{data_type.simple_string}"
|
|
333
|
+
|
|
334
|
+
def json_value
|
|
335
|
+
h = { "name" => name, "type" => data_type.json_value, "nullable" => nullable }
|
|
336
|
+
h["metadata"] = metadata if metadata
|
|
337
|
+
h
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
def to_proto
|
|
341
|
+
Proto::DataType::StructField.new(
|
|
342
|
+
name: name,
|
|
343
|
+
data_type: data_type.to_proto,
|
|
344
|
+
nullable: nullable,
|
|
345
|
+
metadata: metadata ? JSON.generate(metadata) : nil
|
|
346
|
+
)
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
def ==(other)
|
|
350
|
+
other.is_a?(StructField) && other.name == name &&
|
|
351
|
+
other.data_type == data_type && other.nullable == nullable
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
# A struct (row) type: an ordered collection of {StructField}s. This is the
|
|
356
|
+
# type of every {DataFrame}'s schema.
|
|
357
|
+
class StructType < DataType
|
|
358
|
+
include Enumerable
|
|
359
|
+
|
|
360
|
+
attr_reader :fields
|
|
361
|
+
|
|
362
|
+
def initialize(fields = [])
|
|
363
|
+
super()
|
|
364
|
+
@fields = fields
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
# Append a field and return self (chainable builder).
|
|
368
|
+
#
|
|
369
|
+
# @param name [String, StructField]
|
|
370
|
+
# @return [StructType]
|
|
371
|
+
def add(name, data_type = nil, nullable: true, metadata: nil)
|
|
372
|
+
@fields << if name.is_a?(StructField)
|
|
373
|
+
name
|
|
374
|
+
else
|
|
375
|
+
StructField.new(name, data_type, nullable: nullable, metadata: metadata)
|
|
376
|
+
end
|
|
377
|
+
self
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
def each(&) = fields.each(&)
|
|
381
|
+
def [](key) = key.is_a?(Integer) ? fields[key] : fields.find { |f| f.name == key.to_s }
|
|
382
|
+
def names = fields.map(&:name)
|
|
383
|
+
def length = fields.length
|
|
384
|
+
alias size length
|
|
385
|
+
|
|
386
|
+
def simple_string = "struct<#{fields.map(&:simple_string).join(',')}>"
|
|
387
|
+
def type_name = "struct"
|
|
388
|
+
|
|
389
|
+
def json_value
|
|
390
|
+
{ "type" => "struct", "fields" => fields.map(&:json_value) }
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
def to_proto
|
|
394
|
+
Types.wrap(struct: Proto::DataType::Struct.new(fields: fields.map(&:to_proto)))
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
# A human-readable, indented tree (used by {DataFrame#print_schema}).
|
|
398
|
+
#
|
|
399
|
+
# @return [String]
|
|
400
|
+
def tree_string
|
|
401
|
+
lines = ["root"]
|
|
402
|
+
fields.each { |f| append_tree(lines, f, " |") }
|
|
403
|
+
"#{lines.join("\n")}\n"
|
|
404
|
+
end
|
|
405
|
+
|
|
406
|
+
def ==(other) = other.is_a?(StructType) && other.fields == fields
|
|
407
|
+
|
|
408
|
+
private
|
|
409
|
+
|
|
410
|
+
def append_tree(lines, field, prefix)
|
|
411
|
+
dt = field.data_type
|
|
412
|
+
lines << "#{prefix}-- #{field.name}: #{dt.type_name} (nullable = #{field.nullable})"
|
|
413
|
+
case dt
|
|
414
|
+
when StructType
|
|
415
|
+
dt.fields.each { |f| append_tree(lines, f, "#{prefix} |") }
|
|
416
|
+
when ArrayType
|
|
417
|
+
lines << "#{prefix} |-- element: #{dt.element_type.type_name} (containsNull = #{dt.contains_null})"
|
|
418
|
+
dt.element_type.fields.each { |f| append_tree(lines, f, "#{prefix} | |") } if dt.element_type.is_a?(StructType)
|
|
419
|
+
when MapType
|
|
420
|
+
lines << "#{prefix} |-- key: #{dt.key_type.type_name}"
|
|
421
|
+
lines << "#{prefix} |-- value: #{dt.value_type.type_name} (valueContainsNull = #{dt.value_contains_null})"
|
|
422
|
+
end
|
|
423
|
+
end
|
|
424
|
+
end
|
|
425
|
+
|
|
426
|
+
# ---- Convenience constructors -----------------------------------------
|
|
427
|
+
|
|
428
|
+
module_function
|
|
429
|
+
|
|
430
|
+
def null = NullType.new
|
|
431
|
+
def boolean = BooleanType.new
|
|
432
|
+
def byte = ByteType.new
|
|
433
|
+
def short = ShortType.new
|
|
434
|
+
def integer = IntegerType.new
|
|
435
|
+
def long = LongType.new
|
|
436
|
+
def float = FloatType.new
|
|
437
|
+
def double = DoubleType.new
|
|
438
|
+
def string = StringType.new
|
|
439
|
+
def binary = BinaryType.new
|
|
440
|
+
def date = DateType.new
|
|
441
|
+
def timestamp = TimestampType.new
|
|
442
|
+
def timestamp_ntz = TimestampNTZType.new
|
|
443
|
+
def variant = VariantType.new
|
|
444
|
+
def decimal(precision = 10, scale = 0) = DecimalType.new(precision, scale)
|
|
445
|
+
def array(element_type, contains_null: true) = ArrayType.new(element_type, contains_null: contains_null)
|
|
446
|
+
def map(key_type, value_type, value_contains_null: true) = MapType.new(key_type, value_type, value_contains_null: value_contains_null)
|
|
447
|
+
def struct(*fields) = StructType.new(fields.flatten)
|
|
448
|
+
def field(name, data_type, nullable: true, metadata: nil) = StructField.new(name, data_type, nullable: nullable, metadata: metadata)
|
|
449
|
+
|
|
450
|
+
# Convert a protobuf `DataType` message into a {DataType} instance.
|
|
451
|
+
#
|
|
452
|
+
# @param proto [Spark::Connect::DataType]
|
|
453
|
+
# @return [DataType]
|
|
454
|
+
def from_proto(proto)
|
|
455
|
+
kind = proto.kind
|
|
456
|
+
sub = proto.public_send(kind)
|
|
457
|
+
case kind
|
|
458
|
+
when :null then NullType.new
|
|
459
|
+
when :boolean then BooleanType.new
|
|
460
|
+
when :byte then ByteType.new
|
|
461
|
+
when :short then ShortType.new
|
|
462
|
+
when :integer then IntegerType.new
|
|
463
|
+
when :long then LongType.new
|
|
464
|
+
when :float then FloatType.new
|
|
465
|
+
when :double then DoubleType.new
|
|
466
|
+
when :string then StringType.new(sub.collation.empty? ? "UTF8_BINARY" : sub.collation)
|
|
467
|
+
when :binary then BinaryType.new
|
|
468
|
+
when :date then DateType.new
|
|
469
|
+
when :timestamp then TimestampType.new
|
|
470
|
+
when :timestamp_ntz then TimestampNTZType.new
|
|
471
|
+
when :variant then VariantType.new
|
|
472
|
+
when :calendar_interval then CalendarIntervalType.new
|
|
473
|
+
when :day_time_interval then DayTimeIntervalType.new(sub.start_field || 0, sub.end_field || 3)
|
|
474
|
+
when :year_month_interval then YearMonthIntervalType.new(sub.start_field || 0, sub.end_field || 1)
|
|
475
|
+
when :decimal then DecimalType.new(sub.precision || 10, sub.scale || 0)
|
|
476
|
+
when :char then CharType.new(sub.length)
|
|
477
|
+
when :var_char then VarcharType.new(sub.length)
|
|
478
|
+
when :array then ArrayType.new(from_proto(sub.element_type), contains_null: sub.contains_null)
|
|
479
|
+
when :map then MapType.new(from_proto(sub.key_type), from_proto(sub.value_type), value_contains_null: sub.value_contains_null)
|
|
480
|
+
when :struct
|
|
481
|
+
StructType.new(sub.fields.map do |f|
|
|
482
|
+
StructField.new(f.name, from_proto(f.data_type), nullable: f.nullable,
|
|
483
|
+
metadata: (f.metadata && !f.metadata.empty? ? JSON.parse(f.metadata) : nil))
|
|
484
|
+
end)
|
|
485
|
+
else
|
|
486
|
+
raise IllegalArgumentError, "Unsupported proto DataType kind: #{kind}"
|
|
487
|
+
end
|
|
488
|
+
end
|
|
489
|
+
end
|
|
490
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SparkConnect
|
|
4
|
+
# The released version of the spark-connect gem.
|
|
5
|
+
VERSION = "0.2.0"
|
|
6
|
+
|
|
7
|
+
# The Apache Spark version whose Spark Connect protocol definitions this
|
|
8
|
+
# client is generated against. The client aims to be wire-compatible with
|
|
9
|
+
# Spark Connect servers of this major/minor line and newer.
|
|
10
|
+
SPARK_VERSION = "4.1.0"
|
|
11
|
+
end
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SparkConnect
|
|
4
|
+
# Defines the partitioning, ordering, and frame for a window aggregation.
|
|
5
|
+
# Build one with the {Window} factory and attach it to an analytic column via
|
|
6
|
+
# {Column#over}.
|
|
7
|
+
#
|
|
8
|
+
# @example
|
|
9
|
+
# w = SparkConnect::Window.partition_by("dept").order_by(F.col("salary").desc)
|
|
10
|
+
# df.with_column("rank", F.rank.over(w))
|
|
11
|
+
class WindowSpec
|
|
12
|
+
Proto = SparkConnect::Proto
|
|
13
|
+
|
|
14
|
+
# Boundary sentinels (matching Spark's `Window.unboundedPreceding`, etc.).
|
|
15
|
+
UNBOUNDED_PRECEDING = -(2**63)
|
|
16
|
+
UNBOUNDED_FOLLOWING = (2**63) - 1
|
|
17
|
+
CURRENT_ROW = 0
|
|
18
|
+
|
|
19
|
+
# @return [Array<Spark::Connect::Expression>]
|
|
20
|
+
attr_reader :partition_spec
|
|
21
|
+
# @return [Array<Spark::Connect::Expression::SortOrder>]
|
|
22
|
+
attr_reader :order_spec
|
|
23
|
+
# @return [Spark::Connect::Expression::Window::WindowFrame, nil]
|
|
24
|
+
attr_reader :frame_spec
|
|
25
|
+
|
|
26
|
+
def initialize(partition_spec: [], order_spec: [], frame_spec: nil)
|
|
27
|
+
@partition_spec = partition_spec
|
|
28
|
+
@order_spec = order_spec
|
|
29
|
+
@frame_spec = frame_spec
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# @return [WindowSpec] a copy partitioned by the given columns.
|
|
33
|
+
def partition_by(*cols)
|
|
34
|
+
copy(partition_spec: to_exprs(cols))
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# @return [WindowSpec] a copy ordered by the given columns.
|
|
38
|
+
def order_by(*cols)
|
|
39
|
+
copy(order_spec: to_sort_orders(cols))
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Row-based frame between `start` and `end` (offsets relative to the current row).
|
|
43
|
+
# @return [WindowSpec]
|
|
44
|
+
def rows_between(start_, end_)
|
|
45
|
+
copy(frame_spec: frame(:FRAME_TYPE_ROW, start_, end_))
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Range-based frame between `start` and `end` (value offsets over the ordering).
|
|
49
|
+
# @return [WindowSpec]
|
|
50
|
+
def range_between(start_, end_)
|
|
51
|
+
copy(frame_spec: frame(:FRAME_TYPE_RANGE, start_, end_))
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
def copy(partition_spec: @partition_spec, order_spec: @order_spec, frame_spec: @frame_spec)
|
|
57
|
+
WindowSpec.new(partition_spec: partition_spec, order_spec: order_spec, frame_spec: frame_spec)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def to_exprs(cols)
|
|
61
|
+
cols.flatten.map { |c| (c.is_a?(Column) ? c : Functions.col(c.to_s)).to_expr }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def to_sort_orders(cols)
|
|
65
|
+
cols.flatten.map do |c|
|
|
66
|
+
col = c.is_a?(Column) ? c : Functions.col(c.to_s)
|
|
67
|
+
expr = col.to_expr
|
|
68
|
+
if expr.expr_type == :sort_order
|
|
69
|
+
expr.sort_order
|
|
70
|
+
else
|
|
71
|
+
Proto::Expression::SortOrder.new(child: expr, direction: :SORT_DIRECTION_ASCENDING,
|
|
72
|
+
null_ordering: :SORT_NULLS_FIRST)
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def frame(type, start_, end_)
|
|
78
|
+
Proto::Expression::Window::WindowFrame.new(frame_type: type, lower: boundary(start_), upper: boundary(end_))
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def boundary(value)
|
|
82
|
+
fb = Proto::Expression::Window::WindowFrame::FrameBoundary
|
|
83
|
+
case value
|
|
84
|
+
when CURRENT_ROW then fb.new(current_row: true)
|
|
85
|
+
when UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING then fb.new(unbounded: true)
|
|
86
|
+
else fb.new(value: Column.lit(value).to_expr)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Factory entry point for building {WindowSpec}s. Mirrors PySpark's `Window`.
|
|
92
|
+
module Window
|
|
93
|
+
UNBOUNDED_PRECEDING = WindowSpec::UNBOUNDED_PRECEDING
|
|
94
|
+
UNBOUNDED_FOLLOWING = WindowSpec::UNBOUNDED_FOLLOWING
|
|
95
|
+
CURRENT_ROW = WindowSpec::CURRENT_ROW
|
|
96
|
+
|
|
97
|
+
module_function
|
|
98
|
+
|
|
99
|
+
# @return [WindowSpec] partitioned by the given columns.
|
|
100
|
+
def partition_by(*cols)
|
|
101
|
+
WindowSpec.new.partition_by(*cols)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# @return [WindowSpec] ordered by the given columns.
|
|
105
|
+
def order_by(*cols)
|
|
106
|
+
WindowSpec.new.order_by(*cols)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# @return [WindowSpec] with a row-based frame.
|
|
110
|
+
def rows_between(start_, end_)
|
|
111
|
+
WindowSpec.new.rows_between(start_, end_)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# @return [WindowSpec] with a range-based frame.
|
|
115
|
+
def range_between(start_, end_)
|
|
116
|
+
WindowSpec.new.range_between(start_, end_)
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|