spark-connect 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +82 -0
- data/LICENSE +202 -0
- data/NOTICE +16 -0
- data/README.md +166 -0
- data/lib/spark-connect.rb +5 -0
- data/lib/spark_connect/arrow.rb +115 -0
- data/lib/spark_connect/catalog.rb +190 -0
- data/lib/spark_connect/channel_builder.rb +134 -0
- data/lib/spark_connect/client.rb +264 -0
- data/lib/spark_connect/column.rb +379 -0
- data/lib/spark_connect/conf.rb +79 -0
- data/lib/spark_connect/data_frame.rb +828 -0
- data/lib/spark_connect/errors.rb +58 -0
- data/lib/spark_connect/functions.rb +903 -0
- data/lib/spark_connect/grouped_data.rb +101 -0
- data/lib/spark_connect/na_functions.rb +98 -0
- data/lib/spark_connect/observation.rb +61 -0
- data/lib/spark_connect/pipelines.rb +221 -0
- data/lib/spark_connect/plan.rb +39 -0
- data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
- data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
- data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
- data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
- data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
- data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
- data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
- data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
- data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
- data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
- data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
- data/lib/spark_connect/proto.rb +32 -0
- data/lib/spark_connect/reader.rb +98 -0
- data/lib/spark_connect/row.rb +105 -0
- data/lib/spark_connect/session.rb +317 -0
- data/lib/spark_connect/stat_functions.rb +109 -0
- data/lib/spark_connect/streaming.rb +351 -0
- data/lib/spark_connect/types.rb +490 -0
- data/lib/spark_connect/version.rb +11 -0
- data/lib/spark_connect/window.rb +119 -0
- data/lib/spark_connect/writer.rb +208 -0
- data/lib/spark_connect.rb +58 -0
- data/proto/spark/connect/base.proto +1275 -0
- data/proto/spark/connect/catalog.proto +243 -0
- data/proto/spark/connect/commands.proto +553 -0
- data/proto/spark/connect/common.proto +179 -0
- data/proto/spark/connect/expressions.proto +557 -0
- data/proto/spark/connect/ml.proto +147 -0
- data/proto/spark/connect/ml_common.proto +64 -0
- data/proto/spark/connect/pipelines.proto +307 -0
- data/proto/spark/connect/relations.proto +1252 -0
- data/proto/spark/connect/types.proto +227 -0
- metadata +149 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module SparkConnect
|
|
6
|
+
# Loads a streaming {DataFrame} from a streaming source. Returned by
|
|
7
|
+
# {SparkSession#read_stream}. Mirrors PySpark's `DataStreamReader`.
|
|
8
|
+
#
|
|
9
|
+
# @example
|
|
10
|
+
# df = spark.read_stream.format("rate").option("rowsPerSecond", 5).load
|
|
11
|
+
class DataStreamReader
|
|
12
|
+
Proto = SparkConnect::Proto
|
|
13
|
+
|
|
14
|
+
# @param session [SparkSession]
|
|
15
|
+
def initialize(session)
|
|
16
|
+
@session = session
|
|
17
|
+
@format = nil
|
|
18
|
+
@schema = nil
|
|
19
|
+
@options = {}
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# @return [self] set the streaming source format (`"rate"`, `"kafka"`, ...).
|
|
23
|
+
def format(source)
|
|
24
|
+
@format = source.to_s
|
|
25
|
+
self
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# @return [self] set the input schema (a {Types::StructType} or DDL string).
|
|
29
|
+
def schema(schema)
|
|
30
|
+
@schema = schema.is_a?(Types::StructType) ? schema.simple_string : schema.to_s
|
|
31
|
+
self
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# @return [self] set a single source option.
|
|
35
|
+
def option(key, value)
|
|
36
|
+
@options[key.to_s] = value.to_s
|
|
37
|
+
self
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# @return [self] set multiple source options.
|
|
41
|
+
def options(opts)
|
|
42
|
+
opts.each { |k, v| @options[k.to_s] = v.to_s }
|
|
43
|
+
self
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Load a streaming DataFrame from the configured source.
|
|
47
|
+
#
|
|
48
|
+
# @param path [String, nil]
|
|
49
|
+
# @return [DataFrame]
|
|
50
|
+
def load(path = nil)
|
|
51
|
+
ds = Proto::Read::DataSource.new(options: @options, paths: path ? [path.to_s] : [])
|
|
52
|
+
ds.format = @format if @format
|
|
53
|
+
ds.schema = @schema if @schema
|
|
54
|
+
stream_relation(data_source: ds)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Load a streaming DataFrame from a registered table.
|
|
58
|
+
# @return [DataFrame]
|
|
59
|
+
def table(name)
|
|
60
|
+
stream_relation(named_table: Proto::Read::NamedTable.new(unparsed_identifier: name.to_s, options: @options))
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# @return [DataFrame] convenience for `format(...).load(path)`.
|
|
64
|
+
def csv(path) = format("csv").load(path)
|
|
65
|
+
def json(path) = format("json").load(path)
|
|
66
|
+
def parquet(path) = format("parquet").load(path)
|
|
67
|
+
def orc(path) = format("orc").load(path)
|
|
68
|
+
def text(path) = format("text").load(path)
|
|
69
|
+
|
|
70
|
+
private
|
|
71
|
+
|
|
72
|
+
def stream_relation(**read_kw)
|
|
73
|
+
read = Proto::Read.new(is_streaming: true, **read_kw)
|
|
74
|
+
DataFrame.new(@session, PlanBuilder.relation(@session, read: read))
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Writes a streaming {DataFrame} to a streaming sink and starts the query.
|
|
79
|
+
# Returned by {DataFrame#write_stream}. Mirrors PySpark's `DataStreamWriter`.
|
|
80
|
+
#
|
|
81
|
+
# `foreach`/`foreach_batch` are intentionally unsupported: they require
|
|
82
|
+
# user-defined functions, whose Spark Connect protobuf definitions are not yet
|
|
83
|
+
# finalized.
|
|
84
|
+
#
|
|
85
|
+
# @example
|
|
86
|
+
# query = df.write_stream
|
|
87
|
+
# .format("console")
|
|
88
|
+
# .output_mode("append")
|
|
89
|
+
# .trigger(processing_time: "1 second")
|
|
90
|
+
# .start
|
|
91
|
+
# query.stop
|
|
92
|
+
class DataStreamWriter
|
|
93
|
+
Proto = SparkConnect::Proto
|
|
94
|
+
WSO = Proto::WriteStreamOperationStart
|
|
95
|
+
|
|
96
|
+
# @param df [DataFrame]
|
|
97
|
+
def initialize(df)
|
|
98
|
+
@df = df
|
|
99
|
+
@format = nil
|
|
100
|
+
@options = {}
|
|
101
|
+
@partitioning = []
|
|
102
|
+
@output_mode = nil
|
|
103
|
+
@query_name = nil
|
|
104
|
+
@trigger = nil
|
|
105
|
+
@path = nil
|
|
106
|
+
@table = nil
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# @return [self] set the sink format (`"console"`, `"memory"`, `"kafka"`, ...).
|
|
110
|
+
def format(source)
|
|
111
|
+
@format = source.to_s
|
|
112
|
+
self
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# @return [self] set the output mode (`"append"`, `"complete"`, `"update"`).
|
|
116
|
+
def output_mode(mode)
|
|
117
|
+
@output_mode = mode.to_s
|
|
118
|
+
self
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# @return [self] set a single sink option.
|
|
122
|
+
def option(key, value)
|
|
123
|
+
@options[key.to_s] = value.to_s
|
|
124
|
+
self
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# @return [self] set multiple sink options.
|
|
128
|
+
def options(opts)
|
|
129
|
+
opts.each { |k, v| @options[k.to_s] = v.to_s }
|
|
130
|
+
self
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# @return [self] partition the output by these columns.
|
|
134
|
+
def partition_by(*cols)
|
|
135
|
+
@partitioning = cols.flatten.map(&:to_s)
|
|
136
|
+
self
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# @return [self] name the streaming query (required by the memory sink).
|
|
140
|
+
def query_name(name)
|
|
141
|
+
@query_name = name.to_s
|
|
142
|
+
self
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Configure the query trigger. Provide exactly one of:
|
|
146
|
+
#
|
|
147
|
+
# @param processing_time [String, nil] e.g. `"10 seconds"` (micro-batch interval).
|
|
148
|
+
# @param once [Boolean, nil] process all available data once and stop.
|
|
149
|
+
# @param available_now [Boolean, nil] process all available data in (possibly) multiple batches, then stop.
|
|
150
|
+
# @param continuous [String, nil] continuous-processing checkpoint interval.
|
|
151
|
+
# @return [self]
|
|
152
|
+
def trigger(processing_time: nil, once: nil, available_now: nil, continuous: nil)
|
|
153
|
+
@trigger =
|
|
154
|
+
if processing_time then [:processing_time_interval, processing_time.to_s]
|
|
155
|
+
elsif once then [:once, true]
|
|
156
|
+
elsif available_now then [:available_now, true]
|
|
157
|
+
elsif continuous then [:continuous_checkpoint_interval, continuous.to_s]
|
|
158
|
+
end
|
|
159
|
+
self
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Start the streaming query to a file/data path.
|
|
163
|
+
#
|
|
164
|
+
# @param path [String, nil]
|
|
165
|
+
# @return [StreamingQuery]
|
|
166
|
+
def start(path = nil)
|
|
167
|
+
@path = path if path
|
|
168
|
+
run
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Start the streaming query, writing into the given table.
|
|
172
|
+
#
|
|
173
|
+
# @param name [String]
|
|
174
|
+
# @return [StreamingQuery]
|
|
175
|
+
def to_table(name)
|
|
176
|
+
@table = name.to_s
|
|
177
|
+
run
|
|
178
|
+
end
|
|
179
|
+
alias toTable to_table
|
|
180
|
+
|
|
181
|
+
private
|
|
182
|
+
|
|
183
|
+
def run
|
|
184
|
+
op = WSO.new(
|
|
185
|
+
input: @df.relation, format: @format || "", options: @options,
|
|
186
|
+
partitioning_column_names: @partitioning
|
|
187
|
+
)
|
|
188
|
+
op.output_mode = @output_mode if @output_mode
|
|
189
|
+
op.query_name = @query_name if @query_name
|
|
190
|
+
op.public_send("#{@trigger[0]}=", @trigger[1]) if @trigger
|
|
191
|
+
if @path then op.path = @path
|
|
192
|
+
elsif @table then op.table_name = @table
|
|
193
|
+
end
|
|
194
|
+
result = @df.session.client.execute_command(Proto::Command.new(write_stream_operation_start: op))
|
|
195
|
+
wsr = result.write_stream_result
|
|
196
|
+
raise SparkConnectError, "Server did not return a streaming query handle" unless wsr
|
|
197
|
+
|
|
198
|
+
StreamingQuery.new(@df.session, wsr.query_id, wsr.name)
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# A handle to a running streaming query. Returned by {DataStreamWriter#start}.
|
|
203
|
+
# Mirrors PySpark's `StreamingQuery`.
|
|
204
|
+
class StreamingQuery
|
|
205
|
+
Proto = SparkConnect::Proto
|
|
206
|
+
Cmd = Proto::StreamingQueryCommand
|
|
207
|
+
|
|
208
|
+
# @return [String] the stable query id (survives restarts from a checkpoint).
|
|
209
|
+
attr_reader :id
|
|
210
|
+
# @return [String] the run id (unique per start).
|
|
211
|
+
attr_reader :run_id
|
|
212
|
+
# @return [String, nil] the query name, if one was set.
|
|
213
|
+
attr_reader :name
|
|
214
|
+
|
|
215
|
+
# @param session [SparkSession]
|
|
216
|
+
# @param instance_id [Spark::Connect::StreamingQueryInstanceId]
|
|
217
|
+
# @param name [String]
|
|
218
|
+
def initialize(session, instance_id, name)
|
|
219
|
+
@session = session
|
|
220
|
+
@instance_id = instance_id
|
|
221
|
+
@id = instance_id.id
|
|
222
|
+
@run_id = instance_id.run_id
|
|
223
|
+
@name = name.nil? || name.empty? ? nil : name
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# @return [Hash] the current status (`message`, `is_data_available`,
|
|
227
|
+
# `is_trigger_active`, `is_active`).
|
|
228
|
+
def status
|
|
229
|
+
s = command(status: true).status
|
|
230
|
+
{
|
|
231
|
+
"message" => s.status_message,
|
|
232
|
+
"isDataAvailable" => s.is_data_available,
|
|
233
|
+
"isTriggerActive" => s.is_trigger_active,
|
|
234
|
+
"isActive" => s.is_active,
|
|
235
|
+
}
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# @return [Boolean] whether the query is still running.
|
|
239
|
+
def active?
|
|
240
|
+
status["isActive"]
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# @return [Array<Hash>] parsed JSON progress objects for recent micro-batches.
|
|
244
|
+
def recent_progress
|
|
245
|
+
command(recent_progress: true).recent_progress.recent_progress_json.map { |j| JSON.parse(j) }
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# @return [Hash, nil] the most recent progress object, if any.
|
|
249
|
+
def last_progress
|
|
250
|
+
command(last_progress: true).recent_progress.recent_progress_json.map { |j| JSON.parse(j) }.last
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# Block until the query terminates, or until `timeout_ms` elapses.
|
|
254
|
+
#
|
|
255
|
+
# @param timeout_ms [Integer, nil]
|
|
256
|
+
# @return [Boolean] whether the query has terminated.
|
|
257
|
+
def await_termination(timeout_ms = nil)
|
|
258
|
+
ac = Cmd::AwaitTerminationCommand.new
|
|
259
|
+
ac.timeout_ms = timeout_ms if timeout_ms
|
|
260
|
+
command(await_termination: ac).await_termination.terminated
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# Process all available data, then return (useful for tests with bounded sources).
|
|
264
|
+
# @return [void]
|
|
265
|
+
def process_all_available
|
|
266
|
+
command(process_all_available: true)
|
|
267
|
+
nil
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# Stop the query. @return [void]
|
|
271
|
+
def stop
|
|
272
|
+
command(stop: true)
|
|
273
|
+
nil
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# @return [String, nil] the query's exception message, if it has failed.
|
|
277
|
+
def exception
|
|
278
|
+
result = command(exception: true).exception
|
|
279
|
+
result.exception_message && result.exception_message.empty? ? nil : result.exception_message
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# @return [String] the query's execution plan.
|
|
283
|
+
def explain(extended: false)
|
|
284
|
+
command(explain: Cmd::ExplainCommand.new(extended: extended)).explain.result
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
def to_s
|
|
288
|
+
"#<SparkConnect::StreamingQuery id=#{@id} name=#{@name.inspect}>"
|
|
289
|
+
end
|
|
290
|
+
alias inspect to_s
|
|
291
|
+
|
|
292
|
+
private
|
|
293
|
+
|
|
294
|
+
def command(**kw)
|
|
295
|
+
cmd = Cmd.new(query_id: @instance_id, **kw)
|
|
296
|
+
@session.client.execute_command(Proto::Command.new(streaming_query_command: cmd)).streaming_query_result
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
# Manages the streaming queries of a session. Returned by {SparkSession#streams}.
|
|
301
|
+
# Mirrors PySpark's `StreamingQueryManager`.
|
|
302
|
+
class StreamingQueryManager
|
|
303
|
+
Proto = SparkConnect::Proto
|
|
304
|
+
MCmd = Proto::StreamingQueryManagerCommand
|
|
305
|
+
|
|
306
|
+
# @param session [SparkSession]
|
|
307
|
+
def initialize(session)
|
|
308
|
+
@session = session
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
# @return [Array<StreamingQuery>] the currently active queries.
|
|
312
|
+
def active
|
|
313
|
+
command(active: true).active.active_queries.map { |q| StreamingQuery.new(@session, q.id, q.name) }
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
# Look up an active query by its id.
|
|
317
|
+
#
|
|
318
|
+
# @param id [String]
|
|
319
|
+
# @return [StreamingQuery, nil]
|
|
320
|
+
def get(id)
|
|
321
|
+
result = command(get_query: id.to_s)
|
|
322
|
+
return nil unless result.result_type == :query
|
|
323
|
+
|
|
324
|
+
StreamingQuery.new(@session, result.query.id, result.query.name)
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
# Block until any query terminates, or until `timeout_ms` elapses.
|
|
328
|
+
#
|
|
329
|
+
# @param timeout_ms [Integer, nil]
|
|
330
|
+
# @return [Boolean]
|
|
331
|
+
def await_any_termination(timeout_ms = nil)
|
|
332
|
+
ac = MCmd::AwaitAnyTerminationCommand.new
|
|
333
|
+
ac.timeout_ms = timeout_ms if timeout_ms
|
|
334
|
+
command(await_any_termination: ac).await_any_termination.terminated
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
# Forget the cached termination state of all queries (so a subsequent
|
|
338
|
+
# {#await_any_termination} blocks again). @return [void]
|
|
339
|
+
def reset_terminated
|
|
340
|
+
command(reset_terminated: true)
|
|
341
|
+
nil
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
private
|
|
345
|
+
|
|
346
|
+
def command(**kw)
|
|
347
|
+
cmd = MCmd.new(**kw)
|
|
348
|
+
@session.client.execute_command(Proto::Command.new(streaming_query_manager_command: cmd)).streaming_manager_result
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
end
|