spark-connect 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +82 -0
  3. data/LICENSE +202 -0
  4. data/NOTICE +16 -0
  5. data/README.md +166 -0
  6. data/lib/spark-connect.rb +5 -0
  7. data/lib/spark_connect/arrow.rb +115 -0
  8. data/lib/spark_connect/catalog.rb +190 -0
  9. data/lib/spark_connect/channel_builder.rb +134 -0
  10. data/lib/spark_connect/client.rb +264 -0
  11. data/lib/spark_connect/column.rb +379 -0
  12. data/lib/spark_connect/conf.rb +79 -0
  13. data/lib/spark_connect/data_frame.rb +828 -0
  14. data/lib/spark_connect/errors.rb +58 -0
  15. data/lib/spark_connect/functions.rb +903 -0
  16. data/lib/spark_connect/grouped_data.rb +101 -0
  17. data/lib/spark_connect/na_functions.rb +98 -0
  18. data/lib/spark_connect/observation.rb +61 -0
  19. data/lib/spark_connect/pipelines.rb +221 -0
  20. data/lib/spark_connect/plan.rb +39 -0
  21. data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
  22. data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
  23. data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
  24. data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
  25. data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
  26. data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
  27. data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
  28. data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
  29. data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
  30. data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
  31. data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
  32. data/lib/spark_connect/proto.rb +32 -0
  33. data/lib/spark_connect/reader.rb +98 -0
  34. data/lib/spark_connect/row.rb +105 -0
  35. data/lib/spark_connect/session.rb +317 -0
  36. data/lib/spark_connect/stat_functions.rb +109 -0
  37. data/lib/spark_connect/streaming.rb +351 -0
  38. data/lib/spark_connect/types.rb +490 -0
  39. data/lib/spark_connect/version.rb +11 -0
  40. data/lib/spark_connect/window.rb +119 -0
  41. data/lib/spark_connect/writer.rb +208 -0
  42. data/lib/spark_connect.rb +58 -0
  43. data/proto/spark/connect/base.proto +1275 -0
  44. data/proto/spark/connect/catalog.proto +243 -0
  45. data/proto/spark/connect/commands.proto +553 -0
  46. data/proto/spark/connect/common.proto +179 -0
  47. data/proto/spark/connect/expressions.proto +557 -0
  48. data/proto/spark/connect/ml.proto +147 -0
  49. data/proto/spark/connect/ml_common.proto +64 -0
  50. data/proto/spark/connect/pipelines.proto +307 -0
  51. data/proto/spark/connect/relations.proto +1252 -0
  52. data/proto/spark/connect/types.proto +227 -0
  53. metadata +149 -0
@@ -0,0 +1,351 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module SparkConnect
6
+ # Loads a streaming {DataFrame} from a streaming source. Returned by
7
+ # {SparkSession#read_stream}. Mirrors PySpark's `DataStreamReader`.
8
+ #
9
+ # @example
10
+ # df = spark.read_stream.format("rate").option("rowsPerSecond", 5).load
11
+ class DataStreamReader
12
+ Proto = SparkConnect::Proto
13
+
14
+ # @param session [SparkSession]
15
+ def initialize(session)
16
+ @session = session
17
+ @format = nil
18
+ @schema = nil
19
+ @options = {}
20
+ end
21
+
22
+ # @return [self] set the streaming source format (`"rate"`, `"kafka"`, ...).
23
+ def format(source)
24
+ @format = source.to_s
25
+ self
26
+ end
27
+
28
+ # @return [self] set the input schema (a {Types::StructType} or DDL string).
29
+ def schema(schema)
30
+ @schema = schema.is_a?(Types::StructType) ? schema.simple_string : schema.to_s
31
+ self
32
+ end
33
+
34
+ # @return [self] set a single source option.
35
+ def option(key, value)
36
+ @options[key.to_s] = value.to_s
37
+ self
38
+ end
39
+
40
+ # @return [self] set multiple source options.
41
+ def options(opts)
42
+ opts.each { |k, v| @options[k.to_s] = v.to_s }
43
+ self
44
+ end
45
+
46
+ # Load a streaming DataFrame from the configured source.
47
+ #
48
+ # @param path [String, nil]
49
+ # @return [DataFrame]
50
+ def load(path = nil)
51
+ ds = Proto::Read::DataSource.new(options: @options, paths: path ? [path.to_s] : [])
52
+ ds.format = @format if @format
53
+ ds.schema = @schema if @schema
54
+ stream_relation(data_source: ds)
55
+ end
56
+
57
+ # Load a streaming DataFrame from a registered table.
58
+ # @return [DataFrame]
59
+ def table(name)
60
+ stream_relation(named_table: Proto::Read::NamedTable.new(unparsed_identifier: name.to_s, options: @options))
61
+ end
62
+
63
+ # @return [DataFrame] convenience for `format(...).load(path)`.
64
+ def csv(path) = format("csv").load(path)
65
+ def json(path) = format("json").load(path)
66
+ def parquet(path) = format("parquet").load(path)
67
+ def orc(path) = format("orc").load(path)
68
+ def text(path) = format("text").load(path)
69
+
70
+ private
71
+
72
+ def stream_relation(**read_kw)
73
+ read = Proto::Read.new(is_streaming: true, **read_kw)
74
+ DataFrame.new(@session, PlanBuilder.relation(@session, read: read))
75
+ end
76
+ end
77
+
78
+ # Writes a streaming {DataFrame} to a streaming sink and starts the query.
79
+ # Returned by {DataFrame#write_stream}. Mirrors PySpark's `DataStreamWriter`.
80
+ #
81
+ # `foreach`/`foreach_batch` are intentionally unsupported: they require
82
+ # user-defined functions, whose Spark Connect protobuf definitions are not yet
83
+ # finalized.
84
+ #
85
+ # @example
86
+ # query = df.write_stream
87
+ # .format("console")
88
+ # .output_mode("append")
89
+ # .trigger(processing_time: "1 second")
90
+ # .start
91
+ # query.stop
92
+ class DataStreamWriter
93
+ Proto = SparkConnect::Proto
94
+ WSO = Proto::WriteStreamOperationStart
95
+
96
+ # @param df [DataFrame]
97
+ def initialize(df)
98
+ @df = df
99
+ @format = nil
100
+ @options = {}
101
+ @partitioning = []
102
+ @output_mode = nil
103
+ @query_name = nil
104
+ @trigger = nil
105
+ @path = nil
106
+ @table = nil
107
+ end
108
+
109
+ # @return [self] set the sink format (`"console"`, `"memory"`, `"kafka"`, ...).
110
+ def format(source)
111
+ @format = source.to_s
112
+ self
113
+ end
114
+
115
+ # @return [self] set the output mode (`"append"`, `"complete"`, `"update"`).
116
+ def output_mode(mode)
117
+ @output_mode = mode.to_s
118
+ self
119
+ end
120
+
121
+ # @return [self] set a single sink option.
122
+ def option(key, value)
123
+ @options[key.to_s] = value.to_s
124
+ self
125
+ end
126
+
127
+ # @return [self] set multiple sink options.
128
+ def options(opts)
129
+ opts.each { |k, v| @options[k.to_s] = v.to_s }
130
+ self
131
+ end
132
+
133
+ # @return [self] partition the output by these columns.
134
+ def partition_by(*cols)
135
+ @partitioning = cols.flatten.map(&:to_s)
136
+ self
137
+ end
138
+
139
+ # @return [self] name the streaming query (required by the memory sink).
140
+ def query_name(name)
141
+ @query_name = name.to_s
142
+ self
143
+ end
144
+
145
+ # Configure the query trigger. Provide exactly one of:
146
+ #
147
+ # @param processing_time [String, nil] e.g. `"10 seconds"` (micro-batch interval).
148
+ # @param once [Boolean, nil] process all available data once and stop.
149
+ # @param available_now [Boolean, nil] process all available data in (possibly) multiple batches, then stop.
150
+ # @param continuous [String, nil] continuous-processing checkpoint interval.
151
+ # @return [self]
152
+ def trigger(processing_time: nil, once: nil, available_now: nil, continuous: nil)
153
+ @trigger =
154
+ if processing_time then [:processing_time_interval, processing_time.to_s]
155
+ elsif once then [:once, true]
156
+ elsif available_now then [:available_now, true]
157
+ elsif continuous then [:continuous_checkpoint_interval, continuous.to_s]
158
+ end
159
+ self
160
+ end
161
+
162
+ # Start the streaming query to a file/data path.
163
+ #
164
+ # @param path [String, nil]
165
+ # @return [StreamingQuery]
166
+ def start(path = nil)
167
+ @path = path if path
168
+ run
169
+ end
170
+
171
+ # Start the streaming query, writing into the given table.
172
+ #
173
+ # @param name [String]
174
+ # @return [StreamingQuery]
175
+ def to_table(name)
176
+ @table = name.to_s
177
+ run
178
+ end
179
+ alias toTable to_table
180
+
181
+ private
182
+
183
+ def run
184
+ op = WSO.new(
185
+ input: @df.relation, format: @format || "", options: @options,
186
+ partitioning_column_names: @partitioning
187
+ )
188
+ op.output_mode = @output_mode if @output_mode
189
+ op.query_name = @query_name if @query_name
190
+ op.public_send("#{@trigger[0]}=", @trigger[1]) if @trigger
191
+ if @path then op.path = @path
192
+ elsif @table then op.table_name = @table
193
+ end
194
+ result = @df.session.client.execute_command(Proto::Command.new(write_stream_operation_start: op))
195
+ wsr = result.write_stream_result
196
+ raise SparkConnectError, "Server did not return a streaming query handle" unless wsr
197
+
198
+ StreamingQuery.new(@df.session, wsr.query_id, wsr.name)
199
+ end
200
+ end
201
+
202
+ # A handle to a running streaming query. Returned by {DataStreamWriter#start}.
203
+ # Mirrors PySpark's `StreamingQuery`.
204
+ class StreamingQuery
205
+ Proto = SparkConnect::Proto
206
+ Cmd = Proto::StreamingQueryCommand
207
+
208
+ # @return [String] the stable query id (survives restarts from a checkpoint).
209
+ attr_reader :id
210
+ # @return [String] the run id (unique per start).
211
+ attr_reader :run_id
212
+ # @return [String, nil] the query name, if one was set.
213
+ attr_reader :name
214
+
215
+ # @param session [SparkSession]
216
+ # @param instance_id [Spark::Connect::StreamingQueryInstanceId]
217
+ # @param name [String]
218
+ def initialize(session, instance_id, name)
219
+ @session = session
220
+ @instance_id = instance_id
221
+ @id = instance_id.id
222
+ @run_id = instance_id.run_id
223
+ @name = name.nil? || name.empty? ? nil : name
224
+ end
225
+
226
+ # @return [Hash] the current status (`message`, `is_data_available`,
227
+ # `is_trigger_active`, `is_active`).
228
+ def status
229
+ s = command(status: true).status
230
+ {
231
+ "message" => s.status_message,
232
+ "isDataAvailable" => s.is_data_available,
233
+ "isTriggerActive" => s.is_trigger_active,
234
+ "isActive" => s.is_active,
235
+ }
236
+ end
237
+
238
+ # @return [Boolean] whether the query is still running.
239
+ def active?
240
+ status["isActive"]
241
+ end
242
+
243
+ # @return [Array<Hash>] parsed JSON progress objects for recent micro-batches.
244
+ def recent_progress
245
+ command(recent_progress: true).recent_progress.recent_progress_json.map { |j| JSON.parse(j) }
246
+ end
247
+
248
+ # @return [Hash, nil] the most recent progress object, if any.
249
+ def last_progress
250
+ command(last_progress: true).recent_progress.recent_progress_json.map { |j| JSON.parse(j) }.last
251
+ end
252
+
253
+ # Block until the query terminates, or until `timeout_ms` elapses.
254
+ #
255
+ # @param timeout_ms [Integer, nil]
256
+ # @return [Boolean] whether the query has terminated.
257
+ def await_termination(timeout_ms = nil)
258
+ ac = Cmd::AwaitTerminationCommand.new
259
+ ac.timeout_ms = timeout_ms if timeout_ms
260
+ command(await_termination: ac).await_termination.terminated
261
+ end
262
+
263
+ # Process all available data, then return (useful for tests with bounded sources).
264
+ # @return [void]
265
+ def process_all_available
266
+ command(process_all_available: true)
267
+ nil
268
+ end
269
+
270
+ # Stop the query. @return [void]
271
+ def stop
272
+ command(stop: true)
273
+ nil
274
+ end
275
+
276
+ # @return [String, nil] the query's exception message, if it has failed.
277
+ def exception
278
+ result = command(exception: true).exception
279
+ result.exception_message && result.exception_message.empty? ? nil : result.exception_message
280
+ end
281
+
282
+ # @return [String] the query's execution plan.
283
+ def explain(extended: false)
284
+ command(explain: Cmd::ExplainCommand.new(extended: extended)).explain.result
285
+ end
286
+
287
+ def to_s
288
+ "#<SparkConnect::StreamingQuery id=#{@id} name=#{@name.inspect}>"
289
+ end
290
+ alias inspect to_s
291
+
292
+ private
293
+
294
+ def command(**kw)
295
+ cmd = Cmd.new(query_id: @instance_id, **kw)
296
+ @session.client.execute_command(Proto::Command.new(streaming_query_command: cmd)).streaming_query_result
297
+ end
298
+ end
299
+
300
+ # Manages the streaming queries of a session. Returned by {SparkSession#streams}.
301
+ # Mirrors PySpark's `StreamingQueryManager`.
302
+ class StreamingQueryManager
303
+ Proto = SparkConnect::Proto
304
+ MCmd = Proto::StreamingQueryManagerCommand
305
+
306
+ # @param session [SparkSession]
307
+ def initialize(session)
308
+ @session = session
309
+ end
310
+
311
+ # @return [Array<StreamingQuery>] the currently active queries.
312
+ def active
313
+ command(active: true).active.active_queries.map { |q| StreamingQuery.new(@session, q.id, q.name) }
314
+ end
315
+
316
+ # Look up an active query by its id.
317
+ #
318
+ # @param id [String]
319
+ # @return [StreamingQuery, nil]
320
+ def get(id)
321
+ result = command(get_query: id.to_s)
322
+ return nil unless result.result_type == :query
323
+
324
+ StreamingQuery.new(@session, result.query.id, result.query.name)
325
+ end
326
+
327
+ # Block until any query terminates, or until `timeout_ms` elapses.
328
+ #
329
+ # @param timeout_ms [Integer, nil]
330
+ # @return [Boolean]
331
+ def await_any_termination(timeout_ms = nil)
332
+ ac = MCmd::AwaitAnyTerminationCommand.new
333
+ ac.timeout_ms = timeout_ms if timeout_ms
334
+ command(await_any_termination: ac).await_any_termination.terminated
335
+ end
336
+
337
+ # Forget the cached termination state of all queries (so a subsequent
338
+ # {#await_any_termination} blocks again). @return [void]
339
+ def reset_terminated
340
+ command(reset_terminated: true)
341
+ nil
342
+ end
343
+
344
+ private
345
+
346
+ def command(**kw)
347
+ cmd = MCmd.new(**kw)
348
+ @session.client.execute_command(Proto::Command.new(streaming_query_manager_command: cmd)).streaming_manager_result
349
+ end
350
+ end
351
+ end