spark-connect 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +82 -0
  3. data/LICENSE +202 -0
  4. data/NOTICE +16 -0
  5. data/README.md +166 -0
  6. data/lib/spark-connect.rb +5 -0
  7. data/lib/spark_connect/arrow.rb +115 -0
  8. data/lib/spark_connect/catalog.rb +190 -0
  9. data/lib/spark_connect/channel_builder.rb +134 -0
  10. data/lib/spark_connect/client.rb +264 -0
  11. data/lib/spark_connect/column.rb +379 -0
  12. data/lib/spark_connect/conf.rb +79 -0
  13. data/lib/spark_connect/data_frame.rb +828 -0
  14. data/lib/spark_connect/errors.rb +58 -0
  15. data/lib/spark_connect/functions.rb +903 -0
  16. data/lib/spark_connect/grouped_data.rb +101 -0
  17. data/lib/spark_connect/na_functions.rb +98 -0
  18. data/lib/spark_connect/observation.rb +61 -0
  19. data/lib/spark_connect/pipelines.rb +221 -0
  20. data/lib/spark_connect/plan.rb +39 -0
  21. data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
  22. data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
  23. data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
  24. data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
  25. data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
  26. data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
  27. data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
  28. data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
  29. data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
  30. data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
  31. data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
  32. data/lib/spark_connect/proto.rb +32 -0
  33. data/lib/spark_connect/reader.rb +98 -0
  34. data/lib/spark_connect/row.rb +105 -0
  35. data/lib/spark_connect/session.rb +317 -0
  36. data/lib/spark_connect/stat_functions.rb +109 -0
  37. data/lib/spark_connect/streaming.rb +351 -0
  38. data/lib/spark_connect/types.rb +490 -0
  39. data/lib/spark_connect/version.rb +11 -0
  40. data/lib/spark_connect/window.rb +119 -0
  41. data/lib/spark_connect/writer.rb +208 -0
  42. data/lib/spark_connect.rb +58 -0
  43. data/proto/spark/connect/base.proto +1275 -0
  44. data/proto/spark/connect/catalog.proto +243 -0
  45. data/proto/spark/connect/commands.proto +553 -0
  46. data/proto/spark/connect/common.proto +179 -0
  47. data/proto/spark/connect/expressions.proto +557 -0
  48. data/proto/spark/connect/ml.proto +147 -0
  49. data/proto/spark/connect/ml_common.proto +64 -0
  50. data/proto/spark/connect/pipelines.proto +307 -0
  51. data/proto/spark/connect/relations.proto +1252 -0
  52. data/proto/spark/connect/types.proto +227 -0
  53. metadata +149 -0
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SparkConnect
4
+ # The result of {DataFrame#group_by} / {DataFrame#rollup} / {DataFrame#cube}.
5
+ # Call an aggregate ({#agg}, {#count}, {#sum}, {#avg}, {#max}, {#min}, ...) to
6
+ # produce a new {DataFrame}, optionally after {#pivot}.
7
+ #
8
+ # @example
9
+ # df.group_by("dept").agg(F.avg("salary").alias("avg_salary"), F.count("*"))
10
+ # df.group_by("dept").pivot("year").sum("revenue")
11
+ class GroupedData
12
+ Proto = SparkConnect::Proto
13
+
14
+ # @param df [DataFrame]
15
+ # @param grouping [Array<Column>] grouping columns.
16
+ # @param group_type [Symbol] a `GROUP_TYPE_*` enum symbol.
17
+ # @param pivot_col [Column, nil]
18
+ # @param pivot_values [Array, nil]
19
+ def initialize(df, grouping, group_type, pivot_col: nil, pivot_values: nil)
20
+ @df = df
21
+ @grouping = grouping
22
+ @group_type = group_type
23
+ @pivot_col = pivot_col
24
+ @pivot_values = pivot_values
25
+ end
26
+
27
+ # Compute aggregate expressions.
28
+ #
29
+ # @overload agg(*columns)
30
+ # @param columns [Array<Column>] aggregate columns, e.g. `F.sum("x")`.
31
+ # @overload agg(hash)
32
+ # @param hash [Hash{String=>String}] column-to-function map, e.g.
33
+ # `{"age" => "max", "salary" => "avg"}`.
34
+ # @return [DataFrame]
35
+ def agg(*exprs)
36
+ agg_exprs =
37
+ if exprs.size == 1 && exprs.first.is_a?(Hash)
38
+ exprs.first.map { |col, fn| Column.invoke(fn.to_s, Functions.col(col.to_s)).to_expr }
39
+ else
40
+ exprs.flatten.map { |c| Column.to_col(c).to_expr }
41
+ end
42
+ build(agg_exprs)
43
+ end
44
+
45
+ # Count rows per group.
46
+ # @return [DataFrame]
47
+ def count
48
+ build([Column.invoke("count", Column.lit(1)).alias("count").to_expr])
49
+ end
50
+
51
+ # Sum of each numeric column (or all numeric columns when none given).
52
+ # @return [DataFrame]
53
+ def sum(*cols) = numeric_agg("sum", cols)
54
+
55
+ # Mean of each numeric column.
56
+ # @return [DataFrame]
57
+ def avg(*cols) = numeric_agg("avg", cols)
58
+ alias mean avg
59
+
60
+ # Maximum of each column.
61
+ # @return [DataFrame]
62
+ def max(*cols) = numeric_agg("max", cols)
63
+
64
+ # Minimum of each column.
65
+ # @return [DataFrame]
66
+ def min(*cols) = numeric_agg("min", cols)
67
+
68
+ # Pivot a column into multiple output columns.
69
+ #
70
+ # @param pivot_col [String, Column]
71
+ # @param values [Array, nil] optional explicit pivot values (faster, deterministic).
72
+ # @return [GroupedData]
73
+ def pivot(pivot_col, values = nil)
74
+ GroupedData.new(@df, @grouping, :GROUP_TYPE_PIVOT,
75
+ pivot_col: Column.to_col(pivot_col.is_a?(String) ? Functions.col(pivot_col) : pivot_col),
76
+ pivot_values: values)
77
+ end
78
+
79
+ private
80
+
81
+ def numeric_agg(fn, cols)
82
+ exprs = cols.flatten.map { |c| Column.invoke(fn, Functions.col(c.to_s)).to_expr }
83
+ build(exprs)
84
+ end
85
+
86
+ def build(agg_exprs)
87
+ agg = Proto::Aggregate.new(
88
+ input: @df.relation,
89
+ group_type: @group_type,
90
+ grouping_expressions: @grouping.map(&:to_expr),
91
+ aggregate_expressions: agg_exprs
92
+ )
93
+ if @group_type == :GROUP_TYPE_PIVOT
94
+ pivot = Proto::Aggregate::Pivot.new(col: @pivot_col.to_expr)
95
+ pivot.values += @pivot_values.map { |v| Column.lit(v).to_expr.literal } if @pivot_values
96
+ agg.pivot = pivot
97
+ end
98
+ @df.build(aggregate: agg)
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SparkConnect
4
+ # Missing-data helpers, returned by {DataFrame#na}. Mirrors PySpark's
5
+ # `DataFrame.na` (`DataFrameNaFunctions`).
6
+ #
7
+ # @example
8
+ # df.na.drop(how: :any)
9
+ # df.na.fill(0)
10
+ # df.na.fill({ "name" => "unknown", "age" => 0 })
11
+ # df.na.replace("UNKNOWN", nil, subset: ["name"])
12
+ class DataFrameNaFunctions
13
+ Proto = SparkConnect::Proto
14
+
15
+ # @param df [DataFrame]
16
+ def initialize(df)
17
+ @df = df
18
+ end
19
+
20
+ # Drop rows containing null values.
21
+ #
22
+ # @param how [Symbol] `:any` (drop if any field is null) or `:all`.
23
+ # @param thresh [Integer, nil] keep rows with at least this many non-null
24
+ # values (overrides `how` when given).
25
+ # @param subset [Array<String>, nil] only consider these columns.
26
+ # @return [DataFrame]
27
+ def drop(how: :any, thresh: nil, subset: nil)
28
+ cols = Array(subset).map(&:to_s)
29
+ min_non_nulls = thresh || (if how.to_sym == :all
30
+ 1
31
+ else
32
+ (cols.empty? ? nil : cols.size)
33
+ end)
34
+ nd = Proto::NADrop.new(input: @df.relation, cols: cols)
35
+ nd.min_non_nulls = min_non_nulls if min_non_nulls
36
+ @df.build(drop_na: nd)
37
+ end
38
+
39
+ # Replace null values.
40
+ #
41
+ # @overload fill(value, subset: nil)
42
+ # @param value [Object] a scalar used to fill all (or `subset`) columns.
43
+ # @overload fill(value_map)
44
+ # @param value_map [Hash{String=>Object}] per-column fill values.
45
+ # @return [DataFrame]
46
+ def fill(value, subset: nil)
47
+ cols, values =
48
+ if value.is_a?(Hash)
49
+ [value.keys.map(&:to_s), value.values]
50
+ else
51
+ [Array(subset).map(&:to_s), Array(subset).empty? ? [value] : Array(subset).map { value }]
52
+ end
53
+ nf = Proto::NAFill.new(
54
+ input: @df.relation, cols: cols, values: values.map { |v| na_literal(v) }
55
+ )
56
+ @df.build(fill_na: nf)
57
+ end
58
+
59
+ # Replace specific values with others.
60
+ #
61
+ # @param to_replace [Object, Array, Hash] value(s) to replace, or a
62
+ # `{old => new}` mapping.
63
+ # @param value [Object, Array, nil] replacement value(s) when `to_replace`
64
+ # is not a Hash.
65
+ # @param subset [Array<String>, nil]
66
+ # @return [DataFrame]
67
+ def replace(to_replace, value = nil, subset: nil)
68
+ mapping =
69
+ if to_replace.is_a?(Hash)
70
+ to_replace
71
+ else
72
+ Array(to_replace).zip(Array(value)).to_h
73
+ end
74
+ replacements = mapping.map do |old, new_value|
75
+ Proto::NAReplace::Replacement.new(
76
+ old_value: na_literal(old), new_value: na_literal(new_value)
77
+ )
78
+ end
79
+ nr = Proto::NAReplace.new(
80
+ input: @df.relation, cols: Array(subset).map(&:to_s), replacements: replacements
81
+ )
82
+ @df.build(replace: nr)
83
+ end
84
+
85
+ private
86
+
87
+ # Spark's fill/replace handlers only accept Long, Double, String, or Boolean
88
+ # literal values (not 32-bit Int), so widen Ruby Integers to Long and
89
+ # Floats to Double.
90
+ def na_literal(value)
91
+ case value
92
+ when Integer then Proto::Expression::Literal.new(long: value)
93
+ when Float then Proto::Expression::Literal.new(double: value)
94
+ else Column.to_literal(value)
95
+ end
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SparkConnect
4
+ # Captures named aggregate metrics computed while a {DataFrame} is being
5
+ # materialised, without an extra pass over the data. Pair with
6
+ # {DataFrame#observe}.
7
+ #
8
+ # @example
9
+ # obs = SparkConnect::Observation.new("metrics")
10
+ # df.observe(obs, F.count(F.lit(1)).alias("rows"), F.max("id").alias("max_id")).collect
11
+ # obs.get #=> {"rows"=>100, "max_id"=>99}
12
+ class Observation
13
+ # @return [String] the observation name.
14
+ attr_reader :name
15
+
16
+ @counter = 0
17
+ class << self
18
+ # @api private
19
+ attr_accessor :counter
20
+ end
21
+
22
+ # @param name [String, nil] a unique name (auto-generated when omitted).
23
+ def initialize(name = nil)
24
+ Observation.counter += 1
25
+ @name = name || "observation_#{Observation.counter}"
26
+ @df = nil
27
+ end
28
+
29
+ # @api private - bind the observed DataFrame so {#get} can fetch metrics.
30
+ def bind(df)
31
+ @df = df
32
+ self
33
+ end
34
+
35
+ # The observed metric values (forces execution if not yet materialised).
36
+ #
37
+ # @return [Hash{String=>Object}]
38
+ def get
39
+ raise IllegalArgumentError, "Observation has not been attached to a DataFrame yet" unless @df
40
+
41
+ @metrics ||= fetch_metrics
42
+ end
43
+
44
+ private
45
+
46
+ def fetch_metrics
47
+ result = @df.session.client.execute_plan(@df.relation)
48
+ observed = result.observed_metrics.find { |m| m.name == @name } || result.observed_metrics.first
49
+ return {} unless observed
50
+
51
+ keys = observed.keys.to_a
52
+ decoded = observed.values.map { |lit| decode_literal(lit) }
53
+ keys.zip(decoded).to_h
54
+ end
55
+
56
+ def decode_literal(literal)
57
+ kind = literal.literal_type
58
+ kind ? literal.public_send(kind) : nil
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,221 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SparkConnect
4
+ # A timestamped event emitted by the server during a pipeline run.
5
+ PipelineEvent = Struct.new(:timestamp, :message)
6
+
7
+ # A Spark Declarative Pipeline (SDP) dataflow graph.
8
+ #
9
+ # A pipeline is built by registering **outputs** (tables, materialized views,
10
+ # temporary views, or sinks) and the **flows** that populate them, then
11
+ # started with {#start_run}. Each flow is defined by a {DataFrame} (an
12
+ # unresolved relation), so you compose flows with the same API you use for
13
+ # ordinary queries.
14
+ #
15
+ # Create one with {SparkSession#pipeline}.
16
+ #
17
+ # @example
18
+ # pipe = spark.pipeline(storage: "/tmp/pipeline_storage")
19
+ # pipe.create_materialized_view("bronze", spark.read.json("/data/raw"))
20
+ # pipe.create_table("silver", pipe.read("bronze").filter(F.col("ok")))
21
+ # events = pipe.start_run
22
+ #
23
+ # @note `foreach`/`foreachBatch` flows and Python query-function evaluation are
24
+ # not supported (they require UDFs); define each flow with a relation instead.
25
+ class Pipeline
26
+ Proto = SparkConnect::Proto
27
+ PC = Proto::PipelineCommand
28
+
29
+ OUTPUT_TYPES = {
30
+ table: :TABLE,
31
+ materialized_view: :MATERIALIZED_VIEW,
32
+ temporary_view: :TEMPORARY_VIEW,
33
+ sink: :SINK,
34
+ }.freeze
35
+
36
+ # @return [String] the server-assigned dataflow graph id.
37
+ attr_reader :graph_id
38
+
39
+ # @param session [SparkSession]
40
+ # @param default_catalog [String, nil]
41
+ # @param default_database [String, nil]
42
+ # @param sql_conf [Hash{String=>String}]
43
+ def initialize(session, default_catalog: nil, default_database: nil, sql_conf: {})
44
+ @session = session
45
+ cmd = PC::CreateDataflowGraph.new(sql_conf: stringify(sql_conf))
46
+ cmd.default_catalog = default_catalog if default_catalog
47
+ cmd.default_database = default_database if default_database
48
+ result = dispatch(PC.new(create_dataflow_graph: cmd))
49
+ @graph_id = result.pipeline_command_result.create_dataflow_graph_result.dataflow_graph_id
50
+ end
51
+
52
+ # Reference a dataset defined in this pipeline as a {DataFrame} (so later
53
+ # flows can read from earlier outputs).
54
+ #
55
+ # @param name [String]
56
+ # @return [DataFrame]
57
+ def read(name)
58
+ @session.read.table(name)
59
+ end
60
+
61
+ # Define a published table and the flow that populates it.
62
+ #
63
+ # @param name [String]
64
+ # @param df [DataFrame, nil] the query that populates the table (a flow).
65
+ # @return [String] the resolved output identifier.
66
+ def create_table(name, df = nil, comment: nil, format: nil, partition_cols: [],
67
+ clustering_columns: [], table_properties: {}, schema: nil)
68
+ define_table_output(name, :table, df, comment: comment, format: format,
69
+ partition_cols: partition_cols, clustering_columns: clustering_columns,
70
+ table_properties: table_properties, schema: schema)
71
+ end
72
+
73
+ # Define a materialized view and the flow that populates it.
74
+ # @return [String]
75
+ def create_materialized_view(name, df = nil, comment: nil, format: nil, partition_cols: [],
76
+ clustering_columns: [], table_properties: {}, schema: nil)
77
+ define_table_output(name, :materialized_view, df, comment: comment, format: format,
78
+ partition_cols: partition_cols, clustering_columns: clustering_columns,
79
+ table_properties: table_properties, schema: schema)
80
+ end
81
+
82
+ # Define a (non-published) temporary view and its flow.
83
+ # @return [String]
84
+ def create_temporary_view(name, df = nil, comment: nil)
85
+ define_table_output(name, :temporary_view, df, comment: comment)
86
+ end
87
+
88
+ # Define a streaming sink.
89
+ #
90
+ # @param name [String]
91
+ # @param df [DataFrame] the flow feeding the sink.
92
+ # @param format [String, nil]
93
+ # @param options [Hash{String=>String}]
94
+ # @return [String]
95
+ def create_sink(name, df, format: nil, options: {})
96
+ sink = PC::DefineOutput::SinkDetails.new(options: stringify(options))
97
+ sink.format = format if format
98
+ define_output(name, :sink, sink_details: sink)
99
+ define_flow(name, df, target: name)
100
+ name
101
+ end
102
+
103
+ # Define a flow that writes the contents of `df` into `target`.
104
+ #
105
+ # @param name [String] the flow name.
106
+ # @param df [DataFrame]
107
+ # @param target [String] the dataset the flow writes to (defaults to `name`).
108
+ # @param once [Boolean] define as a one-time (batch) flow.
109
+ # @param sql_conf [Hash{String=>String}]
110
+ # @return [String] the resolved flow name.
111
+ def define_flow(name, df, target: nil, once: false, sql_conf: {})
112
+ flow = PC::DefineFlow.new(
113
+ dataflow_graph_id: @graph_id, flow_name: name.to_s, target_dataset_name: (target || name).to_s,
114
+ sql_conf: stringify(sql_conf),
115
+ relation_flow_details: PC::DefineFlow::WriteRelationFlowDetails.new(relation: df.relation)
116
+ )
117
+ # `once` is optional: only set it when true, since the server rejects the
118
+ # option being present at all for non-one-time flows (e.g. MV flows).
119
+ flow.once = true if once
120
+ result = dispatch(PC.new(define_flow: flow))
121
+ identifier_string(result.pipeline_command_result&.define_flow_result&.resolved_identifier) || name.to_s
122
+ end
123
+
124
+ # Register datasets and flows from a SQL definition file.
125
+ #
126
+ # @param sql_text [String] the SQL source.
127
+ # @param sql_file_path [String, nil]
128
+ # @return [void]
129
+ def define_sql(sql_text, sql_file_path: nil)
130
+ el = PC::DefineSqlGraphElements.new(dataflow_graph_id: @graph_id, sql_text: sql_text.to_s)
131
+ el.sql_file_path = sql_file_path if sql_file_path
132
+ dispatch(PC.new(define_sql_graph_elements: el))
133
+ nil
134
+ end
135
+
136
+ # Resolve the graph and run a pipeline update. Blocks until the update
137
+ # completes, returning the events emitted during the run.
138
+ #
139
+ # @param full_refresh [Array<String>] datasets to reset and recompute.
140
+ # @param full_refresh_all [Boolean] reset and recompute everything.
141
+ # @param refresh [Array<String>] datasets to update.
142
+ # @param dry [Boolean] validate the graph without executing flows.
143
+ # @param storage [String, nil] checkpoint/metadata storage location.
144
+ # @yieldparam event [PipelineEvent] each event, as it is collected
145
+ # @return [Array<PipelineEvent>]
146
+ def start_run(full_refresh: [], full_refresh_all: false, refresh: [], dry: false, storage: nil, &block)
147
+ run = PC::StartRun.new(
148
+ dataflow_graph_id: @graph_id,
149
+ full_refresh_selection: Array(full_refresh).map(&:to_s),
150
+ full_refresh_all: full_refresh_all,
151
+ refresh_selection: Array(refresh).map(&:to_s),
152
+ dry: dry
153
+ )
154
+ run.storage = storage if storage
155
+ result = dispatch(PC.new(start_run: run))
156
+ events = result.pipeline_events.map { |e| PipelineEvent.new(e.timestamp, e.message) }
157
+ events.each(&block) if block
158
+ events
159
+ end
160
+
161
+ # Drop this dataflow graph and stop any attached flows.
162
+ # @return [void]
163
+ def drop
164
+ dispatch(PC.new(drop_dataflow_graph: PC::DropDataflowGraph.new(dataflow_graph_id: @graph_id)))
165
+ nil
166
+ end
167
+
168
+ private
169
+
170
+ def define_table_output(name, type, df, comment: nil, format: nil, partition_cols: [],
171
+ clustering_columns: [], table_properties: {}, schema: nil)
172
+ details = nil
173
+ if type != :temporary_view || format || schema
174
+ details = PC::DefineOutput::TableDetails.new(
175
+ table_properties: stringify(table_properties),
176
+ partition_cols: Array(partition_cols).map(&:to_s),
177
+ clustering_columns: Array(clustering_columns).map(&:to_s)
178
+ )
179
+ details.format = format if format
180
+ apply_schema(details, schema) if schema
181
+ end
182
+ resolved = define_output(name, type, table_details: details, comment: comment)
183
+ define_flow(name, df, target: name) if df
184
+ resolved
185
+ end
186
+
187
+ def define_output(name, type, table_details: nil, sink_details: nil, comment: nil)
188
+ output = PC::DefineOutput.new(
189
+ dataflow_graph_id: @graph_id, output_name: name.to_s, output_type: OUTPUT_TYPES.fetch(type)
190
+ )
191
+ output.comment = comment if comment
192
+ output.table_details = table_details if table_details
193
+ output.sink_details = sink_details if sink_details
194
+ result = dispatch(PC.new(define_output: output))
195
+ identifier_string(result.pipeline_command_result&.define_output_result&.resolved_identifier) || name.to_s
196
+ end
197
+
198
+ def apply_schema(details, schema)
199
+ if schema.is_a?(Types::DataType)
200
+ details.schema_data_type = schema.to_proto
201
+ else
202
+ details.schema_string = schema.to_s
203
+ end
204
+ end
205
+
206
+ def dispatch(pipeline_command)
207
+ @session.client.execute_command(Proto::Command.new(pipeline_command: pipeline_command))
208
+ end
209
+
210
+ def identifier_string(resolved)
211
+ return nil unless resolved
212
+
213
+ parts = [resolved.catalog_name, *resolved.namespace, resolved.table_name].reject { |p| p.nil? || p.empty? }
214
+ parts.join(".")
215
+ end
216
+
217
+ def stringify(hash)
218
+ hash.to_h { |k, v| [k.to_s, v.to_s] }
219
+ end
220
+ end
221
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SparkConnect
4
+ # Low-level helpers for assembling the protobuf logical plan that the client
5
+ # sends to the server. {DataFrame} and {SparkSession} build relations through
6
+ # these helpers so that every relation carries a unique `plan_id` (used by the
7
+ # server to resolve columns to a specific subtree, e.g. for self-joins).
8
+ module PlanBuilder
9
+ Proto = SparkConnect::Proto
10
+
11
+ module_function
12
+
13
+ # Wrap a `rel_type` oneof keyword into a {Spark::Connect::Relation},
14
+ # attaching a fresh `plan_id` from `id_source`.
15
+ #
16
+ # @param id_source [#next_plan_id] usually a {SparkSession}.
17
+ # @param rel [Hash] exactly one `rel_type` keyword, e.g. `project:`.
18
+ # @return [Spark::Connect::Relation]
19
+ def relation(id_source, **rel)
20
+ Proto::Relation.new(common: Proto::RelationCommon.new(plan_id: id_source.next_plan_id), **rel)
21
+ end
22
+
23
+ # Wrap a relation as the root of an executable {Spark::Connect::Plan}.
24
+ #
25
+ # @param relation [Spark::Connect::Relation]
26
+ # @return [Spark::Connect::Plan]
27
+ def root_plan(relation)
28
+ Proto::Plan.new(root: relation)
29
+ end
30
+
31
+ # Wrap a command as an executable {Spark::Connect::Plan}.
32
+ #
33
+ # @param command [Spark::Connect::Command]
34
+ # @return [Spark::Connect::Plan]
35
+ def command_plan(command)
36
+ Proto::Plan.new(command: command)
37
+ end
38
+ end
39
+ end