spark-connect 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +82 -0
- data/LICENSE +202 -0
- data/NOTICE +16 -0
- data/README.md +166 -0
- data/lib/spark-connect.rb +5 -0
- data/lib/spark_connect/arrow.rb +115 -0
- data/lib/spark_connect/catalog.rb +190 -0
- data/lib/spark_connect/channel_builder.rb +134 -0
- data/lib/spark_connect/client.rb +264 -0
- data/lib/spark_connect/column.rb +379 -0
- data/lib/spark_connect/conf.rb +79 -0
- data/lib/spark_connect/data_frame.rb +828 -0
- data/lib/spark_connect/errors.rb +58 -0
- data/lib/spark_connect/functions.rb +903 -0
- data/lib/spark_connect/grouped_data.rb +101 -0
- data/lib/spark_connect/na_functions.rb +98 -0
- data/lib/spark_connect/observation.rb +61 -0
- data/lib/spark_connect/pipelines.rb +221 -0
- data/lib/spark_connect/plan.rb +39 -0
- data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
- data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
- data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
- data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
- data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
- data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
- data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
- data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
- data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
- data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
- data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
- data/lib/spark_connect/proto.rb +32 -0
- data/lib/spark_connect/reader.rb +98 -0
- data/lib/spark_connect/row.rb +105 -0
- data/lib/spark_connect/session.rb +317 -0
- data/lib/spark_connect/stat_functions.rb +109 -0
- data/lib/spark_connect/streaming.rb +351 -0
- data/lib/spark_connect/types.rb +490 -0
- data/lib/spark_connect/version.rb +11 -0
- data/lib/spark_connect/window.rb +119 -0
- data/lib/spark_connect/writer.rb +208 -0
- data/lib/spark_connect.rb +58 -0
- data/proto/spark/connect/base.proto +1275 -0
- data/proto/spark/connect/catalog.proto +243 -0
- data/proto/spark/connect/commands.proto +553 -0
- data/proto/spark/connect/common.proto +179 -0
- data/proto/spark/connect/expressions.proto +557 -0
- data/proto/spark/connect/ml.proto +147 -0
- data/proto/spark/connect/ml_common.proto +64 -0
- data/proto/spark/connect/pipelines.proto +307 -0
- data/proto/spark/connect/relations.proto +1252 -0
- data/proto/spark/connect/types.proto +227 -0
- metadata +149 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
3
|
+
* contributor license agreements. See the NOTICE file distributed with
|
|
4
|
+
* this work for additional information regarding copyright ownership.
|
|
5
|
+
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
6
|
+
* (the "License"); you may not use this file except in compliance with
|
|
7
|
+
* the License. You may obtain a copy of the License at
|
|
8
|
+
*
|
|
9
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
*
|
|
11
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
* See the License for the specific language governing permissions and
|
|
15
|
+
* limitations under the License.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
syntax = 'proto3';
|
|
19
|
+
|
|
20
|
+
package spark.connect;
|
|
21
|
+
|
|
22
|
+
import "spark/connect/expressions.proto";
|
|
23
|
+
|
|
24
|
+
option java_multiple_files = true;
|
|
25
|
+
option java_package = "org.apache.spark.connect.proto";
|
|
26
|
+
option go_package = "internal/generated";
|
|
27
|
+
|
|
28
|
+
// MlParams stores param settings for ML Estimator / Transformer / Evaluator
|
|
29
|
+
message MlParams {
|
|
30
|
+
// User-supplied params
|
|
31
|
+
map<string, Expression.Literal> params = 1;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// MLOperator represents the ML operators like (Estimator, Transformer or Evaluator)
|
|
35
|
+
message MlOperator {
|
|
36
|
+
// (Required) The qualified name of the ML operator.
|
|
37
|
+
string name = 1;
|
|
38
|
+
|
|
39
|
+
// (Required) Unique id of the ML operator
|
|
40
|
+
string uid = 2;
|
|
41
|
+
|
|
42
|
+
// (Required) Represents what the ML operator is
|
|
43
|
+
OperatorType type = 3;
|
|
44
|
+
|
|
45
|
+
enum OperatorType {
|
|
46
|
+
OPERATOR_TYPE_UNSPECIFIED = 0;
|
|
47
|
+
// ML estimator
|
|
48
|
+
OPERATOR_TYPE_ESTIMATOR = 1;
|
|
49
|
+
// ML transformer (non-model)
|
|
50
|
+
OPERATOR_TYPE_TRANSFORMER = 2;
|
|
51
|
+
// ML evaluator
|
|
52
|
+
OPERATOR_TYPE_EVALUATOR = 3;
|
|
53
|
+
// ML model
|
|
54
|
+
OPERATOR_TYPE_MODEL = 4;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Represents a reference to the cached object which could be a model
|
|
59
|
+
// or summary evaluated by a model
|
|
60
|
+
message ObjectRef {
|
|
61
|
+
// (Required) The ID is used to lookup the object on the server side.
|
|
62
|
+
// Note it is different from the 'uid' of a ML object.
|
|
63
|
+
string id = 1;
|
|
64
|
+
}
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
3
|
+
* contributor license agreements. See the NOTICE file distributed with
|
|
4
|
+
* this work for additional information regarding copyright ownership.
|
|
5
|
+
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
6
|
+
* (the "License"); you may not use this file except in compliance with
|
|
7
|
+
* the License. You may obtain a copy of the License at
|
|
8
|
+
*
|
|
9
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
*
|
|
11
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
* See the License for the specific language governing permissions and
|
|
15
|
+
* limitations under the License.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
syntax = "proto3";
|
|
19
|
+
|
|
20
|
+
package spark.connect;
|
|
21
|
+
|
|
22
|
+
import "google/protobuf/any.proto";
|
|
23
|
+
import "google/protobuf/timestamp.proto";
|
|
24
|
+
import "spark/connect/common.proto";
|
|
25
|
+
import "spark/connect/relations.proto";
|
|
26
|
+
import "spark/connect/types.proto";
|
|
27
|
+
|
|
28
|
+
option java_multiple_files = true;
|
|
29
|
+
option java_package = "org.apache.spark.connect.proto";
|
|
30
|
+
option go_package = "internal/generated";
|
|
31
|
+
|
|
32
|
+
// Dispatch object for pipelines commands. See each individual command for documentation.
|
|
33
|
+
message PipelineCommand {
|
|
34
|
+
oneof command_type {
|
|
35
|
+
CreateDataflowGraph create_dataflow_graph = 1;
|
|
36
|
+
DefineOutput define_output = 2;
|
|
37
|
+
DefineFlow define_flow = 3;
|
|
38
|
+
DropDataflowGraph drop_dataflow_graph = 4;
|
|
39
|
+
StartRun start_run = 5;
|
|
40
|
+
DefineSqlGraphElements define_sql_graph_elements = 6;
|
|
41
|
+
GetQueryFunctionExecutionSignalStream get_query_function_execution_signal_stream = 7;
|
|
42
|
+
DefineFlowQueryFunctionResult define_flow_query_function_result = 8;
|
|
43
|
+
// Reserved field for protocol extensions.
|
|
44
|
+
// Used to support forward-compatibility by carrying additional command types
|
|
45
|
+
// that are not yet defined in this version of the proto. During planning, the
|
|
46
|
+
// engine will resolve and dispatch the concrete command contained in this field.
|
|
47
|
+
google.protobuf.Any extension = 999;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Request to create a new dataflow graph.
|
|
51
|
+
message CreateDataflowGraph {
|
|
52
|
+
// The default catalog.
|
|
53
|
+
optional string default_catalog = 1;
|
|
54
|
+
|
|
55
|
+
// The default database.
|
|
56
|
+
optional string default_database = 2;
|
|
57
|
+
|
|
58
|
+
// SQL configurations for all flows in this graph.
|
|
59
|
+
map<string, string> sql_conf = 5;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Drops the graph and stops any running attached flows.
|
|
63
|
+
message DropDataflowGraph {
|
|
64
|
+
// The graph to drop.
|
|
65
|
+
optional string dataflow_graph_id = 1;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Request to define an output: a table, a materialized view, a temporary view or a sink.
|
|
69
|
+
message DefineOutput {
|
|
70
|
+
// The graph to attach this output to.
|
|
71
|
+
optional string dataflow_graph_id = 1;
|
|
72
|
+
|
|
73
|
+
// Name of the output. Can be partially or fully qualified.
|
|
74
|
+
optional string output_name = 2;
|
|
75
|
+
|
|
76
|
+
// The type of the output.
|
|
77
|
+
optional OutputType output_type = 3;
|
|
78
|
+
|
|
79
|
+
// Optional comment for the output.
|
|
80
|
+
optional string comment = 4;
|
|
81
|
+
|
|
82
|
+
// The location in source code that this output was defined.
|
|
83
|
+
optional SourceCodeLocation source_code_location = 5;
|
|
84
|
+
|
|
85
|
+
oneof details {
|
|
86
|
+
TableDetails table_details = 6;
|
|
87
|
+
SinkDetails sink_details = 7;
|
|
88
|
+
google.protobuf.Any extension = 999;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Metadata that's only applicable to tables and materialized views.
|
|
92
|
+
message TableDetails {
|
|
93
|
+
// Optional table properties.
|
|
94
|
+
map<string, string> table_properties = 1;
|
|
95
|
+
|
|
96
|
+
// Optional partition columns for the table.
|
|
97
|
+
repeated string partition_cols = 2;
|
|
98
|
+
|
|
99
|
+
// The output table format for the table.
|
|
100
|
+
optional string format = 3;
|
|
101
|
+
|
|
102
|
+
// Schema for the table. If unset, this will be inferred from incoming flows.
|
|
103
|
+
oneof schema {
|
|
104
|
+
spark.connect.DataType schema_data_type = 4;
|
|
105
|
+
string schema_string = 5;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Optional cluster columns for the table.
|
|
109
|
+
repeated string clustering_columns = 6;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Metadata that's only applicable to sinks.
|
|
113
|
+
message SinkDetails {
|
|
114
|
+
// Streaming write options
|
|
115
|
+
map<string, string> options = 1;
|
|
116
|
+
|
|
117
|
+
// Streaming write format
|
|
118
|
+
optional string format = 2;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Request to define a flow targeting a dataset.
|
|
123
|
+
message DefineFlow {
|
|
124
|
+
// The graph to attach this flow to.
|
|
125
|
+
optional string dataflow_graph_id = 1;
|
|
126
|
+
|
|
127
|
+
// Name of the flow. For standalone flows, this must be a single-part name.
|
|
128
|
+
optional string flow_name = 2;
|
|
129
|
+
|
|
130
|
+
// Name of the dataset this flow writes to. Can be partially or fully qualified.
|
|
131
|
+
optional string target_dataset_name = 3;
|
|
132
|
+
|
|
133
|
+
// SQL configurations set when running this flow.
|
|
134
|
+
map<string, string> sql_conf = 4;
|
|
135
|
+
|
|
136
|
+
// Identifier for the client making the request. The server uses this to determine what flow
|
|
137
|
+
// evaluation request stream to dispatch evaluation requests to for this flow.
|
|
138
|
+
optional string client_id = 5;
|
|
139
|
+
|
|
140
|
+
// The location in source code that this flow was defined.
|
|
141
|
+
optional SourceCodeLocation source_code_location = 6;
|
|
142
|
+
|
|
143
|
+
oneof details {
|
|
144
|
+
WriteRelationFlowDetails relation_flow_details = 7;
|
|
145
|
+
google.protobuf.Any extension = 999;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// A flow that is that takes the contents of a relation and writes it to the target dataset.
|
|
149
|
+
message WriteRelationFlowDetails {
|
|
150
|
+
// An unresolved relation that defines the dataset's flow. Empty if the query function
|
|
151
|
+
// that defines the flow cannot be analyzed at the time of flow definition.
|
|
152
|
+
optional spark.connect.Relation relation = 1;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// If true, define the flow as a one-time flow, such as for backfill.
|
|
156
|
+
// Set to true changes the flow in two ways:
|
|
157
|
+
// - The flow is run one time by default. If the pipeline is ran with a full refresh,
|
|
158
|
+
// the flow will run again.
|
|
159
|
+
// - The flow function must be a batch DataFrame, not a streaming DataFrame.
|
|
160
|
+
optional bool once = 8;
|
|
161
|
+
|
|
162
|
+
message Response {
|
|
163
|
+
// Fully qualified flow name that uniquely identify a flow in the Dataflow graph.
|
|
164
|
+
optional string flow_name = 1;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Resolves all datasets and flows and start a pipeline update. Should be called after all
|
|
169
|
+
// graph elements are registered.
|
|
170
|
+
message StartRun {
|
|
171
|
+
// The graph to start.
|
|
172
|
+
optional string dataflow_graph_id = 1;
|
|
173
|
+
|
|
174
|
+
// List of dataset to reset and recompute.
|
|
175
|
+
repeated string full_refresh_selection = 2;
|
|
176
|
+
|
|
177
|
+
// Perform a full graph reset and recompute.
|
|
178
|
+
optional bool full_refresh_all = 3;
|
|
179
|
+
|
|
180
|
+
// List of dataset to update.
|
|
181
|
+
repeated string refresh_selection = 4;
|
|
182
|
+
|
|
183
|
+
// If true, the run will not actually execute any flows, but will only validate the graph and
|
|
184
|
+
// check for any errors. This is useful for testing and validation purposes.
|
|
185
|
+
optional bool dry = 5;
|
|
186
|
+
|
|
187
|
+
// storage location for pipeline checkpoints and metadata.
|
|
188
|
+
optional string storage = 6;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Parses the SQL file and registers all datasets and flows.
|
|
192
|
+
message DefineSqlGraphElements {
|
|
193
|
+
// The graph to attach this dataset to.
|
|
194
|
+
optional string dataflow_graph_id = 1;
|
|
195
|
+
|
|
196
|
+
// The full path to the SQL file. Can be relative or absolute.
|
|
197
|
+
optional string sql_file_path = 2;
|
|
198
|
+
|
|
199
|
+
// The contents of the SQL file.
|
|
200
|
+
optional string sql_text = 3;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Request to get the stream of query function execution signals for a graph. Responses should
|
|
204
|
+
// be a stream of PipelineQueryFunctionExecutionSignal messages.
|
|
205
|
+
message GetQueryFunctionExecutionSignalStream {
|
|
206
|
+
// The graph to get the query function execution signal stream for.
|
|
207
|
+
optional string dataflow_graph_id = 1;
|
|
208
|
+
|
|
209
|
+
// Identifier for the client that is requesting the stream.
|
|
210
|
+
optional string client_id = 2;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// Request from the client to update the flow function evaluation result
|
|
214
|
+
// for a previously un-analyzed flow.
|
|
215
|
+
message DefineFlowQueryFunctionResult {
|
|
216
|
+
// The fully qualified name of the flow being updated.
|
|
217
|
+
optional string flow_name = 1;
|
|
218
|
+
|
|
219
|
+
// The ID of the graph this flow belongs to.
|
|
220
|
+
optional string dataflow_graph_id = 2;
|
|
221
|
+
|
|
222
|
+
// An unresolved relation that defines the dataset's flow.
|
|
223
|
+
optional spark.connect.Relation relation = 3;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// Dispatch object for pipelines command results.
|
|
228
|
+
message PipelineCommandResult {
|
|
229
|
+
oneof result_type {
|
|
230
|
+
CreateDataflowGraphResult create_dataflow_graph_result = 1;
|
|
231
|
+
DefineOutputResult define_output_result = 2;
|
|
232
|
+
DefineFlowResult define_flow_result = 3;
|
|
233
|
+
}
|
|
234
|
+
message CreateDataflowGraphResult {
|
|
235
|
+
// The ID of the created graph.
|
|
236
|
+
optional string dataflow_graph_id = 1;
|
|
237
|
+
}
|
|
238
|
+
message DefineOutputResult {
|
|
239
|
+
// Resolved identifier of the output
|
|
240
|
+
optional ResolvedIdentifier resolved_identifier = 1;
|
|
241
|
+
}
|
|
242
|
+
message DefineFlowResult {
|
|
243
|
+
// Resolved identifier of the flow
|
|
244
|
+
optional ResolvedIdentifier resolved_identifier = 1;
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// The type of output.
|
|
249
|
+
enum OutputType {
|
|
250
|
+
// Safe default value. Should not be used.
|
|
251
|
+
OUTPUT_TYPE_UNSPECIFIED = 0;
|
|
252
|
+
// A materialized view which is published to the catalog
|
|
253
|
+
MATERIALIZED_VIEW = 1;
|
|
254
|
+
// A table which is published to the catalog
|
|
255
|
+
TABLE = 2;
|
|
256
|
+
// A view which is not published to the catalog
|
|
257
|
+
TEMPORARY_VIEW = 3;
|
|
258
|
+
// A sink which is not published to the catalog
|
|
259
|
+
SINK = 4;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// A response containing an event emitted during the run of a pipeline.
|
|
263
|
+
message PipelineEventResult {
|
|
264
|
+
PipelineEvent event = 1;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
message PipelineEvent {
|
|
268
|
+
// The timestamp corresponding to when the event occurred.
|
|
269
|
+
google.protobuf.Timestamp timestamp = 1;
|
|
270
|
+
// The message that should be displayed to users.
|
|
271
|
+
optional string message = 2;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// Source code location information associated with a particular dataset or flow.
|
|
275
|
+
message SourceCodeLocation {
|
|
276
|
+
// The file that this pipeline source code was defined in.
|
|
277
|
+
optional string file_name = 1;
|
|
278
|
+
// The specific line number that this pipeline source code is located at, if applicable.
|
|
279
|
+
optional int32 line_number = 2;
|
|
280
|
+
// The path of the top-level pipeline file determined at runtime during pipeline initialization.
|
|
281
|
+
optional string definition_path = 3;
|
|
282
|
+
|
|
283
|
+
// Reserved field for protocol extensions.
|
|
284
|
+
// Used to support forward-compatibility by carrying additional fields
|
|
285
|
+
// that are not yet defined in this version of the proto. During planning, the
|
|
286
|
+
// engine will resolve and dispatch the concrete command contained in this field.
|
|
287
|
+
repeated google.protobuf.Any extension = 999;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// A signal from the server to the client to execute the query function for one or more flows, and
|
|
291
|
+
// to register their results with the server.
|
|
292
|
+
message PipelineQueryFunctionExecutionSignal {
|
|
293
|
+
repeated string flow_names = 1;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Metadata providing context about the pipeline during Spark Connect query analysis.
|
|
297
|
+
message PipelineAnalysisContext {
|
|
298
|
+
// Unique identifier of the dataflow graph associated with this pipeline.
|
|
299
|
+
optional string dataflow_graph_id = 1;
|
|
300
|
+
// The path of the top-level pipeline file determined at runtime during pipeline initialization.
|
|
301
|
+
optional string definition_path = 2;
|
|
302
|
+
// The name of the Flow involved in this analysis
|
|
303
|
+
optional string flow_name = 3;
|
|
304
|
+
|
|
305
|
+
// Reserved field for protocol extensions.
|
|
306
|
+
repeated google.protobuf.Any extension = 999;
|
|
307
|
+
}
|