spark-connect 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +82 -0
- data/LICENSE +202 -0
- data/NOTICE +16 -0
- data/README.md +166 -0
- data/lib/spark-connect.rb +5 -0
- data/lib/spark_connect/arrow.rb +115 -0
- data/lib/spark_connect/catalog.rb +190 -0
- data/lib/spark_connect/channel_builder.rb +134 -0
- data/lib/spark_connect/client.rb +264 -0
- data/lib/spark_connect/column.rb +379 -0
- data/lib/spark_connect/conf.rb +79 -0
- data/lib/spark_connect/data_frame.rb +828 -0
- data/lib/spark_connect/errors.rb +58 -0
- data/lib/spark_connect/functions.rb +903 -0
- data/lib/spark_connect/grouped_data.rb +101 -0
- data/lib/spark_connect/na_functions.rb +98 -0
- data/lib/spark_connect/observation.rb +61 -0
- data/lib/spark_connect/pipelines.rb +221 -0
- data/lib/spark_connect/plan.rb +39 -0
- data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
- data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
- data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
- data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
- data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
- data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
- data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
- data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
- data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
- data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
- data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
- data/lib/spark_connect/proto.rb +32 -0
- data/lib/spark_connect/reader.rb +98 -0
- data/lib/spark_connect/row.rb +105 -0
- data/lib/spark_connect/session.rb +317 -0
- data/lib/spark_connect/stat_functions.rb +109 -0
- data/lib/spark_connect/streaming.rb +351 -0
- data/lib/spark_connect/types.rb +490 -0
- data/lib/spark_connect/version.rb +11 -0
- data/lib/spark_connect/window.rb +119 -0
- data/lib/spark_connect/writer.rb +208 -0
- data/lib/spark_connect.rb +58 -0
- data/proto/spark/connect/base.proto +1275 -0
- data/proto/spark/connect/catalog.proto +243 -0
- data/proto/spark/connect/commands.proto +553 -0
- data/proto/spark/connect/common.proto +179 -0
- data/proto/spark/connect/expressions.proto +557 -0
- data/proto/spark/connect/ml.proto +147 -0
- data/proto/spark/connect/ml_common.proto +64 -0
- data/proto/spark/connect/pipelines.proto +307 -0
- data/proto/spark/connect/relations.proto +1252 -0
- data/proto/spark/connect/types.proto +227 -0
- metadata +149 -0
|
@@ -0,0 +1,557 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
3
|
+
* contributor license agreements. See the NOTICE file distributed with
|
|
4
|
+
* this work for additional information regarding copyright ownership.
|
|
5
|
+
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
6
|
+
* (the "License"); you may not use this file except in compliance with
|
|
7
|
+
* the License. You may obtain a copy of the License at
|
|
8
|
+
*
|
|
9
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
*
|
|
11
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
* See the License for the specific language governing permissions and
|
|
15
|
+
* limitations under the License.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
syntax = 'proto3';
|
|
19
|
+
|
|
20
|
+
import "google/protobuf/any.proto";
|
|
21
|
+
import "spark/connect/types.proto";
|
|
22
|
+
import "spark/connect/common.proto";
|
|
23
|
+
|
|
24
|
+
package spark.connect;
|
|
25
|
+
|
|
26
|
+
option java_multiple_files = true;
|
|
27
|
+
option java_package = "org.apache.spark.connect.proto";
|
|
28
|
+
option go_package = "internal/generated";
|
|
29
|
+
|
|
30
|
+
// Expression used to refer to fields, functions and similar. This can be used everywhere
|
|
31
|
+
// expressions in SQL appear.
|
|
32
|
+
message Expression {
|
|
33
|
+
|
|
34
|
+
ExpressionCommon common = 18;
|
|
35
|
+
oneof expr_type {
|
|
36
|
+
Literal literal = 1;
|
|
37
|
+
UnresolvedAttribute unresolved_attribute = 2;
|
|
38
|
+
UnresolvedFunction unresolved_function = 3;
|
|
39
|
+
ExpressionString expression_string = 4;
|
|
40
|
+
UnresolvedStar unresolved_star = 5;
|
|
41
|
+
Alias alias = 6;
|
|
42
|
+
Cast cast = 7;
|
|
43
|
+
UnresolvedRegex unresolved_regex = 8;
|
|
44
|
+
SortOrder sort_order = 9;
|
|
45
|
+
LambdaFunction lambda_function = 10;
|
|
46
|
+
Window window = 11;
|
|
47
|
+
UnresolvedExtractValue unresolved_extract_value = 12;
|
|
48
|
+
UpdateFields update_fields = 13;
|
|
49
|
+
UnresolvedNamedLambdaVariable unresolved_named_lambda_variable = 14;
|
|
50
|
+
CommonInlineUserDefinedFunction common_inline_user_defined_function = 15;
|
|
51
|
+
CallFunction call_function = 16;
|
|
52
|
+
NamedArgumentExpression named_argument_expression = 17;
|
|
53
|
+
MergeAction merge_action = 19;
|
|
54
|
+
TypedAggregateExpression typed_aggregate_expression = 20;
|
|
55
|
+
SubqueryExpression subquery_expression = 21;
|
|
56
|
+
DirectShufflePartitionID direct_shuffle_partition_id = 22;
|
|
57
|
+
|
|
58
|
+
// This field is used to mark extensions to the protocol. When plugins generate arbitrary
|
|
59
|
+
// relations they can add them here. During the planning the correct resolution is done.
|
|
60
|
+
google.protobuf.Any extension = 999;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
// Expression for the OVER clause or WINDOW clause.
|
|
65
|
+
message Window {
|
|
66
|
+
|
|
67
|
+
// (Required) The window function.
|
|
68
|
+
Expression window_function = 1;
|
|
69
|
+
|
|
70
|
+
// (Optional) The way that input rows are partitioned.
|
|
71
|
+
repeated Expression partition_spec = 2;
|
|
72
|
+
|
|
73
|
+
// (Optional) Ordering of rows in a partition.
|
|
74
|
+
repeated SortOrder order_spec = 3;
|
|
75
|
+
|
|
76
|
+
// (Optional) Window frame in a partition.
|
|
77
|
+
//
|
|
78
|
+
// If not set, it will be treated as 'UnspecifiedFrame'.
|
|
79
|
+
WindowFrame frame_spec = 4;
|
|
80
|
+
|
|
81
|
+
// The window frame
|
|
82
|
+
message WindowFrame {
|
|
83
|
+
|
|
84
|
+
// (Required) The type of the frame.
|
|
85
|
+
FrameType frame_type = 1;
|
|
86
|
+
|
|
87
|
+
// (Required) The lower bound of the frame.
|
|
88
|
+
FrameBoundary lower = 2;
|
|
89
|
+
|
|
90
|
+
// (Required) The upper bound of the frame.
|
|
91
|
+
FrameBoundary upper = 3;
|
|
92
|
+
|
|
93
|
+
enum FrameType {
|
|
94
|
+
FRAME_TYPE_UNDEFINED = 0;
|
|
95
|
+
|
|
96
|
+
// RowFrame treats rows in a partition individually.
|
|
97
|
+
FRAME_TYPE_ROW = 1;
|
|
98
|
+
|
|
99
|
+
// RangeFrame treats rows in a partition as groups of peers.
|
|
100
|
+
// All rows having the same 'ORDER BY' ordering are considered as peers.
|
|
101
|
+
FRAME_TYPE_RANGE = 2;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
message FrameBoundary {
|
|
105
|
+
oneof boundary {
|
|
106
|
+
// CURRENT ROW boundary
|
|
107
|
+
bool current_row = 1;
|
|
108
|
+
|
|
109
|
+
// UNBOUNDED boundary.
|
|
110
|
+
// For lower bound, it will be converted to 'UnboundedPreceding'.
|
|
111
|
+
// for upper bound, it will be converted to 'UnboundedFollowing'.
|
|
112
|
+
bool unbounded = 2;
|
|
113
|
+
|
|
114
|
+
// This is an expression for future proofing. We are expecting literals on the server side.
|
|
115
|
+
Expression value = 3;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// SortOrder is used to specify the data ordering, it is normally used in Sort and Window.
|
|
122
|
+
// It is an unevaluable expression and cannot be evaluated, so can not be used in Projection.
|
|
123
|
+
message SortOrder {
|
|
124
|
+
// (Required) The expression to be sorted.
|
|
125
|
+
Expression child = 1;
|
|
126
|
+
|
|
127
|
+
// (Required) The sort direction, should be ASCENDING or DESCENDING.
|
|
128
|
+
SortDirection direction = 2;
|
|
129
|
+
|
|
130
|
+
// (Required) How to deal with NULLs, should be NULLS_FIRST or NULLS_LAST.
|
|
131
|
+
NullOrdering null_ordering = 3;
|
|
132
|
+
|
|
133
|
+
enum SortDirection {
|
|
134
|
+
SORT_DIRECTION_UNSPECIFIED = 0;
|
|
135
|
+
SORT_DIRECTION_ASCENDING = 1;
|
|
136
|
+
SORT_DIRECTION_DESCENDING = 2;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
enum NullOrdering {
|
|
140
|
+
SORT_NULLS_UNSPECIFIED = 0;
|
|
141
|
+
SORT_NULLS_FIRST = 1;
|
|
142
|
+
SORT_NULLS_LAST = 2;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Expression that takes a partition ID value and passes it through directly for use in
|
|
147
|
+
// shuffle partitioning. This is used with RepartitionByExpression to allow users to
|
|
148
|
+
// directly specify target partition IDs.
|
|
149
|
+
message DirectShufflePartitionID {
|
|
150
|
+
// (Required) The expression that evaluates to the partition ID.
|
|
151
|
+
Expression child = 1;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
message Cast {
|
|
155
|
+
// (Required) the expression to be casted.
|
|
156
|
+
Expression expr = 1;
|
|
157
|
+
|
|
158
|
+
// (Required) the data type that the expr to be casted to.
|
|
159
|
+
oneof cast_to_type {
|
|
160
|
+
DataType type = 2;
|
|
161
|
+
// If this is set, Server will use Catalyst parser to parse this string to DataType.
|
|
162
|
+
string type_str = 3;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// (Optional) The expression evaluation mode.
|
|
166
|
+
EvalMode eval_mode = 4;
|
|
167
|
+
|
|
168
|
+
enum EvalMode {
|
|
169
|
+
EVAL_MODE_UNSPECIFIED = 0;
|
|
170
|
+
EVAL_MODE_LEGACY = 1;
|
|
171
|
+
EVAL_MODE_ANSI = 2;
|
|
172
|
+
EVAL_MODE_TRY = 3;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
message Literal {
|
|
177
|
+
oneof literal_type {
|
|
178
|
+
DataType null = 1;
|
|
179
|
+
bytes binary = 2;
|
|
180
|
+
bool boolean = 3;
|
|
181
|
+
|
|
182
|
+
int32 byte = 4;
|
|
183
|
+
int32 short = 5;
|
|
184
|
+
int32 integer = 6;
|
|
185
|
+
int64 long = 7;
|
|
186
|
+
float float = 10;
|
|
187
|
+
double double = 11;
|
|
188
|
+
Decimal decimal = 12;
|
|
189
|
+
|
|
190
|
+
string string = 13;
|
|
191
|
+
|
|
192
|
+
// Date in units of days since the UNIX epoch.
|
|
193
|
+
int32 date = 16;
|
|
194
|
+
// Timestamp in units of microseconds since the UNIX epoch.
|
|
195
|
+
int64 timestamp = 17;
|
|
196
|
+
// Timestamp in units of microseconds since the UNIX epoch (without timezone information).
|
|
197
|
+
int64 timestamp_ntz = 18;
|
|
198
|
+
|
|
199
|
+
CalendarInterval calendar_interval = 19;
|
|
200
|
+
int32 year_month_interval = 20;
|
|
201
|
+
int64 day_time_interval = 21;
|
|
202
|
+
Array array = 22;
|
|
203
|
+
Map map = 23;
|
|
204
|
+
Struct struct = 24;
|
|
205
|
+
|
|
206
|
+
SpecializedArray specialized_array = 25;
|
|
207
|
+
Time time = 26;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Reserved for Geometry and Geography.
|
|
211
|
+
reserved 27, 28;
|
|
212
|
+
|
|
213
|
+
// Data type information for the literal.
|
|
214
|
+
// This field is required only in the root literal message for null values or
|
|
215
|
+
// for data types (e.g., array, map, or struct) with non-trivial information.
|
|
216
|
+
// If the data_type field is not set at the root level, the data type will be
|
|
217
|
+
// inferred or retrieved from the deprecated data type fields using best efforts.
|
|
218
|
+
DataType data_type = 100;
|
|
219
|
+
|
|
220
|
+
message Decimal {
|
|
221
|
+
// the string representation.
|
|
222
|
+
string value = 1;
|
|
223
|
+
// The maximum number of digits allowed in the value.
|
|
224
|
+
// the maximum precision is 38.
|
|
225
|
+
optional int32 precision = 2;
|
|
226
|
+
// declared scale of decimal literal
|
|
227
|
+
optional int32 scale = 3;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
message CalendarInterval {
|
|
231
|
+
int32 months = 1;
|
|
232
|
+
int32 days = 2;
|
|
233
|
+
int64 microseconds = 3;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
message Array {
|
|
237
|
+
// (Deprecated) The element type of the array.
|
|
238
|
+
//
|
|
239
|
+
// This field is deprecated since Spark 4.1+. Use data_type field instead.
|
|
240
|
+
DataType element_type = 1 [deprecated = true];
|
|
241
|
+
|
|
242
|
+
// The literal values that make up the array elements.
|
|
243
|
+
repeated Literal elements = 2;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
message Map {
|
|
247
|
+
// (Deprecated) The key type of the map.
|
|
248
|
+
//
|
|
249
|
+
// This field is deprecated since Spark 4.1+. Use data_type field instead.
|
|
250
|
+
DataType key_type = 1 [deprecated = true];
|
|
251
|
+
|
|
252
|
+
// (Deprecated) The value type of the map.
|
|
253
|
+
//
|
|
254
|
+
// This field is deprecated since Spark 4.1+ and should only be set
|
|
255
|
+
// if the data_type field is not set. Use data_type field instead.
|
|
256
|
+
DataType value_type = 2 [deprecated = true];
|
|
257
|
+
|
|
258
|
+
// The literal keys that make up the map.
|
|
259
|
+
repeated Literal keys = 3;
|
|
260
|
+
|
|
261
|
+
// The literal values that make up the map.
|
|
262
|
+
repeated Literal values = 4;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
message Struct {
|
|
266
|
+
// (Deprecated) The type of the struct.
|
|
267
|
+
//
|
|
268
|
+
// This field is deprecated since Spark 4.1+ because using DataType as the type of a struct
|
|
269
|
+
// is ambiguous. Use data_type field instead.
|
|
270
|
+
DataType struct_type = 1 [deprecated = true];
|
|
271
|
+
|
|
272
|
+
// The literal values that make up the struct elements.
|
|
273
|
+
repeated Literal elements = 2;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
message SpecializedArray {
|
|
277
|
+
oneof value_type {
|
|
278
|
+
Bools bools = 1;
|
|
279
|
+
Ints ints = 2;
|
|
280
|
+
Longs longs = 3;
|
|
281
|
+
Floats floats = 4;
|
|
282
|
+
Doubles doubles = 5;
|
|
283
|
+
Strings strings = 6;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
message Time {
|
|
288
|
+
int64 nano = 1;
|
|
289
|
+
// The precision of this time, if omitted, uses the default value of MICROS_PRECISION.
|
|
290
|
+
optional int32 precision = 2;
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// An unresolved attribute that is not explicitly bound to a specific column, but the column
|
|
295
|
+
// is resolved during analysis by name.
|
|
296
|
+
message UnresolvedAttribute {
|
|
297
|
+
// (Required) An identifier that will be parsed by Catalyst parser. This should follow the
|
|
298
|
+
// Spark SQL identifier syntax.
|
|
299
|
+
string unparsed_identifier = 1;
|
|
300
|
+
|
|
301
|
+
// (Optional) The id of corresponding connect plan.
|
|
302
|
+
optional int64 plan_id = 2;
|
|
303
|
+
|
|
304
|
+
// (Optional) The requested column is a metadata column.
|
|
305
|
+
optional bool is_metadata_column = 3;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// An unresolved function is not explicitly bound to one explicit function, but the function
|
|
309
|
+
// is resolved during analysis following Sparks name resolution rules.
|
|
310
|
+
message UnresolvedFunction {
|
|
311
|
+
// (Required) name (or unparsed name for user defined function) for the unresolved function.
|
|
312
|
+
string function_name = 1;
|
|
313
|
+
|
|
314
|
+
// (Optional) Function arguments. Empty arguments are allowed.
|
|
315
|
+
repeated Expression arguments = 2;
|
|
316
|
+
|
|
317
|
+
// (Required) Indicate if this function should be applied on distinct values.
|
|
318
|
+
bool is_distinct = 3;
|
|
319
|
+
|
|
320
|
+
// (Required) Indicate if this is a user defined function.
|
|
321
|
+
//
|
|
322
|
+
// When it is not a user defined function, Connect will use the function name directly.
|
|
323
|
+
// When it is a user defined function, Connect will parse the function name first.
|
|
324
|
+
bool is_user_defined_function = 4;
|
|
325
|
+
|
|
326
|
+
// (Optional) Indicate if this function is defined in the internal function registry.
|
|
327
|
+
// If not set, the server will try to look up the function in the internal function registry
|
|
328
|
+
// and decide appropriately.
|
|
329
|
+
optional bool is_internal = 5;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// Expression as string.
|
|
333
|
+
message ExpressionString {
|
|
334
|
+
// (Required) A SQL expression that will be parsed by Catalyst parser.
|
|
335
|
+
string expression = 1;
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
// UnresolvedStar is used to expand all the fields of a relation or struct.
|
|
339
|
+
message UnresolvedStar {
|
|
340
|
+
|
|
341
|
+
// (Optional) The target of the expansion.
|
|
342
|
+
//
|
|
343
|
+
// If set, it should end with '.*' and will be parsed by 'parseAttributeName'
|
|
344
|
+
// in the server side.
|
|
345
|
+
optional string unparsed_target = 1;
|
|
346
|
+
|
|
347
|
+
// (Optional) The id of corresponding connect plan.
|
|
348
|
+
optional int64 plan_id = 2;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
// Represents all of the input attributes to a given relational operator, for example in
|
|
352
|
+
// "SELECT `(id)?+.+` FROM ...".
|
|
353
|
+
message UnresolvedRegex {
|
|
354
|
+
// (Required) The column name used to extract column with regex.
|
|
355
|
+
string col_name = 1;
|
|
356
|
+
|
|
357
|
+
// (Optional) The id of corresponding connect plan.
|
|
358
|
+
optional int64 plan_id = 2;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// Extracts a value or values from an Expression
|
|
362
|
+
message UnresolvedExtractValue {
|
|
363
|
+
// (Required) The expression to extract value from, can be
|
|
364
|
+
// Map, Array, Struct or array of Structs.
|
|
365
|
+
Expression child = 1;
|
|
366
|
+
|
|
367
|
+
// (Required) The expression to describe the extraction, can be
|
|
368
|
+
// key of Map, index of Array, field name of Struct.
|
|
369
|
+
Expression extraction = 2;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// Add, replace or drop a field of `StructType` expression by name.
|
|
373
|
+
message UpdateFields {
|
|
374
|
+
// (Required) The struct expression.
|
|
375
|
+
Expression struct_expression = 1;
|
|
376
|
+
|
|
377
|
+
// (Required) The field name.
|
|
378
|
+
string field_name = 2;
|
|
379
|
+
|
|
380
|
+
// (Optional) The expression to add or replace.
|
|
381
|
+
//
|
|
382
|
+
// When not set, it means this field will be dropped.
|
|
383
|
+
Expression value_expression = 3;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
message Alias {
|
|
387
|
+
// (Required) The expression that alias will be added on.
|
|
388
|
+
Expression expr = 1;
|
|
389
|
+
|
|
390
|
+
// (Required) a list of name parts for the alias.
|
|
391
|
+
//
|
|
392
|
+
// Scalar columns only has one name that presents.
|
|
393
|
+
repeated string name = 2;
|
|
394
|
+
|
|
395
|
+
// (Optional) Alias metadata expressed as a JSON map.
|
|
396
|
+
optional string metadata = 3;
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
message LambdaFunction {
|
|
400
|
+
// (Required) The lambda function.
|
|
401
|
+
//
|
|
402
|
+
// The function body should use 'UnresolvedAttribute' as arguments, the sever side will
|
|
403
|
+
// replace 'UnresolvedAttribute' with 'UnresolvedNamedLambdaVariable'.
|
|
404
|
+
Expression function = 1;
|
|
405
|
+
|
|
406
|
+
// (Required) Function variables. Must contains 1 ~ 3 variables.
|
|
407
|
+
repeated Expression.UnresolvedNamedLambdaVariable arguments = 2;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
message UnresolvedNamedLambdaVariable {
|
|
411
|
+
|
|
412
|
+
// (Required) a list of name parts for the variable. Must not be empty.
|
|
413
|
+
repeated string name_parts = 1;
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
message ExpressionCommon {
|
|
418
|
+
// (Required) Keep the information of the origin for this expression such as stacktrace.
|
|
419
|
+
Origin origin = 1;
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
message CommonInlineUserDefinedFunction {
|
|
423
|
+
// (Required) Name of the user-defined function.
|
|
424
|
+
string function_name = 1;
|
|
425
|
+
// (Optional) Indicate if the user-defined function is deterministic.
|
|
426
|
+
bool deterministic = 2;
|
|
427
|
+
// (Optional) Function arguments. Empty arguments are allowed.
|
|
428
|
+
repeated Expression arguments = 3;
|
|
429
|
+
// (Required) Indicate the function type of the user-defined function.
|
|
430
|
+
oneof function {
|
|
431
|
+
PythonUDF python_udf = 4;
|
|
432
|
+
ScalarScalaUDF scalar_scala_udf = 5;
|
|
433
|
+
JavaUDF java_udf = 6;
|
|
434
|
+
}
|
|
435
|
+
// (Required) Indicate if this function should be applied on distinct values.
|
|
436
|
+
bool is_distinct = 7;
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
message PythonUDF {
|
|
440
|
+
// (Required) Output type of the Python UDF
|
|
441
|
+
DataType output_type = 1;
|
|
442
|
+
// (Required) EvalType of the Python UDF
|
|
443
|
+
int32 eval_type = 2;
|
|
444
|
+
// (Required) The encoded commands of the Python UDF
|
|
445
|
+
bytes command = 3;
|
|
446
|
+
// (Required) Python version being used in the client.
|
|
447
|
+
string python_ver = 4;
|
|
448
|
+
// (Optional) Additional includes for the Python UDF.
|
|
449
|
+
repeated string additional_includes = 5;
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
message ScalarScalaUDF {
|
|
453
|
+
// (Required) Serialized JVM object containing UDF definition, input encoders and output encoder
|
|
454
|
+
bytes payload = 1;
|
|
455
|
+
// (Optional) Input type(s) of the UDF
|
|
456
|
+
repeated DataType inputTypes = 2;
|
|
457
|
+
// (Required) Output type of the UDF
|
|
458
|
+
DataType outputType = 3;
|
|
459
|
+
// (Required) True if the UDF can return null value
|
|
460
|
+
bool nullable = 4;
|
|
461
|
+
// (Required) Indicate if the UDF is an aggregate function
|
|
462
|
+
bool aggregate = 5;
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
message JavaUDF {
|
|
466
|
+
// (Required) Fully qualified name of Java class
|
|
467
|
+
string class_name = 1;
|
|
468
|
+
|
|
469
|
+
// (Optional) Output type of the Java UDF
|
|
470
|
+
optional DataType output_type = 2;
|
|
471
|
+
|
|
472
|
+
// (Required) Indicate if the Java user-defined function is an aggregate function
|
|
473
|
+
bool aggregate = 3;
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
message TypedAggregateExpression {
|
|
477
|
+
// (Required) The aggregate function object packed into bytes.
|
|
478
|
+
ScalarScalaUDF scalar_scala_udf = 1;
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
message CallFunction {
|
|
482
|
+
// (Required) Unparsed name of the SQL function.
|
|
483
|
+
string function_name = 1;
|
|
484
|
+
|
|
485
|
+
// (Optional) Function arguments. Empty arguments are allowed.
|
|
486
|
+
repeated Expression arguments = 2;
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
message NamedArgumentExpression {
|
|
490
|
+
// (Required) The key of the named argument.
|
|
491
|
+
string key = 1;
|
|
492
|
+
|
|
493
|
+
// (Required) The value expression of the named argument.
|
|
494
|
+
Expression value = 2;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
message MergeAction {
|
|
498
|
+
// (Required) The action type of the merge action.
|
|
499
|
+
ActionType action_type = 1;
|
|
500
|
+
|
|
501
|
+
// (Optional) The condition expression of the merge action.
|
|
502
|
+
optional Expression condition = 2;
|
|
503
|
+
|
|
504
|
+
// (Optional) The assignments of the merge action. Required for ActionTypes INSERT and UPDATE.
|
|
505
|
+
repeated Assignment assignments = 3;
|
|
506
|
+
|
|
507
|
+
enum ActionType {
|
|
508
|
+
ACTION_TYPE_INVALID = 0;
|
|
509
|
+
ACTION_TYPE_DELETE = 1;
|
|
510
|
+
ACTION_TYPE_INSERT = 2;
|
|
511
|
+
ACTION_TYPE_INSERT_STAR = 3;
|
|
512
|
+
ACTION_TYPE_UPDATE = 4;
|
|
513
|
+
ACTION_TYPE_UPDATE_STAR = 5;
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
message Assignment {
|
|
517
|
+
// (Required) The key of the assignment.
|
|
518
|
+
Expression key = 1;
|
|
519
|
+
|
|
520
|
+
// (Required) The value of the assignment.
|
|
521
|
+
Expression value = 2;
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
message SubqueryExpression {
|
|
526
|
+
// (Required) The ID of the corresponding connect plan.
|
|
527
|
+
int64 plan_id = 1;
|
|
528
|
+
|
|
529
|
+
// (Required) The type of the subquery.
|
|
530
|
+
SubqueryType subquery_type = 2;
|
|
531
|
+
|
|
532
|
+
// (Optional) Options specific to table arguments.
|
|
533
|
+
optional TableArgOptions table_arg_options = 3;
|
|
534
|
+
|
|
535
|
+
// (Optional) IN subquery values.
|
|
536
|
+
repeated Expression in_subquery_values = 4;
|
|
537
|
+
|
|
538
|
+
enum SubqueryType {
|
|
539
|
+
SUBQUERY_TYPE_UNKNOWN = 0;
|
|
540
|
+
SUBQUERY_TYPE_SCALAR = 1;
|
|
541
|
+
SUBQUERY_TYPE_EXISTS = 2;
|
|
542
|
+
SUBQUERY_TYPE_TABLE_ARG = 3;
|
|
543
|
+
SUBQUERY_TYPE_IN = 4;
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
// Nested message for table argument options.
|
|
547
|
+
message TableArgOptions {
|
|
548
|
+
// (Optional) The way that input rows are partitioned.
|
|
549
|
+
repeated Expression partition_spec = 1;
|
|
550
|
+
|
|
551
|
+
// (Optional) Ordering of rows in a partition.
|
|
552
|
+
repeated Expression.SortOrder order_spec = 2;
|
|
553
|
+
|
|
554
|
+
// (Optional) Whether this is a single partition.
|
|
555
|
+
optional bool with_single_partition = 3;
|
|
556
|
+
}
|
|
557
|
+
}
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
3
|
+
* contributor license agreements. See the NOTICE file distributed with
|
|
4
|
+
* this work for additional information regarding copyright ownership.
|
|
5
|
+
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
6
|
+
* (the "License"); you may not use this file except in compliance with
|
|
7
|
+
* the License. You may obtain a copy of the License at
|
|
8
|
+
*
|
|
9
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
*
|
|
11
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
* See the License for the specific language governing permissions and
|
|
15
|
+
* limitations under the License.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
syntax = 'proto3';
|
|
19
|
+
|
|
20
|
+
package spark.connect;
|
|
21
|
+
|
|
22
|
+
import "spark/connect/relations.proto";
|
|
23
|
+
import "spark/connect/expressions.proto";
|
|
24
|
+
import "spark/connect/ml_common.proto";
|
|
25
|
+
|
|
26
|
+
option java_multiple_files = true;
|
|
27
|
+
option java_package = "org.apache.spark.connect.proto";
|
|
28
|
+
option go_package = "internal/generated";
|
|
29
|
+
|
|
30
|
+
// Command for ML
|
|
31
|
+
message MlCommand {
|
|
32
|
+
oneof command {
|
|
33
|
+
Fit fit = 1;
|
|
34
|
+
Fetch fetch = 2;
|
|
35
|
+
Delete delete = 3;
|
|
36
|
+
Write write = 4;
|
|
37
|
+
Read read = 5;
|
|
38
|
+
Evaluate evaluate = 6;
|
|
39
|
+
CleanCache clean_cache = 7;
|
|
40
|
+
GetCacheInfo get_cache_info = 8;
|
|
41
|
+
CreateSummary create_summary = 9;
|
|
42
|
+
GetModelSize get_model_size = 10;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Command for estimator.fit(dataset)
|
|
46
|
+
message Fit {
|
|
47
|
+
// (Required) Estimator information (its type should be OPERATOR_TYPE_ESTIMATOR)
|
|
48
|
+
MlOperator estimator = 1;
|
|
49
|
+
// (Optional) parameters of the Estimator
|
|
50
|
+
optional MlParams params = 2;
|
|
51
|
+
// (Required) the training dataset
|
|
52
|
+
Relation dataset = 3;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Command to delete the cached objects which could be a model
|
|
56
|
+
// or summary evaluated by a model
|
|
57
|
+
message Delete {
|
|
58
|
+
repeated ObjectRef obj_refs = 1;
|
|
59
|
+
// if set `evict_only` to true, only evict the cached model from memory,
|
|
60
|
+
// but keep the offloaded model in Spark driver local disk.
|
|
61
|
+
optional bool evict_only = 2;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Force to clean up all the ML cached objects
|
|
65
|
+
message CleanCache { }
|
|
66
|
+
|
|
67
|
+
// Get the information of all the ML cached objects
|
|
68
|
+
message GetCacheInfo { }
|
|
69
|
+
|
|
70
|
+
// Command to write ML operator
|
|
71
|
+
message Write {
|
|
72
|
+
// It could be an estimator/evaluator or the cached model
|
|
73
|
+
oneof type {
|
|
74
|
+
// Estimator or evaluator
|
|
75
|
+
MlOperator operator = 1;
|
|
76
|
+
// The cached model
|
|
77
|
+
ObjectRef obj_ref = 2;
|
|
78
|
+
}
|
|
79
|
+
// (Optional) The parameters of operator which could be estimator/evaluator or a cached model
|
|
80
|
+
optional MlParams params = 3;
|
|
81
|
+
// (Required) Save the ML instance to the path
|
|
82
|
+
string path = 4;
|
|
83
|
+
// (Optional) Overwrites if the output path already exists.
|
|
84
|
+
optional bool should_overwrite = 5;
|
|
85
|
+
// (Optional) The options of the writer
|
|
86
|
+
map<string, string> options = 6;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Command to load ML operator.
|
|
90
|
+
message Read {
|
|
91
|
+
// (Required) ML operator information
|
|
92
|
+
MlOperator operator = 1;
|
|
93
|
+
// (Required) Load the ML instance from the input path
|
|
94
|
+
string path = 2;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Command for evaluator.evaluate(dataset)
|
|
98
|
+
message Evaluate {
|
|
99
|
+
// (Required) Evaluator information (its type should be OPERATOR_TYPE_EVALUATOR)
|
|
100
|
+
MlOperator evaluator = 1;
|
|
101
|
+
// (Optional) parameters of the Evaluator
|
|
102
|
+
optional MlParams params = 2;
|
|
103
|
+
// (Required) the evaluating dataset
|
|
104
|
+
Relation dataset = 3;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// This is for re-creating the model summary when the model summary is lost
|
|
108
|
+
// (model summary is lost when the model is offloaded and then loaded back)
|
|
109
|
+
message CreateSummary {
|
|
110
|
+
ObjectRef model_ref = 1;
|
|
111
|
+
Relation dataset = 2;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// This is for query the model estimated in-memory size
|
|
115
|
+
message GetModelSize {
|
|
116
|
+
ObjectRef model_ref = 1;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// The result of MlCommand
|
|
121
|
+
message MlCommandResult {
|
|
122
|
+
oneof result_type {
|
|
123
|
+
// The result of the attribute
|
|
124
|
+
Expression.Literal param = 1;
|
|
125
|
+
// Evaluate a Dataset in a model and return the cached ID of summary
|
|
126
|
+
string summary = 2;
|
|
127
|
+
// Operator information
|
|
128
|
+
MlOperatorInfo operator_info = 3;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Represents an operator info
|
|
132
|
+
message MlOperatorInfo {
|
|
133
|
+
oneof type {
|
|
134
|
+
// The cached object which could be a model or summary evaluated by a model
|
|
135
|
+
ObjectRef obj_ref = 1;
|
|
136
|
+
// Operator name
|
|
137
|
+
string name = 2;
|
|
138
|
+
}
|
|
139
|
+
// (Optional) the 'uid' of a ML object
|
|
140
|
+
// Note it is different from the 'id' of a cached object.
|
|
141
|
+
optional string uid = 3;
|
|
142
|
+
// (Optional) parameters
|
|
143
|
+
optional MlParams params = 4;
|
|
144
|
+
// (Optional) warning message generated during the ML command execution
|
|
145
|
+
optional string warning_message = 5;
|
|
146
|
+
}
|
|
147
|
+
}
|