spark-connect 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +82 -0
  3. data/LICENSE +202 -0
  4. data/NOTICE +16 -0
  5. data/README.md +166 -0
  6. data/lib/spark-connect.rb +5 -0
  7. data/lib/spark_connect/arrow.rb +115 -0
  8. data/lib/spark_connect/catalog.rb +190 -0
  9. data/lib/spark_connect/channel_builder.rb +134 -0
  10. data/lib/spark_connect/client.rb +264 -0
  11. data/lib/spark_connect/column.rb +379 -0
  12. data/lib/spark_connect/conf.rb +79 -0
  13. data/lib/spark_connect/data_frame.rb +828 -0
  14. data/lib/spark_connect/errors.rb +58 -0
  15. data/lib/spark_connect/functions.rb +903 -0
  16. data/lib/spark_connect/grouped_data.rb +101 -0
  17. data/lib/spark_connect/na_functions.rb +98 -0
  18. data/lib/spark_connect/observation.rb +61 -0
  19. data/lib/spark_connect/pipelines.rb +221 -0
  20. data/lib/spark_connect/plan.rb +39 -0
  21. data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
  22. data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
  23. data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
  24. data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
  25. data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
  26. data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
  27. data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
  28. data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
  29. data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
  30. data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
  31. data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
  32. data/lib/spark_connect/proto.rb +32 -0
  33. data/lib/spark_connect/reader.rb +98 -0
  34. data/lib/spark_connect/row.rb +105 -0
  35. data/lib/spark_connect/session.rb +317 -0
  36. data/lib/spark_connect/stat_functions.rb +109 -0
  37. data/lib/spark_connect/streaming.rb +351 -0
  38. data/lib/spark_connect/types.rb +490 -0
  39. data/lib/spark_connect/version.rb +11 -0
  40. data/lib/spark_connect/window.rb +119 -0
  41. data/lib/spark_connect/writer.rb +208 -0
  42. data/lib/spark_connect.rb +58 -0
  43. data/proto/spark/connect/base.proto +1275 -0
  44. data/proto/spark/connect/catalog.proto +243 -0
  45. data/proto/spark/connect/commands.proto +553 -0
  46. data/proto/spark/connect/common.proto +179 -0
  47. data/proto/spark/connect/expressions.proto +557 -0
  48. data/proto/spark/connect/ml.proto +147 -0
  49. data/proto/spark/connect/ml_common.proto +64 -0
  50. data/proto/spark/connect/pipelines.proto +307 -0
  51. data/proto/spark/connect/relations.proto +1252 -0
  52. data/proto/spark/connect/types.proto +227 -0
  53. metadata +149 -0
@@ -0,0 +1,557 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one or more
3
+ * contributor license agreements. See the NOTICE file distributed with
4
+ * this work for additional information regarding copyright ownership.
5
+ * The ASF licenses this file to You under the Apache License, Version 2.0
6
+ * (the "License"); you may not use this file except in compliance with
7
+ * the License. You may obtain a copy of the License at
8
+ *
9
+ * http://www.apache.org/licenses/LICENSE-2.0
10
+ *
11
+ * Unless required by applicable law or agreed to in writing, software
12
+ * distributed under the License is distributed on an "AS IS" BASIS,
13
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ * See the License for the specific language governing permissions and
15
+ * limitations under the License.
16
+ */
17
+
18
+ syntax = 'proto3';
19
+
20
+ import "google/protobuf/any.proto";
21
+ import "spark/connect/types.proto";
22
+ import "spark/connect/common.proto";
23
+
24
+ package spark.connect;
25
+
26
+ option java_multiple_files = true;
27
+ option java_package = "org.apache.spark.connect.proto";
28
+ option go_package = "internal/generated";
29
+
30
+ // Expression used to refer to fields, functions and similar. This can be used everywhere
31
+ // expressions in SQL appear.
32
+ message Expression {
33
+
34
+ ExpressionCommon common = 18;
35
+ oneof expr_type {
36
+ Literal literal = 1;
37
+ UnresolvedAttribute unresolved_attribute = 2;
38
+ UnresolvedFunction unresolved_function = 3;
39
+ ExpressionString expression_string = 4;
40
+ UnresolvedStar unresolved_star = 5;
41
+ Alias alias = 6;
42
+ Cast cast = 7;
43
+ UnresolvedRegex unresolved_regex = 8;
44
+ SortOrder sort_order = 9;
45
+ LambdaFunction lambda_function = 10;
46
+ Window window = 11;
47
+ UnresolvedExtractValue unresolved_extract_value = 12;
48
+ UpdateFields update_fields = 13;
49
+ UnresolvedNamedLambdaVariable unresolved_named_lambda_variable = 14;
50
+ CommonInlineUserDefinedFunction common_inline_user_defined_function = 15;
51
+ CallFunction call_function = 16;
52
+ NamedArgumentExpression named_argument_expression = 17;
53
+ MergeAction merge_action = 19;
54
+ TypedAggregateExpression typed_aggregate_expression = 20;
55
+ SubqueryExpression subquery_expression = 21;
56
+ DirectShufflePartitionID direct_shuffle_partition_id = 22;
57
+
58
+ // This field is used to mark extensions to the protocol. When plugins generate arbitrary
59
+ // relations they can add them here. During the planning the correct resolution is done.
60
+ google.protobuf.Any extension = 999;
61
+ }
62
+
63
+
64
+ // Expression for the OVER clause or WINDOW clause.
65
+ message Window {
66
+
67
+ // (Required) The window function.
68
+ Expression window_function = 1;
69
+
70
+ // (Optional) The way that input rows are partitioned.
71
+ repeated Expression partition_spec = 2;
72
+
73
+ // (Optional) Ordering of rows in a partition.
74
+ repeated SortOrder order_spec = 3;
75
+
76
+ // (Optional) Window frame in a partition.
77
+ //
78
+ // If not set, it will be treated as 'UnspecifiedFrame'.
79
+ WindowFrame frame_spec = 4;
80
+
81
+ // The window frame
82
+ message WindowFrame {
83
+
84
+ // (Required) The type of the frame.
85
+ FrameType frame_type = 1;
86
+
87
+ // (Required) The lower bound of the frame.
88
+ FrameBoundary lower = 2;
89
+
90
+ // (Required) The upper bound of the frame.
91
+ FrameBoundary upper = 3;
92
+
93
+ enum FrameType {
94
+ FRAME_TYPE_UNDEFINED = 0;
95
+
96
+ // RowFrame treats rows in a partition individually.
97
+ FRAME_TYPE_ROW = 1;
98
+
99
+ // RangeFrame treats rows in a partition as groups of peers.
100
+ // All rows having the same 'ORDER BY' ordering are considered as peers.
101
+ FRAME_TYPE_RANGE = 2;
102
+ }
103
+
104
+ message FrameBoundary {
105
+ oneof boundary {
106
+ // CURRENT ROW boundary
107
+ bool current_row = 1;
108
+
109
+ // UNBOUNDED boundary.
110
+ // For lower bound, it will be converted to 'UnboundedPreceding'.
111
+ // for upper bound, it will be converted to 'UnboundedFollowing'.
112
+ bool unbounded = 2;
113
+
114
+ // This is an expression for future proofing. We are expecting literals on the server side.
115
+ Expression value = 3;
116
+ }
117
+ }
118
+ }
119
+ }
120
+
121
+ // SortOrder is used to specify the data ordering, it is normally used in Sort and Window.
122
+ // It is an unevaluable expression and cannot be evaluated, so can not be used in Projection.
123
+ message SortOrder {
124
+ // (Required) The expression to be sorted.
125
+ Expression child = 1;
126
+
127
+ // (Required) The sort direction, should be ASCENDING or DESCENDING.
128
+ SortDirection direction = 2;
129
+
130
+ // (Required) How to deal with NULLs, should be NULLS_FIRST or NULLS_LAST.
131
+ NullOrdering null_ordering = 3;
132
+
133
+ enum SortDirection {
134
+ SORT_DIRECTION_UNSPECIFIED = 0;
135
+ SORT_DIRECTION_ASCENDING = 1;
136
+ SORT_DIRECTION_DESCENDING = 2;
137
+ }
138
+
139
+ enum NullOrdering {
140
+ SORT_NULLS_UNSPECIFIED = 0;
141
+ SORT_NULLS_FIRST = 1;
142
+ SORT_NULLS_LAST = 2;
143
+ }
144
+ }
145
+
146
+ // Expression that takes a partition ID value and passes it through directly for use in
147
+ // shuffle partitioning. This is used with RepartitionByExpression to allow users to
148
+ // directly specify target partition IDs.
149
+ message DirectShufflePartitionID {
150
+ // (Required) The expression that evaluates to the partition ID.
151
+ Expression child = 1;
152
+ }
153
+
154
+ message Cast {
155
+ // (Required) the expression to be casted.
156
+ Expression expr = 1;
157
+
158
+ // (Required) the data type that the expr to be casted to.
159
+ oneof cast_to_type {
160
+ DataType type = 2;
161
+ // If this is set, Server will use Catalyst parser to parse this string to DataType.
162
+ string type_str = 3;
163
+ }
164
+
165
+ // (Optional) The expression evaluation mode.
166
+ EvalMode eval_mode = 4;
167
+
168
+ enum EvalMode {
169
+ EVAL_MODE_UNSPECIFIED = 0;
170
+ EVAL_MODE_LEGACY = 1;
171
+ EVAL_MODE_ANSI = 2;
172
+ EVAL_MODE_TRY = 3;
173
+ }
174
+ }
175
+
176
+ message Literal {
177
+ oneof literal_type {
178
+ DataType null = 1;
179
+ bytes binary = 2;
180
+ bool boolean = 3;
181
+
182
+ int32 byte = 4;
183
+ int32 short = 5;
184
+ int32 integer = 6;
185
+ int64 long = 7;
186
+ float float = 10;
187
+ double double = 11;
188
+ Decimal decimal = 12;
189
+
190
+ string string = 13;
191
+
192
+ // Date in units of days since the UNIX epoch.
193
+ int32 date = 16;
194
+ // Timestamp in units of microseconds since the UNIX epoch.
195
+ int64 timestamp = 17;
196
+ // Timestamp in units of microseconds since the UNIX epoch (without timezone information).
197
+ int64 timestamp_ntz = 18;
198
+
199
+ CalendarInterval calendar_interval = 19;
200
+ int32 year_month_interval = 20;
201
+ int64 day_time_interval = 21;
202
+ Array array = 22;
203
+ Map map = 23;
204
+ Struct struct = 24;
205
+
206
+ SpecializedArray specialized_array = 25;
207
+ Time time = 26;
208
+ }
209
+
210
+ // Reserved for Geometry and Geography.
211
+ reserved 27, 28;
212
+
213
+ // Data type information for the literal.
214
+ // This field is required only in the root literal message for null values or
215
+ // for data types (e.g., array, map, or struct) with non-trivial information.
216
+ // If the data_type field is not set at the root level, the data type will be
217
+ // inferred or retrieved from the deprecated data type fields using best efforts.
218
+ DataType data_type = 100;
219
+
220
+ message Decimal {
221
+ // the string representation.
222
+ string value = 1;
223
+ // The maximum number of digits allowed in the value.
224
+ // the maximum precision is 38.
225
+ optional int32 precision = 2;
226
+ // declared scale of decimal literal
227
+ optional int32 scale = 3;
228
+ }
229
+
230
+ message CalendarInterval {
231
+ int32 months = 1;
232
+ int32 days = 2;
233
+ int64 microseconds = 3;
234
+ }
235
+
236
+ message Array {
237
+ // (Deprecated) The element type of the array.
238
+ //
239
+ // This field is deprecated since Spark 4.1+. Use data_type field instead.
240
+ DataType element_type = 1 [deprecated = true];
241
+
242
+ // The literal values that make up the array elements.
243
+ repeated Literal elements = 2;
244
+ }
245
+
246
+ message Map {
247
+ // (Deprecated) The key type of the map.
248
+ //
249
+ // This field is deprecated since Spark 4.1+. Use data_type field instead.
250
+ DataType key_type = 1 [deprecated = true];
251
+
252
+ // (Deprecated) The value type of the map.
253
+ //
254
+ // This field is deprecated since Spark 4.1+ and should only be set
255
+ // if the data_type field is not set. Use data_type field instead.
256
+ DataType value_type = 2 [deprecated = true];
257
+
258
+ // The literal keys that make up the map.
259
+ repeated Literal keys = 3;
260
+
261
+ // The literal values that make up the map.
262
+ repeated Literal values = 4;
263
+ }
264
+
265
+ message Struct {
266
+ // (Deprecated) The type of the struct.
267
+ //
268
+ // This field is deprecated since Spark 4.1+ because using DataType as the type of a struct
269
+ // is ambiguous. Use data_type field instead.
270
+ DataType struct_type = 1 [deprecated = true];
271
+
272
+ // The literal values that make up the struct elements.
273
+ repeated Literal elements = 2;
274
+ }
275
+
276
+ message SpecializedArray {
277
+ oneof value_type {
278
+ Bools bools = 1;
279
+ Ints ints = 2;
280
+ Longs longs = 3;
281
+ Floats floats = 4;
282
+ Doubles doubles = 5;
283
+ Strings strings = 6;
284
+ }
285
+ }
286
+
287
+ message Time {
288
+ int64 nano = 1;
289
+ // The precision of this time, if omitted, uses the default value of MICROS_PRECISION.
290
+ optional int32 precision = 2;
291
+ }
292
+ }
293
+
294
+ // An unresolved attribute that is not explicitly bound to a specific column, but the column
295
+ // is resolved during analysis by name.
296
+ message UnresolvedAttribute {
297
+ // (Required) An identifier that will be parsed by Catalyst parser. This should follow the
298
+ // Spark SQL identifier syntax.
299
+ string unparsed_identifier = 1;
300
+
301
+ // (Optional) The id of corresponding connect plan.
302
+ optional int64 plan_id = 2;
303
+
304
+ // (Optional) The requested column is a metadata column.
305
+ optional bool is_metadata_column = 3;
306
+ }
307
+
308
+ // An unresolved function is not explicitly bound to one explicit function, but the function
309
+ // is resolved during analysis following Sparks name resolution rules.
310
+ message UnresolvedFunction {
311
+ // (Required) name (or unparsed name for user defined function) for the unresolved function.
312
+ string function_name = 1;
313
+
314
+ // (Optional) Function arguments. Empty arguments are allowed.
315
+ repeated Expression arguments = 2;
316
+
317
+ // (Required) Indicate if this function should be applied on distinct values.
318
+ bool is_distinct = 3;
319
+
320
+ // (Required) Indicate if this is a user defined function.
321
+ //
322
+ // When it is not a user defined function, Connect will use the function name directly.
323
+ // When it is a user defined function, Connect will parse the function name first.
324
+ bool is_user_defined_function = 4;
325
+
326
+ // (Optional) Indicate if this function is defined in the internal function registry.
327
+ // If not set, the server will try to look up the function in the internal function registry
328
+ // and decide appropriately.
329
+ optional bool is_internal = 5;
330
+ }
331
+
332
+ // Expression as string.
333
+ message ExpressionString {
334
+ // (Required) A SQL expression that will be parsed by Catalyst parser.
335
+ string expression = 1;
336
+ }
337
+
338
+ // UnresolvedStar is used to expand all the fields of a relation or struct.
339
+ message UnresolvedStar {
340
+
341
+ // (Optional) The target of the expansion.
342
+ //
343
+ // If set, it should end with '.*' and will be parsed by 'parseAttributeName'
344
+ // in the server side.
345
+ optional string unparsed_target = 1;
346
+
347
+ // (Optional) The id of corresponding connect plan.
348
+ optional int64 plan_id = 2;
349
+ }
350
+
351
+ // Represents all of the input attributes to a given relational operator, for example in
352
+ // "SELECT `(id)?+.+` FROM ...".
353
+ message UnresolvedRegex {
354
+ // (Required) The column name used to extract column with regex.
355
+ string col_name = 1;
356
+
357
+ // (Optional) The id of corresponding connect plan.
358
+ optional int64 plan_id = 2;
359
+ }
360
+
361
+ // Extracts a value or values from an Expression
362
+ message UnresolvedExtractValue {
363
+ // (Required) The expression to extract value from, can be
364
+ // Map, Array, Struct or array of Structs.
365
+ Expression child = 1;
366
+
367
+ // (Required) The expression to describe the extraction, can be
368
+ // key of Map, index of Array, field name of Struct.
369
+ Expression extraction = 2;
370
+ }
371
+
372
+ // Add, replace or drop a field of `StructType` expression by name.
373
+ message UpdateFields {
374
+ // (Required) The struct expression.
375
+ Expression struct_expression = 1;
376
+
377
+ // (Required) The field name.
378
+ string field_name = 2;
379
+
380
+ // (Optional) The expression to add or replace.
381
+ //
382
+ // When not set, it means this field will be dropped.
383
+ Expression value_expression = 3;
384
+ }
385
+
386
+ message Alias {
387
+ // (Required) The expression that alias will be added on.
388
+ Expression expr = 1;
389
+
390
+ // (Required) a list of name parts for the alias.
391
+ //
392
+ // Scalar columns only has one name that presents.
393
+ repeated string name = 2;
394
+
395
+ // (Optional) Alias metadata expressed as a JSON map.
396
+ optional string metadata = 3;
397
+ }
398
+
399
+ message LambdaFunction {
400
+ // (Required) The lambda function.
401
+ //
402
+ // The function body should use 'UnresolvedAttribute' as arguments, the sever side will
403
+ // replace 'UnresolvedAttribute' with 'UnresolvedNamedLambdaVariable'.
404
+ Expression function = 1;
405
+
406
+ // (Required) Function variables. Must contains 1 ~ 3 variables.
407
+ repeated Expression.UnresolvedNamedLambdaVariable arguments = 2;
408
+ }
409
+
410
+ message UnresolvedNamedLambdaVariable {
411
+
412
+ // (Required) a list of name parts for the variable. Must not be empty.
413
+ repeated string name_parts = 1;
414
+ }
415
+ }
416
+
417
+ message ExpressionCommon {
418
+ // (Required) Keep the information of the origin for this expression such as stacktrace.
419
+ Origin origin = 1;
420
+ }
421
+
422
+ message CommonInlineUserDefinedFunction {
423
+ // (Required) Name of the user-defined function.
424
+ string function_name = 1;
425
+ // (Optional) Indicate if the user-defined function is deterministic.
426
+ bool deterministic = 2;
427
+ // (Optional) Function arguments. Empty arguments are allowed.
428
+ repeated Expression arguments = 3;
429
+ // (Required) Indicate the function type of the user-defined function.
430
+ oneof function {
431
+ PythonUDF python_udf = 4;
432
+ ScalarScalaUDF scalar_scala_udf = 5;
433
+ JavaUDF java_udf = 6;
434
+ }
435
+ // (Required) Indicate if this function should be applied on distinct values.
436
+ bool is_distinct = 7;
437
+ }
438
+
439
+ message PythonUDF {
440
+ // (Required) Output type of the Python UDF
441
+ DataType output_type = 1;
442
+ // (Required) EvalType of the Python UDF
443
+ int32 eval_type = 2;
444
+ // (Required) The encoded commands of the Python UDF
445
+ bytes command = 3;
446
+ // (Required) Python version being used in the client.
447
+ string python_ver = 4;
448
+ // (Optional) Additional includes for the Python UDF.
449
+ repeated string additional_includes = 5;
450
+ }
451
+
452
+ message ScalarScalaUDF {
453
+ // (Required) Serialized JVM object containing UDF definition, input encoders and output encoder
454
+ bytes payload = 1;
455
+ // (Optional) Input type(s) of the UDF
456
+ repeated DataType inputTypes = 2;
457
+ // (Required) Output type of the UDF
458
+ DataType outputType = 3;
459
+ // (Required) True if the UDF can return null value
460
+ bool nullable = 4;
461
+ // (Required) Indicate if the UDF is an aggregate function
462
+ bool aggregate = 5;
463
+ }
464
+
465
+ message JavaUDF {
466
+ // (Required) Fully qualified name of Java class
467
+ string class_name = 1;
468
+
469
+ // (Optional) Output type of the Java UDF
470
+ optional DataType output_type = 2;
471
+
472
+ // (Required) Indicate if the Java user-defined function is an aggregate function
473
+ bool aggregate = 3;
474
+ }
475
+
476
+ message TypedAggregateExpression {
477
+ // (Required) The aggregate function object packed into bytes.
478
+ ScalarScalaUDF scalar_scala_udf = 1;
479
+ }
480
+
481
+ message CallFunction {
482
+ // (Required) Unparsed name of the SQL function.
483
+ string function_name = 1;
484
+
485
+ // (Optional) Function arguments. Empty arguments are allowed.
486
+ repeated Expression arguments = 2;
487
+ }
488
+
489
+ message NamedArgumentExpression {
490
+ // (Required) The key of the named argument.
491
+ string key = 1;
492
+
493
+ // (Required) The value expression of the named argument.
494
+ Expression value = 2;
495
+ }
496
+
497
+ message MergeAction {
498
+ // (Required) The action type of the merge action.
499
+ ActionType action_type = 1;
500
+
501
+ // (Optional) The condition expression of the merge action.
502
+ optional Expression condition = 2;
503
+
504
+ // (Optional) The assignments of the merge action. Required for ActionTypes INSERT and UPDATE.
505
+ repeated Assignment assignments = 3;
506
+
507
+ enum ActionType {
508
+ ACTION_TYPE_INVALID = 0;
509
+ ACTION_TYPE_DELETE = 1;
510
+ ACTION_TYPE_INSERT = 2;
511
+ ACTION_TYPE_INSERT_STAR = 3;
512
+ ACTION_TYPE_UPDATE = 4;
513
+ ACTION_TYPE_UPDATE_STAR = 5;
514
+ }
515
+
516
+ message Assignment {
517
+ // (Required) The key of the assignment.
518
+ Expression key = 1;
519
+
520
+ // (Required) The value of the assignment.
521
+ Expression value = 2;
522
+ }
523
+ }
524
+
525
+ message SubqueryExpression {
526
+ // (Required) The ID of the corresponding connect plan.
527
+ int64 plan_id = 1;
528
+
529
+ // (Required) The type of the subquery.
530
+ SubqueryType subquery_type = 2;
531
+
532
+ // (Optional) Options specific to table arguments.
533
+ optional TableArgOptions table_arg_options = 3;
534
+
535
+ // (Optional) IN subquery values.
536
+ repeated Expression in_subquery_values = 4;
537
+
538
+ enum SubqueryType {
539
+ SUBQUERY_TYPE_UNKNOWN = 0;
540
+ SUBQUERY_TYPE_SCALAR = 1;
541
+ SUBQUERY_TYPE_EXISTS = 2;
542
+ SUBQUERY_TYPE_TABLE_ARG = 3;
543
+ SUBQUERY_TYPE_IN = 4;
544
+ }
545
+
546
+ // Nested message for table argument options.
547
+ message TableArgOptions {
548
+ // (Optional) The way that input rows are partitioned.
549
+ repeated Expression partition_spec = 1;
550
+
551
+ // (Optional) Ordering of rows in a partition.
552
+ repeated Expression.SortOrder order_spec = 2;
553
+
554
+ // (Optional) Whether this is a single partition.
555
+ optional bool with_single_partition = 3;
556
+ }
557
+ }
@@ -0,0 +1,147 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one or more
3
+ * contributor license agreements. See the NOTICE file distributed with
4
+ * this work for additional information regarding copyright ownership.
5
+ * The ASF licenses this file to You under the Apache License, Version 2.0
6
+ * (the "License"); you may not use this file except in compliance with
7
+ * the License. You may obtain a copy of the License at
8
+ *
9
+ * http://www.apache.org/licenses/LICENSE-2.0
10
+ *
11
+ * Unless required by applicable law or agreed to in writing, software
12
+ * distributed under the License is distributed on an "AS IS" BASIS,
13
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ * See the License for the specific language governing permissions and
15
+ * limitations under the License.
16
+ */
17
+
18
+ syntax = 'proto3';
19
+
20
+ package spark.connect;
21
+
22
+ import "spark/connect/relations.proto";
23
+ import "spark/connect/expressions.proto";
24
+ import "spark/connect/ml_common.proto";
25
+
26
+ option java_multiple_files = true;
27
+ option java_package = "org.apache.spark.connect.proto";
28
+ option go_package = "internal/generated";
29
+
30
+ // Command for ML
31
+ message MlCommand {
32
+ oneof command {
33
+ Fit fit = 1;
34
+ Fetch fetch = 2;
35
+ Delete delete = 3;
36
+ Write write = 4;
37
+ Read read = 5;
38
+ Evaluate evaluate = 6;
39
+ CleanCache clean_cache = 7;
40
+ GetCacheInfo get_cache_info = 8;
41
+ CreateSummary create_summary = 9;
42
+ GetModelSize get_model_size = 10;
43
+ }
44
+
45
+ // Command for estimator.fit(dataset)
46
+ message Fit {
47
+ // (Required) Estimator information (its type should be OPERATOR_TYPE_ESTIMATOR)
48
+ MlOperator estimator = 1;
49
+ // (Optional) parameters of the Estimator
50
+ optional MlParams params = 2;
51
+ // (Required) the training dataset
52
+ Relation dataset = 3;
53
+ }
54
+
55
+ // Command to delete the cached objects which could be a model
56
+ // or summary evaluated by a model
57
+ message Delete {
58
+ repeated ObjectRef obj_refs = 1;
59
+ // if set `evict_only` to true, only evict the cached model from memory,
60
+ // but keep the offloaded model in Spark driver local disk.
61
+ optional bool evict_only = 2;
62
+ }
63
+
64
+ // Force to clean up all the ML cached objects
65
+ message CleanCache { }
66
+
67
+ // Get the information of all the ML cached objects
68
+ message GetCacheInfo { }
69
+
70
+ // Command to write ML operator
71
+ message Write {
72
+ // It could be an estimator/evaluator or the cached model
73
+ oneof type {
74
+ // Estimator or evaluator
75
+ MlOperator operator = 1;
76
+ // The cached model
77
+ ObjectRef obj_ref = 2;
78
+ }
79
+ // (Optional) The parameters of operator which could be estimator/evaluator or a cached model
80
+ optional MlParams params = 3;
81
+ // (Required) Save the ML instance to the path
82
+ string path = 4;
83
+ // (Optional) Overwrites if the output path already exists.
84
+ optional bool should_overwrite = 5;
85
+ // (Optional) The options of the writer
86
+ map<string, string> options = 6;
87
+ }
88
+
89
+ // Command to load ML operator.
90
+ message Read {
91
+ // (Required) ML operator information
92
+ MlOperator operator = 1;
93
+ // (Required) Load the ML instance from the input path
94
+ string path = 2;
95
+ }
96
+
97
+ // Command for evaluator.evaluate(dataset)
98
+ message Evaluate {
99
+ // (Required) Evaluator information (its type should be OPERATOR_TYPE_EVALUATOR)
100
+ MlOperator evaluator = 1;
101
+ // (Optional) parameters of the Evaluator
102
+ optional MlParams params = 2;
103
+ // (Required) the evaluating dataset
104
+ Relation dataset = 3;
105
+ }
106
+
107
+ // This is for re-creating the model summary when the model summary is lost
108
+ // (model summary is lost when the model is offloaded and then loaded back)
109
+ message CreateSummary {
110
+ ObjectRef model_ref = 1;
111
+ Relation dataset = 2;
112
+ }
113
+
114
+ // This is for query the model estimated in-memory size
115
+ message GetModelSize {
116
+ ObjectRef model_ref = 1;
117
+ }
118
+ }
119
+
120
+ // The result of MlCommand
121
+ message MlCommandResult {
122
+ oneof result_type {
123
+ // The result of the attribute
124
+ Expression.Literal param = 1;
125
+ // Evaluate a Dataset in a model and return the cached ID of summary
126
+ string summary = 2;
127
+ // Operator information
128
+ MlOperatorInfo operator_info = 3;
129
+ }
130
+
131
+ // Represents an operator info
132
+ message MlOperatorInfo {
133
+ oneof type {
134
+ // The cached object which could be a model or summary evaluated by a model
135
+ ObjectRef obj_ref = 1;
136
+ // Operator name
137
+ string name = 2;
138
+ }
139
+ // (Optional) the 'uid' of a ML object
140
+ // Note it is different from the 'id' of a cached object.
141
+ optional string uid = 3;
142
+ // (Optional) parameters
143
+ optional MlParams params = 4;
144
+ // (Optional) warning message generated during the ML command execution
145
+ optional string warning_message = 5;
146
+ }
147
+ }