spark-connect 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +82 -0
  3. data/LICENSE +202 -0
  4. data/NOTICE +16 -0
  5. data/README.md +166 -0
  6. data/lib/spark-connect.rb +5 -0
  7. data/lib/spark_connect/arrow.rb +115 -0
  8. data/lib/spark_connect/catalog.rb +190 -0
  9. data/lib/spark_connect/channel_builder.rb +134 -0
  10. data/lib/spark_connect/client.rb +264 -0
  11. data/lib/spark_connect/column.rb +379 -0
  12. data/lib/spark_connect/conf.rb +79 -0
  13. data/lib/spark_connect/data_frame.rb +828 -0
  14. data/lib/spark_connect/errors.rb +58 -0
  15. data/lib/spark_connect/functions.rb +903 -0
  16. data/lib/spark_connect/grouped_data.rb +101 -0
  17. data/lib/spark_connect/na_functions.rb +98 -0
  18. data/lib/spark_connect/observation.rb +61 -0
  19. data/lib/spark_connect/pipelines.rb +221 -0
  20. data/lib/spark_connect/plan.rb +39 -0
  21. data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
  22. data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
  23. data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
  24. data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
  25. data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
  26. data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
  27. data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
  28. data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
  29. data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
  30. data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
  31. data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
  32. data/lib/spark_connect/proto.rb +32 -0
  33. data/lib/spark_connect/reader.rb +98 -0
  34. data/lib/spark_connect/row.rb +105 -0
  35. data/lib/spark_connect/session.rb +317 -0
  36. data/lib/spark_connect/stat_functions.rb +109 -0
  37. data/lib/spark_connect/streaming.rb +351 -0
  38. data/lib/spark_connect/types.rb +490 -0
  39. data/lib/spark_connect/version.rb +11 -0
  40. data/lib/spark_connect/window.rb +119 -0
  41. data/lib/spark_connect/writer.rb +208 -0
  42. data/lib/spark_connect.rb +58 -0
  43. data/proto/spark/connect/base.proto +1275 -0
  44. data/proto/spark/connect/catalog.proto +243 -0
  45. data/proto/spark/connect/commands.proto +553 -0
  46. data/proto/spark/connect/common.proto +179 -0
  47. data/proto/spark/connect/expressions.proto +557 -0
  48. data/proto/spark/connect/ml.proto +147 -0
  49. data/proto/spark/connect/ml_common.proto +64 -0
  50. data/proto/spark/connect/pipelines.proto +307 -0
  51. data/proto/spark/connect/relations.proto +1252 -0
  52. data/proto/spark/connect/types.proto +227 -0
  53. metadata +149 -0
@@ -0,0 +1,208 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SparkConnect
4
+ # Saves the contents of a {DataFrame} to external storage. Returned by
5
+ # {DataFrame#write}. Mirrors PySpark's `DataFrameWriter`.
6
+ #
7
+ # @example
8
+ # df.write.format("parquet").mode(:overwrite).save("out.parquet")
9
+ # df.write.mode(:append).save_as_table("my_table")
10
+ class DataFrameWriter
11
+ Proto = SparkConnect::Proto
12
+ WO = Proto::WriteOperation
13
+
14
+ SAVE_MODES = {
15
+ append: :SAVE_MODE_APPEND,
16
+ overwrite: :SAVE_MODE_OVERWRITE,
17
+ error: :SAVE_MODE_ERROR_IF_EXISTS,
18
+ errorifexists: :SAVE_MODE_ERROR_IF_EXISTS,
19
+ error_if_exists: :SAVE_MODE_ERROR_IF_EXISTS,
20
+ ignore: :SAVE_MODE_IGNORE,
21
+ default: :SAVE_MODE_UNSPECIFIED,
22
+ }.freeze
23
+
24
+ # @param df [DataFrame]
25
+ def initialize(df)
26
+ @df = df
27
+ @source = nil
28
+ @mode = :SAVE_MODE_UNSPECIFIED
29
+ @options = {}
30
+ @partitioning_columns = []
31
+ @sort_columns = []
32
+ @bucket_cols = nil
33
+ @num_buckets = nil
34
+ end
35
+
36
+ # @return [self] set the output format.
37
+ def format(source)
38
+ @source = source.to_s
39
+ self
40
+ end
41
+
42
+ # @return [self] set the save mode (`:append`, `:overwrite`, `:ignore`,
43
+ # `:error`).
44
+ def mode(save_mode)
45
+ @mode = SAVE_MODES.fetch(save_mode.to_s.downcase.to_sym) do
46
+ raise IllegalArgumentError, "Unknown save mode: #{save_mode}"
47
+ end
48
+ self
49
+ end
50
+
51
+ # @return [self] set a write option.
52
+ def option(key, value)
53
+ @options[key.to_s] = value.to_s
54
+ self
55
+ end
56
+
57
+ # @return [self] set multiple write options.
58
+ def options(opts)
59
+ opts.each { |k, v| @options[k.to_s] = v.to_s }
60
+ self
61
+ end
62
+
63
+ # @return [self] partition the output by these columns.
64
+ def partition_by(*cols)
65
+ @partitioning_columns = cols.flatten.map(&:to_s)
66
+ self
67
+ end
68
+ alias partitionBy partition_by
69
+
70
+ # @return [self] sort within partitions/buckets by these columns.
71
+ def sort_by(*cols)
72
+ @sort_columns = cols.flatten.map(&:to_s)
73
+ self
74
+ end
75
+ alias sortBy sort_by
76
+
77
+ # @return [self] bucket the output into `num_buckets` by these columns.
78
+ def bucket_by(num_buckets, *cols)
79
+ @num_buckets = num_buckets
80
+ @bucket_cols = cols.flatten.map(&:to_s)
81
+ self
82
+ end
83
+ alias bucketBy bucket_by
84
+
85
+ # Save to a path.
86
+ # @param path [String, nil]
87
+ # @return [void]
88
+ def save(path = nil)
89
+ op = base_operation
90
+ op.path = path if path
91
+ execute(op)
92
+ end
93
+
94
+ # Save as a managed/registered table.
95
+ # @return [void]
96
+ def save_as_table(name)
97
+ op = base_operation
98
+ op.table = WO::SaveTable.new(table_name: name.to_s, save_method: :TABLE_SAVE_METHOD_SAVE_AS_TABLE)
99
+ execute(op)
100
+ end
101
+ alias saveAsTable save_as_table
102
+
103
+ # Insert into an existing table (by position).
104
+ # @return [void]
105
+ def insert_into(name)
106
+ op = base_operation
107
+ op.table = WO::SaveTable.new(table_name: name.to_s, save_method: :TABLE_SAVE_METHOD_INSERT_INTO)
108
+ execute(op)
109
+ end
110
+ alias insertInto insert_into
111
+
112
+ # @return [void] convenience for `format("parquet").save(path)`.
113
+ def parquet(path) = format("parquet").save(path)
114
+ def json(path) = format("json").save(path)
115
+ def csv(path) = format("csv").save(path)
116
+ def orc(path) = format("orc").save(path)
117
+ def text(path) = format("text").save(path)
118
+
119
+ private
120
+
121
+ def base_operation
122
+ op = WO.new(
123
+ input: @df.relation, mode: @mode, options: @options,
124
+ partitioning_columns: @partitioning_columns, sort_column_names: @sort_columns
125
+ )
126
+ op.source = @source if @source
127
+ op.bucket_by = WO::BucketBy.new(bucket_column_names: @bucket_cols, num_buckets: @num_buckets) if @num_buckets
128
+ op
129
+ end
130
+
131
+ def execute(op)
132
+ @df.session.client.execute_command(Proto::Command.new(write_operation: op))
133
+ nil
134
+ end
135
+ end
136
+
137
+ # The DataSourceV2 write interface, returned by {DataFrame#write_to}. Mirrors
138
+ # PySpark's `DataFrameWriterV2`.
139
+ #
140
+ # @example
141
+ # df.write_to("catalog.db.table").using("parquet").create
142
+ # df.write_to("catalog.db.table").append
143
+ class DataFrameWriterV2
144
+ Proto = SparkConnect::Proto
145
+ WO2 = Proto::WriteOperationV2
146
+
147
+ # @param df [DataFrame]
148
+ # @param table [String]
149
+ def initialize(df, table)
150
+ @df = df
151
+ @table = table.to_s
152
+ @provider = nil
153
+ @options = {}
154
+ @table_properties = {}
155
+ @partitioning = []
156
+ end
157
+
158
+ # @return [self] set the table provider/format.
159
+ def using(provider)
160
+ @provider = provider.to_s
161
+ self
162
+ end
163
+
164
+ # @return [self] set a write option.
165
+ def option(key, value)
166
+ @options[key.to_s] = value.to_s
167
+ self
168
+ end
169
+
170
+ # @return [self] set a table property.
171
+ def table_property(key, value)
172
+ @table_properties[key.to_s] = value.to_s
173
+ self
174
+ end
175
+
176
+ # @return [self] partition by the given expressions/columns.
177
+ def partition_by(*cols)
178
+ @partitioning = cols.flatten.map { |c| (c.is_a?(Column) ? c : Functions.col(c.to_s)).to_expr }
179
+ self
180
+ end
181
+
182
+ # Create the table. @return [void]
183
+ def create = run(:MODE_CREATE)
184
+ # Replace the table. @return [void]
185
+ def replace = run(:MODE_REPLACE)
186
+ # Create or replace the table. @return [void]
187
+ def create_or_replace = run(:MODE_CREATE_OR_REPLACE)
188
+ # Append rows. @return [void]
189
+ def append = run(:MODE_APPEND)
190
+ # Overwrite rows matching `condition`. @return [void]
191
+ def overwrite(condition) = run(:MODE_OVERWRITE, overwrite_condition: Column.to_col(condition).to_expr)
192
+ # Dynamically overwrite partitions. @return [void]
193
+ def overwrite_partitions = run(:MODE_OVERWRITE_PARTITIONS)
194
+
195
+ private
196
+
197
+ def run(mode, overwrite_condition: nil)
198
+ op = WO2.new(
199
+ input: @df.relation, table_name: @table, mode: mode,
200
+ options: @options, table_properties: @table_properties, partitioning_columns: @partitioning
201
+ )
202
+ op.provider = @provider if @provider
203
+ op.overwrite_condition = overwrite_condition if overwrite_condition
204
+ @df.session.client.execute_command(Proto::Command.new(write_operation_v2: op))
205
+ nil
206
+ end
207
+ end
208
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "securerandom"
4
+
5
+ require_relative "spark_connect/version"
6
+ require_relative "spark_connect/proto"
7
+ require_relative "spark_connect/errors"
8
+ require_relative "spark_connect/types"
9
+ require_relative "spark_connect/row"
10
+ require_relative "spark_connect/plan"
11
+ require_relative "spark_connect/column"
12
+ require_relative "spark_connect/window"
13
+ require_relative "spark_connect/functions"
14
+ require_relative "spark_connect/arrow"
15
+ require_relative "spark_connect/channel_builder"
16
+ require_relative "spark_connect/client"
17
+ require_relative "spark_connect/conf"
18
+ require_relative "spark_connect/observation"
19
+ require_relative "spark_connect/grouped_data"
20
+ require_relative "spark_connect/na_functions"
21
+ require_relative "spark_connect/stat_functions"
22
+ require_relative "spark_connect/reader"
23
+ require_relative "spark_connect/writer"
24
+ require_relative "spark_connect/streaming"
25
+ require_relative "spark_connect/pipelines"
26
+ require_relative "spark_connect/catalog"
27
+ require_relative "spark_connect/data_frame"
28
+ require_relative "spark_connect/session"
29
+
30
+ # spark-connect is a pure-Ruby client for {https://spark.apache.org/docs/latest/spark-connect-overview.html
31
+ # Apache Spark Connect}, the gRPC-based decoupled client-server protocol for
32
+ # Apache Spark.
33
+ #
34
+ # The public surface mirrors PySpark closely: a {SparkConnect::SparkSession}
35
+ # is the entry point, {SparkConnect::DataFrame} is the lazy, immutable relation
36
+ # builder, {SparkConnect::Column} represents column expressions, and
37
+ # {SparkConnect::Functions} (aliased as {SparkConnect::F}) provides the standard
38
+ # function library.
39
+ #
40
+ # @example Connect and run a query
41
+ # require "spark-connect"
42
+ #
43
+ # spark = SparkConnect::SparkSession.builder
44
+ # .remote("sc://localhost:15002")
45
+ # .get_or_create
46
+ # df = spark.range(10).select(SparkConnect::F.col("id") * 2)
47
+ # df.show
48
+ # spark.stop
49
+ module SparkConnect
50
+ class << self
51
+ # Convenience shortcut for {SparkConnect::SparkSession.builder}.
52
+ #
53
+ # @return [SparkConnect::SparkSession::Builder]
54
+ def builder
55
+ SparkSession.builder
56
+ end
57
+ end
58
+ end