spark-connect 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +82 -0
- data/LICENSE +202 -0
- data/NOTICE +16 -0
- data/README.md +166 -0
- data/lib/spark-connect.rb +5 -0
- data/lib/spark_connect/arrow.rb +115 -0
- data/lib/spark_connect/catalog.rb +190 -0
- data/lib/spark_connect/channel_builder.rb +134 -0
- data/lib/spark_connect/client.rb +264 -0
- data/lib/spark_connect/column.rb +379 -0
- data/lib/spark_connect/conf.rb +79 -0
- data/lib/spark_connect/data_frame.rb +828 -0
- data/lib/spark_connect/errors.rb +58 -0
- data/lib/spark_connect/functions.rb +903 -0
- data/lib/spark_connect/grouped_data.rb +101 -0
- data/lib/spark_connect/na_functions.rb +98 -0
- data/lib/spark_connect/observation.rb +61 -0
- data/lib/spark_connect/pipelines.rb +221 -0
- data/lib/spark_connect/plan.rb +39 -0
- data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
- data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
- data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
- data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
- data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
- data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
- data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
- data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
- data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
- data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
- data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
- data/lib/spark_connect/proto.rb +32 -0
- data/lib/spark_connect/reader.rb +98 -0
- data/lib/spark_connect/row.rb +105 -0
- data/lib/spark_connect/session.rb +317 -0
- data/lib/spark_connect/stat_functions.rb +109 -0
- data/lib/spark_connect/streaming.rb +351 -0
- data/lib/spark_connect/types.rb +490 -0
- data/lib/spark_connect/version.rb +11 -0
- data/lib/spark_connect/window.rb +119 -0
- data/lib/spark_connect/writer.rb +208 -0
- data/lib/spark_connect.rb +58 -0
- data/proto/spark/connect/base.proto +1275 -0
- data/proto/spark/connect/catalog.proto +243 -0
- data/proto/spark/connect/commands.proto +553 -0
- data/proto/spark/connect/common.proto +179 -0
- data/proto/spark/connect/expressions.proto +557 -0
- data/proto/spark/connect/ml.proto +147 -0
- data/proto/spark/connect/ml_common.proto +64 -0
- data/proto/spark/connect/pipelines.proto +307 -0
- data/proto/spark/connect/relations.proto +1252 -0
- data/proto/spark/connect/types.proto +227 -0
- metadata +149 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SparkConnect
|
|
4
|
+
# Saves the contents of a {DataFrame} to external storage. Returned by
|
|
5
|
+
# {DataFrame#write}. Mirrors PySpark's `DataFrameWriter`.
|
|
6
|
+
#
|
|
7
|
+
# @example
|
|
8
|
+
# df.write.format("parquet").mode(:overwrite).save("out.parquet")
|
|
9
|
+
# df.write.mode(:append).save_as_table("my_table")
|
|
10
|
+
class DataFrameWriter
|
|
11
|
+
Proto = SparkConnect::Proto
|
|
12
|
+
WO = Proto::WriteOperation
|
|
13
|
+
|
|
14
|
+
SAVE_MODES = {
|
|
15
|
+
append: :SAVE_MODE_APPEND,
|
|
16
|
+
overwrite: :SAVE_MODE_OVERWRITE,
|
|
17
|
+
error: :SAVE_MODE_ERROR_IF_EXISTS,
|
|
18
|
+
errorifexists: :SAVE_MODE_ERROR_IF_EXISTS,
|
|
19
|
+
error_if_exists: :SAVE_MODE_ERROR_IF_EXISTS,
|
|
20
|
+
ignore: :SAVE_MODE_IGNORE,
|
|
21
|
+
default: :SAVE_MODE_UNSPECIFIED,
|
|
22
|
+
}.freeze
|
|
23
|
+
|
|
24
|
+
# @param df [DataFrame]
|
|
25
|
+
def initialize(df)
|
|
26
|
+
@df = df
|
|
27
|
+
@source = nil
|
|
28
|
+
@mode = :SAVE_MODE_UNSPECIFIED
|
|
29
|
+
@options = {}
|
|
30
|
+
@partitioning_columns = []
|
|
31
|
+
@sort_columns = []
|
|
32
|
+
@bucket_cols = nil
|
|
33
|
+
@num_buckets = nil
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# @return [self] set the output format.
|
|
37
|
+
def format(source)
|
|
38
|
+
@source = source.to_s
|
|
39
|
+
self
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# @return [self] set the save mode (`:append`, `:overwrite`, `:ignore`,
|
|
43
|
+
# `:error`).
|
|
44
|
+
def mode(save_mode)
|
|
45
|
+
@mode = SAVE_MODES.fetch(save_mode.to_s.downcase.to_sym) do
|
|
46
|
+
raise IllegalArgumentError, "Unknown save mode: #{save_mode}"
|
|
47
|
+
end
|
|
48
|
+
self
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# @return [self] set a write option.
|
|
52
|
+
def option(key, value)
|
|
53
|
+
@options[key.to_s] = value.to_s
|
|
54
|
+
self
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# @return [self] set multiple write options.
|
|
58
|
+
def options(opts)
|
|
59
|
+
opts.each { |k, v| @options[k.to_s] = v.to_s }
|
|
60
|
+
self
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# @return [self] partition the output by these columns.
|
|
64
|
+
def partition_by(*cols)
|
|
65
|
+
@partitioning_columns = cols.flatten.map(&:to_s)
|
|
66
|
+
self
|
|
67
|
+
end
|
|
68
|
+
alias partitionBy partition_by
|
|
69
|
+
|
|
70
|
+
# @return [self] sort within partitions/buckets by these columns.
|
|
71
|
+
def sort_by(*cols)
|
|
72
|
+
@sort_columns = cols.flatten.map(&:to_s)
|
|
73
|
+
self
|
|
74
|
+
end
|
|
75
|
+
alias sortBy sort_by
|
|
76
|
+
|
|
77
|
+
# @return [self] bucket the output into `num_buckets` by these columns.
|
|
78
|
+
def bucket_by(num_buckets, *cols)
|
|
79
|
+
@num_buckets = num_buckets
|
|
80
|
+
@bucket_cols = cols.flatten.map(&:to_s)
|
|
81
|
+
self
|
|
82
|
+
end
|
|
83
|
+
alias bucketBy bucket_by
|
|
84
|
+
|
|
85
|
+
# Save to a path.
|
|
86
|
+
# @param path [String, nil]
|
|
87
|
+
# @return [void]
|
|
88
|
+
def save(path = nil)
|
|
89
|
+
op = base_operation
|
|
90
|
+
op.path = path if path
|
|
91
|
+
execute(op)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Save as a managed/registered table.
|
|
95
|
+
# @return [void]
|
|
96
|
+
def save_as_table(name)
|
|
97
|
+
op = base_operation
|
|
98
|
+
op.table = WO::SaveTable.new(table_name: name.to_s, save_method: :TABLE_SAVE_METHOD_SAVE_AS_TABLE)
|
|
99
|
+
execute(op)
|
|
100
|
+
end
|
|
101
|
+
alias saveAsTable save_as_table
|
|
102
|
+
|
|
103
|
+
# Insert into an existing table (by position).
|
|
104
|
+
# @return [void]
|
|
105
|
+
def insert_into(name)
|
|
106
|
+
op = base_operation
|
|
107
|
+
op.table = WO::SaveTable.new(table_name: name.to_s, save_method: :TABLE_SAVE_METHOD_INSERT_INTO)
|
|
108
|
+
execute(op)
|
|
109
|
+
end
|
|
110
|
+
alias insertInto insert_into
|
|
111
|
+
|
|
112
|
+
# @return [void] convenience for `format("parquet").save(path)`.
|
|
113
|
+
def parquet(path) = format("parquet").save(path)
|
|
114
|
+
def json(path) = format("json").save(path)
|
|
115
|
+
def csv(path) = format("csv").save(path)
|
|
116
|
+
def orc(path) = format("orc").save(path)
|
|
117
|
+
def text(path) = format("text").save(path)
|
|
118
|
+
|
|
119
|
+
private
|
|
120
|
+
|
|
121
|
+
def base_operation
|
|
122
|
+
op = WO.new(
|
|
123
|
+
input: @df.relation, mode: @mode, options: @options,
|
|
124
|
+
partitioning_columns: @partitioning_columns, sort_column_names: @sort_columns
|
|
125
|
+
)
|
|
126
|
+
op.source = @source if @source
|
|
127
|
+
op.bucket_by = WO::BucketBy.new(bucket_column_names: @bucket_cols, num_buckets: @num_buckets) if @num_buckets
|
|
128
|
+
op
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def execute(op)
|
|
132
|
+
@df.session.client.execute_command(Proto::Command.new(write_operation: op))
|
|
133
|
+
nil
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# The DataSourceV2 write interface, returned by {DataFrame#write_to}. Mirrors
|
|
138
|
+
# PySpark's `DataFrameWriterV2`.
|
|
139
|
+
#
|
|
140
|
+
# @example
|
|
141
|
+
# df.write_to("catalog.db.table").using("parquet").create
|
|
142
|
+
# df.write_to("catalog.db.table").append
|
|
143
|
+
class DataFrameWriterV2
|
|
144
|
+
Proto = SparkConnect::Proto
|
|
145
|
+
WO2 = Proto::WriteOperationV2
|
|
146
|
+
|
|
147
|
+
# @param df [DataFrame]
|
|
148
|
+
# @param table [String]
|
|
149
|
+
def initialize(df, table)
|
|
150
|
+
@df = df
|
|
151
|
+
@table = table.to_s
|
|
152
|
+
@provider = nil
|
|
153
|
+
@options = {}
|
|
154
|
+
@table_properties = {}
|
|
155
|
+
@partitioning = []
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# @return [self] set the table provider/format.
|
|
159
|
+
def using(provider)
|
|
160
|
+
@provider = provider.to_s
|
|
161
|
+
self
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# @return [self] set a write option.
|
|
165
|
+
def option(key, value)
|
|
166
|
+
@options[key.to_s] = value.to_s
|
|
167
|
+
self
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# @return [self] set a table property.
|
|
171
|
+
def table_property(key, value)
|
|
172
|
+
@table_properties[key.to_s] = value.to_s
|
|
173
|
+
self
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# @return [self] partition by the given expressions/columns.
|
|
177
|
+
def partition_by(*cols)
|
|
178
|
+
@partitioning = cols.flatten.map { |c| (c.is_a?(Column) ? c : Functions.col(c.to_s)).to_expr }
|
|
179
|
+
self
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Create the table. @return [void]
|
|
183
|
+
def create = run(:MODE_CREATE)
|
|
184
|
+
# Replace the table. @return [void]
|
|
185
|
+
def replace = run(:MODE_REPLACE)
|
|
186
|
+
# Create or replace the table. @return [void]
|
|
187
|
+
def create_or_replace = run(:MODE_CREATE_OR_REPLACE)
|
|
188
|
+
# Append rows. @return [void]
|
|
189
|
+
def append = run(:MODE_APPEND)
|
|
190
|
+
# Overwrite rows matching `condition`. @return [void]
|
|
191
|
+
def overwrite(condition) = run(:MODE_OVERWRITE, overwrite_condition: Column.to_col(condition).to_expr)
|
|
192
|
+
# Dynamically overwrite partitions. @return [void]
|
|
193
|
+
def overwrite_partitions = run(:MODE_OVERWRITE_PARTITIONS)
|
|
194
|
+
|
|
195
|
+
private
|
|
196
|
+
|
|
197
|
+
def run(mode, overwrite_condition: nil)
|
|
198
|
+
op = WO2.new(
|
|
199
|
+
input: @df.relation, table_name: @table, mode: mode,
|
|
200
|
+
options: @options, table_properties: @table_properties, partitioning_columns: @partitioning
|
|
201
|
+
)
|
|
202
|
+
op.provider = @provider if @provider
|
|
203
|
+
op.overwrite_condition = overwrite_condition if overwrite_condition
|
|
204
|
+
@df.session.client.execute_command(Proto::Command.new(write_operation_v2: op))
|
|
205
|
+
nil
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "securerandom"
|
|
4
|
+
|
|
5
|
+
require_relative "spark_connect/version"
|
|
6
|
+
require_relative "spark_connect/proto"
|
|
7
|
+
require_relative "spark_connect/errors"
|
|
8
|
+
require_relative "spark_connect/types"
|
|
9
|
+
require_relative "spark_connect/row"
|
|
10
|
+
require_relative "spark_connect/plan"
|
|
11
|
+
require_relative "spark_connect/column"
|
|
12
|
+
require_relative "spark_connect/window"
|
|
13
|
+
require_relative "spark_connect/functions"
|
|
14
|
+
require_relative "spark_connect/arrow"
|
|
15
|
+
require_relative "spark_connect/channel_builder"
|
|
16
|
+
require_relative "spark_connect/client"
|
|
17
|
+
require_relative "spark_connect/conf"
|
|
18
|
+
require_relative "spark_connect/observation"
|
|
19
|
+
require_relative "spark_connect/grouped_data"
|
|
20
|
+
require_relative "spark_connect/na_functions"
|
|
21
|
+
require_relative "spark_connect/stat_functions"
|
|
22
|
+
require_relative "spark_connect/reader"
|
|
23
|
+
require_relative "spark_connect/writer"
|
|
24
|
+
require_relative "spark_connect/streaming"
|
|
25
|
+
require_relative "spark_connect/pipelines"
|
|
26
|
+
require_relative "spark_connect/catalog"
|
|
27
|
+
require_relative "spark_connect/data_frame"
|
|
28
|
+
require_relative "spark_connect/session"
|
|
29
|
+
|
|
30
|
+
# spark-connect is a pure-Ruby client for {https://spark.apache.org/docs/latest/spark-connect-overview.html
|
|
31
|
+
# Apache Spark Connect}, the gRPC-based decoupled client-server protocol for
|
|
32
|
+
# Apache Spark.
|
|
33
|
+
#
|
|
34
|
+
# The public surface mirrors PySpark closely: a {SparkConnect::SparkSession}
|
|
35
|
+
# is the entry point, {SparkConnect::DataFrame} is the lazy, immutable relation
|
|
36
|
+
# builder, {SparkConnect::Column} represents column expressions, and
|
|
37
|
+
# {SparkConnect::Functions} (aliased as {SparkConnect::F}) provides the standard
|
|
38
|
+
# function library.
|
|
39
|
+
#
|
|
40
|
+
# @example Connect and run a query
|
|
41
|
+
# require "spark-connect"
|
|
42
|
+
#
|
|
43
|
+
# spark = SparkConnect::SparkSession.builder
|
|
44
|
+
# .remote("sc://localhost:15002")
|
|
45
|
+
# .get_or_create
|
|
46
|
+
# df = spark.range(10).select(SparkConnect::F.col("id") * 2)
|
|
47
|
+
# df.show
|
|
48
|
+
# spark.stop
|
|
49
|
+
module SparkConnect
|
|
50
|
+
class << self
|
|
51
|
+
# Convenience shortcut for {SparkConnect::SparkSession.builder}.
|
|
52
|
+
#
|
|
53
|
+
# @return [SparkConnect::SparkSession::Builder]
|
|
54
|
+
def builder
|
|
55
|
+
SparkSession.builder
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|