spark-connect 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +56 -1
- data/README.md +8 -2
- data/lib/spark_connect/arrow.rb +5 -1
- data/lib/spark_connect/client.rb +7 -3
- data/lib/spark_connect/conf.rb +1 -1
- data/lib/spark_connect/data_frame.rb +19 -2
- data/lib/spark_connect/functions.rb +1 -1
- data/lib/spark_connect/session.rb +1 -1
- data/lib/spark_connect/version.rb +1 -1
- metadata +6 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 27eea75da0f1e6659b7371732fc56d2f103e0e79f60662760046303b0af5671f
|
|
4
|
+
data.tar.gz: 20676c87b08bd17e425722cc74b8fdbd2a2e4fb97e4c0bb15b63a9f2a1602e45
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3ca57c3c06de5909d836c9f3da9fd52eb70016262e8cbf5be90bf0082ff579c63c49b7586592eb6e465a090f16126a2e47fdac4f47c21402e4af2da356b2dfea
|
|
7
|
+
data.tar.gz: e68ec46d9a26c71201a456191847825c85d842bcfb76d2001eab07f6c5bbf7147f43feea68eeb85472f2d9b5a91a9ea5dcd43a80a70225fa48edf4e1a35deace
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,59 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.3.0] - 2026-06-15
|
|
11
|
+
|
|
12
|
+
Minor-version release. Contains the same fixes as 0.2.1 (listed below); there
|
|
13
|
+
are no functional changes beyond 0.2.1.
|
|
14
|
+
|
|
15
|
+
### Fixed
|
|
16
|
+
|
|
17
|
+
- **Correct `TimestampType` instants in `create_data_frame`.** Timestamp columns
|
|
18
|
+
were shipped as zone-less Arrow timestamps, so the server interpreted the epoch
|
|
19
|
+
micros as session-local wall-clock and shifted the value by the session time
|
|
20
|
+
zone. They are now tagged UTC; `TimestampNTZType` remains zone-less.
|
|
21
|
+
- **`RuntimeConfig#get` with a non-String default no longer raises.** A non-String
|
|
22
|
+
default (e.g. `conf.get(key, 8)`) was passed straight into a protobuf string
|
|
23
|
+
field, raising `Google::Protobuf::TypeError`. The default is now coerced to a
|
|
24
|
+
String, matching `#set`.
|
|
25
|
+
- **No duplicate rows when an execute stream is retried.** The result accumulator
|
|
26
|
+
was created outside the retry loop, so a mid-stream gRPC failure replayed
|
|
27
|
+
already-consumed Arrow batches and duplicated rows on retry. The accumulator is
|
|
28
|
+
now reset per attempt.
|
|
29
|
+
- **`DataFrame#drop_duplicates_within_watermark` is now watermark-aware.** It was a
|
|
30
|
+
plain alias of `#drop_duplicates` and never set the `within_watermark` flag, so
|
|
31
|
+
it silently performed an ordinary deduplication. Added a `dropDuplicatesWithinWatermark`
|
|
32
|
+
alias.
|
|
33
|
+
- **`SparkSession::Builder#app_name` is now applied.** `create` explicitly skipped
|
|
34
|
+
`spark.app.name`, making `#app_name` a no-op; all builder options are now
|
|
35
|
+
forwarded to the new session.
|
|
36
|
+
- Corrected misleading doc comments for `Functions#nanvl` and `DataFrame#except_all`.
|
|
37
|
+
|
|
38
|
+
## [0.2.1] - 2026-06-15
|
|
39
|
+
|
|
40
|
+
### Fixed
|
|
41
|
+
|
|
42
|
+
- **Correct `TimestampType` instants in `create_data_frame`.** Timestamp columns
|
|
43
|
+
were shipped as zone-less Arrow timestamps, so the server interpreted the epoch
|
|
44
|
+
micros as session-local wall-clock and shifted the value by the session time
|
|
45
|
+
zone. They are now tagged UTC; `TimestampNTZType` remains zone-less.
|
|
46
|
+
- **`RuntimeConfig#get` with a non-String default no longer raises.** A non-String
|
|
47
|
+
default (e.g. `conf.get(key, 8)`) was passed straight into a protobuf string
|
|
48
|
+
field, raising `Google::Protobuf::TypeError`. The default is now coerced to a
|
|
49
|
+
String, matching `#set`.
|
|
50
|
+
- **No duplicate rows when an execute stream is retried.** The result accumulator
|
|
51
|
+
was created outside the retry loop, so a mid-stream gRPC failure replayed
|
|
52
|
+
already-consumed Arrow batches and duplicated rows on retry. The accumulator is
|
|
53
|
+
now reset per attempt.
|
|
54
|
+
- **`DataFrame#drop_duplicates_within_watermark` is now watermark-aware.** It was a
|
|
55
|
+
plain alias of `#drop_duplicates` and never set the `within_watermark` flag, so
|
|
56
|
+
it silently performed an ordinary deduplication. Added a `dropDuplicatesWithinWatermark`
|
|
57
|
+
alias.
|
|
58
|
+
- **`SparkSession::Builder#app_name` is now applied.** `create` explicitly skipped
|
|
59
|
+
`spark.app.name`, making `#app_name` a no-op; all builder options are now
|
|
60
|
+
forwarded to the new session.
|
|
61
|
+
- Corrected misleading doc comments for `Functions#nanvl` and `DataFrame#except_all`.
|
|
62
|
+
|
|
10
63
|
## [0.2.0] - 2026-06-10
|
|
11
64
|
|
|
12
65
|
### Added
|
|
@@ -77,6 +130,8 @@ Initial release.
|
|
|
77
130
|
- Vendored Spark Connect 4.1 protobuf/gRPC definitions and a regeneration script
|
|
78
131
|
(`bin/generate-protos`).
|
|
79
132
|
|
|
80
|
-
[Unreleased]: https://github.com/HyukjinKwon/spark-connect-ruby/compare/v0.
|
|
133
|
+
[Unreleased]: https://github.com/HyukjinKwon/spark-connect-ruby/compare/v0.3.0...HEAD
|
|
134
|
+
[0.3.0]: https://github.com/HyukjinKwon/spark-connect-ruby/compare/v0.2.1...v0.3.0
|
|
135
|
+
[0.2.1]: https://github.com/HyukjinKwon/spark-connect-ruby/compare/v0.2.0...v0.2.1
|
|
81
136
|
[0.2.0]: https://github.com/HyukjinKwon/spark-connect-ruby/compare/v0.1.0...v0.2.0
|
|
82
137
|
[0.1.0]: https://github.com/HyukjinKwon/spark-connect-ruby/releases/tag/v0.1.0
|
data/README.md
CHANGED
|
@@ -64,12 +64,14 @@ See the [installation guide](https://hyukjinkwon.github.io/spark-connect-ruby/in
|
|
|
64
64
|
## Installation
|
|
65
65
|
|
|
66
66
|
```bash
|
|
67
|
+
gem install rubygems-requirements-system
|
|
67
68
|
gem install spark-connect
|
|
68
69
|
```
|
|
69
70
|
|
|
70
71
|
Or in a `Gemfile`:
|
|
71
72
|
|
|
72
73
|
```ruby
|
|
74
|
+
plugin "rubygems-requirements-system"
|
|
73
75
|
gem "spark-connect"
|
|
74
76
|
```
|
|
75
77
|
|
|
@@ -80,10 +82,14 @@ gem "spark-connect"
|
|
|
80
82
|
curl -fsSL https://archive.apache.org/dist/spark/spark-4.1.0/spark-4.1.0-bin-hadoop3.tgz | tar xz
|
|
81
83
|
cd spark-4.1.0-bin-hadoop3
|
|
82
84
|
|
|
83
|
-
# Start the Connect server (requires Java 17+)
|
|
84
|
-
|
|
85
|
+
# Start the Connect server (requires Java 17+).
|
|
86
|
+
# Spark 4.0.0+ bundles the Connect server, so no extra packages are needed.
|
|
87
|
+
./sbin/start-connect-server.sh
|
|
85
88
|
```
|
|
86
89
|
|
|
90
|
+
On **Spark 3.5.x** the Connect server is not bundled; pull it in with
|
|
91
|
+
`--packages "org.apache.spark:spark-connect_2.13:3.5.5"` (use a Scala 2.13 distribution).
|
|
92
|
+
|
|
87
93
|
The server listens on `sc://localhost:15002` by default.
|
|
88
94
|
|
|
89
95
|
## Connecting
|
data/lib/spark_connect/arrow.rb
CHANGED
|
@@ -103,7 +103,11 @@ module SparkConnect
|
|
|
103
103
|
when Types::StringType, Types::CharType, Types::VarcharType then :string
|
|
104
104
|
when Types::BinaryType then :binary
|
|
105
105
|
when Types::DateType then :date32
|
|
106
|
-
|
|
106
|
+
# TimestampType is an instant: tag it UTC so the server reads the epoch
|
|
107
|
+
# micros as a point in time rather than session-local wall-clock. The NTZ
|
|
108
|
+
# variant stays zone-less (wall-clock) to match its semantics.
|
|
109
|
+
when Types::TimestampType then ::Arrow::TimestampDataType.new(:micro, GLib::TimeZone.new("UTC"))
|
|
110
|
+
when Types::TimestampNTZType then { type: :timestamp, unit: :micro }
|
|
107
111
|
when Types::ArrayType
|
|
108
112
|
{ type: :list, field: { name: "element", type: arrow_field_type(data_type.element_type) } }
|
|
109
113
|
when Types::StructType
|
data/lib/spark_connect/client.rb
CHANGED
|
@@ -171,16 +171,20 @@ module SparkConnect
|
|
|
171
171
|
tags: @tags
|
|
172
172
|
)
|
|
173
173
|
|
|
174
|
-
|
|
175
|
-
|
|
174
|
+
# Build the accumulator *inside* the retry block so that a mid-stream
|
|
175
|
+
# failure (which restarts the gRPC stream from the beginning) starts from
|
|
176
|
+
# a clean slate. Accumulating into a result created outside the block
|
|
177
|
+
# would re-append already-seen batches and duplicate rows on retry.
|
|
176
178
|
with_retries do
|
|
179
|
+
result = ExecuteResult.new([], nil, nil, [], nil, 0)
|
|
180
|
+
result.pipeline_events = []
|
|
177
181
|
responses = @stub.execute_plan(req, metadata: @metadata)
|
|
178
182
|
responses.each do |resp|
|
|
179
183
|
@server_side_session_id = resp.server_side_session_id unless resp.server_side_session_id.empty?
|
|
180
184
|
accumulate(result, resp)
|
|
181
185
|
end
|
|
186
|
+
result
|
|
182
187
|
end
|
|
183
|
-
result
|
|
184
188
|
end
|
|
185
189
|
|
|
186
190
|
def accumulate(result, resp)
|
data/lib/spark_connect/conf.rb
CHANGED
|
@@ -39,7 +39,7 @@ module SparkConnect
|
|
|
39
39
|
Op.new(get: CR::Get.new(keys: [key.to_s]))
|
|
40
40
|
else
|
|
41
41
|
Op.new(get_with_default: CR::GetWithDefault.new(
|
|
42
|
-
pairs: [Proto::KeyValue.new(key: key.to_s, value: default)]
|
|
42
|
+
pairs: [Proto::KeyValue.new(key: key.to_s, value: default.to_s)]
|
|
43
43
|
))
|
|
44
44
|
end
|
|
45
45
|
resp = @client.config(op)
|
|
@@ -182,7 +182,24 @@ module SparkConnect
|
|
|
182
182
|
build(deduplicate: dedup)
|
|
183
183
|
end
|
|
184
184
|
alias dropDuplicates drop_duplicates
|
|
185
|
-
|
|
185
|
+
|
|
186
|
+
# Drop duplicate rows within the event-time watermark, optionally restricted
|
|
187
|
+
# to a subset of columns. Unlike {#drop_duplicates}, this is watermark-aware
|
|
188
|
+
# and is intended for streaming DataFrames (mirrors PySpark's
|
|
189
|
+
# `dropDuplicatesWithinWatermark`).
|
|
190
|
+
#
|
|
191
|
+
# @param subset [Array<String>, nil]
|
|
192
|
+
# @return [DataFrame]
|
|
193
|
+
def drop_duplicates_within_watermark(subset = nil)
|
|
194
|
+
dedup =
|
|
195
|
+
if subset.nil? || subset.empty?
|
|
196
|
+
Proto::Deduplicate.new(input: @relation, all_columns_as_keys: true, within_watermark: true)
|
|
197
|
+
else
|
|
198
|
+
Proto::Deduplicate.new(input: @relation, column_names: Array(subset).map(&:to_s), within_watermark: true)
|
|
199
|
+
end
|
|
200
|
+
build(deduplicate: dedup)
|
|
201
|
+
end
|
|
202
|
+
alias dropDuplicatesWithinWatermark drop_duplicates_within_watermark
|
|
186
203
|
|
|
187
204
|
# ---- Ordering ----------------------------------------------------------
|
|
188
205
|
|
|
@@ -311,7 +328,7 @@ module SparkConnect
|
|
|
311
328
|
end
|
|
312
329
|
alias intersectAll intersect_all
|
|
313
330
|
|
|
314
|
-
# Rows in this DataFrame not in `other`
|
|
331
|
+
# Rows in this DataFrame not in `other`, keeping duplicates - Spark's `EXCEPT ALL`.
|
|
315
332
|
# @return [DataFrame]
|
|
316
333
|
def except_all(other)
|
|
317
334
|
set_op(other, :SET_OP_TYPE_EXCEPT, is_all: true)
|
|
@@ -85,7 +85,7 @@ module SparkConnect
|
|
|
85
85
|
|
|
86
86
|
# @return [Column] first non-null among the given columns.
|
|
87
87
|
def coalesce(*cols) = Column.invoke("coalesce", *cols.map { |c| _col(c) })
|
|
88
|
-
# @return [Column] `
|
|
88
|
+
# @return [Column] `col1` if it is not NaN, else `col2`.
|
|
89
89
|
def nanvl(col1, col2) = Column.invoke("nanvl", _col(col1), _col(col2))
|
|
90
90
|
|
|
91
91
|
# ---- Constructors of complex types ------------------------------------
|
|
@@ -308,7 +308,7 @@ module SparkConnect
|
|
|
308
308
|
url = @remote || ENV["SPARK_REMOTE"] || "sc://localhost:15002"
|
|
309
309
|
client = SparkConnectClient.new(ChannelBuilder.new(url))
|
|
310
310
|
session = SparkSession.new(client)
|
|
311
|
-
@options.each { |k, v| session.conf.set(k, v)
|
|
311
|
+
@options.each { |k, v| session.conf.set(k, v) }
|
|
312
312
|
session
|
|
313
313
|
end
|
|
314
314
|
alias build create
|
metadata
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: spark-connect
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Hyukjin Kwon
|
|
8
|
+
autorequire:
|
|
8
9
|
bindir: bin
|
|
9
10
|
cert_chain: []
|
|
10
|
-
date:
|
|
11
|
+
date: 2026-06-15 00:00:00.000000000 Z
|
|
11
12
|
dependencies:
|
|
12
13
|
- !ruby/object:Gem::Dependency
|
|
13
14
|
name: google-protobuf
|
|
@@ -129,6 +130,7 @@ metadata:
|
|
|
129
130
|
documentation_uri: https://hyukjinkwon.github.io/spark-connect-ruby/
|
|
130
131
|
bug_tracker_uri: https://github.com/HyukjinKwon/spark-connect-ruby/issues
|
|
131
132
|
changelog_uri: https://github.com/HyukjinKwon/spark-connect-ruby/blob/main/CHANGELOG.md
|
|
133
|
+
post_install_message:
|
|
132
134
|
rdoc_options: []
|
|
133
135
|
require_paths:
|
|
134
136
|
- lib
|
|
@@ -143,7 +145,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
143
145
|
- !ruby/object:Gem::Version
|
|
144
146
|
version: '0'
|
|
145
147
|
requirements: []
|
|
146
|
-
rubygems_version:
|
|
148
|
+
rubygems_version: 3.5.22
|
|
149
|
+
signing_key:
|
|
147
150
|
specification_version: 4
|
|
148
151
|
summary: A pure-Ruby client for Apache Spark Connect.
|
|
149
152
|
test_files: []
|