spark-connect-cli 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spark_connect_cli-0.2.0/.github/workflows/publish.yml +43 -0
- spark_connect_cli-0.2.0/.gitignore +9 -0
- spark_connect_cli-0.2.0/LICENSE +21 -0
- spark_connect_cli-0.2.0/PKG-INFO +156 -0
- spark_connect_cli-0.2.0/README.md +136 -0
- spark_connect_cli-0.2.0/SKILL.md +151 -0
- spark_connect_cli-0.2.0/pyproject.toml +43 -0
- spark_connect_cli-0.2.0/src/spark_connect_cli/__init__.py +2 -0
- spark_connect_cli-0.2.0/src/spark_connect_cli/__main__.py +4 -0
- spark_connect_cli-0.2.0/src/spark_connect_cli/cli.py +138 -0
- spark_connect_cli-0.2.0/src/spark_connect_cli/jobs.py +221 -0
- spark_connect_cli-0.2.0/src/spark_connect_cli/meta.py +95 -0
- spark_connect_cli-0.2.0/src/spark_connect_cli/query.py +57 -0
- spark_connect_cli-0.2.0/src/spark_connect_cli/rest.py +53 -0
- spark_connect_cli-0.2.0/src/spark_connect_cli/session.py +72 -0
- spark_connect_cli-0.2.0/src/spark_connect_cli/sync.py +146 -0
- spark_connect_cli-0.2.0/tests/test_guard.py +15 -0
- spark_connect_cli-0.2.0/tests/test_jobs.py +71 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
# Publishes to PyPI via OIDC Trusted Publishing (no API token stored).
|
|
4
|
+
# Triggered when you publish a GitHub Release. The version comes from
|
|
5
|
+
# pyproject.toml — bump it before tagging, since PyPI rejects re-uploads.
|
|
6
|
+
on:
|
|
7
|
+
release:
|
|
8
|
+
types: [published]
|
|
9
|
+
workflow_dispatch: {}
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
build:
|
|
13
|
+
name: Build distribution
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.x"
|
|
20
|
+
- name: Build sdist + wheel
|
|
21
|
+
run: |
|
|
22
|
+
python -m pip install --upgrade build
|
|
23
|
+
python -m build
|
|
24
|
+
- uses: actions/upload-artifact@v4
|
|
25
|
+
with:
|
|
26
|
+
name: dist
|
|
27
|
+
path: dist/
|
|
28
|
+
|
|
29
|
+
publish:
|
|
30
|
+
name: Publish to PyPI
|
|
31
|
+
needs: build
|
|
32
|
+
runs-on: ubuntu-latest
|
|
33
|
+
environment:
|
|
34
|
+
name: pypi
|
|
35
|
+
url: https://pypi.org/p/spark-connect-cli
|
|
36
|
+
permissions:
|
|
37
|
+
id-token: write # required for OIDC trusted publishing
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/download-artifact@v4
|
|
40
|
+
with:
|
|
41
|
+
name: dist
|
|
42
|
+
path: dist/
|
|
43
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 dengshu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: spark-connect-cli
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Agent-friendly Spark Connect CLI: read-only querying + async long-job control. No JVM, no Kerberos on the client.
|
|
5
|
+
Project-URL: Homepage, https://github.com/dengshu2/spark-connect-cli
|
|
6
|
+
Project-URL: Issues, https://github.com/dengshu2/spark-connect-cli/issues
|
|
7
|
+
Author: dengshu
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: agent,cli,clickhouse,hive,llm,spark,spark-connect
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Database :: Front-Ends
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Requires-Dist: pyspark[connect]<4,>=3.5
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: pytest>=7; extra == 'dev'
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# spark-connect-cli (`scq`)
|
|
22
|
+
|
|
23
|
+
An agent-friendly [Spark Connect](https://spark.apache.org/spark-connect/) CLI —
|
|
24
|
+
**read-only querying** plus **async control for long-running jobs**.
|
|
25
|
+
|
|
26
|
+
Built for LLM agents and humans who live in a shell. Unlike `spark-sql` /
|
|
27
|
+
`spark-submit`, the client is a thin **pure-Python gRPC client**: no JVM, and
|
|
28
|
+
**no Kerberos on the client side** — the Spark Connect server authenticates with
|
|
29
|
+
its own keytab, so you just point at `sc://host:15002` and go.
|
|
30
|
+
|
|
31
|
+
## Why
|
|
32
|
+
|
|
33
|
+
- **JSON-first, read-only by default.** Safe for an agent to call for
|
|
34
|
+
exploration; writes/DDL are blocked unless you opt in (`--allow-ddl`).
|
|
35
|
+
- **Long jobs don't block you.** A multi-minute Spark job shouldn't trap an agent
|
|
36
|
+
in a 30-minute tool call. `scq` submits the job, hands back a durable **job
|
|
37
|
+
id**, and returns immediately. Poll it whenever you like; the handle survives a
|
|
38
|
+
client/container restart because it lives in an on-disk registry.
|
|
39
|
+
- **Stable exit codes** so a caller can branch without scraping text.
|
|
40
|
+
|
|
41
|
+
## Install
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install spark-connect-cli # once published
|
|
45
|
+
# or, from source:
|
|
46
|
+
pip install -e .
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Quick start
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
export SPARK_REMOTE=sc://localhost:15002 # your Spark Connect endpoint
|
|
53
|
+
|
|
54
|
+
scq databases
|
|
55
|
+
scq tables mydb --like '%orders%'
|
|
56
|
+
scq describe mydb.orders
|
|
57
|
+
scq query "SELECT id, name FROM mydb.orders LIMIT 10"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Output is **JSONEachRow** (one JSON object per line) by default; pick another with
|
|
61
|
+
`--format json|csv|tsv|table`.
|
|
62
|
+
|
|
63
|
+
### Read-only guard
|
|
64
|
+
|
|
65
|
+
`scq query` allows only `SELECT/SHOW/DESCRIBE/EXPLAIN/WITH`. Anything else exits
|
|
66
|
+
with code **3** unless you pass `--allow-ddl`.
|
|
67
|
+
|
|
68
|
+
| exit | meaning |
|
|
69
|
+
|------|---------|
|
|
70
|
+
| 0 | success |
|
|
71
|
+
| 1 | query error (bad SQL) |
|
|
72
|
+
| 2 | connection error |
|
|
73
|
+
| 3 | blocked by the read-only guard |
|
|
74
|
+
| 4 | job-control error (no such job, …) |
|
|
75
|
+
|
|
76
|
+
## Async jobs (Layer A)
|
|
77
|
+
|
|
78
|
+
Long work runs detached and is tracked by a file-based registry under
|
|
79
|
+
`$SCQ_JOBS_DIR` (default `~/.spark-connect-cli/jobs`).
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
# submit — returns a job id immediately, does NOT block
|
|
83
|
+
scq sync ods.orders --to clickhouse
|
|
84
|
+
# {"job_id": "j-20260625-...", "state": "running", "message": "... poll with ..."}
|
|
85
|
+
|
|
86
|
+
scq jobs list # all jobs + state
|
|
87
|
+
scq jobs status j-20260625-... # full status (rows, timings, pid, exit code)
|
|
88
|
+
scq jobs logs j-20260625-... --tail 40
|
|
89
|
+
scq jobs cancel j-20260625-... # kills the whole process group
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Design: each job is a directory with `meta.json` (state machine:
|
|
93
|
+
`submitted → running → succeeded|failed|cancelled`) and `out.log`. The worker
|
|
94
|
+
runs in its **own process group**, so cancel kills the entire tree (no orphans).
|
|
95
|
+
A `running` job whose process has vanished is reconciled to `failed` on the next
|
|
96
|
+
status read, so status never lies.
|
|
97
|
+
|
|
98
|
+
## Hive → ClickHouse sync
|
|
99
|
+
|
|
100
|
+
`scq sync` is one job kind built on the async subsystem. It uses **Spark direct
|
|
101
|
+
write**: a Spark Connect job reads the Hive table and writes to ClickHouse over
|
|
102
|
+
JDBC. The write runs on the executors, so rows never pass through this process or
|
|
103
|
+
the agent.
|
|
104
|
+
|
|
105
|
+
Modes control write parallelism — `single` (one connection, small tables),
|
|
106
|
+
`parallel` (N partitions, large tables), `auto` (picks by row count).
|
|
107
|
+
|
|
108
|
+
Requires:
|
|
109
|
+
- `clickhouse-jdbc` on the Spark Connect server classpath (`/opt/spark/jars/`),
|
|
110
|
+
- cluster→ClickHouse network egress,
|
|
111
|
+
- a JDBC URL with credentials via `--ch-jdbc` / `$SCQ_CH_JDBC`,
|
|
112
|
+
- the **target ClickHouse table created beforehand** with a suitable engine
|
|
113
|
+
(Spark `append` won't build a usable MergeTree table for you — create it first,
|
|
114
|
+
e.g. with the `chsql` skill).
|
|
115
|
+
|
|
116
|
+
## Introspection
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
scq meta db.table # one JSON: schema, created time, location,
|
|
120
|
+
# partitions, file count/size, mtime range
|
|
121
|
+
scq meta db.table --count # also run an exact count(*)
|
|
122
|
+
|
|
123
|
+
scq exec stages?status=active # read-only Spark REST passthrough
|
|
124
|
+
scq exec executors
|
|
125
|
+
scq exec stages/<id>/<attempt>/taskSummary?quantiles=0.5,0.95,1.0 # skew: max/median
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
`scq exec` auto-discovers the running Spark app via the YARN ResourceManager and
|
|
129
|
+
proxies its monitoring REST API (GET-only). Set the RM base with `$SCQ_YARN_RM`.
|
|
130
|
+
|
|
131
|
+
**Reading `scq exec executors`** — the `maxMemory` field is Spark's
|
|
132
|
+
**storage/cache pool** (`(heap − 300 MB reserved) × 0.6`), *not* the executor's
|
|
133
|
+
total memory: a 512 MB executor reports ~93 MB, a 1536 MB driver ~741 MB. The
|
|
134
|
+
real heap is `spark.executor.memory` (+ off-heap overhead). The `driver` row has
|
|
135
|
+
0 cores and runs no tasks. With dynamic allocation, idle executors are released —
|
|
136
|
+
so the list may show only the driver when nothing is running.
|
|
137
|
+
|
|
138
|
+
## Configuration
|
|
139
|
+
|
|
140
|
+
| env | default | meaning |
|
|
141
|
+
|-----|---------|---------|
|
|
142
|
+
| `SPARK_REMOTE` | `sc://localhost:15002` | Spark Connect endpoint |
|
|
143
|
+
| `SCQ_JOBS_DIR` | `~/.spark-connect-cli/jobs` | job registry (put on a persistent volume) |
|
|
144
|
+
| `SCQ_MAX_ROWS` | `10000` | default row cap for `query` |
|
|
145
|
+
| `SCQ_CH_JDBC` | — | ClickHouse JDBC URL for `sync` path A |
|
|
146
|
+
| `SCQ_YARN_RM` | `http://namenode.hive-net:8088` | YARN RM base for `scq exec` |
|
|
147
|
+
|
|
148
|
+
## Use with an LLM agent
|
|
149
|
+
|
|
150
|
+
`SKILL.md` ships a ready-made skill (discover-before-query workflow, async-job
|
|
151
|
+
etiquette, type-mapping table). Drop it into your agent's skills directory and
|
|
152
|
+
the agent drives `scq` through a shell/Bash tool.
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
MIT
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# spark-connect-cli (`scq`)
|
|
2
|
+
|
|
3
|
+
An agent-friendly [Spark Connect](https://spark.apache.org/spark-connect/) CLI —
|
|
4
|
+
**read-only querying** plus **async control for long-running jobs**.
|
|
5
|
+
|
|
6
|
+
Built for LLM agents and humans who live in a shell. Unlike `spark-sql` /
|
|
7
|
+
`spark-submit`, the client is a thin **pure-Python gRPC client**: no JVM, and
|
|
8
|
+
**no Kerberos on the client side** — the Spark Connect server authenticates with
|
|
9
|
+
its own keytab, so you just point at `sc://host:15002` and go.
|
|
10
|
+
|
|
11
|
+
## Why
|
|
12
|
+
|
|
13
|
+
- **JSON-first, read-only by default.** Safe for an agent to call for
|
|
14
|
+
exploration; writes/DDL are blocked unless you opt in (`--allow-ddl`).
|
|
15
|
+
- **Long jobs don't block you.** A multi-minute Spark job shouldn't trap an agent
|
|
16
|
+
in a 30-minute tool call. `scq` submits the job, hands back a durable **job
|
|
17
|
+
id**, and returns immediately. Poll it whenever you like; the handle survives a
|
|
18
|
+
client/container restart because it lives in an on-disk registry.
|
|
19
|
+
- **Stable exit codes** so a caller can branch without scraping text.
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install spark-connect-cli # once published
|
|
25
|
+
# or, from source:
|
|
26
|
+
pip install -e .
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Quick start
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
export SPARK_REMOTE=sc://localhost:15002 # your Spark Connect endpoint
|
|
33
|
+
|
|
34
|
+
scq databases
|
|
35
|
+
scq tables mydb --like '%orders%'
|
|
36
|
+
scq describe mydb.orders
|
|
37
|
+
scq query "SELECT id, name FROM mydb.orders LIMIT 10"
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Output is **JSONEachRow** (one JSON object per line) by default; pick another with
|
|
41
|
+
`--format json|csv|tsv|table`.
|
|
42
|
+
|
|
43
|
+
### Read-only guard
|
|
44
|
+
|
|
45
|
+
`scq query` allows only `SELECT/SHOW/DESCRIBE/EXPLAIN/WITH`. Anything else exits
|
|
46
|
+
with code **3** unless you pass `--allow-ddl`.
|
|
47
|
+
|
|
48
|
+
| exit | meaning |
|
|
49
|
+
|------|---------|
|
|
50
|
+
| 0 | success |
|
|
51
|
+
| 1 | query error (bad SQL) |
|
|
52
|
+
| 2 | connection error |
|
|
53
|
+
| 3 | blocked by the read-only guard |
|
|
54
|
+
| 4 | job-control error (no such job, …) |
|
|
55
|
+
|
|
56
|
+
## Async jobs (Layer A)
|
|
57
|
+
|
|
58
|
+
Long work runs detached and is tracked by a file-based registry under
|
|
59
|
+
`$SCQ_JOBS_DIR` (default `~/.spark-connect-cli/jobs`).
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# submit — returns a job id immediately, does NOT block
|
|
63
|
+
scq sync ods.orders --to clickhouse
|
|
64
|
+
# {"job_id": "j-20260625-...", "state": "running", "message": "... poll with ..."}
|
|
65
|
+
|
|
66
|
+
scq jobs list # all jobs + state
|
|
67
|
+
scq jobs status j-20260625-... # full status (rows, timings, pid, exit code)
|
|
68
|
+
scq jobs logs j-20260625-... --tail 40
|
|
69
|
+
scq jobs cancel j-20260625-... # kills the whole process group
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Design: each job is a directory with `meta.json` (state machine:
|
|
73
|
+
`submitted → running → succeeded|failed|cancelled`) and `out.log`. The worker
|
|
74
|
+
runs in its **own process group**, so cancel kills the entire tree (no orphans).
|
|
75
|
+
A `running` job whose process has vanished is reconciled to `failed` on the next
|
|
76
|
+
status read, so status never lies.
|
|
77
|
+
|
|
78
|
+
## Hive → ClickHouse sync
|
|
79
|
+
|
|
80
|
+
`scq sync` is one job kind built on the async subsystem. It uses **Spark direct
|
|
81
|
+
write**: a Spark Connect job reads the Hive table and writes to ClickHouse over
|
|
82
|
+
JDBC. The write runs on the executors, so rows never pass through this process or
|
|
83
|
+
the agent.
|
|
84
|
+
|
|
85
|
+
Modes control write parallelism — `single` (one connection, small tables),
|
|
86
|
+
`parallel` (N partitions, large tables), `auto` (picks by row count).
|
|
87
|
+
|
|
88
|
+
Requires:
|
|
89
|
+
- `clickhouse-jdbc` on the Spark Connect server classpath (`/opt/spark/jars/`),
|
|
90
|
+
- cluster→ClickHouse network egress,
|
|
91
|
+
- a JDBC URL with credentials via `--ch-jdbc` / `$SCQ_CH_JDBC`,
|
|
92
|
+
- the **target ClickHouse table created beforehand** with a suitable engine
|
|
93
|
+
(Spark `append` won't build a usable MergeTree table for you — create it first,
|
|
94
|
+
e.g. with the `chsql` skill).
|
|
95
|
+
|
|
96
|
+
## Introspection
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
scq meta db.table # one JSON: schema, created time, location,
|
|
100
|
+
# partitions, file count/size, mtime range
|
|
101
|
+
scq meta db.table --count # also run an exact count(*)
|
|
102
|
+
|
|
103
|
+
scq exec stages?status=active # read-only Spark REST passthrough
|
|
104
|
+
scq exec executors
|
|
105
|
+
scq exec stages/<id>/<attempt>/taskSummary?quantiles=0.5,0.95,1.0 # skew: max/median
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
`scq exec` auto-discovers the running Spark app via the YARN ResourceManager and
|
|
109
|
+
proxies its monitoring REST API (GET-only). Set the RM base with `$SCQ_YARN_RM`.
|
|
110
|
+
|
|
111
|
+
**Reading `scq exec executors`** — the `maxMemory` field is Spark's
|
|
112
|
+
**storage/cache pool** (`(heap − 300 MB reserved) × 0.6`), *not* the executor's
|
|
113
|
+
total memory: a 512 MB executor reports ~93 MB, a 1536 MB driver ~741 MB. The
|
|
114
|
+
real heap is `spark.executor.memory` (+ off-heap overhead). The `driver` row has
|
|
115
|
+
0 cores and runs no tasks. With dynamic allocation, idle executors are released —
|
|
116
|
+
so the list may show only the driver when nothing is running.
|
|
117
|
+
|
|
118
|
+
## Configuration
|
|
119
|
+
|
|
120
|
+
| env | default | meaning |
|
|
121
|
+
|-----|---------|---------|
|
|
122
|
+
| `SPARK_REMOTE` | `sc://localhost:15002` | Spark Connect endpoint |
|
|
123
|
+
| `SCQ_JOBS_DIR` | `~/.spark-connect-cli/jobs` | job registry (put on a persistent volume) |
|
|
124
|
+
| `SCQ_MAX_ROWS` | `10000` | default row cap for `query` |
|
|
125
|
+
| `SCQ_CH_JDBC` | — | ClickHouse JDBC URL for `sync` path A |
|
|
126
|
+
| `SCQ_YARN_RM` | `http://namenode.hive-net:8088` | YARN RM base for `scq exec` |
|
|
127
|
+
|
|
128
|
+
## Use with an LLM agent
|
|
129
|
+
|
|
130
|
+
`SKILL.md` ships a ready-made skill (discover-before-query workflow, async-job
|
|
131
|
+
etiquette, type-mapping table). Drop it into your agent's skills directory and
|
|
132
|
+
the agent drives `scq` through a shell/Bash tool.
|
|
133
|
+
|
|
134
|
+
## License
|
|
135
|
+
|
|
136
|
+
MIT
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: spark-connect-cli
|
|
3
|
+
description: >-
|
|
4
|
+
Query Spark / Hive from the shell with the `scq` CLI over Spark Connect, and
|
|
5
|
+
run long Spark jobs (e.g. Hive->ClickHouse syncs) without blocking. Use
|
|
6
|
+
whenever the user wants to read Hive/Spark data, explore databases/tables/
|
|
7
|
+
schema, run a Spark SQL analysis, or sync a Hive table somewhere. Triggers:
|
|
8
|
+
Hive, Spark, Spark SQL, 查 Hive, 跑个 Spark SQL, 看下这个表, 同步到 ClickHouse,
|
|
9
|
+
sync table.
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
# scq — Spark Connect from the shell
|
|
13
|
+
|
|
14
|
+
`scq` queries a Spark Connect server (JSON-first, **read-only by default**) and
|
|
15
|
+
manages **async long jobs** so you never sit in a blocking tool call.
|
|
16
|
+
|
|
17
|
+
## Discover before you query
|
|
18
|
+
|
|
19
|
+
Don't guess names. Discover them first:
|
|
20
|
+
|
|
21
|
+
1. `scq databases` — list databases.
|
|
22
|
+
2. `scq tables [DB] --like '%keyword%'` — list tables.
|
|
23
|
+
3. `scq describe <db.table>` — list columns (name, type, comment).
|
|
24
|
+
4. `scq query "SELECT ..."` — run it once you know the schema.
|
|
25
|
+
|
|
26
|
+
## Reading output
|
|
27
|
+
|
|
28
|
+
- **stdout** carries data. Default is **JSONEachRow** (NDJSON — one JSON object
|
|
29
|
+
per line). Other formats: `--format json|csv|tsv|table`.
|
|
30
|
+
- **stderr** carries errors as one JSON object `{"error": ..., "code": ...}`.
|
|
31
|
+
- `query` caps at `SCQ_MAX_ROWS` (default 10k); a `{"warning": ...}` on stderr
|
|
32
|
+
means it was truncated — add `LIMIT`/filters or raise `--max-rows`.
|
|
33
|
+
|
|
34
|
+
## Branch on the exit code
|
|
35
|
+
|
|
36
|
+
`0` ok · `1` query error (fix the SQL) · `2` connection error (check
|
|
37
|
+
`$SPARK_REMOTE`) · `3` read-only guard blocked it · `4` job-control error.
|
|
38
|
+
|
|
39
|
+
## Read-only by default
|
|
40
|
+
|
|
41
|
+
`scq query` allows only SELECT/SHOW/DESCRIBE/EXPLAIN/WITH. Writes and DDL exit
|
|
42
|
+
with code `3` unless you pass `--allow-ddl`. **Only** add `--allow-ddl` when the
|
|
43
|
+
user explicitly asked to modify data or schema.
|
|
44
|
+
|
|
45
|
+
## Long jobs — submit, then poll. NEVER block.
|
|
46
|
+
|
|
47
|
+
A full-table sync is a multi-minute Spark job. **Do not** run it in the
|
|
48
|
+
foreground and wait. Submit it, tell the user the job id, and hand control back:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
scq sync ods.orders --to clickhouse # prints {"job_id": "...", "state": "running"}
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Then, *only when the user asks* "how's it going" / after a natural pause:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
scq jobs status j-20260625-... # state, source_rows, written_rows, exit_code
|
|
58
|
+
scq jobs logs j-20260625-... --tail 40 # recent progress
|
|
59
|
+
scq jobs cancel j-20260625-... # stop it
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Etiquette:
|
|
63
|
+
- After submitting, reply with the job id and a one-line "I'll check when you
|
|
64
|
+
want." Don't loop on `status` in a tight wait — let the user drive, or poll on
|
|
65
|
+
a relaxed cadence.
|
|
66
|
+
- Report terminal state plainly: `succeeded` with `written_rows`, or `failed`
|
|
67
|
+
with the tail of the log.
|
|
68
|
+
|
|
69
|
+
## Hive → ClickHouse sync workflow
|
|
70
|
+
|
|
71
|
+
When the user says "同步 X 表到 ClickHouse":
|
|
72
|
+
|
|
73
|
+
1. `scq describe <src>` — get the Hive schema.
|
|
74
|
+
2. Decide the **target database and table**. `--target` takes `db.table`:
|
|
75
|
+
- If the user names a database (e.g. `class_db`), pass it **qualified**:
|
|
76
|
+
`--target class_db.class`. A **bare table name lands in the connection's
|
|
77
|
+
default database** (`default`) — don't let data silently go there.
|
|
78
|
+
- The **database must already exist** (auto-create makes the table, not the
|
|
79
|
+
database). Ensure it first: `chsql query --allow-ddl "CREATE DATABASE IF NOT
|
|
80
|
+
EXISTS class_db"`.
|
|
81
|
+
3. Make sure the target table is good:
|
|
82
|
+
- For a quick/one-off sync, let `scq sync` auto-create it — but pass
|
|
83
|
+
`--order-by <key>` so it gets a real sort key (otherwise it is created with
|
|
84
|
+
`ORDER BY tuple()`, no primary index).
|
|
85
|
+
- For a production table, **pre-create it** with `chsql query --allow-ddl
|
|
86
|
+
"CREATE TABLE class_db.class (...) ENGINE = MergeTree ORDER BY (...)"` (full
|
|
87
|
+
control over engine, keys, partitioning), then sync.
|
|
88
|
+
4. Submit: `scq sync <src> --target db.table [--order-by key] [--where ...]`.
|
|
89
|
+
5. Hand back the job id. Verify with row counts when it finishes.
|
|
90
|
+
|
|
91
|
+
The ClickHouse JDBC connection (`$SCQ_CH_JDBC`) is preconfigured — you do **not**
|
|
92
|
+
pass credentials; just choose the `db.table` with `--target`.
|
|
93
|
+
|
|
94
|
+
### Spark/Hive → ClickHouse type mapping
|
|
95
|
+
|
|
96
|
+
| Spark/Hive | ClickHouse |
|
|
97
|
+
|------------|------------|
|
|
98
|
+
| boolean | Bool |
|
|
99
|
+
| tinyint / smallint / int / bigint | Int8 / Int16 / Int32 / Int64 |
|
|
100
|
+
| float / double | Float32 / Float64 |
|
|
101
|
+
| decimal(p,s) | Decimal(p,s) |
|
|
102
|
+
| string / varchar / char / binary | String |
|
|
103
|
+
| date | Date32 |
|
|
104
|
+
| timestamp | DateTime64(3) |
|
|
105
|
+
|
|
106
|
+
Nullable columns map to `Nullable(T)`. Nested/complex types default to `String`
|
|
107
|
+
(JSON) — confirm with the user before relying on them.
|
|
108
|
+
|
|
109
|
+
## Metadata & execution introspection
|
|
110
|
+
|
|
111
|
+
Two general primitives — don't hand-stitch many queries.
|
|
112
|
+
|
|
113
|
+
**Table metadata → `scq meta db.table`** — one JSON: schema (+ ClickHouse type
|
|
114
|
+
mapping), created time, owner, format, HDFS location, partition columns +
|
|
115
|
+
partition list/count, file count/total size, and min/max file modification time
|
|
116
|
+
(i.e. "when did the data arrive"). Add `--count` for an exact row count (runs a
|
|
117
|
+
`count(*)`, so only when asked). For ad-hoc bits you can still use
|
|
118
|
+
`scq query "DESCRIBE EXTENDED t"` / `"SHOW PARTITIONS t"`.
|
|
119
|
+
|
|
120
|
+
**Execution metadata → `scq exec <path>`** — read-only passthrough to the Spark
|
|
121
|
+
REST API (auto-discovers the app, GET-only). The model reads the JSON, so any
|
|
122
|
+
runtime question is the same command with a different path:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
scq exec stages?status=active # what's running now
|
|
126
|
+
scq exec sql # each query's plan + metrics
|
|
127
|
+
scq exec executors # cores / memory / GC / shuffle
|
|
128
|
+
scq exec jobs
|
|
129
|
+
scq exec stages/<id>/<attempt>/taskSummary?quantiles=0.5,0.95,1.0
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
- **Data skew**: pull a stage's `taskSummary` and compare a metric's **max vs
|
|
133
|
+
median** (`executorRunTime`, `shuffleReadBytes`, `shuffleReadRecords`). A large
|
|
134
|
+
`max/median` ratio = a straggler / skewed partition. `…?details=true` on a
|
|
135
|
+
stage lists every task to find the hot one.
|
|
136
|
+
- Stage/job lists can be long — filter (`?status=active`) or fetch one id.
|
|
137
|
+
- For the *plan before running*, use `scq query "EXPLAIN FORMATTED SELECT ..."`.
|
|
138
|
+
|
|
139
|
+
## Connection
|
|
140
|
+
|
|
141
|
+
`scq --remote sc://host:15002 ...` or set `$SPARK_REMOTE`. No Kerberos or JVM is
|
|
142
|
+
needed on this side — the Spark Connect server does the auth.
|
|
143
|
+
|
|
144
|
+
## Recipes
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
scq --format table tables analytics --like '%event%'
|
|
148
|
+
scq query --format table "SELECT count(*) FROM analytics.events"
|
|
149
|
+
scq query --max-rows 0 "SELECT * FROM small_dim" # no cap
|
|
150
|
+
scq sync analytics.events --to clickhouse --where "dt='2026-06-25'"
|
|
151
|
+
```
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "spark-connect-cli"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Agent-friendly Spark Connect CLI: read-only querying + async long-job control. No JVM, no Kerberos on the client."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "dengshu" }]
|
|
13
|
+
keywords = ["spark", "spark-connect", "cli", "hive", "clickhouse", "llm", "agent"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Topic :: Database :: Front-Ends",
|
|
18
|
+
"Environment :: Console",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"pyspark[connect]>=3.5,<4",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
dev = ["pytest>=7"]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Homepage = "https://github.com/dengshu2/spark-connect-cli"
|
|
29
|
+
Issues = "https://github.com/dengshu2/spark-connect-cli/issues"
|
|
30
|
+
|
|
31
|
+
[project.scripts]
|
|
32
|
+
scq = "spark_connect_cli.cli:main"
|
|
33
|
+
spark-connect-cli = "spark_connect_cli.cli:main"
|
|
34
|
+
|
|
35
|
+
[tool.hatch.build.targets.wheel]
|
|
36
|
+
packages = ["src/spark_connect_cli"]
|
|
37
|
+
|
|
38
|
+
# Ship SKILL.md inside the package so `scq skill install` can write it out.
|
|
39
|
+
[tool.hatch.build.targets.wheel.force-include]
|
|
40
|
+
"SKILL.md" = "spark_connect_cli/SKILL.md"
|
|
41
|
+
|
|
42
|
+
[tool.pytest.ini_options]
|
|
43
|
+
testpaths = ["tests"]
|