spark-connect-cli 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ name: Publish to PyPI
2
+
3
+ # Publishes to PyPI via OIDC Trusted Publishing (no API token stored).
4
+ # Triggered when you publish a GitHub Release. The version comes from
5
+ # pyproject.toml — bump it before tagging, since PyPI rejects re-uploads.
6
+ on:
7
+ release:
8
+ types: [published]
9
+ workflow_dispatch: {}
10
+
11
+ jobs:
12
+ build:
13
+ name: Build distribution
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.x"
20
+ - name: Build sdist + wheel
21
+ run: |
22
+ python -m pip install --upgrade build
23
+ python -m build
24
+ - uses: actions/upload-artifact@v4
25
+ with:
26
+ name: dist
27
+ path: dist/
28
+
29
+ publish:
30
+ name: Publish to PyPI
31
+ needs: build
32
+ runs-on: ubuntu-latest
33
+ environment:
34
+ name: pypi
35
+ url: https://pypi.org/p/spark-connect-cli
36
+ permissions:
37
+ id-token: write # required for OIDC trusted publishing
38
+ steps:
39
+ - uses: actions/download-artifact@v4
40
+ with:
41
+ name: dist
42
+ path: dist/
43
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,9 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ build/
5
+ dist/
6
+ .venv/
7
+ venv/
8
+ .pytest_cache/
9
+ .spark-connect-cli/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 dengshu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.4
2
+ Name: spark-connect-cli
3
+ Version: 0.2.0
4
+ Summary: Agent-friendly Spark Connect CLI: read-only querying + async long-job control. No JVM, no Kerberos on the client.
5
+ Project-URL: Homepage, https://github.com/dengshu2/spark-connect-cli
6
+ Project-URL: Issues, https://github.com/dengshu2/spark-connect-cli/issues
7
+ Author: dengshu
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: agent,cli,clickhouse,hive,llm,spark,spark-connect
11
+ Classifier: Environment :: Console
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Database :: Front-Ends
15
+ Requires-Python: >=3.9
16
+ Requires-Dist: pyspark[connect]<4,>=3.5
17
+ Provides-Extra: dev
18
+ Requires-Dist: pytest>=7; extra == 'dev'
19
+ Description-Content-Type: text/markdown
20
+
21
+ # spark-connect-cli (`scq`)
22
+
23
+ An agent-friendly [Spark Connect](https://spark.apache.org/spark-connect/) CLI —
24
+ **read-only querying** plus **async control for long-running jobs**.
25
+
26
+ Built for LLM agents and humans who live in a shell. Unlike `spark-sql` /
27
+ `spark-submit`, the client is a thin **pure-Python gRPC client**: no JVM, and
28
+ **no Kerberos on the client side** — the Spark Connect server authenticates with
29
+ its own keytab, so you just point at `sc://host:15002` and go.
30
+
31
+ ## Why
32
+
33
+ - **JSON-first, read-only by default.** Safe for an agent to call for
34
+ exploration; writes/DDL are blocked unless you opt in (`--allow-ddl`).
35
+ - **Long jobs don't block you.** A multi-minute Spark job shouldn't trap an agent
36
+ in a 30-minute tool call. `scq` submits the job, hands back a durable **job
37
+ id**, and returns immediately. Poll it whenever you like; the handle survives a
38
+ client/container restart because it lives in an on-disk registry.
39
+ - **Stable exit codes** so a caller can branch without scraping text.
40
+
41
+ ## Install
42
+
43
+ ```bash
44
+ pip install spark-connect-cli # once published
45
+ # or, from source:
46
+ pip install -e .
47
+ ```
48
+
49
+ ## Quick start
50
+
51
+ ```bash
52
+ export SPARK_REMOTE=sc://localhost:15002 # your Spark Connect endpoint
53
+
54
+ scq databases
55
+ scq tables mydb --like '%orders%'
56
+ scq describe mydb.orders
57
+ scq query "SELECT id, name FROM mydb.orders LIMIT 10"
58
+ ```
59
+
60
+ Output is **JSONEachRow** (one JSON object per line) by default; pick another with
61
+ `--format json|csv|tsv|table`.
62
+
63
+ ### Read-only guard
64
+
65
+ `scq query` allows only `SELECT/SHOW/DESCRIBE/EXPLAIN/WITH`. Anything else exits
66
+ with code **3** unless you pass `--allow-ddl`.
67
+
68
+ | exit | meaning |
69
+ |------|---------|
70
+ | 0 | success |
71
+ | 1 | query error (bad SQL) |
72
+ | 2 | connection error |
73
+ | 3 | blocked by the read-only guard |
74
+ | 4 | job-control error (no such job, …) |
75
+
76
+ ## Async jobs (Layer A)
77
+
78
+ Long work runs detached and is tracked by a file-based registry under
79
+ `$SCQ_JOBS_DIR` (default `~/.spark-connect-cli/jobs`).
80
+
81
+ ```bash
82
+ # submit — returns a job id immediately, does NOT block
83
+ scq sync ods.orders --to clickhouse
84
+ # {"job_id": "j-20260625-...", "state": "running", "message": "... poll with ..."}
85
+
86
+ scq jobs list # all jobs + state
87
+ scq jobs status j-20260625-... # full status (rows, timings, pid, exit code)
88
+ scq jobs logs j-20260625-... --tail 40
89
+ scq jobs cancel j-20260625-... # kills the whole process group
90
+ ```
91
+
92
+ Design: each job is a directory with `meta.json` (state machine:
93
+ `submitted → running → succeeded|failed|cancelled`) and `out.log`. The worker
94
+ runs in its **own process group**, so cancel kills the entire tree (no orphans).
95
+ A `running` job whose process has vanished is reconciled to `failed` on the next
96
+ status read, so status never lies.
97
+
98
+ ## Hive → ClickHouse sync
99
+
100
+ `scq sync` is one job kind built on the async subsystem. It uses **Spark direct
101
+ write**: a Spark Connect job reads the Hive table and writes to ClickHouse over
102
+ JDBC. The write runs on the executors, so rows never pass through this process or
103
+ the agent.
104
+
105
+ Modes control write parallelism — `single` (one connection, small tables),
106
+ `parallel` (N partitions, large tables), `auto` (picks by row count).
107
+
108
+ Requires:
109
+ - `clickhouse-jdbc` on the Spark Connect server classpath (`/opt/spark/jars/`),
110
+ - cluster→ClickHouse network egress,
111
+ - a JDBC URL with credentials via `--ch-jdbc` / `$SCQ_CH_JDBC`,
112
+ - the **target ClickHouse table created beforehand** with a suitable engine
113
+ (Spark `append` won't build a usable MergeTree table for you — create it first,
114
+ e.g. with the `chsql` skill).
115
+
116
+ ## Introspection
117
+
118
+ ```bash
119
+ scq meta db.table # one JSON: schema, created time, location,
120
+ # partitions, file count/size, mtime range
121
+ scq meta db.table --count # also run an exact count(*)
122
+
123
+ scq exec stages?status=active # read-only Spark REST passthrough
124
+ scq exec executors
125
+ scq exec stages/<id>/<attempt>/taskSummary?quantiles=0.5,0.95,1.0 # skew: max/median
126
+ ```
127
+
128
+ `scq exec` auto-discovers the running Spark app via the YARN ResourceManager and
129
+ proxies its monitoring REST API (GET-only). Set the RM base with `$SCQ_YARN_RM`.
130
+
131
+ **Reading `scq exec executors`** — the `maxMemory` field is Spark's
132
+ **storage/cache pool** (`(heap − 300 MB reserved) × 0.6`), *not* the executor's
133
+ total memory: a 512 MB executor reports ~93 MB, a 1536 MB driver ~741 MB. The
134
+ real heap is `spark.executor.memory` (+ off-heap overhead). The `driver` row has
135
+ 0 cores and runs no tasks. With dynamic allocation, idle executors are released —
136
+ so the list may show only the driver when nothing is running.
137
+
138
+ ## Configuration
139
+
140
+ | env | default | meaning |
141
+ |-----|---------|---------|
142
+ | `SPARK_REMOTE` | `sc://localhost:15002` | Spark Connect endpoint |
143
+ | `SCQ_JOBS_DIR` | `~/.spark-connect-cli/jobs` | job registry (put on a persistent volume) |
144
+ | `SCQ_MAX_ROWS` | `10000` | default row cap for `query` |
145
+ | `SCQ_CH_JDBC` | — | ClickHouse JDBC URL for `sync` path A |
146
+ | `SCQ_YARN_RM` | `http://namenode.hive-net:8088` | YARN RM base for `scq exec` |
147
+
148
+ ## Use with an LLM agent
149
+
150
+ `SKILL.md` ships a ready-made skill (discover-before-query workflow, async-job
151
+ etiquette, type-mapping table). Drop it into your agent's skills directory and
152
+ the agent drives `scq` through a shell/Bash tool.
153
+
154
+ ## License
155
+
156
+ MIT
@@ -0,0 +1,136 @@
1
+ # spark-connect-cli (`scq`)
2
+
3
+ An agent-friendly [Spark Connect](https://spark.apache.org/spark-connect/) CLI —
4
+ **read-only querying** plus **async control for long-running jobs**.
5
+
6
+ Built for LLM agents and humans who live in a shell. Unlike `spark-sql` /
7
+ `spark-submit`, the client is a thin **pure-Python gRPC client**: no JVM, and
8
+ **no Kerberos on the client side** — the Spark Connect server authenticates with
9
+ its own keytab, so you just point at `sc://host:15002` and go.
10
+
11
+ ## Why
12
+
13
+ - **JSON-first, read-only by default.** Safe for an agent to call for
14
+ exploration; writes/DDL are blocked unless you opt in (`--allow-ddl`).
15
+ - **Long jobs don't block you.** A multi-minute Spark job shouldn't trap an agent
16
+ in a 30-minute tool call. `scq` submits the job, hands back a durable **job
17
+ id**, and returns immediately. Poll it whenever you like; the handle survives a
18
+ client/container restart because it lives in an on-disk registry.
19
+ - **Stable exit codes** so a caller can branch without scraping text.
20
+
21
+ ## Install
22
+
23
+ ```bash
24
+ pip install spark-connect-cli # once published
25
+ # or, from source:
26
+ pip install -e .
27
+ ```
28
+
29
+ ## Quick start
30
+
31
+ ```bash
32
+ export SPARK_REMOTE=sc://localhost:15002 # your Spark Connect endpoint
33
+
34
+ scq databases
35
+ scq tables mydb --like '%orders%'
36
+ scq describe mydb.orders
37
+ scq query "SELECT id, name FROM mydb.orders LIMIT 10"
38
+ ```
39
+
40
+ Output is **JSONEachRow** (one JSON object per line) by default; pick another with
41
+ `--format json|csv|tsv|table`.
42
+
43
+ ### Read-only guard
44
+
45
+ `scq query` allows only `SELECT/SHOW/DESCRIBE/EXPLAIN/WITH`. Anything else exits
46
+ with code **3** unless you pass `--allow-ddl`.
47
+
48
+ | exit | meaning |
49
+ |------|---------|
50
+ | 0 | success |
51
+ | 1 | query error (bad SQL) |
52
+ | 2 | connection error |
53
+ | 3 | blocked by the read-only guard |
54
+ | 4 | job-control error (no such job, …) |
55
+
56
+ ## Async jobs (Layer A)
57
+
58
+ Long work runs detached and is tracked by a file-based registry under
59
+ `$SCQ_JOBS_DIR` (default `~/.spark-connect-cli/jobs`).
60
+
61
+ ```bash
62
+ # submit — returns a job id immediately, does NOT block
63
+ scq sync ods.orders --to clickhouse
64
+ # {"job_id": "j-20260625-...", "state": "running", "message": "... poll with ..."}
65
+
66
+ scq jobs list # all jobs + state
67
+ scq jobs status j-20260625-... # full status (rows, timings, pid, exit code)
68
+ scq jobs logs j-20260625-... --tail 40
69
+ scq jobs cancel j-20260625-... # kills the whole process group
70
+ ```
71
+
72
+ Design: each job is a directory with `meta.json` (state machine:
73
+ `submitted → running → succeeded|failed|cancelled`) and `out.log`. The worker
74
+ runs in its **own process group**, so cancel kills the entire tree (no orphans).
75
+ A `running` job whose process has vanished is reconciled to `failed` on the next
76
+ status read, so status never lies.
77
+
78
+ ## Hive → ClickHouse sync
79
+
80
+ `scq sync` is one job kind built on the async subsystem. It uses **Spark direct
81
+ write**: a Spark Connect job reads the Hive table and writes to ClickHouse over
82
+ JDBC. The write runs on the executors, so rows never pass through this process or
83
+ the agent.
84
+
85
+ Modes control write parallelism — `single` (one connection, small tables),
86
+ `parallel` (N partitions, large tables), `auto` (picks by row count).
87
+
88
+ Requires:
89
+ - `clickhouse-jdbc` on the Spark Connect server classpath (`/opt/spark/jars/`),
90
+ - cluster→ClickHouse network egress,
91
+ - a JDBC URL with credentials via `--ch-jdbc` / `$SCQ_CH_JDBC`,
92
+ - the **target ClickHouse table created beforehand** with a suitable engine
93
+ (Spark `append` won't build a usable MergeTree table for you — create it first,
94
+ e.g. with the `chsql` skill).
95
+
96
+ ## Introspection
97
+
98
+ ```bash
99
+ scq meta db.table # one JSON: schema, created time, location,
100
+ # partitions, file count/size, mtime range
101
+ scq meta db.table --count # also run an exact count(*)
102
+
103
+ scq exec stages?status=active # read-only Spark REST passthrough
104
+ scq exec executors
105
+ scq exec stages/<id>/<attempt>/taskSummary?quantiles=0.5,0.95,1.0 # skew: max/median
106
+ ```
107
+
108
+ `scq exec` auto-discovers the running Spark app via the YARN ResourceManager and
109
+ proxies its monitoring REST API (GET-only). Set the RM base with `$SCQ_YARN_RM`.
110
+
111
+ **Reading `scq exec executors`** — the `maxMemory` field is Spark's
112
+ **storage/cache pool** (`(heap − 300 MB reserved) × 0.6`), *not* the executor's
113
+ total memory: a 512 MB executor reports ~93 MB, a 1536 MB driver ~741 MB. The
114
+ real heap is `spark.executor.memory` (+ off-heap overhead). The `driver` row has
115
+ 0 cores and runs no tasks. With dynamic allocation, idle executors are released —
116
+ so the list may show only the driver when nothing is running.
117
+
118
+ ## Configuration
119
+
120
+ | env | default | meaning |
121
+ |-----|---------|---------|
122
+ | `SPARK_REMOTE` | `sc://localhost:15002` | Spark Connect endpoint |
123
+ | `SCQ_JOBS_DIR` | `~/.spark-connect-cli/jobs` | job registry (put on a persistent volume) |
124
+ | `SCQ_MAX_ROWS` | `10000` | default row cap for `query` |
125
+ | `SCQ_CH_JDBC` | — | ClickHouse JDBC URL for `sync` path A |
126
+ | `SCQ_YARN_RM` | `http://namenode.hive-net:8088` | YARN RM base for `scq exec` |
127
+
128
+ ## Use with an LLM agent
129
+
130
+ `SKILL.md` ships a ready-made skill (discover-before-query workflow, async-job
131
+ etiquette, type-mapping table). Drop it into your agent's skills directory and
132
+ the agent drives `scq` through a shell/Bash tool.
133
+
134
+ ## License
135
+
136
+ MIT
@@ -0,0 +1,151 @@
1
+ ---
2
+ name: spark-connect-cli
3
+ description: >-
4
+ Query Spark / Hive from the shell with the `scq` CLI over Spark Connect, and
5
+ run long Spark jobs (e.g. Hive->ClickHouse syncs) without blocking. Use
6
+ whenever the user wants to read Hive/Spark data, explore databases/tables/
7
+ schema, run a Spark SQL analysis, or sync a Hive table somewhere. Triggers:
8
+ Hive, Spark, Spark SQL, 查 Hive, 跑个 Spark SQL, 看下这个表, 同步到 ClickHouse,
9
+ sync table.
10
+ ---
11
+
12
+ # scq — Spark Connect from the shell
13
+
14
+ `scq` queries a Spark Connect server (JSON-first, **read-only by default**) and
15
+ manages **async long jobs** so you never sit in a blocking tool call.
16
+
17
+ ## Discover before you query
18
+
19
+ Don't guess names. Discover them first:
20
+
21
+ 1. `scq databases` — list databases.
22
+ 2. `scq tables [DB] --like '%keyword%'` — list tables.
23
+ 3. `scq describe <db.table>` — list columns (name, type, comment).
24
+ 4. `scq query "SELECT ..."` — run it once you know the schema.
25
+
26
+ ## Reading output
27
+
28
+ - **stdout** carries data. Default is **JSONEachRow** (NDJSON — one JSON object
29
+ per line). Other formats: `--format json|csv|tsv|table`.
30
+ - **stderr** carries errors as one JSON object `{"error": ..., "code": ...}`.
31
+ - `query` caps at `SCQ_MAX_ROWS` (default 10k); a `{"warning": ...}` on stderr
32
+ means it was truncated — add `LIMIT`/filters or raise `--max-rows`.
33
+
34
+ ## Branch on the exit code
35
+
36
+ `0` ok · `1` query error (fix the SQL) · `2` connection error (check
37
+ `$SPARK_REMOTE`) · `3` read-only guard blocked it · `4` job-control error.
38
+
39
+ ## Read-only by default
40
+
41
+ `scq query` allows only SELECT/SHOW/DESCRIBE/EXPLAIN/WITH. Writes and DDL exit
42
+ with code `3` unless you pass `--allow-ddl`. **Only** add `--allow-ddl` when the
43
+ user explicitly asked to modify data or schema.
44
+
45
+ ## Long jobs — submit, then poll. NEVER block.
46
+
47
+ A full-table sync is a multi-minute Spark job. **Do not** run it in the
48
+ foreground and wait. Submit it, tell the user the job id, and hand control back:
49
+
50
+ ```bash
51
+ scq sync ods.orders --to clickhouse # prints {"job_id": "...", "state": "running"}
52
+ ```
53
+
54
+ Then, *only when the user asks* "how's it going" / after a natural pause:
55
+
56
+ ```bash
57
+ scq jobs status j-20260625-... # state, source_rows, written_rows, exit_code
58
+ scq jobs logs j-20260625-... --tail 40 # recent progress
59
+ scq jobs cancel j-20260625-... # stop it
60
+ ```
61
+
62
+ Etiquette:
63
+ - After submitting, reply with the job id and a one-line "I'll check when you
64
+ want." Don't loop on `status` in a tight wait — let the user drive, or poll on
65
+ a relaxed cadence.
66
+ - Report terminal state plainly: `succeeded` with `written_rows`, or `failed`
67
+ with the tail of the log.
68
+
69
+ ## Hive → ClickHouse sync workflow
70
+
71
+ When the user says "同步 X 表到 ClickHouse":
72
+
73
+ 1. `scq describe <src>` — get the Hive schema.
74
+ 2. Decide the **target database and table**. `--target` takes `db.table`:
75
+ - If the user names a database (e.g. `class_db`), pass it **qualified**:
76
+ `--target class_db.class`. A **bare table name lands in the connection's
77
+ default database** (`default`) — don't let data silently go there.
78
+ - The **database must already exist** (auto-create makes the table, not the
79
+ database). Ensure it first: `chsql query --allow-ddl "CREATE DATABASE IF NOT
80
+ EXISTS class_db"`.
81
+ 3. Make sure the target table is good:
82
+ - For a quick/one-off sync, let `scq sync` auto-create it — but pass
83
+ `--order-by <key>` so it gets a real sort key (otherwise it is created with
84
+ `ORDER BY tuple()`, no primary index).
85
+ - For a production table, **pre-create it** with `chsql query --allow-ddl
86
+ "CREATE TABLE class_db.class (...) ENGINE = MergeTree ORDER BY (...)"` (full
87
+ control over engine, keys, partitioning), then sync.
88
+ 4. Submit: `scq sync <src> --target db.table [--order-by key] [--where ...]`.
89
+ 5. Hand back the job id. Verify with row counts when it finishes.
90
+
91
+ The ClickHouse JDBC connection (`$SCQ_CH_JDBC`) is preconfigured — you do **not**
92
+ pass credentials; just choose the `db.table` with `--target`.
93
+
94
+ ### Spark/Hive → ClickHouse type mapping
95
+
96
+ | Spark/Hive | ClickHouse |
97
+ |------------|------------|
98
+ | boolean | Bool |
99
+ | tinyint / smallint / int / bigint | Int8 / Int16 / Int32 / Int64 |
100
+ | float / double | Float32 / Float64 |
101
+ | decimal(p,s) | Decimal(p,s) |
102
+ | string / varchar / char / binary | String |
103
+ | date | Date32 |
104
+ | timestamp | DateTime64(3) |
105
+
106
+ Nullable columns map to `Nullable(T)`. Nested/complex types default to `String`
107
+ (JSON) — confirm with the user before relying on them.
108
+
109
+ ## Metadata & execution introspection
110
+
111
+ Two general primitives — don't hand-stitch many queries.
112
+
113
+ **Table metadata → `scq meta db.table`** — one JSON: schema (+ ClickHouse type
114
+ mapping), created time, owner, format, HDFS location, partition columns +
115
+ partition list/count, file count/total size, and min/max file modification time
116
+ (i.e. "when did the data arrive"). Add `--count` for an exact row count (runs a
117
+ `count(*)`, so only when asked). For ad-hoc bits you can still use
118
+ `scq query "DESCRIBE EXTENDED t"` / `"SHOW PARTITIONS t"`.
119
+
120
+ **Execution metadata → `scq exec <path>`** — read-only passthrough to the Spark
121
+ REST API (auto-discovers the app, GET-only). The model reads the JSON, so any
122
+ runtime question is the same command with a different path:
123
+
124
+ ```bash
125
+ scq exec stages?status=active # what's running now
126
+ scq exec sql # each query's plan + metrics
127
+ scq exec executors # cores / memory / GC / shuffle
128
+ scq exec jobs
129
+ scq exec stages/<id>/<attempt>/taskSummary?quantiles=0.5,0.95,1.0
130
+ ```
131
+
132
+ - **Data skew**: pull a stage's `taskSummary` and compare a metric's **max vs
133
+ median** (`executorRunTime`, `shuffleReadBytes`, `shuffleReadRecords`). A large
134
+ `max/median` ratio = a straggler / skewed partition. `…?details=true` on a
135
+ stage lists every task to find the hot one.
136
+ - Stage/job lists can be long — filter (`?status=active`) or fetch one id.
137
+ - For the *plan before running*, use `scq query "EXPLAIN FORMATTED SELECT ..."`.
138
+
139
+ ## Connection
140
+
141
+ `scq --remote sc://host:15002 ...` or set `$SPARK_REMOTE`. No Kerberos or JVM is
142
+ needed on this side — the Spark Connect server does the auth.
143
+
144
+ ## Recipes
145
+
146
+ ```bash
147
+ scq --format table tables analytics --like '%event%'
148
+ scq query --format table "SELECT count(*) FROM analytics.events"
149
+ scq query --max-rows 0 "SELECT * FROM small_dim" # no cap
150
+ scq sync analytics.events --to clickhouse --where "dt='2026-06-25'"
151
+ ```
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "spark-connect-cli"
7
+ version = "0.2.0"
8
+ description = "Agent-friendly Spark Connect CLI: read-only querying + async long-job control. No JVM, no Kerberos on the client."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "dengshu" }]
13
+ keywords = ["spark", "spark-connect", "cli", "hive", "clickhouse", "llm", "agent"]
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Topic :: Database :: Front-Ends",
18
+ "Environment :: Console",
19
+ ]
20
+ dependencies = [
21
+ "pyspark[connect]>=3.5,<4",
22
+ ]
23
+
24
+ [project.optional-dependencies]
25
+ dev = ["pytest>=7"]
26
+
27
+ [project.urls]
28
+ Homepage = "https://github.com/dengshu2/spark-connect-cli"
29
+ Issues = "https://github.com/dengshu2/spark-connect-cli/issues"
30
+
31
+ [project.scripts]
32
+ scq = "spark_connect_cli.cli:main"
33
+ spark-connect-cli = "spark_connect_cli.cli:main"
34
+
35
+ [tool.hatch.build.targets.wheel]
36
+ packages = ["src/spark_connect_cli"]
37
+
38
+ # Ship SKILL.md inside the package so `scq skill install` can write it out.
39
+ [tool.hatch.build.targets.wheel.force-include]
40
+ "SKILL.md" = "spark_connect_cli/SKILL.md"
41
+
42
+ [tool.pytest.ini_options]
43
+ testpaths = ["tests"]
@@ -0,0 +1,2 @@
1
+ """spark-connect-cli — an agent-friendly Spark Connect CLI."""
2
+ __version__ = "0.2.0"
@@ -0,0 +1,4 @@
1
+ from .cli import main
2
+
3
+ if __name__ == "__main__":
4
+ main()