clusop 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. clusop-0.0.1/.github/workflows/release-pypi.yml +118 -0
  2. clusop-0.0.1/.gitignore +11 -0
  3. clusop-0.0.1/PKG-INFO +107 -0
  4. clusop-0.0.1/README.md +91 -0
  5. clusop-0.0.1/SPEC.md +117 -0
  6. clusop-0.0.1/clusop_autoload.pth +1 -0
  7. clusop-0.0.1/harness/certify.py +69 -0
  8. clusop-0.0.1/pyproject.toml +42 -0
  9. clusop-0.0.1/scala/build.sbt +20 -0
  10. clusop-0.0.1/scala/project/build.properties +1 -0
  11. clusop-0.0.1/scala/src/main/scala/com/clusop/listener/PhotonFallbackListener.scala +36 -0
  12. clusop-0.0.1/scala/src/main/scala/com/clusop/listener/PlanParser.scala +65 -0
  13. clusop-0.0.1/scala/src/main/scala/com/clusop/listener/SignalDispatcher.scala +37 -0
  14. clusop-0.0.1/scala/src/main/scala/com/clusop/listener/WasteCalculator.scala +35 -0
  15. clusop-0.0.1/src/clusop/__init__.py +23 -0
  16. clusop-0.0.1/src/clusop/analysis/__init__.py +1 -0
  17. clusop-0.0.1/src/clusop/analysis/confidence.py +71 -0
  18. clusop-0.0.1/src/clusop/analysis/parser.py +110 -0
  19. clusop-0.0.1/src/clusop/analysis/signal.py +38 -0
  20. clusop-0.0.1/src/clusop/analysis/waste.py +36 -0
  21. clusop-0.0.1/src/clusop/jars/.gitkeep +1 -0
  22. clusop-0.0.1/src/clusop/jars/photon_listener.jar +0 -0
  23. clusop-0.0.1/src/clusop/pricing/__init__.py +1 -0
  24. clusop-0.0.1/src/clusop/pricing/price_table.json +34 -0
  25. clusop-0.0.1/src/clusop/pricing/provider.py +61 -0
  26. clusop-0.0.1/src/clusop/runtime/__init__.py +1 -0
  27. clusop-0.0.1/src/clusop/runtime/bootstrap.py +117 -0
  28. clusop-0.0.1/src/clusop/runtime/detect.py +106 -0
  29. clusop-0.0.1/src/clusop/service/__init__.py +1 -0
  30. clusop-0.0.1/src/clusop/service/aggregator.py +69 -0
  31. clusop-0.0.1/src/clusop/service/onboard.py +34 -0
  32. clusop-0.0.1/src/clusop/service/resolver.py +59 -0
  33. clusop-0.0.1/src/clusop/service/suppression.py +40 -0
  34. clusop-0.0.1/src/clusop/service/teams.py +44 -0
  35. clusop-0.0.1/tests/test_parser.py +61 -0
  36. clusop-0.0.1/tests/test_waste_and_confidence.py +52 -0
@@ -0,0 +1,118 @@
1
+ name: Publish clusop to PyPI
2
+
3
+ # One-click release — auto-versioned (next = max(PyPI, tags) + 1 patch). Builds the
4
+ # Scala listener JAR (sbt), bakes it into the wheel, lints/tests, publishes via
5
+ # Trusted Publishing (OIDC, no stored token), then commits the bump + tag. Mirrors
6
+ # the proven self-healing-kit workflow, with an added JVM build step.
7
+ on:
8
+ workflow_dispatch: {}
9
+
10
+ permissions:
11
+ contents: read
12
+
13
+ jobs:
14
+ release:
15
+ runs-on: ubuntu-latest
16
+ environment: pypi
17
+ permissions:
18
+ contents: write # commit the version bump + tag (after publish)
19
+ id-token: write # OIDC for PyPI Trusted Publishing
20
+ steps:
21
+ - uses: actions/checkout@v6
22
+ with: { ref: main, fetch-depth: 0, fetch-tags: true }
23
+
24
+ - uses: actions/setup-python@v6
25
+ with: { python-version: "3.11" }
26
+
27
+ - name: Set up JDK (for the listener JAR)
28
+ uses: actions/setup-java@v4
29
+ with: { distribution: temurin, java-version: "17", cache: sbt }
30
+
31
+ - name: Install sbt (not preinstalled on ubuntu-latest)
32
+ uses: sbt/setup-sbt@v1
33
+
34
+ - name: Compute next version (latest published + 1 patch)
35
+ id: ver
36
+ run: |
37
+ python -m pip install --quiet packaging
38
+ python - <<'PY' >> "$GITHUB_OUTPUT"
39
+ import json, subprocess, sys, urllib.request
40
+ from packaging.version import Version, InvalidVersion
41
+ PKG = "clusop"; cand = []
42
+ try:
43
+ with urllib.request.urlopen(f"https://pypi.org/pypi/{PKG}/json", timeout=20) as r:
44
+ cand += list(json.load(r).get("releases", {}).keys())
45
+ except Exception as e:
46
+ # stdout is redirected into $GITHUB_OUTPUT — diagnostics MUST go to stderr,
47
+ # else a 404 (first publish, package not yet on PyPI) breaks output parsing.
48
+ print(f"pypi read failed ({e}); tags only", file=sys.stderr)
49
+ tags = subprocess.run(["git","tag","--list","v*"], capture_output=True, text=True).stdout.split()
50
+ cand += [t[1:] for t in tags if t.startswith("v")]
51
+ parsed = []
52
+ for c in cand:
53
+ try: parsed.append(Version(c))
54
+ except InvalidVersion: pass
55
+ base = max(parsed) if parsed else Version("0.0.0")
56
+ nxt = f"{base.major}.{base.minor}.{base.micro + 1}"
57
+ print(f"version={nxt}"); print(f"previous={base}")
58
+ PY
59
+
60
+ - name: Show version
61
+ run: echo "▶ releasing clusop ${{ steps.ver.outputs.version }} (prev ${{ steps.ver.outputs.previous }})"
62
+
63
+ - name: Bump pyproject + __init__ (working tree only)
64
+ run: |
65
+ V="${{ steps.ver.outputs.version }}"
66
+ python - "$V" <<'PY'
67
+ import re, sys, pathlib
68
+ v = sys.argv[1]
69
+ for path, key in (("pyproject.toml", "version"), ("src/clusop/__init__.py", "__version__")):
70
+ p = pathlib.Path(path); t = p.read_text()
71
+ pat = rf'(?m)^{re.escape(key)} = ".*"'
72
+ if not re.search(pat, t): raise SystemExit(f"no {key} in {path}")
73
+ p.write_text(re.sub(pat, f'{key} = "{v}"', t, count=1))
74
+ print("bumped to", v)
75
+ PY
76
+
77
+ - name: Build the listener JAR (sbt) and bake it into the package
78
+ run: |
79
+ cd scala && sbt -batch package && cd ..
80
+ mkdir -p src/clusop/jars
81
+ JAR=$(ls scala/target/*.jar scala/target/scala-*/*.jar 2>/dev/null | head -1)
82
+ cp "$JAR" src/clusop/jars/photon_listener.jar
83
+ echo "bundled JAR: $(ls -la src/clusop/jars/*.jar)"
84
+
85
+ - name: Lint + unit tests
86
+ run: |
87
+ pip install -e ".[dev]"
88
+ ruff check src/ tests/ --ignore E501
89
+ pytest tests/ -q
90
+
91
+ - name: Build sdist + wheel
92
+ run: pip install build && python -m build && ls -l dist/
93
+
94
+ - name: Gate — .pth at wheel root + JAR bundled
95
+ run: |
96
+ WHEEL=$(ls dist/*.whl | head -1)
97
+ python -m zipfile -l "$WHEEL" | awk '{print $1}' | grep -qxE 'clusop_autoload\.pth' || { echo "FAIL: .pth not at wheel root"; exit 1; }
98
+ python -m zipfile -l "$WHEEL" | grep -q 'clusop/jars/.*\.jar' || { echo "FAIL: listener JAR not bundled"; exit 1; }
99
+ echo "OK: .pth at root + JAR bundled"
100
+
101
+ - name: Publish to PyPI (Trusted Publishing / OIDC)
102
+ uses: pypa/gh-action-pypi-publish@release/v1
103
+ with: { skip-existing: true }
104
+
105
+ - name: Commit bump + tag (after a successful publish)
106
+ run: |
107
+ V="${{ steps.ver.outputs.version }}"
108
+ git config user.name "clusop-release-bot"
109
+ git config user.email "clusop-release-bot@users.noreply.github.com"
110
+ git commit -am "release v$V" || echo "nothing to commit"
111
+ git tag "v$V"; git push origin HEAD:main; git push origin "v$V"
112
+
113
+ - name: Summary
114
+ if: always()
115
+ run: |
116
+ echo "## 📦 clusop publish" >> "$GITHUB_STEP_SUMMARY"
117
+ echo "Published \`${{ steps.ver.outputs.version }}\` (prev \`${{ steps.ver.outputs.previous }}\`)" >> "$GITHUB_STEP_SUMMARY"
118
+ echo "https://pypi.org/project/clusop/${{ steps.ver.outputs.version }}/" >> "$GITHUB_STEP_SUMMARY"
@@ -0,0 +1,11 @@
1
+ __pycache__/
2
+ *.pyc
3
+ .pytest_cache/
4
+ .ruff_cache/
5
+ dist/
6
+ build/
7
+ *.egg-info/
8
+ .venv/
9
+ scala/target/
10
+ scala/project/target/
11
+ src/clso/jars/*.jar
clusop-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,107 @@
1
+ Metadata-Version: 2.4
2
+ Name: clusop
3
+ Version: 0.0.1
4
+ Summary: Photon Fallback Analyzer — zero-touch Databricks cost forensics (.pth self-arm + bundled JVM listener)
5
+ Author: yogasathyandrun
6
+ License: Proprietary
7
+ Keywords: cost,databricks,finops,photon,spark
8
+ Requires-Python: >=3.10
9
+ Provides-Extra: dev
10
+ Requires-Dist: pytest>=7; extra == 'dev'
11
+ Requires-Dist: ruff>=0.5; extra == 'dev'
12
+ Provides-Extra: service
13
+ Requires-Dist: databricks-sdk>=0.30; extra == 'service'
14
+ Requires-Dist: requests>=2.31; extra == 'service'
15
+ Description-Content-Type: text/markdown
16
+
17
+ # clusop — Photon Fallback Analyzer
18
+
19
+ `pip install clusop`
20
+
21
+ clusop finds the money Databricks Photon quietly burns. When Photon hits an operator it
22
+ can't run (a Python UDF, a struct-IN filter, an unsupported Delta feature), it silently
23
+ **falls back** to the JVM — you keep paying the 2–2.9× Photon DBU premium while getting
24
+ JVM speed. clusop detects those fallbacks from the executed plan, estimates the wasted
25
+ spend, and proposes a fix. It never touches your job.
26
+
27
+ ## One install, two halves, zero config
28
+
29
+ A single `pip install clusop` ships both halves and arms itself:
30
+
31
+ - a **Python `.pth`** (`clusop_autoload.pth`) that Python's `site` machinery runs at
32
+ interpreter startup → imports `clusop.runtime.bootstrap` → arms automatically. You do
33
+ not `import clusop` anywhere in your job.
34
+ - a bundled **Scala JAR** (`clusop/jars/photon_listener.jar`) — the actual
35
+ `QueryExecutionListener`. The bootstrap `addJar`s it and registers it over Py4J.
36
+
37
+ Install it as a **cluster/job library** (production path) so every interpreter that
38
+ starts already has it. `%pip install clusop` + `dbutils.library.restartPython()` works for
39
+ dev.
40
+
41
+ Everything is **auto-detected** at runtime — cloud (from the instance type), DBR (from
42
+ `DATABRICKS_RUNTIME_VERSION`), cluster shape, and whether the JVM is reachable. No
43
+ per-user setup; a new customer pip-installs and it works.
44
+
45
+ ## What it costs you: nothing if it can't run safely
46
+
47
+ - **Fail-open everywhere.** Any error in arming, listening, parsing, or dispatch is
48
+ swallowed. clusop never breaks, slows, or blocks a customer query.
49
+ - **Needs a SINGLE_USER / dedicated cluster** for the JVM listener. On USER_ISOLATION
50
+ (Shared) clusters the JVM is sealed behind Spark Connect — clusop detects this and stays
51
+ dormant rather than failing.
52
+ - **Propose-never-apply.** clusop emits a signal and a Teams card. A human decides.
53
+
54
+ ## How a signal is born
55
+
56
+ 1. The JVM listener sees a finished query and reads its **executed plan**.
57
+ 2. The **structural parser** ([src/clusop/analysis/parser.py](src/clusop/analysis/parser.py),
58
+ mirrored in Scala) decides if this is a *real* fallback. The hard part: a clean Photon
59
+ query always ends in one **terminal** `ColumnarToRow` (the normal result boundary) —
60
+ that is **not** a fallback. Real fallback is **mid-plan** `ColumnarToRow`,
61
+ `RowToColumnar` round-trips, or `BatchEvalPython`/`ArrowEvalPython`. Counting raw
62
+ occurrences false-positives; clusop doesn't.
63
+ 3. The **waste model** ([waste.py](src/clusop/analysis/waste.py)) sizes the loss:
64
+ `runtime × Σ(node DBU/hr) × $rate × (photon_premium−1) × fallback_weight`.
65
+ 4. **Confidence** ([confidence.py](src/clusop/analysis/confidence.py)) is decomposed into
66
+ four legs — parse / diagnosis / cost / recommendation. A *fix* (rewrite the UDF) needs
67
+ only parse+diagnosis; a *disable-Photon* recommendation is a dollar decision and is
68
+ capped by the cost leg.
69
+ 5. The signal flows to a **driver batcher** (dedup by signature) → **central aggregator**
70
+ (idempotent upsert, suppression, prioritization) → Teams card.
71
+
72
+ ## Cost: modeled now, billed if granted
73
+
74
+ clusop always works with a **modeled** cost (public price table → MEDIUM cost confidence).
75
+ If the workspace can read `system.billing.usage`, the
76
+ [CostTierResolver](src/clusop/service/resolver.py) reconciles the estimate to **billed**
77
+ dollars (HIGH confidence). Detection never depends on system tables — they only upgrade
78
+ the cost layer. Prices are reference data, not something clusop invents per-row.
79
+
80
+ ## Certifying it's real
81
+
82
+ [harness/certify.py](harness/certify.py) runs on a dedicated cluster, captures real
83
+ executed plans for known clean / known-fallback fixtures, and reports catch-rate plus a
84
+ **DBR-stamped Delta feature-support matrix** — derived, never hand-maintained.
85
+
86
+ ## Layout
87
+
88
+ ```
89
+ src/clusop/
90
+ runtime/ bootstrap (arming), detect (cloud/DBR/shape/jvm)
91
+ analysis/ parser · waste · confidence · signal
92
+ pricing/ price_table.json · provider
93
+ service/ aggregator · resolver · suppression · teams · onboard
94
+ jars/ photon_listener.jar (baked in by the release workflow)
95
+ scala/ the QueryExecutionListener (sbt; mirror of the Python parser)
96
+ harness/ certify.py (catch-rate + Delta matrix, run on a dedicated cluster)
97
+ tests/ parser / waste / confidence
98
+ ```
99
+
100
+ See [SPEC.md](SPEC.md) for the full design and invariants.
101
+
102
+ ## Release
103
+
104
+ Push-button: the [Publish workflow](.github/workflows/release-pypi.yml) auto-versions
105
+ (`max(PyPI, tags)+1`), builds the JAR with sbt, bakes it into the wheel, lints+tests,
106
+ gates on *(.pth at wheel root AND JAR bundled)*, then publishes via PyPI Trusted
107
+ Publishing (OIDC — no stored token) and tags the bump.
clusop-0.0.1/README.md ADDED
@@ -0,0 +1,91 @@
1
+ # clusop — Photon Fallback Analyzer
2
+
3
+ `pip install clusop`
4
+
5
+ clusop finds the money Databricks Photon quietly burns. When Photon hits an operator it
6
+ can't run (a Python UDF, a struct-IN filter, an unsupported Delta feature), it silently
7
+ **falls back** to the JVM — you keep paying the 2–2.9× Photon DBU premium while getting
8
+ JVM speed. clusop detects those fallbacks from the executed plan, estimates the wasted
9
+ spend, and proposes a fix. It never touches your job.
10
+
11
+ ## One install, two halves, zero config
12
+
13
+ A single `pip install clusop` ships both halves and arms itself:
14
+
15
+ - a **Python `.pth`** (`clusop_autoload.pth`) that Python's `site` machinery runs at
16
+ interpreter startup → imports `clusop.runtime.bootstrap` → arms automatically. You do
17
+ not `import clusop` anywhere in your job.
18
+ - a bundled **Scala JAR** (`clusop/jars/photon_listener.jar`) — the actual
19
+ `QueryExecutionListener`. The bootstrap `addJar`s it and registers it over Py4J.
20
+
21
+ Install it as a **cluster/job library** (production path) so every interpreter that
22
+ starts already has it. `%pip install clusop` + `dbutils.library.restartPython()` works for
23
+ dev.
24
+
25
+ Everything is **auto-detected** at runtime — cloud (from the instance type), DBR (from
26
+ `DATABRICKS_RUNTIME_VERSION`), cluster shape, and whether the JVM is reachable. No
27
+ per-user setup; a new customer pip-installs and it works.
28
+
29
+ ## What it costs you: nothing if it can't run safely
30
+
31
+ - **Fail-open everywhere.** Any error in arming, listening, parsing, or dispatch is
32
+ swallowed. clusop never breaks, slows, or blocks a customer query.
33
+ - **Needs a SINGLE_USER / dedicated cluster** for the JVM listener. On USER_ISOLATION
34
+ (Shared) clusters the JVM is sealed behind Spark Connect — clusop detects this and stays
35
+ dormant rather than failing.
36
+ - **Propose-never-apply.** clusop emits a signal and a Teams card. A human decides.
37
+
38
+ ## How a signal is born
39
+
40
+ 1. The JVM listener sees a finished query and reads its **executed plan**.
41
+ 2. The **structural parser** ([src/clusop/analysis/parser.py](src/clusop/analysis/parser.py),
42
+ mirrored in Scala) decides if this is a *real* fallback. The hard part: a clean Photon
43
+ query always ends in one **terminal** `ColumnarToRow` (the normal result boundary) —
44
+ that is **not** a fallback. Real fallback is **mid-plan** `ColumnarToRow`,
45
+ `RowToColumnar` round-trips, or `BatchEvalPython`/`ArrowEvalPython`. Counting raw
46
+ occurrences false-positives; clusop doesn't.
47
+ 3. The **waste model** ([waste.py](src/clusop/analysis/waste.py)) sizes the loss:
48
+ `runtime × Σ(node DBU/hr) × $rate × (photon_premium−1) × fallback_weight`.
49
+ 4. **Confidence** ([confidence.py](src/clusop/analysis/confidence.py)) is decomposed into
50
+ four legs — parse / diagnosis / cost / recommendation. A *fix* (rewrite the UDF) needs
51
+ only parse+diagnosis; a *disable-Photon* recommendation is a dollar decision and is
52
+ capped by the cost leg.
53
+ 5. The signal flows to a **driver batcher** (dedup by signature) → **central aggregator**
54
+ (idempotent upsert, suppression, prioritization) → Teams card.
55
+
56
+ ## Cost: modeled now, billed if granted
57
+
58
+ clusop always works with a **modeled** cost (public price table → MEDIUM cost confidence).
59
+ If the workspace can read `system.billing.usage`, the
60
+ [CostTierResolver](src/clusop/service/resolver.py) reconciles the estimate to **billed**
61
+ dollars (HIGH confidence). Detection never depends on system tables — they only upgrade
62
+ the cost layer. Prices are reference data, not something clusop invents per-row.
63
+
64
+ ## Certifying it's real
65
+
66
+ [harness/certify.py](harness/certify.py) runs on a dedicated cluster, captures real
67
+ executed plans for known clean / known-fallback fixtures, and reports catch-rate plus a
68
+ **DBR-stamped Delta feature-support matrix** — derived, never hand-maintained.
69
+
70
+ ## Layout
71
+
72
+ ```
73
+ src/clusop/
74
+ runtime/ bootstrap (arming), detect (cloud/DBR/shape/jvm)
75
+ analysis/ parser · waste · confidence · signal
76
+ pricing/ price_table.json · provider
77
+ service/ aggregator · resolver · suppression · teams · onboard
78
+ jars/ photon_listener.jar (baked in by the release workflow)
79
+ scala/ the QueryExecutionListener (sbt; mirror of the Python parser)
80
+ harness/ certify.py (catch-rate + Delta matrix, run on a dedicated cluster)
81
+ tests/ parser / waste / confidence
82
+ ```
83
+
84
+ See [SPEC.md](SPEC.md) for the full design and invariants.
85
+
86
+ ## Release
87
+
88
+ Push-button: the [Publish workflow](.github/workflows/release-pypi.yml) auto-versions
89
+ (`max(PyPI, tags)+1`), builds the JAR with sbt, bakes it into the wheel, lints+tests,
90
+ gates on *(.pth at wheel root AND JAR bundled)*, then publishes via PyPI Trusted
91
+ Publishing (OIDC — no stored token) and tags the bump.
clusop-0.0.1/SPEC.md ADDED
@@ -0,0 +1,117 @@
1
+ # clusop — Design Specification
2
+
3
+ The Photon Fallback Analyzer. Detect silent Photon→JVM fallbacks, price the waste,
4
+ propose a fix. In-process, fail-open, propose-never-apply.
5
+
6
+ ## 1. Problem
7
+
8
+ Photon bills a 2–2.9× DBU premium. When it meets an operator it can't vectorize it
9
+ inserts a `ColumnarToRow` transition and runs that subtree on the JVM — a **fallback**.
10
+ The customer keeps paying the premium for JVM work. Fallbacks are invisible in the UI and
11
+ common: Python/Pandas UDFs, struct/array predicates, some Delta features per DBR, RDD
12
+ APIs. clusop surfaces them with a dollar figure and a confidence.
13
+
14
+ ## 2. Delivery — one pip, self-arming
15
+
16
+ `pip install clusop` ships:
17
+ - `clusop_autoload.pth` force-included at the **wheel root**. Python's `site` module
18
+ executes it at interpreter startup → `import clusop.runtime.bootstrap` → `activate()`.
19
+ Importing the package in user code is **not** required and does **not** arm it.
20
+ - `clusop/jars/photon_listener.jar`, baked in by the release workflow and shipped as a
21
+ package artifact.
22
+
23
+ Arming path matters: only interpreters that **start** with clusop installed arm. Cluster/job
24
+ **library** = production (every interpreter). `%pip install` + `restartPython()` = dev.
25
+
26
+ `bootstrap.activate()`: locate the active `SparkSession` → `jvm_available()` (False on
27
+ Spark Connect / USER_ISOLATION) → `sparkContext.addJar(bundled_jar)` →
28
+ `spark._jvm.com.clusop.listener.PhotonFallbackListener(premium, endpoint)` →
29
+ `listenerManager().register(...)`. A watcher polls for a late session. Driver-only
30
+ (`DB_IS_DRIVER`). Every step is wrapped — failure leaves the customer job untouched.
31
+
32
+ Native alternative: `spark.sql.queryExecutionListeners` conf pointing at the listener
33
+ class (no Py4J), for conf-managed fleets.
34
+
35
+ ## 3. Auto-detection (no per-user config)
36
+
37
+ [detect.py](src/clusop/runtime/detect.py):
38
+ - **cloud** from instance-type prefix (`m5`→aws, `Standard_D`→azure, `n2`→gcp) + host.
39
+ - **DBR** from `DATABRICKS_RUNTIME_VERSION`.
40
+ - **shape** (driver/worker node types, num_workers, photon_enabled, data_security_mode,
41
+ compute_type) → `ClusterShape`.
42
+ - **jvm_available(spark)** → False under Spark Connect.
43
+
44
+ Defaults when unknown: AWS + DBR 14.3/15.4/16.x, all overridable.
45
+
46
+ ## 4. The parser — the moat
47
+
48
+ [parser.py](src/clusop/analysis/parser.py) / `PlanParser.scala` (kept identical).
49
+ `analyze_plan(plan_text, runtime_seconds, photon_enabled) → PlanAnalysis`.
50
+
51
+ Invariant: **terminal vs mid-plan `ColumnarToRow`.** A clean Photon plan ends in exactly
52
+ one root `ColumnarToRow` — the result boundary, **not** a fallback. Fallback =
53
+ - `mid_plan_col2row` ≥ 1, or
54
+ - `RowToColumnar` round-trips, or
55
+ - `BatchEvalPython`/`ArrowEvalPython` (UDF), or
56
+ - zero Photon nodes on a Photon-enabled cluster (`fallback_weight = 1.0`).
57
+
58
+ Outputs `fallback_weight` (0..1, share of the plan off Photon), `photon_ratio`, and a
59
+ `Verdict(kind, cause)`: `fix_fallback` (e.g. cause `python_udf`) vs `disable_photon`
60
+ (round-trip / whole-plan JVM where Photon is pure overhead). Node-count capped
61
+ (`_NODE_CAP`). Empty/garbage plan → safe no-fallback.
62
+
63
+ ## 5. Waste model
64
+
65
+ [waste.py](src/clusop/analysis/waste.py): `modeled_waste(analysis, shape, provider,
66
+ runtime_seconds) → WasteEstimate`. Dimensionally correct:
67
+
68
+ ```
69
+ waste$ = runtime_hours × Σ(node DBU/hr) × dbu_$rate × (photon_premium − 1) × fallback_weight
70
+ ```
71
+
72
+ Unknown node type → 0 (never crash, never guess). `basis="modeled"` until reconciled.
73
+
74
+ ## 6. Confidence — four legs
75
+
76
+ [confidence.py](src/clusop/analysis/confidence.py): `parse`, `diagnosis`, `cost`,
77
+ `recommendation`. The recommendation leg never exceeds the legs it depends on:
78
+ - `fix_fallback` depends on parse+diagnosis (not cost) — a UDF rewrite is right whatever
79
+ the dollar figure.
80
+ - `disable_photon` is a dollar decision — capped by the **cost** leg. Modeled cost
81
+ (MEDIUM/LOW) caps a disable recommendation accordingly.
82
+
83
+ ## 7. Cost tiers — adaptive, never a dependency
84
+
85
+ [resolver.py](src/clusop/service/resolver.py): `probe()` tests
86
+ `SELECT 1 FROM system.billing.usage LIMIT 1`.
87
+ - readable → **Tier-1 billed**, cost leg HIGH; `reconcile(signal)` upgrades modeled→billed.
88
+ - not readable → **Tier-0 modeled**, cost leg MEDIUM.
89
+
90
+ Detection is **always** in-process and never depends on system tables. Fail-open. Never
91
+ present modeled cost as billed.
92
+
93
+ ## 8. Aggregation & action
94
+
95
+ [aggregator.py](src/clusop/service/aggregator.py): `DriverBatcher` (dedup by
96
+ `fallback_signature`, accumulate, evict, flush) → `CentralAggregator` (idempotent upsert,
97
+ resolver reconcile). [suppression.py](src/clusop/service/suppression.py): `passes_gate`
98
+ (min waste / min confidence / cooldown) + `priority(waste, conf)`.
99
+ [teams.py](src/clusop/service/teams.py): propose-never-apply Adaptive Card.
100
+ [onboard.py](src/clusop/service/onboard.py): creates `clusop_signals` in the customer's own
101
+ schema and probes cost tier. clusop writes only to its own schema; reads system tables
102
+ read-only.
103
+
104
+ ## 9. Certification
105
+
106
+ [harness/certify.py](harness/certify.py) on a dedicated cluster: catch-rate over known
107
+ clean/fallback fixtures + a **DBR-stamped Delta feature-support matrix**, derived from
108
+ real executed plans through the shipped parser — never hand-maintained.
109
+
110
+ ## 10. Invariants (non-negotiable)
111
+
112
+ 1. Fail-open — clusop never breaks/slows/blocks a customer query.
113
+ 2. Propose-never-apply — never mutates production compute or jobs.
114
+ 3. Detection in-process; system tables only upgrade cost.
115
+ 4. Modeled cost is never labeled billed.
116
+ 5. Writes confined to clusop's own schema; system tables read-only.
117
+ 6. Terminal `ColumnarToRow` is not a fallback.
@@ -0,0 +1 @@
1
+ import clusop.runtime.bootstrap
@@ -0,0 +1,69 @@
1
+ """clusop certification harness — run on a DEDICATED (single-user) cluster.
2
+
3
+ Produces two artifacts the product depends on (no week-by-week phasing — run it on
4
+ each target DBR):
5
+ 1. catch-rate baseline — does the parser flag known-fallback fixtures and stay
6
+ quiet on clean Photon? (the only question that matters: is the signal real)
7
+ 2. the Delta feature-support matrix — for each (DBR, Delta feature), run a fixture
8
+ and record whether Photon fell back. The matrix is DERIVED here, never
9
+ hand-maintained, and stamped with the DBR it was observed on.
10
+
11
+ Captures the real executed plan via `df._jdf.queryExecution().executedPlan().toString()`
12
+ (needs JVM access — hence dedicated clusters) and feeds it to the SAME structural
13
+ parser the product uses (clusop.analysis.parser), so the harness validates the shipped logic.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+
20
+ from clusop.analysis.parser import analyze_plan
21
+ from clusop.runtime.detect import detect_dbr
22
+
23
+
24
+ def _plan(df) -> str:
25
+ # JVM path — dedicated cluster only; this is the certification environment.
26
+ return df._jdf.queryExecution().executedPlan().toString()
27
+
28
+
29
+ def run(spark) -> dict:
30
+ dbr = detect_dbr()
31
+ results = {"dbr": dbr, "catch_rate": {}, "delta_matrix": {}}
32
+
33
+ # ---- catch-rate fixtures -------------------------------------------------
34
+ fixtures = {
35
+ # name -> (builder, expect_fallback)
36
+ "clean_sql": (lambda: spark.range(0, 1_000_00).groupBy((spark.range(1).id).alias("k")).count(), False),
37
+ }
38
+ # a Python-UDF fixture (should fall back to BatchEvalPython)
39
+ try:
40
+ from pyspark.sql.functions import udf
41
+ from pyspark.sql.types import StringType
42
+ bucket = udf(lambda v: "hi" if v % 2 else "lo", StringType())
43
+ fixtures["python_udf"] = (lambda: spark.range(0, 100000).withColumn("b", bucket("id")), True)
44
+ except Exception:
45
+ pass
46
+
47
+ for name, (build, expect) in fixtures.items():
48
+ try:
49
+ df = build()
50
+ df.collect()
51
+ a = analyze_plan(_plan(df), photon_enabled=True)
52
+ results["catch_rate"][name] = {"expected": expect, "detected": a.has_fallback,
53
+ "correct": a.has_fallback == expect,
54
+ "weight": a.fallback_weight, "cause": a.verdict.cause}
55
+ except Exception as e:
56
+ results["catch_rate"][name] = {"error": str(e)[:160]}
57
+
58
+ # ---- Delta feature matrix (stub fixtures; expand per feature) -------------
59
+ for feature in ("deletionVectors", "CDF", "generatedColumns", "schemaEvolution"):
60
+ # production: build a MERGE/UPDATE exercising `feature`, parse, record fellBack.
61
+ results["delta_matrix"][f"{dbr}::{feature}"] = {"fellBack": None, "status": "untested_stub"}
62
+
63
+ print(json.dumps(results, indent=2))
64
+ return results
65
+
66
+
67
+ if __name__ == "__main__": # pragma: no cover — run inside a Databricks notebook
68
+ from pyspark.sql import SparkSession
69
+ run(SparkSession.builder.getOrCreate())
@@ -0,0 +1,42 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "clusop"
7
+ version = "0.0.1"
8
+ description = "Photon Fallback Analyzer — zero-touch Databricks cost forensics (.pth self-arm + bundled JVM listener)"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "Proprietary" }
12
+ authors = [{ name = "yogasathyandrun" }]
13
+ keywords = ["databricks", "photon", "finops", "spark", "cost"]
14
+ dependencies = [] # in-cluster agent stays dependency-free; service extras below
15
+
16
+ [project.optional-dependencies]
17
+ # the off-cluster central service (aggregation + cost-tier resolver + Teams)
18
+ service = ["requests>=2.31", "databricks-sdk>=0.30"]
19
+ dev = ["pytest>=7", "ruff>=0.5"]
20
+
21
+ [tool.hatch.build.targets.wheel]
22
+ packages = ["src/clusop"]
23
+ # ship the compiled Scala listener JAR inside the wheel so `pip install clusop`
24
+ # delivers everything in one artifact (CI's sbt step drops it into src/clusop/jars/).
25
+ artifacts = ["src/clusop/jars/*.jar"]
26
+
27
+ # Ship the self-arming hook at the WHEEL ROOT so pip drops it straight into
28
+ # site-packages, where Python's `site` module runs .pth files at interpreter
29
+ # startup -> `import clusop.runtime.bootstrap` arms the listener. See README.
30
+ [tool.hatch.build.targets.wheel.force-include]
31
+ "clusop_autoload.pth" = "clusop_autoload.pth"
32
+
33
+ [tool.ruff]
34
+ target-version = "py310"
35
+ line-length = 120
36
+
37
+ [tool.ruff.lint]
38
+ select = ["E", "F", "I", "W"]
39
+
40
+ [tool.pytest.ini_options]
41
+ testpaths = ["tests"]
42
+ pythonpath = ["src"]
@@ -0,0 +1,20 @@
1
+ // clusop — Photon fallback listener (the bundled JAR).
2
+ // Built by CI (sbt assembly/package) and copied into src/clusop/jars/ before the wheel
3
+ // is packed, so `pip install clusop` ships everything in one artifact.
4
+ //
5
+ // Spark/Scala are PROVIDED — supplied by the Databricks Runtime at execution time;
6
+ // we never bundle Spark into the JAR.
7
+
8
+ ThisBuild / scalaVersion := "2.12.18"
9
+
10
+ lazy val clusopListener = (project in file("."))
11
+ .settings(
12
+ name := "clusop-photon-listener",
13
+ version := "0.1.0",
14
+ libraryDependencies ++= Seq(
15
+ "org.apache.spark" %% "spark-sql" % "3.5.0" % Provided,
16
+ "org.apache.spark" %% "spark-core" % "3.5.0" % Provided
17
+ ),
18
+ // a thin JAR — no shading needed since deps are Provided
19
+ Compile / packageBin / artifactPath := baseDirectory.value / "target" / "clusop-photon-listener.jar"
20
+ )
@@ -0,0 +1 @@
1
+ sbt.version=1.9.9
@@ -0,0 +1,36 @@
1
+ package com.clusop.listener
2
+
3
+ import org.apache.spark.sql.util.QueryExecutionListener
4
+ import org.apache.spark.sql.execution.QueryExecution
5
+
6
+ /**
7
+ * Passive QueryExecutionListener — captures the executed physical plan of every
8
+ * query, with zero query wrappers and zero code change for data engineers. Native
9
+ * JVM parsing avoids per-query Py4J callback overhead.
10
+ *
11
+ * CARDINAL RULE: best-effort, non-blocking, must never slow or break the job.
12
+ * Everything is wrapped; the webhook/dispatch is async (see SignalDispatcher).
13
+ *
14
+ * Registered by the clusop `.pth` via:
15
+ * spark._jvm.com.clusop.listener.PhotonFallbackListener(premium, endpoint)
16
+ * spark._jsparkSession.listenerManager().register(listener)
17
+ */
18
+ class PhotonFallbackListener(photonPremiumMultiplier: Double, signalEndpoint: String)
19
+ extends QueryExecutionListener {
20
+
21
+ override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = {
22
+ try {
23
+ // executedPlan is post-AQE/post-Catalyst; treeString avoids toString truncation.
24
+ val plan = qe.executedPlan.treeString
25
+ val a = PlanParser.analyze(plan)
26
+ if (a.hasFallback) {
27
+ WasteCalculator.emit(funcName, durationNs, a, photonPremiumMultiplier, signalEndpoint, plan)
28
+ }
29
+ } catch {
30
+ case _: Throwable => () // never propagate into the customer's job
31
+ }
32
+ }
33
+
34
+ // failed queries didn't complete -> no DBU waste to attribute
35
+ override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = ()
36
+ }