hotdata-ibis 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hotdata_ibis-0.1.1.dist-info/METADATA +176 -0
- hotdata_ibis-0.1.1.dist-info/RECORD +10 -0
- hotdata_ibis-0.1.1.dist-info/WHEEL +5 -0
- hotdata_ibis-0.1.1.dist-info/entry_points.txt +2 -0
- hotdata_ibis-0.1.1.dist-info/top_level.txt +1 -0
- ibis_hotdata/__init__.py +10 -0
- ibis_hotdata/backend.py +673 -0
- ibis_hotdata/http.py +290 -0
- ibis_hotdata/managed.py +19 -0
- ibis_hotdata/types.py +16 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hotdata-ibis
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Ibis backend for Hotdata federated SQL API (depends on the hotdata SDK only; not hotdata-runtime)
|
|
5
|
+
Author: Hotdata Ibis contributors
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Documentation, https://www.hotdata.dev/docs/api-reference
|
|
8
|
+
Project-URL: Ibis, https://ibis-project.org/
|
|
9
|
+
Keywords: ibis,hotdata,sql,dataframe
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Requires-Dist: ibis-framework<11,>=10.0
|
|
22
|
+
Requires-Dist: hotdata>=0.2.0
|
|
23
|
+
Requires-Dist: pyarrow>=15
|
|
24
|
+
Requires-Dist: pyarrow-hotfix>=0.6
|
|
25
|
+
Requires-Dist: pandas>=2
|
|
26
|
+
Requires-Dist: sqlglot>=24
|
|
27
|
+
|
|
28
|
+
# hotdata-ibis
|
|
29
|
+
|
|
30
|
+
Experimental [Ibis](https://ibis-project.org/) backend for [Hotdata](https://www.hotdata.dev/docs/api-reference): compile expressions with Ibis, run federated SQL over the Hotdata API. REST calls use the official **[hotdata](https://github.com/hotdata-dev/sdk-python)** Python SDK. Repo examples use **httpx** (listed under the **dev** dependency group).
|
|
31
|
+
|
|
32
|
+
**Requirements:** Python 3.10+, **ibis-framework** 10.x, **hotdata** ≥0.2.
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
uv pip install hotdata-ibis
|
|
38
|
+
# or: python -m pip install hotdata-ibis
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
- **Ibis connection API** — connect with `ibis.hotdata.connect(...)` or `ibis.connect("hotdata://...")`.
|
|
44
|
+
- **Hotdata catalog mapping** — expose Hotdata connections, schemas, and tables through Ibis catalogs, databases, and tables.
|
|
45
|
+
- **SQL-backed expression execution** — compile Ibis expressions with the Postgres SQLGlot compiler and execute them through Hotdata query APIs.
|
|
46
|
+
- **Typed table discovery** — load schema metadata from Hotdata information schema and map SQL types into Ibis types.
|
|
47
|
+
- **Arrow and pandas results** — materialize expressions as pandas DataFrames, PyArrow tables, or local Arrow record batches.
|
|
48
|
+
- **Raw SQL escape hatch** — use `con.sql(..., dialect="postgres")` when Hotdata-specific federated SQL is clearer than modeled Ibis expressions.
|
|
49
|
+
- **Managed database writes** — create managed connections with `create_database`, load local pandas or PyArrow data through `create_table`, and clean up with `drop_table` / `drop_database`.
|
|
50
|
+
|
|
51
|
+
## Connect
|
|
52
|
+
|
|
53
|
+
Programmatic API:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
import ibis
|
|
57
|
+
|
|
58
|
+
con = ibis.hotdata.connect(
|
|
59
|
+
api_url="https://api.hotdata.dev",
|
|
60
|
+
token="YOUR_API_TOKEN",
|
|
61
|
+
workspace_id="ws_…",
|
|
62
|
+
session_id=None, # optional: X-Session-Id (sandbox)
|
|
63
|
+
verify_ssl=True,
|
|
64
|
+
timeout=120.0,
|
|
65
|
+
default_connection=None, # Hotdata connection id → Ibis catalog
|
|
66
|
+
default_schema=None, # remote schema → Ibis database
|
|
67
|
+
poll_interval_s=0.25,
|
|
68
|
+
poll_timeout_s=600.0,
|
|
69
|
+
)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
URL style (token may live in the query string or the URL “password” segment):
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
con = ibis.connect(
|
|
76
|
+
"hotdata://api.hotdata.dev/?token=…&workspace_id=ws_…&verify_ssl=true"
|
|
77
|
+
)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Mapping:** Ibis **catalog** = Hotdata connection id; **database** = remote schema; **table** = table name. SQL references look like `connection.schema.table`. With a single connection and schema, defaults are inferred; otherwise set `default_connection` / `default_schema` or qualify `con.table(..., database=(conn_id, schema))`.
|
|
81
|
+
|
|
82
|
+
**Execution:** SQL is compiled with Ibis’s **Postgres** SQLGlot compiler. The client submits queries asynchronously with `POST /v1/query`, polls `GET /v1/query-runs/{id}`, then downloads ready results as Arrow IPC from `GET /v1/results/{id}`. Tuning: `poll_interval_s`, `poll_timeout_s` on `connect()`.
|
|
83
|
+
|
|
84
|
+
**Types:** Typed tables come from Hotdata’s information schema. `con.sql(...)` types are inferred from a small preview query and Arrow schema; see [Hotdata SQL](https://www.hotdata.dev/docs/sql) for server behavior.
|
|
85
|
+
|
|
86
|
+
## Ibis Support Overview
|
|
87
|
+
|
|
88
|
+
`hotdata-ibis` is a read-oriented SQL backend. It is useful for exploring Hotdata workspaces with Ibis expressions, running federated SQL, and materializing results locally, but it is not a full mutable database backend.
|
|
89
|
+
|
|
90
|
+
Supported today:
|
|
91
|
+
|
|
92
|
+
- **Connection setup:** `ibis.hotdata.connect(...)` and `ibis.connect("hotdata://...")` with token, workspace, optional sandbox session, TLS, timeout, and polling settings.
|
|
93
|
+
- **Catalog discovery:** `list_catalogs`, `list_databases`, `list_tables`, `current_catalog`, and `current_database` map Hotdata connections and remote schemas into Ibis' catalog/database/table hierarchy.
|
|
94
|
+
- **Table schemas:** `con.table(...)` uses Hotdata information schema column metadata and maps SQL types through Ibis' Postgres type parser.
|
|
95
|
+
- **SQL-backed expressions:** Ibis expressions compile with the Postgres SQLGlot compiler and execute through Hotdata. Common `SELECT` workloads such as projection, filtering, joins, grouping, aggregation, ordering, limits, scalar expressions, and `con.sql(...)` work when the generated SQL is accepted by Hotdata.
|
|
96
|
+
- **Result materialization:** `.execute()` returns pandas objects. `.to_pyarrow()` and `.to_pyarrow_batches()` use the Arrow IPC result data exposed by Hotdata without converting through JSON rows; batches are split locally after the result is downloaded.
|
|
97
|
+
- **Raw SQL escape hatch:** `con.sql("SELECT ...", dialect="postgres")` is the most reliable way to use Hotdata-specific federated table names or SQL that Ibis does not model directly.
|
|
98
|
+
- **Managed database lifecycle:** `create_database("sales", schema="public", tables=["orders"])` registers a managed connection (Ibis catalog). `create_table("orders", pandas_df, database=("sales", "public"))` uploads Parquet and loads it with replace mode. Query as `sales.public.orders` in SQL. `drop_table` clears a managed table; `drop_database` deletes the connection.
|
|
99
|
+
- **Parquet uploads:** `create_table` accepts pandas DataFrames, PyArrow tables, or schema-only empty tables. Tables must live in a managed connection — declare them with `create_database(..., tables=[...])` first. Loads always use replace mode; pass `overwrite=True` to replace an existing synced table (the default `overwrite=False` raises if the table already exists).
|
|
100
|
+
|
|
101
|
+
Not supported as full Ibis backend features:
|
|
102
|
+
|
|
103
|
+
- **General DDL and mutations:** Arbitrary remote DDL, inserts, updates, deletes, and schema-altering operations on external connections are not implemented. Managed-database writes are limited to `create_database`, `create_table`, `drop_table`, and `drop_database` as described above.
|
|
104
|
+
- **Temporary tables and in-memory registration:** `supports_temporary_tables` is false, and in-memory tables are not uploaded automatically for joins.
|
|
105
|
+
- **Python UDFs:** `supports_python_udfs` is false.
|
|
106
|
+
- **Transactions and sessions as database state:** Hotdata sandbox sessions can be passed as `session_id`, but the backend does not expose transaction APIs.
|
|
107
|
+
- **Backend-native SQL dialect:** Compilation uses Ibis' Postgres dialect as the closest fit. Hotdata SQL and federation rules are authoritative, so not every Ibis expression that compiles is guaranteed to execute remotely.
|
|
108
|
+
- **Complete Ibis compliance:** The backend is experimental and has focused test coverage for connection, discovery, schema mapping, execution, uploads, and Arrow results. It has not yet been validated against the full Ibis backend test suite.
|
|
109
|
+
- **Hotdata platform APIs beyond SQL and managed databases:** embeddings, indexes, query history management, sandbox lifecycle management, and other Hotdata-specific APIs are outside the Ibis backend surface.
|
|
110
|
+
|
|
111
|
+
## Development
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
uv sync # installs dev group by default (pytest, ruff, httpx for examples)
|
|
115
|
+
uv run pytest
|
|
116
|
+
uv run ruff check src tests examples
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Lockfile CI: `uv sync --locked && uv run pytest`.
|
|
120
|
+
|
|
121
|
+
## TPC-H for the examples
|
|
122
|
+
|
|
123
|
+
Examples assume something like **`tpch.tpch_sf1.customer`**. Provision TPC-H in your workspace (commonly a **DuckDB** connection, then DuckDB’s `tpch` extension and `CALL dbgen(sf = 1)` — see [DuckDB TPC-H](https://www.duckdb.org/docs/current/core_extensions/tpch.html) and [Hotdata Quick Start](https://www.hotdata.dev/docs/quick-start)). If your data lives under `main` instead, pass `--default-schema` / `--default-connection` or set `HOTDATA_DEFAULT_*` (see `examples/_helpers.py`).
|
|
124
|
+
|
|
125
|
+
## Examples
|
|
126
|
+
|
|
127
|
+
Needs `HOTDATA_API_KEY` and `HOTDATA_WORKSPACE`.
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
uv sync
|
|
131
|
+
export HOTDATA_API_KEY=…
|
|
132
|
+
export HOTDATA_WORKSPACE=…
|
|
133
|
+
uv run python examples/01_catalog_introspection.py
|
|
134
|
+
uv run python examples/02_execute_sql.py 'SELECT COUNT(*) AS n FROM tpch.tpch_sf1.customer'
|
|
135
|
+
uv run python examples/03_connect_via_url.py
|
|
136
|
+
uv run python examples/04_ibis_table_workflows.py
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Ibis tables → pandas DataFrames
|
|
140
|
+
|
|
141
|
+
Calling **`.execute()`** on a table expression runs the compiled SQL on Hotdata and returns a **pandas** `DataFrame` (Ibis’s default for this backend).
|
|
142
|
+
|
|
143
|
+
Hotdata’s SQL often uses a **federated prefix** (for example `tpch.tpch_sf1`) that may not match the Ibis **catalog** string (the connection id). A reliable pattern is to start from **`con.sql("SELECT * FROM tpch.tpch_sf1.mytable", dialect="postgres")`**, then chain filters and aggregates—see **`examples/04_ibis_table_workflows.py`**.
|
|
144
|
+
|
|
145
|
+
When **`con.table("mytable")`** is enough (single connection/schema and names align with compiled SQL), the same operations apply:
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
t = con.table("customer") # or con.table("customer", database=(conn_id, "tpch_sf1"))
|
|
149
|
+
|
|
150
|
+
df = (
|
|
151
|
+
t.filter(t.c_mktsegment == "AUTOMOBILE")
|
|
152
|
+
.select("c_custkey", "c_name")
|
|
153
|
+
.limit(100)
|
|
154
|
+
.execute()
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
by_seg = t.group_by(t.c_mktsegment).agg(n=t.count()).execute()
|
|
158
|
+
|
|
159
|
+
o = con.table("orders")
|
|
160
|
+
orders_with_names = (
|
|
161
|
+
t.join(o, t.c_custkey == o.o_custkey)
|
|
162
|
+
.select(t.c_name, o.o_totalprice)
|
|
163
|
+
.limit(50)
|
|
164
|
+
.execute()
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
total = t.c_acctbal.sum().execute()
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Other useful paths: **`.to_pyarrow()`** / **`.to_pyarrow_batches()`** for Arrow; **`con.sql("SELECT …", dialect="postgres")`** then chain the returned table expression.
|
|
171
|
+
|
|
172
|
+
## References
|
|
173
|
+
|
|
174
|
+
- [Hotdata Python SDK](https://github.com/hotdata-dev/sdk-python)
|
|
175
|
+
- [Hotdata API](https://www.hotdata.dev/docs/api-reference) · [Hotdata SQL](https://www.hotdata.dev/docs/sql)
|
|
176
|
+
- [Ibis](https://ibis-project.org/) · [Ibis backend hierarchy](https://ibis-project.org/concepts/backend-table-hierarchy.qmd)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
ibis_hotdata/__init__.py,sha256=oFalV5QK1XhHut0zCNaADMTR7eioubYVTJPpVqhFrng,252
|
|
2
|
+
ibis_hotdata/backend.py,sha256=DlzQpE76iBJElDpKIJliytk6iqw9eB8k5g2Fi8_VyVw,24394
|
|
3
|
+
ibis_hotdata/http.py,sha256=W2UuMq56RFV3GA13oNOSuJNHeczV3MlenqMt3sYoksI,10251
|
|
4
|
+
ibis_hotdata/managed.py,sha256=B289vMWtPdrEZ5U4JCGi7qb953SSOBmz4fErtVYzbhk,428
|
|
5
|
+
ibis_hotdata/types.py,sha256=_vuVB2gooJ9BCbvxjswp-PqMCraAYVji744EP_wb2qI,636
|
|
6
|
+
hotdata_ibis-0.1.1.dist-info/METADATA,sha256=m3hn7g9ectyDTsT1z88dT333bTtPXMR3E4BKaelyXbE,10538
|
|
7
|
+
hotdata_ibis-0.1.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
8
|
+
hotdata_ibis-0.1.1.dist-info/entry_points.txt,sha256=tPFpKl3RYEJj2XvxtUMMcvL_Bq1f-HlTn_drbKFECpg,39
|
|
9
|
+
hotdata_ibis-0.1.1.dist-info/top_level.txt,sha256=zV4cow300tmsMSssOOfS3nrlS35lhG_ZX3aNsFJ5hvY,13
|
|
10
|
+
hotdata_ibis-0.1.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ibis_hotdata
|
ibis_hotdata/__init__.py
ADDED
ibis_hotdata/backend.py
ADDED
|
@@ -0,0 +1,673 @@
|
|
|
1
|
+
"""Ibis backend for Hotdata (federated SQL over HTTP).
|
|
2
|
+
|
|
3
|
+
Identifier mapping:
|
|
4
|
+
|
|
5
|
+
* **Ibis catalog** ↔ Hotdata ``connection_id`` returned by ``GET /v1/connections`` /
|
|
6
|
+
``connection`` on ``GET /v1/information_schema`` rows.
|
|
7
|
+
* **Ibis database** ↔ remote **schema name** surfaced by Hotdata (`schema` column).
|
|
8
|
+
* **Ibis table name** ↔ remote table name (`table` column).
|
|
9
|
+
|
|
10
|
+
Use fully qualified SQL in Hotdata exactly as documented for federated Postgres-style
|
|
11
|
+
references (typically ``connection.schema.table``, quoting if needed).
|
|
12
|
+
|
|
13
|
+
See the README for dialect and typing limitations.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import contextlib
|
|
19
|
+
import io
|
|
20
|
+
from collections.abc import Iterable, Mapping, Sequence
|
|
21
|
+
from functools import cached_property
|
|
22
|
+
from importlib.metadata import PackageNotFoundError
|
|
23
|
+
from importlib.metadata import version as pkg_version
|
|
24
|
+
from typing import TYPE_CHECKING, Any
|
|
25
|
+
from urllib.parse import ParseResult, parse_qsl, unquote_plus
|
|
26
|
+
|
|
27
|
+
import ibis.backends.sql.compilers as sc
|
|
28
|
+
import ibis.common.exceptions as com
|
|
29
|
+
import ibis.expr.datatypes as dt
|
|
30
|
+
import ibis.expr.operations as ops
|
|
31
|
+
import ibis.expr.schema as sch
|
|
32
|
+
import ibis.expr.types as ir
|
|
33
|
+
import sqlglot as sg
|
|
34
|
+
import sqlglot.expressions as sge
|
|
35
|
+
from ibis.backends import (
|
|
36
|
+
CanCreateDatabase,
|
|
37
|
+
CanListCatalog,
|
|
38
|
+
CanListDatabase,
|
|
39
|
+
HasCurrentCatalog,
|
|
40
|
+
HasCurrentDatabase,
|
|
41
|
+
NoExampleLoader,
|
|
42
|
+
)
|
|
43
|
+
from ibis.backends.sql import SQLBackend
|
|
44
|
+
|
|
45
|
+
from ibis_hotdata.http import HotdataAPIError, HotdataClient
|
|
46
|
+
from ibis_hotdata.managed import DEFAULT_SCHEMA, MANAGED_SOURCE_TYPE
|
|
47
|
+
from ibis_hotdata.types import dtype_from_hotdata_sql_type
|
|
48
|
+
|
|
49
|
+
_INFORMATION_SCHEMA_PAGE_SIZE = 500
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _ibis_err_from_hotdata(exc: HotdataAPIError) -> com.IbisError:
|
|
53
|
+
return com.IbisError(str(exc))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
if TYPE_CHECKING:
|
|
57
|
+
from collections.abc import Iterator
|
|
58
|
+
|
|
59
|
+
import pandas as pd
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class Backend(
|
|
63
|
+
SQLBackend,
|
|
64
|
+
CanCreateDatabase,
|
|
65
|
+
CanListCatalog,
|
|
66
|
+
CanListDatabase,
|
|
67
|
+
HasCurrentCatalog,
|
|
68
|
+
HasCurrentDatabase,
|
|
69
|
+
NoExampleLoader,
|
|
70
|
+
):
|
|
71
|
+
"""Hotdata-backed Ibis client."""
|
|
72
|
+
|
|
73
|
+
name = "hotdata"
|
|
74
|
+
compiler = sc.postgres.compiler
|
|
75
|
+
supports_python_udfs = False
|
|
76
|
+
supports_temporary_tables = False
|
|
77
|
+
|
|
78
|
+
_http: HotdataClient
|
|
79
|
+
_default_connection: str | None
|
|
80
|
+
_default_schema: str | None
|
|
81
|
+
|
|
82
|
+
def _from_url(self, url: ParseResult, **kwarg_overrides: Any):
|
|
83
|
+
"""Connect using ``hotdata://host`` or ``hotdata://host/path`` URLs.
|
|
84
|
+
|
|
85
|
+
* Base URL defaults to ``https://{host}`` plus optional leading ``path``.
|
|
86
|
+
* Query string may include ``token``, ``workspace_id``, ``session_id``,
|
|
87
|
+
``timeout``, ``verify_ssl`` (``true`` / ``false``), ``default_connection``,
|
|
88
|
+
``default_schema``, ``poll_interval_s``, ``poll_timeout_s``.
|
|
89
|
+
* If ``token`` is omitted, ``urlparse`` password (`user:TOKEN@`) is accepted.
|
|
90
|
+
"""
|
|
91
|
+
q = dict(parse_qsl(url.query, keep_blank_values=True))
|
|
92
|
+
q.update(kwarg_overrides)
|
|
93
|
+
|
|
94
|
+
netloc = url.netloc
|
|
95
|
+
path_prefix = url.path.rstrip("/")
|
|
96
|
+
if not netloc:
|
|
97
|
+
raise com.IbisError(
|
|
98
|
+
"hotdata:// URL requires a network location, e.g. hotdata://api.hotdata.dev/"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
verify = q.pop("verify_ssl", None)
|
|
102
|
+
if verify is None:
|
|
103
|
+
verify_ssl: bool | str = True
|
|
104
|
+
elif isinstance(verify, str):
|
|
105
|
+
verify_ssl = verify.lower() in ("true", "1", "yes")
|
|
106
|
+
else:
|
|
107
|
+
verify_ssl = bool(verify)
|
|
108
|
+
|
|
109
|
+
timeout = float(q.pop("timeout", "120"))
|
|
110
|
+
api_url = q.pop("api_url", None) or ("https://" + netloc + path_prefix)
|
|
111
|
+
|
|
112
|
+
token = q.pop("token", None) or (unquote_plus(url.password) if url.password else None)
|
|
113
|
+
workspace_id = q.pop("workspace_id", None)
|
|
114
|
+
|
|
115
|
+
kwargs = dict(
|
|
116
|
+
api_url=api_url,
|
|
117
|
+
token=token,
|
|
118
|
+
workspace_id=workspace_id,
|
|
119
|
+
session_id=q.pop("session_id", None),
|
|
120
|
+
timeout=timeout,
|
|
121
|
+
verify_ssl=verify_ssl,
|
|
122
|
+
default_connection=q.pop("default_connection", None),
|
|
123
|
+
default_schema=q.pop("default_schema", None),
|
|
124
|
+
poll_interval_s=float(q.pop("poll_interval_s", "0.25")),
|
|
125
|
+
poll_timeout_s=float(q.pop("poll_timeout_s", "600")),
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
self._convert_kwargs(kwargs)
|
|
129
|
+
if not kwargs.get("token"):
|
|
130
|
+
raise com.IbisError("Hotdata URL missing token (query parameter or password segment)")
|
|
131
|
+
if not kwargs.get("workspace_id"):
|
|
132
|
+
raise com.IbisError("Hotdata URL missing workspace_id query parameter")
|
|
133
|
+
|
|
134
|
+
return self.connect(**kwargs)
|
|
135
|
+
|
|
136
|
+
def do_connect(
|
|
137
|
+
self,
|
|
138
|
+
*,
|
|
139
|
+
api_url: str,
|
|
140
|
+
token: str,
|
|
141
|
+
workspace_id: str,
|
|
142
|
+
session_id: str | None = None,
|
|
143
|
+
timeout: float = 120.0,
|
|
144
|
+
verify_ssl: bool | str = True,
|
|
145
|
+
default_connection: str | None = None,
|
|
146
|
+
default_schema: str | None = None,
|
|
147
|
+
poll_interval_s: float = 0.25,
|
|
148
|
+
poll_timeout_s: float = 600.0,
|
|
149
|
+
) -> None:
|
|
150
|
+
"""Create an Ibis client for a Hotdata workspace.
|
|
151
|
+
|
|
152
|
+
Query execution always uses Hotdata's async path and downloads ready
|
|
153
|
+
results as Arrow IPC from ``GET /v1/results/{id}``.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
api_url
|
|
158
|
+
Hotdata API base URL (e.g. ``https://api.hotdata.dev``).
|
|
159
|
+
token
|
|
160
|
+
API bearer token (``Authorization`` header).
|
|
161
|
+
workspace_id
|
|
162
|
+
Workspace public id (``X-Workspace-Id`` header).
|
|
163
|
+
session_id
|
|
164
|
+
Optional sandbox id (``X-Session-Id`` header).
|
|
165
|
+
timeout
|
|
166
|
+
HTTP timeout in seconds (per request).
|
|
167
|
+
verify_ssl
|
|
168
|
+
Passed through to the Hotdata SDK configuration (boolean or path to a CA bundle).
|
|
169
|
+
default_connection
|
|
170
|
+
Optional default **catalog** (Hotdata connection id). If omitted and the
|
|
171
|
+
workspace exposes exactly one connection, it is chosen automatically;
|
|
172
|
+
otherwise you must set this (or use fully qualified ``database=``).
|
|
173
|
+
default_schema
|
|
174
|
+
Optional default **database** (remote schema name). If omitted and only
|
|
175
|
+
one schema exists for the default connection, it is chosen automatically.
|
|
176
|
+
poll_interval_s
|
|
177
|
+
Sleep between ``GET /v1/query-runs/{id}`` polls.
|
|
178
|
+
poll_timeout_s
|
|
179
|
+
Max wall-clock time spent waiting on async submissions.
|
|
180
|
+
"""
|
|
181
|
+
self.disconnect()
|
|
182
|
+
self._default_connection = default_connection
|
|
183
|
+
self._default_schema = default_schema
|
|
184
|
+
self._poll_interval_s = poll_interval_s
|
|
185
|
+
self._poll_timeout_s = poll_timeout_s
|
|
186
|
+
|
|
187
|
+
self._http = HotdataClient(
|
|
188
|
+
api_url=api_url,
|
|
189
|
+
token=token,
|
|
190
|
+
workspace_id=workspace_id,
|
|
191
|
+
session_id=session_id,
|
|
192
|
+
timeout=timeout,
|
|
193
|
+
verify_ssl=verify_ssl,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def disconnect(self) -> None:
|
|
197
|
+
if getattr(self, "_http", None) is not None:
|
|
198
|
+
self._http.close()
|
|
199
|
+
|
|
200
|
+
# --- hierarchy ---------------------------------------------------------
|
|
201
|
+
|
|
202
|
+
def _infer_default_connection(self) -> str:
|
|
203
|
+
ids = self._connection_ids()
|
|
204
|
+
if self._default_connection is not None:
|
|
205
|
+
return self._default_connection
|
|
206
|
+
if len(ids) == 1:
|
|
207
|
+
self._default_connection = ids[0]
|
|
208
|
+
return self._default_connection
|
|
209
|
+
raise com.IbisInputError(
|
|
210
|
+
"Multiple Hotdata connections in this workspace — pass default_connection="
|
|
211
|
+
"... when connecting or qualify tables with database=('conn_id','schema')."
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
def _infer_default_schema(self, connection_id: str) -> str:
|
|
215
|
+
schemas = sorted(
|
|
216
|
+
{
|
|
217
|
+
row["schema"]
|
|
218
|
+
for row in self._iterate_information_schema(
|
|
219
|
+
{"connection_id": connection_id}, include_columns=False
|
|
220
|
+
)
|
|
221
|
+
}
|
|
222
|
+
)
|
|
223
|
+
if self._default_schema is not None:
|
|
224
|
+
if self._default_schema not in schemas:
|
|
225
|
+
raise com.IbisInputError(
|
|
226
|
+
f"Unknown schema {self._default_schema!r} for connection {connection_id!r}"
|
|
227
|
+
)
|
|
228
|
+
return self._default_schema
|
|
229
|
+
if len(schemas) == 1:
|
|
230
|
+
self._default_schema = schemas[0]
|
|
231
|
+
return self._default_schema
|
|
232
|
+
raise com.IbisInputError(
|
|
233
|
+
"Could not infer default schema — pass default_schema=... when connecting"
|
|
234
|
+
" or qualify with database=('connection_id','schema')."
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
@property
|
|
238
|
+
def current_catalog(self) -> str:
|
|
239
|
+
return self._infer_default_connection()
|
|
240
|
+
|
|
241
|
+
@property
|
|
242
|
+
def current_database(self) -> str:
|
|
243
|
+
return self._infer_default_schema(self.current_catalog)
|
|
244
|
+
|
|
245
|
+
# --- catalogs / databases ----------------------------------------------
|
|
246
|
+
|
|
247
|
+
def _to_catalog_db_tuple(self, table_loc: sge.Table):
|
|
248
|
+
"""Use the compiler SQL dialect when stringifying qualifiers (backend name is not a dialect)."""
|
|
249
|
+
|
|
250
|
+
dialect = self.dialect
|
|
251
|
+
if (sg_cat := table_loc.args["catalog"]) is not None:
|
|
252
|
+
sg_cat.args["quoted"] = False
|
|
253
|
+
sg_cat = sg_cat.sql(dialect=dialect)
|
|
254
|
+
if (sg_db := table_loc.args["db"]) is not None:
|
|
255
|
+
sg_db.args["quoted"] = False
|
|
256
|
+
sg_db = sg_db.sql(dialect=dialect)
|
|
257
|
+
|
|
258
|
+
return sg_cat, sg_db
|
|
259
|
+
|
|
260
|
+
def _connection_ids(self) -> list[str]:
|
|
261
|
+
data = self._http.list_connections()
|
|
262
|
+
return [c["id"] for c in data["connections"]]
|
|
263
|
+
|
|
264
|
+
def list_catalogs(self, *, like: str | None = None) -> list[str]:
|
|
265
|
+
names = self._connection_ids()
|
|
266
|
+
return self._filter_with_like(names, like)
|
|
267
|
+
|
|
268
|
+
def list_databases(
|
|
269
|
+
self,
|
|
270
|
+
*,
|
|
271
|
+
like: str | None = None,
|
|
272
|
+
catalog: str | None = None,
|
|
273
|
+
) -> list[str]:
|
|
274
|
+
conn = catalog or self.current_catalog
|
|
275
|
+
schemas = sorted(
|
|
276
|
+
{
|
|
277
|
+
row["schema"]
|
|
278
|
+
for row in self._iterate_information_schema(
|
|
279
|
+
{"connection_id": conn}, include_columns=False
|
|
280
|
+
)
|
|
281
|
+
}
|
|
282
|
+
)
|
|
283
|
+
return self._filter_with_like(list(schemas), like)
|
|
284
|
+
|
|
285
|
+
def list_tables(
|
|
286
|
+
self,
|
|
287
|
+
*,
|
|
288
|
+
like: str | None = None,
|
|
289
|
+
database: tuple[str, str] | str | None = None,
|
|
290
|
+
) -> list[str]:
|
|
291
|
+
loc = database if database is not None else (self.current_catalog, self.current_database)
|
|
292
|
+
table_loc = self._to_sqlglot_table(loc)
|
|
293
|
+
catalog_part, schema_part = self._to_catalog_db_tuple(table_loc)
|
|
294
|
+
params: dict[str, Any] = {}
|
|
295
|
+
if catalog_part:
|
|
296
|
+
params["connection_id"] = catalog_part
|
|
297
|
+
if schema_part:
|
|
298
|
+
params["schema"] = schema_part
|
|
299
|
+
|
|
300
|
+
tables = sorted(
|
|
301
|
+
{
|
|
302
|
+
row["table"]
|
|
303
|
+
for row in self._iterate_information_schema(params, include_columns=False)
|
|
304
|
+
}
|
|
305
|
+
)
|
|
306
|
+
return self._filter_with_like(tables, like)
|
|
307
|
+
|
|
308
|
+
def _iterate_information_schema(
|
|
309
|
+
self, filters: Mapping[str, Any], *, include_columns: bool
|
|
310
|
+
) -> Iterable[dict[str, Any]]:
|
|
311
|
+
cursor: str | None = None
|
|
312
|
+
while True:
|
|
313
|
+
params: dict[str, Any] = dict(filters)
|
|
314
|
+
params["limit"] = _INFORMATION_SCHEMA_PAGE_SIZE
|
|
315
|
+
params["include_columns"] = include_columns
|
|
316
|
+
if cursor:
|
|
317
|
+
params["cursor"] = cursor
|
|
318
|
+
chunk = self._http.get_information_schema(params)
|
|
319
|
+
yield from chunk["tables"]
|
|
320
|
+
if not chunk.get("has_more"):
|
|
321
|
+
break
|
|
322
|
+
cursor = chunk.get("next_cursor")
|
|
323
|
+
if cursor is None:
|
|
324
|
+
break
|
|
325
|
+
|
|
326
|
+
def _resolve_connection(self, name_or_id: str) -> dict[str, Any]:
|
|
327
|
+
data = self._http.list_connections()
|
|
328
|
+
for conn in data["connections"]:
|
|
329
|
+
if conn["id"] == name_or_id or conn.get("name") == name_or_id:
|
|
330
|
+
return conn
|
|
331
|
+
raise com.IbisError(f"Unknown Hotdata connection {name_or_id!r}")
|
|
332
|
+
|
|
333
|
+
def _resolve_managed_connection(self, name_or_id: str) -> dict[str, Any]:
|
|
334
|
+
conn = self._resolve_connection(name_or_id)
|
|
335
|
+
if conn.get("source_type") != MANAGED_SOURCE_TYPE:
|
|
336
|
+
raise com.IbisInputError(
|
|
337
|
+
f"{name_or_id!r} is not a managed database "
|
|
338
|
+
f"(source_type={conn.get('source_type')!r})"
|
|
339
|
+
)
|
|
340
|
+
return conn
|
|
341
|
+
|
|
342
|
+
def _managed_table_synced(
|
|
343
|
+
self,
|
|
344
|
+
connection_id: str,
|
|
345
|
+
schema_name: str,
|
|
346
|
+
table_name: str,
|
|
347
|
+
) -> bool:
|
|
348
|
+
"""Return True only if the table exists and its last load has completed.
|
|
349
|
+
|
|
350
|
+
A table whose ``synced`` flag is False is still being loaded; we treat
|
|
351
|
+
it as writable (returns False) so that an in-progress load can be
|
|
352
|
+
retried without requiring ``overwrite=True``. Tables not present in the
|
|
353
|
+
information schema also return False (not yet created).
|
|
354
|
+
"""
|
|
355
|
+
for row in self._iterate_information_schema(
|
|
356
|
+
{"connection_id": connection_id, "schema": schema_name, "table": table_name},
|
|
357
|
+
include_columns=False,
|
|
358
|
+
):
|
|
359
|
+
if row["table"] == table_name and row["schema"] == schema_name:
|
|
360
|
+
return bool(row.get("synced", True))
|
|
361
|
+
return False
|
|
362
|
+
|
|
363
|
+
def _table_location(
|
|
364
|
+
self,
|
|
365
|
+
database: tuple[str, str] | str | None,
|
|
366
|
+
) -> tuple[str, str]:
|
|
367
|
+
if database is None:
|
|
368
|
+
if self._default_connection is None or self._default_schema is None:
|
|
369
|
+
raise com.IbisInputError(
|
|
370
|
+
"Requires database=(catalog, schema) or default_connection and default_schema"
|
|
371
|
+
)
|
|
372
|
+
conn = self._default_connection
|
|
373
|
+
schema = self._default_schema
|
|
374
|
+
elif isinstance(database, tuple):
|
|
375
|
+
conn, schema = database
|
|
376
|
+
else:
|
|
377
|
+
conn = self._default_connection or self.current_catalog
|
|
378
|
+
schema = database
|
|
379
|
+
if conn is None:
|
|
380
|
+
raise com.IbisInputError(
|
|
381
|
+
"create_table with database=schema requires default_connection or current catalog"
|
|
382
|
+
)
|
|
383
|
+
conn_record = self._resolve_managed_connection(conn)
|
|
384
|
+
return conn_record["id"], schema
|
|
385
|
+
|
|
386
|
+
# --- schema / sql execution --------------------------------------------
|
|
387
|
+
|
|
388
|
+
def get_schema(
|
|
389
|
+
self,
|
|
390
|
+
table_name: str,
|
|
391
|
+
*,
|
|
392
|
+
catalog: str | None = None,
|
|
393
|
+
database: str | None = None,
|
|
394
|
+
) -> sch.Schema:
|
|
395
|
+
conn = catalog or self.current_catalog
|
|
396
|
+
schema_name = database or self.current_database
|
|
397
|
+
matches: list[dict[str, Any]] = []
|
|
398
|
+
for row in self._iterate_information_schema(
|
|
399
|
+
{"connection_id": conn, "schema": schema_name, "table": table_name},
|
|
400
|
+
include_columns=True,
|
|
401
|
+
):
|
|
402
|
+
if row["table"] == table_name and row["schema"] == schema_name:
|
|
403
|
+
matches.append(row)
|
|
404
|
+
if not matches:
|
|
405
|
+
fqn = sg.table(table_name, db=schema_name, catalog=conn).sql(self.compiler.dialect)
|
|
406
|
+
raise com.TableNotFound(fqn)
|
|
407
|
+
|
|
408
|
+
columns = matches[0].get("columns") or []
|
|
409
|
+
mapping: dict[str, dt.DataType] = {}
|
|
410
|
+
for col in columns:
|
|
411
|
+
name = col["name"]
|
|
412
|
+
sql_t = col.get("data_type")
|
|
413
|
+
null = bool(col.get("nullable", True))
|
|
414
|
+
mapping[name] = dtype_from_hotdata_sql_type(sql_t, nullable=null)
|
|
415
|
+
return sch.Schema(mapping)
|
|
416
|
+
|
|
417
|
+
def _get_schema_using_query(self, query: str) -> sch.Schema:
|
|
418
|
+
stripped = query.strip().rstrip(";")
|
|
419
|
+
preview_sql = f"SELECT * FROM ({stripped}) AS ibis_hotdata_preview LIMIT 1"
|
|
420
|
+
try:
|
|
421
|
+
data = self._http.execute_query(
|
|
422
|
+
preview_sql,
|
|
423
|
+
poll_interval_s=self._poll_interval_s,
|
|
424
|
+
poll_timeout_s=self._poll_timeout_s,
|
|
425
|
+
)
|
|
426
|
+
except HotdataAPIError as exc:
|
|
427
|
+
raise _ibis_err_from_hotdata(exc) from exc
|
|
428
|
+
|
|
429
|
+
from ibis.formats.pyarrow import PyArrowSchema
|
|
430
|
+
|
|
431
|
+
return PyArrowSchema.to_ibis(data["pa_table"].schema)
|
|
432
|
+
|
|
433
|
+
@contextlib.contextmanager
|
|
434
|
+
def _safe_raw_sql(
|
|
435
|
+
self,
|
|
436
|
+
query: str | sge.Expression,
|
|
437
|
+
) -> Iterator[Any]:
|
|
438
|
+
if not isinstance(query, str):
|
|
439
|
+
query = query.sql(dialect=self.compiler.dialect, pretty=True)
|
|
440
|
+
|
|
441
|
+
try:
|
|
442
|
+
payload = self._http.execute_query(
|
|
443
|
+
query,
|
|
444
|
+
poll_interval_s=self._poll_interval_s,
|
|
445
|
+
poll_timeout_s=self._poll_timeout_s,
|
|
446
|
+
)
|
|
447
|
+
except HotdataAPIError as exc:
|
|
448
|
+
raise _ibis_err_from_hotdata(exc) from exc
|
|
449
|
+
|
|
450
|
+
yield payload["pa_table"]
|
|
451
|
+
|
|
452
|
+
def _fetch_from_cursor(self, cursor, schema: sch.Schema) -> pd.DataFrame:
|
|
453
|
+
from ibis.formats.pandas import PandasData
|
|
454
|
+
|
|
455
|
+
df = cursor.to_pandas()
|
|
456
|
+
return PandasData.convert_table(df, schema)
|
|
457
|
+
|
|
458
|
+
def to_pyarrow(
|
|
459
|
+
self,
|
|
460
|
+
expr: ir.Expr,
|
|
461
|
+
/,
|
|
462
|
+
*,
|
|
463
|
+
params: Mapping[ir.Scalar, Any] | None = None,
|
|
464
|
+
limit: int | str | None = None,
|
|
465
|
+
**kwargs: Any,
|
|
466
|
+
):
|
|
467
|
+
self._run_pre_execute_hooks(expr)
|
|
468
|
+
table_expr = expr.as_table()
|
|
469
|
+
sql = self.compile(table_expr, params=params, limit=limit, **kwargs)
|
|
470
|
+
try:
|
|
471
|
+
payload = self._http.execute_query(
|
|
472
|
+
sql,
|
|
473
|
+
poll_interval_s=self._poll_interval_s,
|
|
474
|
+
poll_timeout_s=self._poll_timeout_s,
|
|
475
|
+
)
|
|
476
|
+
except HotdataAPIError as exc:
|
|
477
|
+
raise _ibis_err_from_hotdata(exc) from exc
|
|
478
|
+
table = payload["pa_table"]
|
|
479
|
+
arrow_schema = table_expr.schema().to_pyarrow()
|
|
480
|
+
table = table.rename_columns(list(table_expr.columns)).cast(arrow_schema)
|
|
481
|
+
return expr.__pyarrow_result__(table)
|
|
482
|
+
|
|
483
|
+
def to_pyarrow_batches(
|
|
484
|
+
self,
|
|
485
|
+
expr: ir.Expr,
|
|
486
|
+
/,
|
|
487
|
+
*,
|
|
488
|
+
params: Mapping[ir.Scalar, Any] | None = None,
|
|
489
|
+
limit: int | str | None = None,
|
|
490
|
+
chunk_size: int = 1_000_000,
|
|
491
|
+
**kwargs: Any,
|
|
492
|
+
):
|
|
493
|
+
"""Execute to Arrow and expose local record batches.
|
|
494
|
+
|
|
495
|
+
Hotdata currently returns one Arrow IPC result for the full query. This
|
|
496
|
+
method downloads that result first, then splits it into local batches.
|
|
497
|
+
"""
|
|
498
|
+
import pyarrow as pa
|
|
499
|
+
|
|
500
|
+
table = self.to_pyarrow(expr.as_table(), params=params, limit=limit, **kwargs)
|
|
501
|
+
return pa.ipc.RecordBatchReader.from_batches(
|
|
502
|
+
table.schema,
|
|
503
|
+
table.to_batches(max_chunksize=chunk_size),
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
def upload_file(self, data: bytes, *, content_type: str | None = None) -> dict[str, Any]:
|
|
507
|
+
"""POST ``/v1/files``; returns the upload record (use ``id`` with managed table loads)."""
|
|
508
|
+
try:
|
|
509
|
+
return self._http.upload_file(data, content_type=content_type)
|
|
510
|
+
except HotdataAPIError as exc:
|
|
511
|
+
raise _ibis_err_from_hotdata(exc) from exc
|
|
512
|
+
|
|
513
|
+
def create_database(
|
|
514
|
+
self,
|
|
515
|
+
name: str,
|
|
516
|
+
/,
|
|
517
|
+
*,
|
|
518
|
+
catalog: str | None = None,
|
|
519
|
+
schema: str = DEFAULT_SCHEMA,
|
|
520
|
+
tables: Sequence[str] | None = None,
|
|
521
|
+
force: bool = False,
|
|
522
|
+
) -> None:
|
|
523
|
+
"""Create a managed Hotdata connection (Ibis catalog) with optional declared tables."""
|
|
524
|
+
if catalog is not None:
|
|
525
|
+
raise com.UnsupportedOperationError(
|
|
526
|
+
"Hotdata create_database creates a managed connection (catalog); catalog= is not supported"
|
|
527
|
+
)
|
|
528
|
+
try:
|
|
529
|
+
existing = self._resolve_connection(name)
|
|
530
|
+
except com.IbisError:
|
|
531
|
+
existing = None
|
|
532
|
+
if existing is not None:
|
|
533
|
+
if not force:
|
|
534
|
+
raise com.IbisInputError(f"Managed database {name!r} already exists")
|
|
535
|
+
if existing.get("source_type") != MANAGED_SOURCE_TYPE:
|
|
536
|
+
raise com.IbisInputError(
|
|
537
|
+
f"{name!r} is not a managed database "
|
|
538
|
+
f"(source_type={existing.get('source_type')!r})"
|
|
539
|
+
)
|
|
540
|
+
return
|
|
541
|
+
try:
|
|
542
|
+
self._http.create_managed_database(name, schema=schema, tables=list(tables or ()))
|
|
543
|
+
except HotdataAPIError as exc:
|
|
544
|
+
raise _ibis_err_from_hotdata(exc) from exc
|
|
545
|
+
|
|
546
|
+
def drop_database(
|
|
547
|
+
self,
|
|
548
|
+
name: str,
|
|
549
|
+
/,
|
|
550
|
+
*,
|
|
551
|
+
catalog: str | None = None,
|
|
552
|
+
force: bool = False,
|
|
553
|
+
) -> None:
|
|
554
|
+
"""Delete a managed Hotdata connection (Ibis catalog)."""
|
|
555
|
+
if catalog is not None:
|
|
556
|
+
raise com.UnsupportedOperationError(
|
|
557
|
+
"Hotdata drop_database deletes a managed connection (catalog); catalog= is not supported"
|
|
558
|
+
)
|
|
559
|
+
try:
|
|
560
|
+
conn = self._resolve_managed_connection(name)
|
|
561
|
+
except com.IbisInputError:
|
|
562
|
+
raise
|
|
563
|
+
except com.IbisError:
|
|
564
|
+
if force:
|
|
565
|
+
return
|
|
566
|
+
raise
|
|
567
|
+
try:
|
|
568
|
+
self._http.delete_connection(conn["id"])
|
|
569
|
+
except HotdataAPIError as exc:
|
|
570
|
+
if force and exc.status_code == 404:
|
|
571
|
+
return
|
|
572
|
+
raise _ibis_err_from_hotdata(exc) from exc
|
|
573
|
+
|
|
574
|
+
def _local_table_to_parquet(self, obj: Any, schema: sch.Schema | None):
|
|
575
|
+
import pandas as pd
|
|
576
|
+
import pyarrow as pa
|
|
577
|
+
import pyarrow.parquet as pq
|
|
578
|
+
|
|
579
|
+
if obj is not None and schema is not None:
|
|
580
|
+
raise com.IbisInputError("create_table accepts only one of obj or schema")
|
|
581
|
+
|
|
582
|
+
if obj is None:
|
|
583
|
+
if schema is None:
|
|
584
|
+
raise com.IbisInputError("create_table requires a pandas/pyarrow object or schema")
|
|
585
|
+
arrow_schema = schema.to_pyarrow()
|
|
586
|
+
table = pa.Table.from_arrays(
|
|
587
|
+
[pa.array([], type=field.type) for field in arrow_schema],
|
|
588
|
+
schema=arrow_schema,
|
|
589
|
+
)
|
|
590
|
+
elif isinstance(obj, pa.Table):
|
|
591
|
+
table = obj
|
|
592
|
+
elif isinstance(obj, pd.DataFrame):
|
|
593
|
+
table = pa.Table.from_pandas(obj, preserve_index=False)
|
|
594
|
+
else:
|
|
595
|
+
raise com.IbisInputError(
|
|
596
|
+
"create_table currently accepts pandas.DataFrame or pyarrow.Table"
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
sink = io.BytesIO()
|
|
600
|
+
pq.write_table(table, sink)
|
|
601
|
+
return sink.getvalue()
|
|
602
|
+
|
|
603
|
+
def create_table(
|
|
604
|
+
self,
|
|
605
|
+
name: str,
|
|
606
|
+
/,
|
|
607
|
+
obj: Any = None,
|
|
608
|
+
*,
|
|
609
|
+
schema: sch.Schema | None = None,
|
|
610
|
+
database: tuple[str, str] | str | None = None,
|
|
611
|
+
temp: bool = False,
|
|
612
|
+
overwrite: bool = False,
|
|
613
|
+
) -> ir.Table:
|
|
614
|
+
"""Upload local data into a declared managed table.
|
|
615
|
+
|
|
616
|
+
Hotdata loads always use ``replace`` mode (the only API option). When
|
|
617
|
+
``overwrite=False`` (the Ibis default), an existing synced table raises
|
|
618
|
+
:class:`~ibis.common.exceptions.IbisInputError` instead of replacing it.
|
|
619
|
+
"""
|
|
620
|
+
if temp:
|
|
621
|
+
raise NotImplementedError("Hotdata does not support temporary tables.")
|
|
622
|
+
|
|
623
|
+
data = self._local_table_to_parquet(obj, schema)
|
|
624
|
+
connection_id, schema_name = self._table_location(database)
|
|
625
|
+
if not overwrite and self._managed_table_synced(connection_id, schema_name, name):
|
|
626
|
+
raise com.IbisInputError(
|
|
627
|
+
f"Table {name!r} already exists; pass overwrite=True to replace"
|
|
628
|
+
)
|
|
629
|
+
upload = self.upload_file(data, content_type="application/parquet")
|
|
630
|
+
try:
|
|
631
|
+
self._http.load_managed_table(
|
|
632
|
+
connection_id,
|
|
633
|
+
schema_name,
|
|
634
|
+
name,
|
|
635
|
+
upload_id=upload["id"],
|
|
636
|
+
)
|
|
637
|
+
except HotdataAPIError as exc:
|
|
638
|
+
raise _ibis_err_from_hotdata(exc) from exc
|
|
639
|
+
return self.table(name, database=(connection_id, schema_name))
|
|
640
|
+
|
|
641
|
+
def drop_table(
|
|
642
|
+
self,
|
|
643
|
+
name: str,
|
|
644
|
+
/,
|
|
645
|
+
*,
|
|
646
|
+
database: tuple[str, str] | str | None = None,
|
|
647
|
+
force: bool = False,
|
|
648
|
+
) -> None:
|
|
649
|
+
try:
|
|
650
|
+
connection_id, schema_name = self._table_location(database)
|
|
651
|
+
except com.IbisInputError:
|
|
652
|
+
raise
|
|
653
|
+
except com.IbisError:
|
|
654
|
+
if force:
|
|
655
|
+
return
|
|
656
|
+
raise
|
|
657
|
+
try:
|
|
658
|
+
self._http.delete_managed_table(connection_id, schema_name, name)
|
|
659
|
+
except HotdataAPIError as exc:
|
|
660
|
+
if force and exc.status_code == 404:
|
|
661
|
+
return
|
|
662
|
+
raise _ibis_err_from_hotdata(exc) from exc
|
|
663
|
+
|
|
664
|
+
def _register_in_memory_table(self, _op: ops.InMemoryTable) -> None:
|
|
665
|
+
return
|
|
666
|
+
|
|
667
|
+
@cached_property
|
|
668
|
+
def version(self) -> str:
|
|
669
|
+
try:
|
|
670
|
+
v = pkg_version("hotdata-ibis")
|
|
671
|
+
except PackageNotFoundError:
|
|
672
|
+
v = "0.0.0"
|
|
673
|
+
return f"hotdata-ibis {v} (Hotdata /v1/query)"
|
ibis_hotdata/http.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
"""HTTP access to Hotdata via the official ``hotdata`` Python SDK (OpenAPI client)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import json
|
|
7
|
+
import time
|
|
8
|
+
from collections.abc import Callable, Mapping, Sequence
|
|
9
|
+
from typing import Any, TypeVar
|
|
10
|
+
|
|
11
|
+
import pyarrow as pa
|
|
12
|
+
import pyarrow_hotfix # noqa: F401
|
|
13
|
+
import pyarrow.ipc as pa_ipc
|
|
14
|
+
|
|
15
|
+
from hotdata import ApiClient, Configuration
|
|
16
|
+
from hotdata.api import (
|
|
17
|
+
ConnectionsApi,
|
|
18
|
+
InformationSchemaApi,
|
|
19
|
+
QueryApi,
|
|
20
|
+
QueryRunsApi,
|
|
21
|
+
ResultsApi,
|
|
22
|
+
UploadsApi,
|
|
23
|
+
)
|
|
24
|
+
from hotdata.exceptions import ApiException
|
|
25
|
+
from hotdata.models import CreateConnectionRequest, QueryRequest
|
|
26
|
+
from hotdata.models.async_query_response import AsyncQueryResponse
|
|
27
|
+
from hotdata.models.load_managed_table_request import LoadManagedTableRequest
|
|
28
|
+
|
|
29
|
+
from ibis_hotdata.managed import DEFAULT_SCHEMA, MANAGED_SOURCE_TYPE, build_managed_config
|
|
30
|
+
|
|
31
|
+
T = TypeVar("T")
|
|
32
|
+
|
|
33
|
+
# Matches Hotdata / runtimedb ``GET /v1/results/{{id}}`` Arrow responses.
|
|
34
|
+
APPLICATION_ARROW_STREAM = "application/vnd.apache.arrow.stream"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _sleep_until(deadline: float, interval: float) -> None:
|
|
38
|
+
"""Sleep up to ``interval`` s but never past ``deadline`` (cleaner timeout behavior)."""
|
|
39
|
+
remaining = deadline - time.monotonic()
|
|
40
|
+
if remaining > 0:
|
|
41
|
+
time.sleep(min(interval, remaining))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class HotdataAPIError(Exception):
|
|
45
|
+
def __init__(self, message: str, *, status_code: int | None = None, body: Any = None):
|
|
46
|
+
super().__init__(message)
|
|
47
|
+
self.status_code = status_code
|
|
48
|
+
self.body = body
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _from_api_exception(exc: ApiException) -> HotdataAPIError:
|
|
52
|
+
body = exc.body
|
|
53
|
+
if isinstance(body, (bytes, bytearray)):
|
|
54
|
+
body = body.decode("utf-8", errors="replace")
|
|
55
|
+
msg = f"Hotdata API error: {exc.reason}"
|
|
56
|
+
if body:
|
|
57
|
+
msg = f"{msg} {body}"
|
|
58
|
+
return HotdataAPIError(msg.strip(), status_code=exc.status, body=exc.body)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _ipc_stream_bytes_to_table(data: bytes) -> pa.Table:
|
|
62
|
+
with pa_ipc.open_stream(io.BytesIO(data)) as reader:
|
|
63
|
+
return reader.read_all()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _json_utf8(obj: bytes) -> Any:
|
|
67
|
+
return json.loads(obj.decode("utf-8"))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class HotdataClient:
|
|
71
|
+
"""Thin wrapper around the SDK used by the Ibis backend."""
|
|
72
|
+
|
|
73
|
+
def __init__(
|
|
74
|
+
self,
|
|
75
|
+
*,
|
|
76
|
+
api_url: str,
|
|
77
|
+
token: str,
|
|
78
|
+
workspace_id: str,
|
|
79
|
+
session_id: str | None = None,
|
|
80
|
+
timeout: float = 120.0,
|
|
81
|
+
verify_ssl: bool | str = True,
|
|
82
|
+
) -> None:
|
|
83
|
+
host = api_url.rstrip("/")
|
|
84
|
+
conf = Configuration(
|
|
85
|
+
host=host, api_key=token, workspace_id=workspace_id, session_id=session_id
|
|
86
|
+
)
|
|
87
|
+
if verify_ssl is False:
|
|
88
|
+
conf.verify_ssl = False
|
|
89
|
+
elif isinstance(verify_ssl, str):
|
|
90
|
+
conf.ssl_ca_cert = verify_ssl
|
|
91
|
+
self._timeout = timeout
|
|
92
|
+
self._client = ApiClient(conf)
|
|
93
|
+
self._query = QueryApi(self._client)
|
|
94
|
+
self._query_runs = QueryRunsApi(self._client)
|
|
95
|
+
self._results = ResultsApi(self._client)
|
|
96
|
+
self._connections = ConnectionsApi(self._client)
|
|
97
|
+
self._information_schema = InformationSchemaApi(self._client)
|
|
98
|
+
self._uploads = UploadsApi(self._client)
|
|
99
|
+
|
|
100
|
+
def close(self) -> None:
|
|
101
|
+
client = self._client
|
|
102
|
+
close_fn = getattr(client, "close", None)
|
|
103
|
+
if callable(close_fn):
|
|
104
|
+
close_fn()
|
|
105
|
+
return
|
|
106
|
+
pool = getattr(getattr(client, "rest_client", None), "pool_manager", None)
|
|
107
|
+
if pool is not None:
|
|
108
|
+
pool.clear()
|
|
109
|
+
|
|
110
|
+
def _safe_call(self, fn: Callable[..., T], /, *args: Any, **kwargs: Any) -> T:
|
|
111
|
+
try:
|
|
112
|
+
return fn(*args, _request_timeout=self._timeout, **kwargs)
|
|
113
|
+
except ApiException as exc:
|
|
114
|
+
raise _from_api_exception(exc) from exc
|
|
115
|
+
|
|
116
|
+
def list_connections(self) -> dict[str, Any]:
|
|
117
|
+
"""GET ``/v1/connections``."""
|
|
118
|
+
out = self._safe_call(self._connections.list_connections)
|
|
119
|
+
return out.model_dump(by_alias=True, mode="json")
|
|
120
|
+
|
|
121
|
+
def get_information_schema(self, params: Mapping[str, Any]) -> dict[str, Any]:
|
|
122
|
+
"""GET ``/v1/information_schema`` — ``params`` uses REST names (``schema`` not ``var_schema``)."""
|
|
123
|
+
out = self._safe_call(
|
|
124
|
+
self._information_schema.information_schema,
|
|
125
|
+
connection_id=params.get("connection_id"),
|
|
126
|
+
var_schema=params.get("schema"),
|
|
127
|
+
table=params.get("table"),
|
|
128
|
+
include_columns=params.get("include_columns"),
|
|
129
|
+
limit=params.get("limit"),
|
|
130
|
+
cursor=params.get("cursor"),
|
|
131
|
+
)
|
|
132
|
+
return out.model_dump(by_alias=True, mode="json")
|
|
133
|
+
|
|
134
|
+
def execute_query(
|
|
135
|
+
self,
|
|
136
|
+
sql: str,
|
|
137
|
+
*,
|
|
138
|
+
async_after_ms: int | None = None,
|
|
139
|
+
poll_interval_s: float = 0.25,
|
|
140
|
+
poll_timeout_s: float = 600.0,
|
|
141
|
+
) -> dict[str, Any]:
|
|
142
|
+
req = QueryRequest(sql=sql, var_async=True, async_after_ms=async_after_ms)
|
|
143
|
+
out = self._safe_call(self._query.query, req)
|
|
144
|
+
if isinstance(out, AsyncQueryResponse):
|
|
145
|
+
query_run_id = out.query_run_id
|
|
146
|
+
deadline = time.monotonic() + poll_timeout_s
|
|
147
|
+
while time.monotonic() < deadline:
|
|
148
|
+
qr = self._safe_call(self._query_runs.get_query_run, query_run_id)
|
|
149
|
+
status = qr.status
|
|
150
|
+
if status == "failed":
|
|
151
|
+
raise HotdataAPIError(qr.error_message or "Query run failed")
|
|
152
|
+
if status == "succeeded":
|
|
153
|
+
result_id = qr.result_id
|
|
154
|
+
if result_id is None:
|
|
155
|
+
raise HotdataAPIError("succeeded query run missing result_id")
|
|
156
|
+
return self._poll_result_arrow(
|
|
157
|
+
result_id,
|
|
158
|
+
deadline=deadline,
|
|
159
|
+
poll_interval_s=poll_interval_s,
|
|
160
|
+
)
|
|
161
|
+
_sleep_until(deadline, poll_interval_s)
|
|
162
|
+
raise HotdataAPIError("Timeout waiting for asynchronous query")
|
|
163
|
+
raise HotdataAPIError("Unexpected query response type")
|
|
164
|
+
|
|
165
|
+
def upload_file(self, data: bytes, *, content_type: str | None = None) -> dict[str, Any]:
|
|
166
|
+
kwargs: dict[str, Any] = {}
|
|
167
|
+
if content_type is not None:
|
|
168
|
+
kwargs["_content_type"] = content_type
|
|
169
|
+
resp = self._safe_call(self._uploads.upload_file, data, **kwargs)
|
|
170
|
+
return resp.model_dump(by_alias=True, mode="json")
|
|
171
|
+
|
|
172
|
+
def create_managed_database(
|
|
173
|
+
self,
|
|
174
|
+
name: str,
|
|
175
|
+
*,
|
|
176
|
+
schema: str = DEFAULT_SCHEMA,
|
|
177
|
+
tables: Sequence[str] = (),
|
|
178
|
+
) -> dict[str, Any]:
|
|
179
|
+
req = CreateConnectionRequest(
|
|
180
|
+
name=name,
|
|
181
|
+
source_type=MANAGED_SOURCE_TYPE,
|
|
182
|
+
config=build_managed_config(schema, list(tables)),
|
|
183
|
+
skip_discovery=True,
|
|
184
|
+
)
|
|
185
|
+
resp = self._safe_call(self._connections.create_connection, req)
|
|
186
|
+
return resp.model_dump(by_alias=True, mode="json")
|
|
187
|
+
|
|
188
|
+
def delete_connection(self, connection_id: str) -> None:
|
|
189
|
+
self._safe_call(self._connections.delete_connection, connection_id)
|
|
190
|
+
|
|
191
|
+
def load_managed_table(
|
|
192
|
+
self,
|
|
193
|
+
connection_id: str,
|
|
194
|
+
schema: str,
|
|
195
|
+
table: str,
|
|
196
|
+
*,
|
|
197
|
+
upload_id: str,
|
|
198
|
+
) -> dict[str, Any]:
|
|
199
|
+
req = LoadManagedTableRequest(mode="replace", upload_id=upload_id)
|
|
200
|
+
resp = self._safe_call(
|
|
201
|
+
self._connections.load_managed_table,
|
|
202
|
+
connection_id,
|
|
203
|
+
schema,
|
|
204
|
+
table,
|
|
205
|
+
req,
|
|
206
|
+
)
|
|
207
|
+
return resp.model_dump(by_alias=True, mode="json")
|
|
208
|
+
|
|
209
|
+
def delete_managed_table(self, connection_id: str, schema: str, table: str) -> None:
|
|
210
|
+
self._safe_call(
|
|
211
|
+
self._connections.delete_managed_table,
|
|
212
|
+
connection_id,
|
|
213
|
+
schema,
|
|
214
|
+
table,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
def _poll_result_arrow(
|
|
218
|
+
self,
|
|
219
|
+
result_id: str,
|
|
220
|
+
*,
|
|
221
|
+
deadline: float,
|
|
222
|
+
poll_interval_s: float,
|
|
223
|
+
) -> dict[str, Any]:
|
|
224
|
+
"""Poll ``GET /v1/results/{{id}}`` with ``Accept: application/vnd.apache.arrow.stream``."""
|
|
225
|
+
while time.monotonic() < deadline:
|
|
226
|
+
try:
|
|
227
|
+
raw = self._results.get_result_without_preload_content(
|
|
228
|
+
result_id,
|
|
229
|
+
_headers={"Accept": APPLICATION_ARROW_STREAM},
|
|
230
|
+
_request_timeout=self._timeout,
|
|
231
|
+
)
|
|
232
|
+
except ApiException as exc:
|
|
233
|
+
raise _from_api_exception(exc) from exc
|
|
234
|
+
body = raw.read()
|
|
235
|
+
status = raw.status
|
|
236
|
+
ctype = (raw.headers.get("Content-Type") or "").split(";")[0].strip().lower()
|
|
237
|
+
|
|
238
|
+
if status == 200 and ctype == APPLICATION_ARROW_STREAM.lower():
|
|
239
|
+
table = _ipc_stream_bytes_to_table(body)
|
|
240
|
+
return self._arrow_payload_from_table(table, result_id=result_id)
|
|
241
|
+
|
|
242
|
+
if status == 202:
|
|
243
|
+
_sleep_until(deadline, poll_interval_s)
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
if status == 409:
|
|
247
|
+
d = _json_utf8(body) if body else {}
|
|
248
|
+
raise HotdataAPIError(
|
|
249
|
+
d.get("error_message") or "Result failed",
|
|
250
|
+
status_code=409,
|
|
251
|
+
body=d,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
if status == 404:
|
|
255
|
+
d = _json_utf8(body) if body else {}
|
|
256
|
+
raise HotdataAPIError(
|
|
257
|
+
d.get("detail") or f"Result {result_id!r} not found",
|
|
258
|
+
status_code=404,
|
|
259
|
+
body=d,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
raise HotdataAPIError(
|
|
263
|
+
f"Unexpected GET /v1/results/{result_id} status {status}",
|
|
264
|
+
status_code=status,
|
|
265
|
+
body=body,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
raise HotdataAPIError("Timeout waiting for Arrow query result")
|
|
269
|
+
|
|
270
|
+
def _arrow_payload_from_table(
|
|
271
|
+
self,
|
|
272
|
+
table: pa.Table,
|
|
273
|
+
*,
|
|
274
|
+
result_id: str,
|
|
275
|
+
) -> dict[str, Any]:
|
|
276
|
+
sch = table.schema
|
|
277
|
+
columns = sch.names
|
|
278
|
+
nullable = [sch.field(i).nullable for i in range(len(columns))]
|
|
279
|
+
return {
|
|
280
|
+
"format": "arrow",
|
|
281
|
+
"pa_table": table,
|
|
282
|
+
"columns": columns,
|
|
283
|
+
"nullable": nullable,
|
|
284
|
+
"rows": [],
|
|
285
|
+
"result_id": result_id,
|
|
286
|
+
"row_count": table.num_rows,
|
|
287
|
+
"execution_time_ms": None,
|
|
288
|
+
"query_run_id": None,
|
|
289
|
+
"warning": None,
|
|
290
|
+
}
|
ibis_hotdata/managed.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Helpers for Hotdata managed database connections."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
MANAGED_SOURCE_TYPE = "managed"
|
|
8
|
+
DEFAULT_SCHEMA = "public"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def build_managed_config(schema: str, tables: list[str]) -> dict[str, Any]:
|
|
12
|
+
return {
|
|
13
|
+
"schemas": [
|
|
14
|
+
{
|
|
15
|
+
"name": schema,
|
|
16
|
+
"tables": [{"name": table} for table in tables],
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
}
|
ibis_hotdata/types.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Map Hotdata metadata to Ibis dtypes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ibis.expr.datatypes as dt
|
|
6
|
+
from ibis.backends.sql.datatypes import PostgresType
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def dtype_from_hotdata_sql_type(sql_type: str | None, *, nullable: bool) -> dt.DataType:
|
|
10
|
+
"""Best-effort mapping from Hotdata `/information_schema` column `data_type` strings."""
|
|
11
|
+
if not sql_type:
|
|
12
|
+
return dt.String(nullable=nullable)
|
|
13
|
+
try:
|
|
14
|
+
return PostgresType.from_string(sql_type.strip(), nullable=nullable)
|
|
15
|
+
except Exception: # ibis/sqlglot raise a variety of parse errors; fall back to String
|
|
16
|
+
return dt.String(nullable=nullable)
|