laketower 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of laketower might be problematic. Click here for more details.
- laketower/__about__.py +1 -1
- laketower/cli.py +181 -87
- laketower/config.py +82 -16
- laketower/tables.py +166 -12
- laketower/templates/queries/view.html +6 -0
- laketower/templates/tables/_macros.html +3 -0
- laketower/templates/tables/history.html +6 -0
- laketower/templates/tables/import.html +71 -0
- laketower/templates/tables/index.html +6 -0
- laketower/templates/tables/query.html +6 -0
- laketower/templates/tables/statistics.html +6 -0
- laketower/templates/tables/view.html +6 -0
- laketower/web.py +144 -29
- {laketower-0.5.0.dist-info → laketower-0.6.0.dist-info}/METADATA +145 -10
- laketower-0.6.0.dist-info/RECORD +23 -0
- laketower-0.6.0.dist-info/entry_points.txt +2 -0
- laketower-0.5.0.dist-info/RECORD +0 -22
- laketower-0.5.0.dist-info/entry_points.txt +0 -2
- {laketower-0.5.0.dist-info → laketower-0.6.0.dist-info}/WHEEL +0 -0
- {laketower-0.5.0.dist-info → laketower-0.6.0.dist-info}/licenses/LICENSE +0 -0
laketower/tables.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import enum
|
|
1
2
|
from datetime import datetime, timezone
|
|
2
|
-
from typing import Any, Protocol
|
|
3
|
+
from typing import Any, BinaryIO, Protocol, TextIO
|
|
3
4
|
|
|
4
5
|
import deltalake
|
|
5
6
|
import duckdb
|
|
@@ -17,6 +18,15 @@ from laketower.config import ConfigTable, TableFormats
|
|
|
17
18
|
DEFAULT_LIMIT = 10
|
|
18
19
|
|
|
19
20
|
|
|
21
|
+
class ImportModeEnum(str, enum.Enum):
|
|
22
|
+
append = "append"
|
|
23
|
+
overwrite = "overwrite"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ImportFileFormatEnum(str, enum.Enum):
|
|
27
|
+
csv = "csv"
|
|
28
|
+
|
|
29
|
+
|
|
20
30
|
class TableMetadata(pydantic.BaseModel):
|
|
21
31
|
table_format: TableFormats
|
|
22
32
|
name: str | None = None
|
|
@@ -43,17 +53,112 @@ class TableHistory(pydantic.BaseModel):
|
|
|
43
53
|
|
|
44
54
|
|
|
45
55
|
class TableProtocol(Protocol): # pragma: no cover
|
|
56
|
+
@classmethod
|
|
57
|
+
def is_valid(cls, table_config: ConfigTable) -> bool: ...
|
|
58
|
+
def __init__(self, table_config: ConfigTable) -> None: ...
|
|
46
59
|
def metadata(self) -> TableMetadata: ...
|
|
47
60
|
def schema(self) -> pa.Schema: ...
|
|
48
61
|
def history(self) -> TableHistory: ...
|
|
49
62
|
def dataset(self, version: int | str | None = None) -> padataset.Dataset: ...
|
|
63
|
+
def import_data(
|
|
64
|
+
self, data: pd.DataFrame, mode: ImportModeEnum = ImportModeEnum.append
|
|
65
|
+
) -> None: ...
|
|
50
66
|
|
|
51
67
|
|
|
52
68
|
class DeltaTable:
|
|
53
69
|
def __init__(self, table_config: ConfigTable):
|
|
54
70
|
super().__init__()
|
|
55
71
|
self.table_config = table_config
|
|
56
|
-
|
|
72
|
+
storage_options = self._generate_storage_options(table_config)
|
|
73
|
+
self._impl = deltalake.DeltaTable(
|
|
74
|
+
table_config.uri, storage_options=storage_options
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def _generate_storage_options(
|
|
79
|
+
cls, table_config: ConfigTable
|
|
80
|
+
) -> dict[str, str] | None:
|
|
81
|
+
# documentation from `object-store` Rust crate:
|
|
82
|
+
# - s3: https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html
|
|
83
|
+
# - adls: https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html
|
|
84
|
+
storage_options = None
|
|
85
|
+
conn_s3 = (
|
|
86
|
+
table_config.connection.s3
|
|
87
|
+
if table_config.connection and table_config.connection.s3
|
|
88
|
+
else None
|
|
89
|
+
)
|
|
90
|
+
conn_adls = (
|
|
91
|
+
table_config.connection.adls
|
|
92
|
+
if table_config.connection and table_config.connection.adls
|
|
93
|
+
else None
|
|
94
|
+
)
|
|
95
|
+
if conn_s3:
|
|
96
|
+
storage_options = (
|
|
97
|
+
{
|
|
98
|
+
"aws_access_key_id": conn_s3.s3_access_key_id,
|
|
99
|
+
"aws_secret_access_key": conn_s3.s3_secret_access_key.get_secret_value(),
|
|
100
|
+
"aws_allow_http": str(conn_s3.s3_allow_http).lower(),
|
|
101
|
+
}
|
|
102
|
+
| ({"aws_region": conn_s3.s3_region} if conn_s3.s3_region else {})
|
|
103
|
+
| (
|
|
104
|
+
{"aws_endpoint_url": str(conn_s3.s3_endpoint_url).rstrip("/")}
|
|
105
|
+
if conn_s3.s3_endpoint_url
|
|
106
|
+
else {}
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
elif conn_adls:
|
|
110
|
+
storage_options = (
|
|
111
|
+
{
|
|
112
|
+
"azure_storage_account_name": conn_adls.adls_account_name,
|
|
113
|
+
"azure_use_azure_cli": str(conn_adls.use_azure_cli).lower(),
|
|
114
|
+
}
|
|
115
|
+
| (
|
|
116
|
+
{
|
|
117
|
+
"azure_storage_access_key": conn_adls.adls_access_key.get_secret_value()
|
|
118
|
+
}
|
|
119
|
+
if conn_adls.adls_access_key
|
|
120
|
+
else {}
|
|
121
|
+
)
|
|
122
|
+
| (
|
|
123
|
+
{"azure_storage_sas_key": conn_adls.adls_sas_key.get_secret_value()}
|
|
124
|
+
if conn_adls.adls_sas_key
|
|
125
|
+
else {}
|
|
126
|
+
)
|
|
127
|
+
| (
|
|
128
|
+
{"azure_storage_tenant_id": conn_adls.adls_tenant_id}
|
|
129
|
+
if conn_adls.adls_tenant_id
|
|
130
|
+
else {}
|
|
131
|
+
)
|
|
132
|
+
| (
|
|
133
|
+
{"azure_storage_client_id": conn_adls.adls_client_id}
|
|
134
|
+
if conn_adls.adls_client_id
|
|
135
|
+
else {}
|
|
136
|
+
)
|
|
137
|
+
| (
|
|
138
|
+
{
|
|
139
|
+
"azure_storage_client_secret": conn_adls.adls_client_secret.get_secret_value()
|
|
140
|
+
}
|
|
141
|
+
if conn_adls.adls_client_secret
|
|
142
|
+
else {}
|
|
143
|
+
)
|
|
144
|
+
| (
|
|
145
|
+
{
|
|
146
|
+
"azure_msi_endpoint": str(conn_adls.azure_msi_endpoint).rstrip(
|
|
147
|
+
"/"
|
|
148
|
+
)
|
|
149
|
+
}
|
|
150
|
+
if conn_adls.azure_msi_endpoint
|
|
151
|
+
else {}
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
return storage_options
|
|
155
|
+
|
|
156
|
+
@classmethod
|
|
157
|
+
def is_valid(cls, table_config: ConfigTable) -> bool:
|
|
158
|
+
storage_options = cls._generate_storage_options(table_config)
|
|
159
|
+
return deltalake.DeltaTable.is_deltatable(
|
|
160
|
+
table_config.uri, storage_options=storage_options
|
|
161
|
+
)
|
|
57
162
|
|
|
58
163
|
def metadata(self) -> TableMetadata:
|
|
59
164
|
metadata = self._impl.metadata()
|
|
@@ -72,7 +177,7 @@ class DeltaTable:
|
|
|
72
177
|
)
|
|
73
178
|
|
|
74
179
|
def schema(self) -> pa.Schema:
|
|
75
|
-
return self._impl.schema().
|
|
180
|
+
return pa.schema(self._impl.schema().to_arrow()) # type: ignore[arg-type]
|
|
76
181
|
|
|
77
182
|
def history(self) -> TableHistory:
|
|
78
183
|
delta_history = self._impl.history()
|
|
@@ -96,10 +201,32 @@ class DeltaTable:
|
|
|
96
201
|
self._impl.load_as_version(version)
|
|
97
202
|
return self._impl.to_pyarrow_dataset()
|
|
98
203
|
|
|
204
|
+
def import_data(
|
|
205
|
+
self, data: pd.DataFrame, mode: ImportModeEnum = ImportModeEnum.append
|
|
206
|
+
) -> None:
|
|
207
|
+
deltalake.write_deltalake(
|
|
208
|
+
self.table_config.uri, data, mode=mode.value, schema_mode="merge"
|
|
209
|
+
)
|
|
210
|
+
|
|
99
211
|
|
|
100
212
|
def load_table(table_config: ConfigTable) -> TableProtocol:
|
|
101
|
-
format_handler = {
|
|
102
|
-
|
|
213
|
+
format_handler: dict[TableFormats, type[TableProtocol]] = {
|
|
214
|
+
TableFormats.delta: DeltaTable
|
|
215
|
+
}
|
|
216
|
+
table_handler = format_handler[table_config.table_format]
|
|
217
|
+
if not table_handler.is_valid(table_config):
|
|
218
|
+
raise ValueError(f"Invalid table: {table_config.uri}")
|
|
219
|
+
return table_handler(table_config)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def load_datasets(table_configs: list[ConfigTable]) -> dict[str, padataset.Dataset]:
|
|
223
|
+
tables_dataset = {}
|
|
224
|
+
for table_config in table_configs:
|
|
225
|
+
try:
|
|
226
|
+
tables_dataset[table_config.name] = load_table(table_config).dataset()
|
|
227
|
+
except ValueError:
|
|
228
|
+
pass
|
|
229
|
+
return tables_dataset
|
|
103
230
|
|
|
104
231
|
|
|
105
232
|
def generate_table_query(
|
|
@@ -110,21 +237,26 @@ def generate_table_query(
|
|
|
110
237
|
sort_desc: str | None = None,
|
|
111
238
|
) -> str:
|
|
112
239
|
query_expr = (
|
|
113
|
-
sqlglot.select(*(cols
|
|
240
|
+
sqlglot.select(*([f'"{col}"' for col in cols] if cols else ["*"]))
|
|
241
|
+
.from_(f'"{table_name}"')
|
|
242
|
+
.limit(limit or DEFAULT_LIMIT)
|
|
114
243
|
)
|
|
115
244
|
if sort_asc:
|
|
116
245
|
query_expr = query_expr.order_by(f"{sort_asc} asc")
|
|
117
246
|
elif sort_desc:
|
|
118
247
|
query_expr = query_expr.order_by(f"{sort_desc} desc")
|
|
119
|
-
return
|
|
120
|
-
query_expr
|
|
121
|
-
)
|
|
248
|
+
return query_expr.sql(dialect=sqlglot.dialects.duckdb.DuckDB, identify="always")
|
|
122
249
|
|
|
123
250
|
|
|
124
251
|
def generate_table_statistics_query(table_name: str) -> str:
|
|
125
|
-
|
|
126
|
-
f"
|
|
252
|
+
summarize_expr = sqlglot.expressions.Summarize(
|
|
253
|
+
this=sqlglot.expressions.Table(this=f'"{table_name}"')
|
|
127
254
|
)
|
|
255
|
+
subquery_expr = sqlglot.expressions.Subquery(this=summarize_expr)
|
|
256
|
+
query_expr = sqlglot.select(
|
|
257
|
+
"column_name", "count", "avg", "std", "min", "max"
|
|
258
|
+
).from_(subquery_expr)
|
|
259
|
+
return query_expr.sql(dialect=sqlglot.dialects.duckdb.DuckDB, identify="always")
|
|
128
260
|
|
|
129
261
|
|
|
130
262
|
def execute_query(
|
|
@@ -133,9 +265,31 @@ def execute_query(
|
|
|
133
265
|
try:
|
|
134
266
|
conn = duckdb.connect()
|
|
135
267
|
for table_name, table_dataset in tables_datasets.items():
|
|
268
|
+
# ATTACH IF NOT EXISTS ':memory:' AS {catalog.name};
|
|
269
|
+
# CREATE SCHEMA IF NOT EXISTS {catalog.name}.{database.name};
|
|
270
|
+
# USE {catalog.name}.{database.name};
|
|
271
|
+
# CREATE VIEW IF NOT EXISTS {table.name} AS FROM {table.name}_dataset;
|
|
272
|
+
|
|
136
273
|
view_name = f"{table_name}_view"
|
|
137
274
|
conn.register(view_name, table_dataset)
|
|
138
|
-
conn.execute(f
|
|
275
|
+
conn.execute(f'create table "{table_name}" as select * from "{view_name}"') # nosec B608
|
|
139
276
|
return conn.execute(sql_query).df()
|
|
140
277
|
except duckdb.Error as e:
|
|
141
278
|
raise ValueError(str(e)) from e
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def import_file_to_table(
|
|
282
|
+
table_config: ConfigTable,
|
|
283
|
+
file_path: BinaryIO | TextIO,
|
|
284
|
+
mode: ImportModeEnum = ImportModeEnum.append,
|
|
285
|
+
file_format: ImportFileFormatEnum = ImportFileFormatEnum.csv,
|
|
286
|
+
delimiter: str = ",",
|
|
287
|
+
encoding: str = "utf-8",
|
|
288
|
+
) -> int:
|
|
289
|
+
file_format_handler = {
|
|
290
|
+
ImportFileFormatEnum.csv: lambda f, d, e: pd.read_csv(f, sep=d, encoding=e)
|
|
291
|
+
}
|
|
292
|
+
table = load_table(table_config)
|
|
293
|
+
df = file_format_handler[file_format](file_path, delimiter, encoding)
|
|
294
|
+
table.import_data(df, mode=mode)
|
|
295
|
+
return len(df)
|
|
@@ -34,6 +34,12 @@
|
|
|
34
34
|
{{ error.message }}
|
|
35
35
|
</div>
|
|
36
36
|
{% else %}
|
|
37
|
+
<div class="d-flex justify-content-between align-items-center mb-2">
|
|
38
|
+
<h3>Results</h3>
|
|
39
|
+
<a href="/tables/query/csv?sql={{ query.sql | urlencode }}" class="btn btn-outline-secondary btn-sm">
|
|
40
|
+
<i class="bi-download" aria-hidden="true"></i> Export CSV
|
|
41
|
+
</a>
|
|
42
|
+
</div>
|
|
37
43
|
<div class="table-responsive">
|
|
38
44
|
<table class="table table-sm table-bordered table-striped table-hover">
|
|
39
45
|
<thead>
|
|
@@ -12,5 +12,8 @@
|
|
|
12
12
|
<li class="nav-item">
|
|
13
13
|
<a class="nav-link{% if current == 'history' %} active{% endif %}"{% if current == 'history' %} aria-current="true"{% endif %} href="/tables/{{ table_id }}/history">History</a>
|
|
14
14
|
</li>
|
|
15
|
+
<li class="nav-item">
|
|
16
|
+
<a class="nav-link{% if current == 'import' %} active{% endif %}"{% if current == 'import' %} aria-current="true"{% endif %} href="/tables/{{ table_id }}/import">Import</a>
|
|
17
|
+
</li>
|
|
15
18
|
</ul>
|
|
16
19
|
{%- endmacro %}
|
|
@@ -2,6 +2,11 @@
|
|
|
2
2
|
{% import 'tables/_macros.html' as table_macros %}
|
|
3
3
|
|
|
4
4
|
{% block body %}
|
|
5
|
+
{% if error %}
|
|
6
|
+
<div class="alert alert-danger" role="alert">
|
|
7
|
+
{{ error.message }}
|
|
8
|
+
</div>
|
|
9
|
+
{% else %}
|
|
5
10
|
{{ table_macros.table_nav(table_id, 'history') }}
|
|
6
11
|
|
|
7
12
|
<div class="row">
|
|
@@ -39,4 +44,5 @@
|
|
|
39
44
|
{% endfor %}
|
|
40
45
|
</div>
|
|
41
46
|
</div>
|
|
47
|
+
{% endif %}
|
|
42
48
|
{% endblock %}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
{% extends "_base.html" %}
|
|
2
|
+
{% import 'tables/_macros.html' as table_macros %}
|
|
3
|
+
|
|
4
|
+
{% block body %}
|
|
5
|
+
{{ table_macros.table_nav(table_id, 'import') }}
|
|
6
|
+
|
|
7
|
+
{% if message %}
|
|
8
|
+
{% if message.type == 'success' %}{% set alert_type = 'success' %}
|
|
9
|
+
{% elif message.type == 'error' %}{% set alert_type = 'danger' %}
|
|
10
|
+
{% else %}{% set alert_type = 'primary' %}
|
|
11
|
+
{% endif %}
|
|
12
|
+
|
|
13
|
+
<div class="alert alert-{{ alert_type }}" role="alert">
|
|
14
|
+
{{ message.body }}
|
|
15
|
+
</div>
|
|
16
|
+
{% endif %}
|
|
17
|
+
|
|
18
|
+
<div class="row justify-content-center">
|
|
19
|
+
<div class="col-12 col-md-8 col-lg-4">
|
|
20
|
+
<form action="{{ request.url.path }}" method="post" enctype="multipart/form-data">
|
|
21
|
+
<div class="mb-3">
|
|
22
|
+
<label for="import-file-input" class="form-label">Input file</label>
|
|
23
|
+
<input id="import-file-input" class="form-control" name="input_file" type="file" accept=".csv" required>
|
|
24
|
+
</div>
|
|
25
|
+
|
|
26
|
+
<div class="mb-3">
|
|
27
|
+
<label class="form-label">Mode</label>
|
|
28
|
+
|
|
29
|
+
<div class="form-check">
|
|
30
|
+
<input id="import-mode-append" class="form-check-input" name="mode" type="radio" value="append" checked>
|
|
31
|
+
<label for="import-mode-append" class="form-check-label">Append</label>
|
|
32
|
+
</div>
|
|
33
|
+
<div class="form-check">
|
|
34
|
+
<input id="import-mode-overwrite" class="form-check-input" name="mode" type="radio" value="overwrite">
|
|
35
|
+
<label for="import-mode-overwrite" class="form-check-label">Overwrite</label>
|
|
36
|
+
</div>
|
|
37
|
+
</div>
|
|
38
|
+
|
|
39
|
+
<div class="mb-3">
|
|
40
|
+
<label for="import-file-format" class="form-label">File format</label>
|
|
41
|
+
<select id="import-file-format" class="form-select" name="file_format">
|
|
42
|
+
<option value="csv" selected>CSV</option>
|
|
43
|
+
</select>
|
|
44
|
+
</div>
|
|
45
|
+
|
|
46
|
+
<div class="mb-3">
|
|
47
|
+
<label for="import-delimiter" class="form-label">Delimiter</label>
|
|
48
|
+
<input id="import-delimiter" class="form-control" name="delimiter" value="," required>
|
|
49
|
+
</div>
|
|
50
|
+
|
|
51
|
+
<div class="mb-3">
|
|
52
|
+
<label for="import-encoding" class="form-label">Encoding</label>
|
|
53
|
+
<select id="import-encoding" class="form-select" name="encoding">
|
|
54
|
+
<option value="utf-8" selected>UTF-8</option>
|
|
55
|
+
<option value="utf-16">UTF-16</option>
|
|
56
|
+
<option value="utf-32">UTF-32</option>
|
|
57
|
+
<option value="latin-1">Latin-1</option>
|
|
58
|
+
</select>
|
|
59
|
+
</div>
|
|
60
|
+
|
|
61
|
+
<div class="mb-3">
|
|
62
|
+
<div class="d-flex justify-content-end">
|
|
63
|
+
<button type="submit" class="btn btn-primary">
|
|
64
|
+
<i class="bi-upload" aria-hidden="true"></i> Import Data
|
|
65
|
+
</button>
|
|
66
|
+
</div>
|
|
67
|
+
</div>
|
|
68
|
+
</form>
|
|
69
|
+
</div>
|
|
70
|
+
</div>
|
|
71
|
+
{% endblock %}
|
|
@@ -2,6 +2,11 @@
|
|
|
2
2
|
{% import 'tables/_macros.html' as table_macros %}
|
|
3
3
|
|
|
4
4
|
{% block body %}
|
|
5
|
+
{% if error %}
|
|
6
|
+
<div class="alert alert-danger" role="alert">
|
|
7
|
+
{{ error.message }}
|
|
8
|
+
</div>
|
|
9
|
+
{% else %}
|
|
5
10
|
{{ table_macros.table_nav(table_id, 'overview') }}
|
|
6
11
|
|
|
7
12
|
<div class="row row-cols-1 row-cols-md-2 g-4">
|
|
@@ -81,4 +86,5 @@
|
|
|
81
86
|
</div>
|
|
82
87
|
</div>
|
|
83
88
|
</div>
|
|
89
|
+
{% endif %}
|
|
84
90
|
{% endblock %}
|
|
@@ -25,6 +25,12 @@
|
|
|
25
25
|
{{ error.message }}
|
|
26
26
|
</div>
|
|
27
27
|
{% else %}
|
|
28
|
+
<div class="d-flex justify-content-between align-items-center mb-2">
|
|
29
|
+
<h3>Results</h3>
|
|
30
|
+
<a href="/tables/query/csv?sql={{ sql_query | urlencode }}" class="btn btn-outline-secondary btn-sm">
|
|
31
|
+
<i class="bi-download" aria-hidden="true"></i> Export CSV
|
|
32
|
+
</a>
|
|
33
|
+
</div>
|
|
28
34
|
<div class="table-responsive">
|
|
29
35
|
<table class="table table-sm table-bordered table-striped table-hover">
|
|
30
36
|
<thead>
|
|
@@ -2,6 +2,11 @@
|
|
|
2
2
|
{% import 'tables/_macros.html' as table_macros %}
|
|
3
3
|
|
|
4
4
|
{% block body %}
|
|
5
|
+
{% if error %}
|
|
6
|
+
<div class="alert alert-danger" role="alert">
|
|
7
|
+
{{ error.message }}
|
|
8
|
+
</div>
|
|
9
|
+
{% else %}
|
|
5
10
|
{{ table_macros.table_nav(table_id, 'statistics') }}
|
|
6
11
|
|
|
7
12
|
<div class="row">
|
|
@@ -53,4 +58,5 @@
|
|
|
53
58
|
</div>
|
|
54
59
|
</div>
|
|
55
60
|
</div>
|
|
61
|
+
{% endif %}
|
|
56
62
|
{% endblock %}
|
|
@@ -2,6 +2,11 @@
|
|
|
2
2
|
{% import 'tables/_macros.html' as table_macros %}
|
|
3
3
|
|
|
4
4
|
{% block body %}
|
|
5
|
+
{% if error %}
|
|
6
|
+
<div class="alert alert-danger" role="alert">
|
|
7
|
+
{{ error.message }}
|
|
8
|
+
</div>
|
|
9
|
+
{% else %}
|
|
5
10
|
{{ table_macros.table_nav(table_id, 'view') }}
|
|
6
11
|
|
|
7
12
|
<div class="row">
|
|
@@ -93,4 +98,5 @@
|
|
|
93
98
|
</div>
|
|
94
99
|
</div>
|
|
95
100
|
</div>
|
|
101
|
+
{% endif %}
|
|
96
102
|
{% endblock %}
|