laketower 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of laketower might be problematic. Click here for more details.

laketower/tables.py CHANGED
@@ -1,5 +1,6 @@
1
+ import enum
1
2
  from datetime import datetime, timezone
2
- from typing import Any, Protocol
3
+ from typing import Any, BinaryIO, Protocol, TextIO
3
4
 
4
5
  import deltalake
5
6
  import duckdb
@@ -17,6 +18,15 @@ from laketower.config import ConfigTable, TableFormats
17
18
  DEFAULT_LIMIT = 10
18
19
 
19
20
 
21
+ class ImportModeEnum(str, enum.Enum):
22
+ append = "append"
23
+ overwrite = "overwrite"
24
+
25
+
26
+ class ImportFileFormatEnum(str, enum.Enum):
27
+ csv = "csv"
28
+
29
+
20
30
  class TableMetadata(pydantic.BaseModel):
21
31
  table_format: TableFormats
22
32
  name: str | None = None
@@ -43,17 +53,112 @@ class TableHistory(pydantic.BaseModel):
43
53
 
44
54
 
45
55
  class TableProtocol(Protocol): # pragma: no cover
56
+ @classmethod
57
+ def is_valid(cls, table_config: ConfigTable) -> bool: ...
58
+ def __init__(self, table_config: ConfigTable) -> None: ...
46
59
  def metadata(self) -> TableMetadata: ...
47
60
  def schema(self) -> pa.Schema: ...
48
61
  def history(self) -> TableHistory: ...
49
62
  def dataset(self, version: int | str | None = None) -> padataset.Dataset: ...
63
+ def import_data(
64
+ self, data: pd.DataFrame, mode: ImportModeEnum = ImportModeEnum.append
65
+ ) -> None: ...
50
66
 
51
67
 
52
68
  class DeltaTable:
53
69
  def __init__(self, table_config: ConfigTable):
54
70
  super().__init__()
55
71
  self.table_config = table_config
56
- self._impl = deltalake.DeltaTable(table_config.uri)
72
+ storage_options = self._generate_storage_options(table_config)
73
+ self._impl = deltalake.DeltaTable(
74
+ table_config.uri, storage_options=storage_options
75
+ )
76
+
77
+ @classmethod
78
+ def _generate_storage_options(
79
+ cls, table_config: ConfigTable
80
+ ) -> dict[str, str] | None:
81
+ # documentation from `object-store` Rust crate:
82
+ # - s3: https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html
83
+ # - adls: https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html
84
+ storage_options = None
85
+ conn_s3 = (
86
+ table_config.connection.s3
87
+ if table_config.connection and table_config.connection.s3
88
+ else None
89
+ )
90
+ conn_adls = (
91
+ table_config.connection.adls
92
+ if table_config.connection and table_config.connection.adls
93
+ else None
94
+ )
95
+ if conn_s3:
96
+ storage_options = (
97
+ {
98
+ "aws_access_key_id": conn_s3.s3_access_key_id,
99
+ "aws_secret_access_key": conn_s3.s3_secret_access_key.get_secret_value(),
100
+ "aws_allow_http": str(conn_s3.s3_allow_http).lower(),
101
+ }
102
+ | ({"aws_region": conn_s3.s3_region} if conn_s3.s3_region else {})
103
+ | (
104
+ {"aws_endpoint_url": str(conn_s3.s3_endpoint_url).rstrip("/")}
105
+ if conn_s3.s3_endpoint_url
106
+ else {}
107
+ )
108
+ )
109
+ elif conn_adls:
110
+ storage_options = (
111
+ {
112
+ "azure_storage_account_name": conn_adls.adls_account_name,
113
+ "azure_use_azure_cli": str(conn_adls.use_azure_cli).lower(),
114
+ }
115
+ | (
116
+ {
117
+ "azure_storage_access_key": conn_adls.adls_access_key.get_secret_value()
118
+ }
119
+ if conn_adls.adls_access_key
120
+ else {}
121
+ )
122
+ | (
123
+ {"azure_storage_sas_key": conn_adls.adls_sas_key.get_secret_value()}
124
+ if conn_adls.adls_sas_key
125
+ else {}
126
+ )
127
+ | (
128
+ {"azure_storage_tenant_id": conn_adls.adls_tenant_id}
129
+ if conn_adls.adls_tenant_id
130
+ else {}
131
+ )
132
+ | (
133
+ {"azure_storage_client_id": conn_adls.adls_client_id}
134
+ if conn_adls.adls_client_id
135
+ else {}
136
+ )
137
+ | (
138
+ {
139
+ "azure_storage_client_secret": conn_adls.adls_client_secret.get_secret_value()
140
+ }
141
+ if conn_adls.adls_client_secret
142
+ else {}
143
+ )
144
+ | (
145
+ {
146
+ "azure_msi_endpoint": str(conn_adls.azure_msi_endpoint).rstrip(
147
+ "/"
148
+ )
149
+ }
150
+ if conn_adls.azure_msi_endpoint
151
+ else {}
152
+ )
153
+ )
154
+ return storage_options
155
+
156
+ @classmethod
157
+ def is_valid(cls, table_config: ConfigTable) -> bool:
158
+ storage_options = cls._generate_storage_options(table_config)
159
+ return deltalake.DeltaTable.is_deltatable(
160
+ table_config.uri, storage_options=storage_options
161
+ )
57
162
 
58
163
  def metadata(self) -> TableMetadata:
59
164
  metadata = self._impl.metadata()
@@ -72,7 +177,7 @@ class DeltaTable:
72
177
  )
73
178
 
74
179
  def schema(self) -> pa.Schema:
75
- return self._impl.schema().to_pyarrow()
180
+ return pa.schema(self._impl.schema().to_arrow()) # type: ignore[arg-type]
76
181
 
77
182
  def history(self) -> TableHistory:
78
183
  delta_history = self._impl.history()
@@ -96,10 +201,32 @@ class DeltaTable:
96
201
  self._impl.load_as_version(version)
97
202
  return self._impl.to_pyarrow_dataset()
98
203
 
204
+ def import_data(
205
+ self, data: pd.DataFrame, mode: ImportModeEnum = ImportModeEnum.append
206
+ ) -> None:
207
+ deltalake.write_deltalake(
208
+ self.table_config.uri, data, mode=mode.value, schema_mode="merge"
209
+ )
210
+
99
211
 
100
212
  def load_table(table_config: ConfigTable) -> TableProtocol:
101
- format_handler = {TableFormats.delta: DeltaTable}
102
- return format_handler[table_config.table_format](table_config)
213
+ format_handler: dict[TableFormats, type[TableProtocol]] = {
214
+ TableFormats.delta: DeltaTable
215
+ }
216
+ table_handler = format_handler[table_config.table_format]
217
+ if not table_handler.is_valid(table_config):
218
+ raise ValueError(f"Invalid table: {table_config.uri}")
219
+ return table_handler(table_config)
220
+
221
+
222
+ def load_datasets(table_configs: list[ConfigTable]) -> dict[str, padataset.Dataset]:
223
+ tables_dataset = {}
224
+ for table_config in table_configs:
225
+ try:
226
+ tables_dataset[table_config.name] = load_table(table_config).dataset()
227
+ except ValueError:
228
+ pass
229
+ return tables_dataset
103
230
 
104
231
 
105
232
  def generate_table_query(
@@ -110,21 +237,26 @@ def generate_table_query(
110
237
  sort_desc: str | None = None,
111
238
  ) -> str:
112
239
  query_expr = (
113
- sqlglot.select(*(cols or ["*"])).from_(table_name).limit(limit or DEFAULT_LIMIT)
240
+ sqlglot.select(*([f'"{col}"' for col in cols] if cols else ["*"]))
241
+ .from_(f'"{table_name}"')
242
+ .limit(limit or DEFAULT_LIMIT)
114
243
  )
115
244
  if sort_asc:
116
245
  query_expr = query_expr.order_by(f"{sort_asc} asc")
117
246
  elif sort_desc:
118
247
  query_expr = query_expr.order_by(f"{sort_desc} desc")
119
- return sqlglot.Generator(dialect=sqlglot.dialects.duckdb.DuckDB).generate(
120
- query_expr
121
- )
248
+ return query_expr.sql(dialect=sqlglot.dialects.duckdb.DuckDB, identify="always")
122
249
 
123
250
 
124
251
  def generate_table_statistics_query(table_name: str) -> str:
125
- return (
126
- f"SELECT column_name, count, avg, std, min, max FROM (SUMMARIZE {table_name})" # nosec B608
252
+ summarize_expr = sqlglot.expressions.Summarize(
253
+ this=sqlglot.expressions.Table(this=f'"{table_name}"')
127
254
  )
255
+ subquery_expr = sqlglot.expressions.Subquery(this=summarize_expr)
256
+ query_expr = sqlglot.select(
257
+ "column_name", "count", "avg", "std", "min", "max"
258
+ ).from_(subquery_expr)
259
+ return query_expr.sql(dialect=sqlglot.dialects.duckdb.DuckDB, identify="always")
128
260
 
129
261
 
130
262
  def execute_query(
@@ -133,9 +265,31 @@ def execute_query(
133
265
  try:
134
266
  conn = duckdb.connect()
135
267
  for table_name, table_dataset in tables_datasets.items():
268
+ # ATTACH IF NOT EXISTS ':memory:' AS {catalog.name};
269
+ # CREATE SCHEMA IF NOT EXISTS {catalog.name}.{database.name};
270
+ # USE {catalog.name}.{database.name};
271
+ # CREATE VIEW IF NOT EXISTS {table.name} AS FROM {table.name}_dataset;
272
+
136
273
  view_name = f"{table_name}_view"
137
274
  conn.register(view_name, table_dataset)
138
- conn.execute(f"create table {table_name} as select * from {view_name}") # nosec B608
275
+ conn.execute(f'create table "{table_name}" as select * from "{view_name}"') # nosec B608
139
276
  return conn.execute(sql_query).df()
140
277
  except duckdb.Error as e:
141
278
  raise ValueError(str(e)) from e
279
+
280
+
281
+ def import_file_to_table(
282
+ table_config: ConfigTable,
283
+ file_path: BinaryIO | TextIO,
284
+ mode: ImportModeEnum = ImportModeEnum.append,
285
+ file_format: ImportFileFormatEnum = ImportFileFormatEnum.csv,
286
+ delimiter: str = ",",
287
+ encoding: str = "utf-8",
288
+ ) -> int:
289
+ file_format_handler = {
290
+ ImportFileFormatEnum.csv: lambda f, d, e: pd.read_csv(f, sep=d, encoding=e)
291
+ }
292
+ table = load_table(table_config)
293
+ df = file_format_handler[file_format](file_path, delimiter, encoding)
294
+ table.import_data(df, mode=mode)
295
+ return len(df)
@@ -34,6 +34,12 @@
34
34
  {{ error.message }}
35
35
  </div>
36
36
  {% else %}
37
+ <div class="d-flex justify-content-between align-items-center mb-2">
38
+ <h3>Results</h3>
39
+ <a href="/tables/query/csv?sql={{ query.sql | urlencode }}" class="btn btn-outline-secondary btn-sm">
40
+ <i class="bi-download" aria-hidden="true"></i> Export CSV
41
+ </a>
42
+ </div>
37
43
  <div class="table-responsive">
38
44
  <table class="table table-sm table-bordered table-striped table-hover">
39
45
  <thead>
@@ -12,5 +12,8 @@
12
12
  <li class="nav-item">
13
13
  <a class="nav-link{% if current == 'history' %} active{% endif %}"{% if current == 'history' %} aria-current="true"{% endif %} href="/tables/{{ table_id }}/history">History</a>
14
14
  </li>
15
+ <li class="nav-item">
16
+ <a class="nav-link{% if current == 'import' %} active{% endif %}"{% if current == 'import' %} aria-current="true"{% endif %} href="/tables/{{ table_id }}/import">Import</a>
17
+ </li>
15
18
  </ul>
16
19
  {%- endmacro %}
@@ -2,6 +2,11 @@
2
2
  {% import 'tables/_macros.html' as table_macros %}
3
3
 
4
4
  {% block body %}
5
+ {% if error %}
6
+ <div class="alert alert-danger" role="alert">
7
+ {{ error.message }}
8
+ </div>
9
+ {% else %}
5
10
  {{ table_macros.table_nav(table_id, 'history') }}
6
11
 
7
12
  <div class="row">
@@ -39,4 +44,5 @@
39
44
  {% endfor %}
40
45
  </div>
41
46
  </div>
47
+ {% endif %}
42
48
  {% endblock %}
@@ -0,0 +1,71 @@
1
+ {% extends "_base.html" %}
2
+ {% import 'tables/_macros.html' as table_macros %}
3
+
4
+ {% block body %}
5
+ {{ table_macros.table_nav(table_id, 'import') }}
6
+
7
+ {% if message %}
8
+ {% if message.type == 'success' %}{% set alert_type = 'success' %}
9
+ {% elif message.type == 'error' %}{% set alert_type = 'danger' %}
10
+ {% else %}{% set alert_type = 'primary' %}
11
+ {% endif %}
12
+
13
+ <div class="alert alert-{{ alert_type }}" role="alert">
14
+ {{ message.body }}
15
+ </div>
16
+ {% endif %}
17
+
18
+ <div class="row justify-content-center">
19
+ <div class="col-12 col-md-8 col-lg-4">
20
+ <form action="{{ request.url.path }}" method="post" enctype="multipart/form-data">
21
+ <div class="mb-3">
22
+ <label for="import-file-input" class="form-label">Input file</label>
23
+ <input id="import-file-input" class="form-control" name="input_file" type="file" accept=".csv" required>
24
+ </div>
25
+
26
+ <div class="mb-3">
27
+ <label class="form-label">Mode</label>
28
+
29
+ <div class="form-check">
30
+ <input id="import-mode-append" class="form-check-input" name="mode" type="radio" value="append" checked>
31
+ <label for="import-mode-append" class="form-check-label">Append</label>
32
+ </div>
33
+ <div class="form-check">
34
+ <input id="import-mode-overwrite" class="form-check-input" name="mode" type="radio" value="overwrite">
35
+ <label for="import-mode-overwrite" class="form-check-label">Overwrite</label>
36
+ </div>
37
+ </div>
38
+
39
+ <div class="mb-3">
40
+ <label for="import-file-format" class="form-label">File format</label>
41
+ <select id="import-file-format" class="form-select" name="file_format">
42
+ <option value="csv" selected>CSV</option>
43
+ </select>
44
+ </div>
45
+
46
+ <div class="mb-3">
47
+ <label for="import-delimiter" class="form-label">Delimiter</label>
48
+ <input id="import-delimiter" class="form-control" name="delimiter" value="," required>
49
+ </div>
50
+
51
+ <div class="mb-3">
52
+ <label for="import-encoding" class="form-label">Encoding</label>
53
+ <select id="import-encoding" class="form-select" name="encoding">
54
+ <option value="utf-8" selected>UTF-8</option>
55
+ <option value="utf-16">UTF-16</option>
56
+ <option value="utf-32">UTF-32</option>
57
+ <option value="latin-1">Latin-1</option>
58
+ </select>
59
+ </div>
60
+
61
+ <div class="mb-3">
62
+ <div class="d-flex justify-content-end">
63
+ <button type="submit" class="btn btn-primary">
64
+ <i class="bi-upload" aria-hidden="true"></i> Import Data
65
+ </button>
66
+ </div>
67
+ </div>
68
+ </form>
69
+ </div>
70
+ </div>
71
+ {% endblock %}
@@ -2,6 +2,11 @@
2
2
  {% import 'tables/_macros.html' as table_macros %}
3
3
 
4
4
  {% block body %}
5
+ {% if error %}
6
+ <div class="alert alert-danger" role="alert">
7
+ {{ error.message }}
8
+ </div>
9
+ {% else %}
5
10
  {{ table_macros.table_nav(table_id, 'overview') }}
6
11
 
7
12
  <div class="row row-cols-1 row-cols-md-2 g-4">
@@ -81,4 +86,5 @@
81
86
  </div>
82
87
  </div>
83
88
  </div>
89
+ {% endif %}
84
90
  {% endblock %}
@@ -25,6 +25,12 @@
25
25
  {{ error.message }}
26
26
  </div>
27
27
  {% else %}
28
+ <div class="d-flex justify-content-between align-items-center mb-2">
29
+ <h3>Results</h3>
30
+ <a href="/tables/query/csv?sql={{ sql_query | urlencode }}" class="btn btn-outline-secondary btn-sm">
31
+ <i class="bi-download" aria-hidden="true"></i> Export CSV
32
+ </a>
33
+ </div>
28
34
  <div class="table-responsive">
29
35
  <table class="table table-sm table-bordered table-striped table-hover">
30
36
  <thead>
@@ -2,6 +2,11 @@
2
2
  {% import 'tables/_macros.html' as table_macros %}
3
3
 
4
4
  {% block body %}
5
+ {% if error %}
6
+ <div class="alert alert-danger" role="alert">
7
+ {{ error.message }}
8
+ </div>
9
+ {% else %}
5
10
  {{ table_macros.table_nav(table_id, 'statistics') }}
6
11
 
7
12
  <div class="row">
@@ -53,4 +58,5 @@
53
58
  </div>
54
59
  </div>
55
60
  </div>
61
+ {% endif %}
56
62
  {% endblock %}
@@ -2,6 +2,11 @@
2
2
  {% import 'tables/_macros.html' as table_macros %}
3
3
 
4
4
  {% block body %}
5
+ {% if error %}
6
+ <div class="alert alert-danger" role="alert">
7
+ {{ error.message }}
8
+ </div>
9
+ {% else %}
5
10
  {{ table_macros.table_nav(table_id, 'view') }}
6
11
 
7
12
  <div class="row">
@@ -93,4 +98,5 @@
93
98
  </div>
94
99
  </div>
95
100
  </div>
101
+ {% endif %}
96
102
  {% endblock %}