moose-lib 0.6.90__py3-none-any.whl → 0.6.283__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- moose_lib/__init__.py +38 -3
- moose_lib/blocks.py +497 -37
- moose_lib/clients/redis_client.py +26 -14
- moose_lib/commons.py +94 -5
- moose_lib/config/config_file.py +44 -2
- moose_lib/config/runtime.py +137 -5
- moose_lib/data_models.py +451 -46
- moose_lib/dmv2/__init__.py +88 -60
- moose_lib/dmv2/_registry.py +3 -1
- moose_lib/dmv2/_source_capture.py +37 -0
- moose_lib/dmv2/consumption.py +55 -32
- moose_lib/dmv2/ingest_api.py +9 -2
- moose_lib/dmv2/ingest_pipeline.py +56 -13
- moose_lib/dmv2/life_cycle.py +3 -1
- moose_lib/dmv2/materialized_view.py +24 -14
- moose_lib/dmv2/moose_model.py +165 -0
- moose_lib/dmv2/olap_table.py +304 -119
- moose_lib/dmv2/registry.py +28 -3
- moose_lib/dmv2/sql_resource.py +16 -8
- moose_lib/dmv2/stream.py +241 -21
- moose_lib/dmv2/types.py +14 -8
- moose_lib/dmv2/view.py +13 -6
- moose_lib/dmv2/web_app.py +175 -0
- moose_lib/dmv2/web_app_helpers.py +96 -0
- moose_lib/dmv2/workflow.py +37 -9
- moose_lib/internal.py +537 -68
- moose_lib/main.py +87 -56
- moose_lib/query_builder.py +18 -5
- moose_lib/query_param.py +54 -20
- moose_lib/secrets.py +122 -0
- moose_lib/streaming/streaming_function_runner.py +266 -156
- moose_lib/utilities/sql.py +0 -1
- {moose_lib-0.6.90.dist-info → moose_lib-0.6.283.dist-info}/METADATA +19 -1
- moose_lib-0.6.283.dist-info/RECORD +63 -0
- tests/__init__.py +1 -1
- tests/conftest.py +38 -1
- tests/test_backward_compatibility.py +85 -0
- tests/test_cluster_validation.py +85 -0
- tests/test_codec.py +75 -0
- tests/test_column_formatting.py +80 -0
- tests/test_fixedstring.py +43 -0
- tests/test_iceberg_config.py +105 -0
- tests/test_int_types.py +211 -0
- tests/test_kafka_config.py +141 -0
- tests/test_materialized.py +74 -0
- tests/test_metadata.py +37 -0
- tests/test_moose.py +21 -30
- tests/test_moose_model.py +153 -0
- tests/test_olap_table_moosemodel.py +89 -0
- tests/test_olap_table_versioning.py +210 -0
- tests/test_query_builder.py +97 -9
- tests/test_redis_client.py +10 -3
- tests/test_s3queue_config.py +211 -110
- tests/test_secrets.py +239 -0
- tests/test_simple_aggregate.py +114 -0
- tests/test_web_app.py +227 -0
- moose_lib-0.6.90.dist-info/RECORD +0 -42
- {moose_lib-0.6.90.dist-info → moose_lib-0.6.283.dist-info}/WHEEL +0 -0
- {moose_lib-0.6.90.dist-info → moose_lib-0.6.283.dist-info}/top_level.txt +0 -0
moose_lib/blocks.py
CHANGED
|
@@ -14,66 +14,297 @@ class ClickHouseEngines(Enum):
|
|
|
14
14
|
VersionedCollapsingMergeTree = "VersionedCollapsingMergeTree"
|
|
15
15
|
GraphiteMergeTree = "GraphiteMergeTree"
|
|
16
16
|
S3Queue = "S3Queue"
|
|
17
|
+
S3 = "S3"
|
|
18
|
+
Buffer = "Buffer"
|
|
19
|
+
Distributed = "Distributed"
|
|
20
|
+
IcebergS3 = "IcebergS3"
|
|
21
|
+
Kafka = "Kafka"
|
|
22
|
+
ReplicatedMergeTree = "ReplicatedMergeTree"
|
|
23
|
+
ReplicatedReplacingMergeTree = "ReplicatedReplacingMergeTree"
|
|
24
|
+
ReplicatedAggregatingMergeTree = "ReplicatedAggregatingMergeTree"
|
|
25
|
+
ReplicatedSummingMergeTree = "ReplicatedSummingMergeTree"
|
|
26
|
+
ReplicatedCollapsingMergeTree = "ReplicatedCollapsingMergeTree"
|
|
27
|
+
ReplicatedVersionedCollapsingMergeTree = "ReplicatedVersionedCollapsingMergeTree"
|
|
28
|
+
|
|
17
29
|
|
|
18
30
|
# ==========================
|
|
19
31
|
# New Engine Configuration Classes
|
|
20
32
|
# ==========================
|
|
21
33
|
|
|
34
|
+
|
|
22
35
|
@dataclass
|
|
23
36
|
class EngineConfig(ABC):
|
|
24
37
|
"""Base class for engine configurations"""
|
|
38
|
+
|
|
25
39
|
pass
|
|
26
40
|
|
|
41
|
+
|
|
27
42
|
@dataclass
|
|
28
43
|
class MergeTreeEngine(EngineConfig):
|
|
29
44
|
"""Configuration for MergeTree engine"""
|
|
45
|
+
|
|
30
46
|
pass
|
|
31
47
|
|
|
32
|
-
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
33
50
|
class ReplacingMergeTreeEngine(EngineConfig):
|
|
34
51
|
"""Configuration for ReplacingMergeTree engine (with deduplication)
|
|
35
|
-
|
|
52
|
+
|
|
36
53
|
Args:
|
|
37
54
|
ver: Optional column name for version tracking
|
|
38
55
|
is_deleted: Optional column name for deletion marking (requires ver)
|
|
39
56
|
"""
|
|
57
|
+
|
|
40
58
|
ver: Optional[str] = None
|
|
41
59
|
is_deleted: Optional[str] = None
|
|
42
|
-
|
|
60
|
+
|
|
43
61
|
def __post_init__(self):
|
|
44
62
|
if self.is_deleted and not self.ver:
|
|
45
63
|
raise ValueError("is_deleted requires ver to be specified")
|
|
46
64
|
|
|
65
|
+
|
|
47
66
|
@dataclass
|
|
48
67
|
class AggregatingMergeTreeEngine(EngineConfig):
|
|
49
68
|
"""Configuration for AggregatingMergeTree engine"""
|
|
69
|
+
|
|
50
70
|
pass
|
|
51
71
|
|
|
72
|
+
|
|
52
73
|
@dataclass
|
|
53
74
|
class SummingMergeTreeEngine(EngineConfig):
|
|
54
|
-
"""Configuration for SummingMergeTree engine
|
|
55
|
-
|
|
75
|
+
"""Configuration for SummingMergeTree engine
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
columns: Optional list of column names to sum
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
columns: Optional[List[str]] = None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class CollapsingMergeTreeEngine(EngineConfig):
|
|
86
|
+
"""Configuration for CollapsingMergeTree engine
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
sign: Column name indicating row type (1 = state, -1 = cancel)
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
sign: str
|
|
93
|
+
|
|
94
|
+
def __post_init__(self):
|
|
95
|
+
if not self.sign:
|
|
96
|
+
raise ValueError("sign column is required for CollapsingMergeTree")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class VersionedCollapsingMergeTreeEngine(EngineConfig):
|
|
101
|
+
"""Configuration for VersionedCollapsingMergeTree engine
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
sign: Column name indicating row type (1 = state, -1 = cancel)
|
|
105
|
+
ver: Column name for object state versioning
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
sign: str
|
|
109
|
+
ver: str
|
|
110
|
+
|
|
111
|
+
def __post_init__(self):
|
|
112
|
+
if not self.sign:
|
|
113
|
+
raise ValueError("sign column is required for VersionedCollapsingMergeTree")
|
|
114
|
+
if not self.ver:
|
|
115
|
+
raise ValueError("ver column is required for VersionedCollapsingMergeTree")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclass
|
|
119
|
+
class ReplicatedMergeTreeEngine(EngineConfig):
|
|
120
|
+
"""Configuration for ReplicatedMergeTree engine (replicated version of MergeTree)
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
keeper_path: Keeper path for replication (e.g., '/clickhouse/tables/{database}/{shard}/table_name')
|
|
124
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
125
|
+
replica_name: Replica name (e.g., '{replica}')
|
|
126
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
127
|
+
|
|
128
|
+
Note: Both keeper_path and replica_name must be provided together, or both omitted.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
keeper_path: Optional[str] = None
|
|
132
|
+
replica_name: Optional[str] = None
|
|
133
|
+
|
|
134
|
+
def __post_init__(self):
|
|
135
|
+
# Both must be provided or both must be None
|
|
136
|
+
if (self.keeper_path is None) != (self.replica_name is None):
|
|
137
|
+
raise ValueError(
|
|
138
|
+
"keeper_path and replica_name must both be provided or both be None"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@dataclass
|
|
143
|
+
class ReplicatedReplacingMergeTreeEngine(EngineConfig):
|
|
144
|
+
"""Configuration for ReplicatedReplacingMergeTree engine (replicated version with deduplication)
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
keeper_path: Keeper path for replication (e.g., '/clickhouse/tables/{database}/{shard}/table_name')
|
|
148
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
149
|
+
replica_name: Replica name (e.g., '{replica}')
|
|
150
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
151
|
+
ver: Optional column name for version tracking
|
|
152
|
+
is_deleted: Optional column name for deletion marking (requires ver)
|
|
153
|
+
|
|
154
|
+
Note: Both keeper_path and replica_name must be provided together, or both omitted.
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
keeper_path: Optional[str] = None
|
|
158
|
+
replica_name: Optional[str] = None
|
|
159
|
+
ver: Optional[str] = None
|
|
160
|
+
is_deleted: Optional[str] = None
|
|
161
|
+
|
|
162
|
+
def __post_init__(self):
|
|
163
|
+
# Both must be provided or both must be None
|
|
164
|
+
if (self.keeper_path is None) != (self.replica_name is None):
|
|
165
|
+
raise ValueError(
|
|
166
|
+
"keeper_path and replica_name must both be provided or both be None"
|
|
167
|
+
)
|
|
168
|
+
if self.is_deleted and not self.ver:
|
|
169
|
+
raise ValueError("is_deleted requires ver to be specified")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@dataclass
|
|
173
|
+
class ReplicatedAggregatingMergeTreeEngine(EngineConfig):
|
|
174
|
+
"""Configuration for ReplicatedAggregatingMergeTree engine (replicated version for aggregations)
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
keeper_path: Keeper path for replication (e.g., '/clickhouse/tables/{database}/{shard}/table_name')
|
|
178
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
179
|
+
replica_name: Replica name (e.g., '{replica}')
|
|
180
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
181
|
+
|
|
182
|
+
Note: Both keeper_path and replica_name must be provided together, or both omitted.
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
keeper_path: Optional[str] = None
|
|
186
|
+
replica_name: Optional[str] = None
|
|
187
|
+
|
|
188
|
+
def __post_init__(self):
|
|
189
|
+
# Both must be provided or both must be None
|
|
190
|
+
if (self.keeper_path is None) != (self.replica_name is None):
|
|
191
|
+
raise ValueError(
|
|
192
|
+
"keeper_path and replica_name must both be provided or both be None"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
@dataclass
|
|
197
|
+
class ReplicatedSummingMergeTreeEngine(EngineConfig):
|
|
198
|
+
"""Configuration for ReplicatedSummingMergeTree engine (replicated version for summation)
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
keeper_path: Keeper path for replication (e.g., '/clickhouse/tables/{database}/{shard}/table_name')
|
|
202
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
203
|
+
replica_name: Replica name (e.g., '{replica}')
|
|
204
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
205
|
+
columns: Optional list of column names to sum
|
|
206
|
+
|
|
207
|
+
Note: Both keeper_path and replica_name must be provided together, or both omitted.
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
keeper_path: Optional[str] = None
|
|
211
|
+
replica_name: Optional[str] = None
|
|
212
|
+
columns: Optional[List[str]] = None
|
|
213
|
+
|
|
214
|
+
def __post_init__(self):
|
|
215
|
+
# Both must be provided or both must be None
|
|
216
|
+
if (self.keeper_path is None) != (self.replica_name is None):
|
|
217
|
+
raise ValueError(
|
|
218
|
+
"keeper_path and replica_name must both be provided or both be None"
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@dataclass
|
|
223
|
+
class ReplicatedCollapsingMergeTreeEngine(EngineConfig):
|
|
224
|
+
"""Configuration for ReplicatedCollapsingMergeTree engine (replicated version with collapsing)
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
keeper_path: Keeper path for replication (e.g., '/clickhouse/tables/{database}/{shard}/table_name')
|
|
228
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
229
|
+
replica_name: Replica name (e.g., '{replica}')
|
|
230
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
231
|
+
sign: Column name indicating row type (1 = state, -1 = cancel)
|
|
232
|
+
|
|
233
|
+
Note: Both keeper_path and replica_name must be provided together, or both omitted.
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
keeper_path: Optional[str] = None
|
|
237
|
+
replica_name: Optional[str] = None
|
|
238
|
+
sign: str = field(default=None)
|
|
239
|
+
|
|
240
|
+
def __post_init__(self):
|
|
241
|
+
# Both must be provided or both must be None
|
|
242
|
+
if (self.keeper_path is None) != (self.replica_name is None):
|
|
243
|
+
raise ValueError(
|
|
244
|
+
"keeper_path and replica_name must both be provided or both be None"
|
|
245
|
+
)
|
|
246
|
+
if not self.sign:
|
|
247
|
+
raise ValueError(
|
|
248
|
+
"sign column is required for ReplicatedCollapsingMergeTree"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
@dataclass
|
|
253
|
+
class ReplicatedVersionedCollapsingMergeTreeEngine(EngineConfig):
|
|
254
|
+
"""Configuration for ReplicatedVersionedCollapsingMergeTree engine (replicated version with versioned collapsing)
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
keeper_path: Keeper path for replication (e.g., '/clickhouse/tables/{database}/{shard}/table_name')
|
|
258
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
259
|
+
replica_name: Replica name (e.g., '{replica}')
|
|
260
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
261
|
+
sign: Column name indicating row type (1 = state, -1 = cancel)
|
|
262
|
+
ver: Column name for object state versioning
|
|
263
|
+
|
|
264
|
+
Note: Both keeper_path and replica_name must be provided together, or both omitted.
|
|
265
|
+
"""
|
|
266
|
+
|
|
267
|
+
keeper_path: Optional[str] = None
|
|
268
|
+
replica_name: Optional[str] = None
|
|
269
|
+
sign: str = field(default=None)
|
|
270
|
+
ver: str = field(default=None)
|
|
271
|
+
|
|
272
|
+
def __post_init__(self):
|
|
273
|
+
# Both must be provided or both must be None
|
|
274
|
+
if (self.keeper_path is None) != (self.replica_name is None):
|
|
275
|
+
raise ValueError(
|
|
276
|
+
"keeper_path and replica_name must both be provided or both be None"
|
|
277
|
+
)
|
|
278
|
+
if not self.sign:
|
|
279
|
+
raise ValueError(
|
|
280
|
+
"sign column is required for ReplicatedVersionedCollapsingMergeTree"
|
|
281
|
+
)
|
|
282
|
+
if not self.ver:
|
|
283
|
+
raise ValueError(
|
|
284
|
+
"ver column is required for ReplicatedVersionedCollapsingMergeTree"
|
|
285
|
+
)
|
|
286
|
+
|
|
56
287
|
|
|
57
288
|
@dataclass
|
|
58
289
|
class S3QueueEngine(EngineConfig):
|
|
59
290
|
"""Configuration for S3Queue engine - only non-alterable constructor parameters.
|
|
60
|
-
|
|
291
|
+
|
|
61
292
|
S3Queue-specific settings like 'mode', 'keeper_path', etc. should be specified
|
|
62
293
|
in the settings field of OlapConfig, not here.
|
|
63
294
|
"""
|
|
64
|
-
|
|
295
|
+
|
|
65
296
|
# Required fields
|
|
66
297
|
s3_path: str # S3 bucket path with wildcards (e.g., 's3://bucket/prefix/*.json')
|
|
67
|
-
format: str
|
|
68
|
-
|
|
298
|
+
format: str # Data format (e.g., 'JSONEachRow', 'CSV', 'Parquet')
|
|
299
|
+
|
|
69
300
|
# Optional AWS credentials
|
|
70
301
|
aws_access_key_id: Optional[str] = None
|
|
71
302
|
aws_secret_access_key: Optional[str] = None
|
|
72
|
-
|
|
303
|
+
|
|
73
304
|
# Optional configuration
|
|
74
305
|
compression: Optional[str] = None # e.g., 'gzip', 'zstd'
|
|
75
306
|
headers: Optional[Dict[str, str]] = None
|
|
76
|
-
|
|
307
|
+
|
|
77
308
|
def __post_init__(self):
|
|
78
309
|
"""Validate required fields"""
|
|
79
310
|
if not self.s3_path:
|
|
@@ -81,34 +312,233 @@ class S3QueueEngine(EngineConfig):
|
|
|
81
312
|
if not self.format:
|
|
82
313
|
raise ValueError("S3Queue engine requires 'format'")
|
|
83
314
|
|
|
315
|
+
|
|
316
|
+
@dataclass
|
|
317
|
+
class S3Engine(EngineConfig):
|
|
318
|
+
"""Configuration for S3 engine - direct read/write from S3 storage.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
path: S3 path to the data file(s) (e.g., 's3://bucket/path/file.json')
|
|
322
|
+
format: Data format (e.g., 'JSONEachRow', 'CSV', 'Parquet')
|
|
323
|
+
aws_access_key_id: AWS access key ID (optional, omit for public buckets)
|
|
324
|
+
aws_secret_access_key: AWS secret access key (optional, omit for public buckets)
|
|
325
|
+
compression: Compression type (e.g., 'gzip', 'zstd', 'auto')
|
|
326
|
+
partition_strategy: Optional partition strategy
|
|
327
|
+
partition_columns_in_data_file: Optional partition columns in data file
|
|
328
|
+
"""
|
|
329
|
+
|
|
330
|
+
# Required fields
|
|
331
|
+
path: str
|
|
332
|
+
format: str
|
|
333
|
+
|
|
334
|
+
# Optional fields
|
|
335
|
+
aws_access_key_id: Optional[str] = None
|
|
336
|
+
aws_secret_access_key: Optional[str] = None
|
|
337
|
+
compression: Optional[str] = None
|
|
338
|
+
partition_strategy: Optional[str] = None
|
|
339
|
+
partition_columns_in_data_file: Optional[str] = None
|
|
340
|
+
|
|
341
|
+
def __post_init__(self):
|
|
342
|
+
"""Validate required fields"""
|
|
343
|
+
if not self.path:
|
|
344
|
+
raise ValueError("S3 engine requires 'path'")
|
|
345
|
+
if not self.format:
|
|
346
|
+
raise ValueError("S3 engine requires 'format'")
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
@dataclass
|
|
350
|
+
class BufferEngine(EngineConfig):
|
|
351
|
+
"""Configuration for Buffer engine - in-memory buffer that flushes to a destination table.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
target_database: Target database name for the destination table
|
|
355
|
+
target_table: Target table name where data will be flushed
|
|
356
|
+
num_layers: Number of buffer layers (typically 16)
|
|
357
|
+
min_time: Minimum time in seconds before flushing
|
|
358
|
+
max_time: Maximum time in seconds before flushing
|
|
359
|
+
min_rows: Minimum number of rows before flushing
|
|
360
|
+
max_rows: Maximum number of rows before flushing
|
|
361
|
+
min_bytes: Minimum bytes before flushing
|
|
362
|
+
max_bytes: Maximum bytes before flushing
|
|
363
|
+
flush_time: Optional flush time in seconds
|
|
364
|
+
flush_rows: Optional flush number of rows
|
|
365
|
+
flush_bytes: Optional flush number of bytes
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
# Required fields
|
|
369
|
+
target_database: str
|
|
370
|
+
target_table: str
|
|
371
|
+
num_layers: int
|
|
372
|
+
min_time: int
|
|
373
|
+
max_time: int
|
|
374
|
+
min_rows: int
|
|
375
|
+
max_rows: int
|
|
376
|
+
min_bytes: int
|
|
377
|
+
max_bytes: int
|
|
378
|
+
|
|
379
|
+
# Optional fields
|
|
380
|
+
flush_time: Optional[int] = None
|
|
381
|
+
flush_rows: Optional[int] = None
|
|
382
|
+
flush_bytes: Optional[int] = None
|
|
383
|
+
|
|
384
|
+
def __post_init__(self):
|
|
385
|
+
"""Validate required fields"""
|
|
386
|
+
if not self.target_database:
|
|
387
|
+
raise ValueError("Buffer engine requires 'target_database'")
|
|
388
|
+
if not self.target_table:
|
|
389
|
+
raise ValueError("Buffer engine requires 'target_table'")
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
@dataclass
|
|
393
|
+
class DistributedEngine(EngineConfig):
|
|
394
|
+
"""Configuration for Distributed engine - distributed table across a cluster.
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
cluster: Cluster name from the ClickHouse configuration
|
|
398
|
+
target_database: Database name on the cluster
|
|
399
|
+
target_table: Table name on the cluster
|
|
400
|
+
sharding_key: Optional sharding key expression for data distribution
|
|
401
|
+
policy_name: Optional policy name for data distribution
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
# Required fields
|
|
405
|
+
cluster: str
|
|
406
|
+
target_database: str
|
|
407
|
+
target_table: str
|
|
408
|
+
|
|
409
|
+
# Optional fields
|
|
410
|
+
sharding_key: Optional[str] = None
|
|
411
|
+
policy_name: Optional[str] = None
|
|
412
|
+
|
|
413
|
+
def __post_init__(self):
|
|
414
|
+
"""Validate required fields"""
|
|
415
|
+
if not self.cluster:
|
|
416
|
+
raise ValueError("Distributed engine requires 'cluster'")
|
|
417
|
+
if not self.target_database:
|
|
418
|
+
raise ValueError("Distributed engine requires 'target_database'")
|
|
419
|
+
if not self.target_table:
|
|
420
|
+
raise ValueError("Distributed engine requires 'target_table'")
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
@dataclass
|
|
424
|
+
class IcebergS3Engine(EngineConfig):
|
|
425
|
+
"""Configuration for IcebergS3 engine - read-only Iceberg table access.
|
|
426
|
+
|
|
427
|
+
Provides direct querying of Apache Iceberg tables stored on S3.
|
|
428
|
+
Data is not copied; queries stream directly from Parquet/ORC files.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
path: S3 path to Iceberg table root (e.g., 's3://bucket/warehouse/events/')
|
|
432
|
+
format: Data format - 'Parquet' or 'ORC'
|
|
433
|
+
aws_access_key_id: AWS access key ID (optional, omit for public buckets or IAM roles)
|
|
434
|
+
aws_secret_access_key: AWS secret access key (optional)
|
|
435
|
+
compression: Compression type (optional: 'gzip', 'zstd', 'auto')
|
|
436
|
+
|
|
437
|
+
Example:
|
|
438
|
+
>>> from moose_lib import OlapTable, OlapConfig, moose_runtime_env
|
|
439
|
+
>>> from moose_lib.blocks import IcebergS3Engine
|
|
440
|
+
>>>
|
|
441
|
+
>>> lake_events = OlapTable[Event](
|
|
442
|
+
... "lake_events",
|
|
443
|
+
... OlapConfig(
|
|
444
|
+
... engine=IcebergS3Engine(
|
|
445
|
+
... path="s3://datalake/events/",
|
|
446
|
+
... format="Parquet",
|
|
447
|
+
... aws_access_key_id=moose_runtime_env.get("AWS_ACCESS_KEY_ID"),
|
|
448
|
+
... aws_secret_access_key=moose_runtime_env.get("AWS_SECRET_ACCESS_KEY")
|
|
449
|
+
... )
|
|
450
|
+
... )
|
|
451
|
+
... )
|
|
452
|
+
|
|
453
|
+
Note:
|
|
454
|
+
- IcebergS3 engine is read-only
|
|
455
|
+
- Does not support ORDER BY, PARTITION BY, or SAMPLE BY clauses
|
|
456
|
+
- Queries always see the latest Iceberg snapshot (with metadata cache)
|
|
457
|
+
"""
|
|
458
|
+
|
|
459
|
+
# Required fields
|
|
460
|
+
path: str
|
|
461
|
+
format: str
|
|
462
|
+
|
|
463
|
+
# Optional fields
|
|
464
|
+
aws_access_key_id: Optional[str] = None
|
|
465
|
+
aws_secret_access_key: Optional[str] = None
|
|
466
|
+
compression: Optional[str] = None
|
|
467
|
+
|
|
468
|
+
def __post_init__(self):
|
|
469
|
+
"""Validate required fields"""
|
|
470
|
+
if not self.path:
|
|
471
|
+
raise ValueError("IcebergS3 engine requires 'path'")
|
|
472
|
+
if not self.format:
|
|
473
|
+
raise ValueError("IcebergS3 engine requires 'format'")
|
|
474
|
+
if self.format not in ["Parquet", "ORC"]:
|
|
475
|
+
raise ValueError(
|
|
476
|
+
f"IcebergS3 format must be 'Parquet' or 'ORC', got '{self.format}'"
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
@dataclass
|
|
481
|
+
class KafkaEngine(EngineConfig):
|
|
482
|
+
"""Kafka engine for streaming data from Kafka topics.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
broker_list: Kafka broker addresses (e.g., 'kafka:9092')
|
|
486
|
+
topic_list: Topics to consume from
|
|
487
|
+
group_name: Consumer group identifier
|
|
488
|
+
format: Message format (e.g., 'JSONEachRow')
|
|
489
|
+
|
|
490
|
+
Additional settings (kafka_num_consumers, security) go in OlapConfig.settings.
|
|
491
|
+
"""
|
|
492
|
+
|
|
493
|
+
broker_list: str
|
|
494
|
+
topic_list: str
|
|
495
|
+
group_name: str
|
|
496
|
+
format: str
|
|
497
|
+
|
|
498
|
+
def __post_init__(self):
|
|
499
|
+
"""Validate required fields"""
|
|
500
|
+
if not self.broker_list:
|
|
501
|
+
raise ValueError("Kafka engine requires 'broker_list'")
|
|
502
|
+
if not self.topic_list:
|
|
503
|
+
raise ValueError("Kafka engine requires 'topic_list'")
|
|
504
|
+
if not self.group_name:
|
|
505
|
+
raise ValueError("Kafka engine requires 'group_name'")
|
|
506
|
+
if not self.format:
|
|
507
|
+
raise ValueError("Kafka engine requires 'format'")
|
|
508
|
+
|
|
509
|
+
|
|
84
510
|
# ==========================
|
|
85
511
|
# New Table Configuration (Recommended API)
|
|
86
512
|
# ==========================
|
|
87
513
|
|
|
514
|
+
|
|
88
515
|
@dataclass
|
|
89
516
|
class TableConfig:
|
|
90
517
|
"""Modern table configuration with engine-specific settings"""
|
|
91
|
-
|
|
518
|
+
|
|
92
519
|
# Engine configuration (required in new API)
|
|
93
520
|
engine: EngineConfig
|
|
94
|
-
|
|
521
|
+
|
|
95
522
|
# Common settings
|
|
96
523
|
name: str
|
|
97
524
|
columns: Dict[str, str]
|
|
98
525
|
order_by: Optional[str] = None
|
|
99
|
-
|
|
526
|
+
|
|
100
527
|
# Note: Factory methods (with_s3_queue, with_merge_tree, with_replacing_merge_tree)
|
|
101
528
|
# were removed in ENG-856. Use direct configuration instead, e.g.:
|
|
102
529
|
# TableConfig(name="table", columns={...}, engine=S3QueueEngine(s3_path="...", format="..."))
|
|
103
530
|
# TableConfig(name="table", columns={...}, engine=ReplacingMergeTreeEngine(ver="updated_at"))
|
|
104
531
|
|
|
532
|
+
|
|
105
533
|
# ==========================
|
|
106
534
|
# Legacy API Support (Deprecated)
|
|
107
535
|
# ==========================
|
|
108
536
|
|
|
537
|
+
|
|
109
538
|
@dataclass
|
|
110
539
|
class S3QueueEngineConfig:
|
|
111
540
|
"""Legacy S3Queue configuration (deprecated - use S3QueueEngine instead)"""
|
|
541
|
+
|
|
112
542
|
path: str # S3 path pattern (e.g., 's3://bucket/data/*.json')
|
|
113
543
|
format: str # Data format (e.g., 'JSONEachRow', 'CSV', etc.)
|
|
114
544
|
# Optional S3 access credentials - can be NOSIGN for public buckets
|
|
@@ -119,37 +549,48 @@ class S3QueueEngineConfig:
|
|
|
119
549
|
# Optional headers
|
|
120
550
|
headers: Optional[Dict[str, str]] = None
|
|
121
551
|
|
|
552
|
+
|
|
122
553
|
@dataclass
|
|
123
554
|
class TableCreateOptions:
|
|
124
555
|
name: str
|
|
125
556
|
columns: Dict[str, str]
|
|
126
557
|
engine: Optional[ClickHouseEngines] = ClickHouseEngines.MergeTree
|
|
127
558
|
order_by: Optional[str] = None
|
|
128
|
-
s3_queue_engine_config: Optional[S3QueueEngineConfig] =
|
|
559
|
+
s3_queue_engine_config: Optional[S3QueueEngineConfig] = (
|
|
560
|
+
None # Required when engine is S3Queue
|
|
561
|
+
)
|
|
129
562
|
|
|
130
563
|
def __post_init__(self):
|
|
131
564
|
"""Validate S3Queue configuration"""
|
|
132
|
-
if
|
|
565
|
+
if (
|
|
566
|
+
self.engine == ClickHouseEngines.S3Queue
|
|
567
|
+
and self.s3_queue_engine_config is None
|
|
568
|
+
):
|
|
133
569
|
raise ValueError(
|
|
134
570
|
"s3_queue_engine_config is required when using ClickHouseEngines.S3Queue engine. "
|
|
135
571
|
"Please provide s3_queue_engine_config with path, format, and optional settings."
|
|
136
572
|
)
|
|
137
573
|
|
|
574
|
+
|
|
138
575
|
# ==========================
|
|
139
576
|
# Backward Compatibility Layer
|
|
140
577
|
# ==========================
|
|
141
578
|
|
|
579
|
+
|
|
142
580
|
def is_new_config(config: Any) -> bool:
|
|
143
581
|
"""Check if configuration uses new API"""
|
|
144
582
|
if isinstance(config, TableConfig):
|
|
145
583
|
return True
|
|
146
|
-
if hasattr(config,
|
|
584
|
+
if hasattr(config, "engine") and isinstance(
|
|
585
|
+
getattr(config, "engine"), EngineConfig
|
|
586
|
+
):
|
|
147
587
|
return True
|
|
148
588
|
return False
|
|
149
589
|
|
|
590
|
+
|
|
150
591
|
def migrate_legacy_config(legacy: TableCreateOptions) -> TableConfig:
|
|
151
592
|
"""Convert legacy configuration to new format"""
|
|
152
|
-
|
|
593
|
+
|
|
153
594
|
# Show deprecation warning
|
|
154
595
|
warnings.warn(
|
|
155
596
|
"Using deprecated TableCreateOptions. Please migrate to TableConfig:\n"
|
|
@@ -157,9 +598,9 @@ def migrate_legacy_config(legacy: TableCreateOptions) -> TableConfig:
|
|
|
157
598
|
"- For deduplication: Use TableConfig(name='table', columns={...}, engine=ReplacingMergeTreeEngine())\n"
|
|
158
599
|
"See documentation for examples.",
|
|
159
600
|
DeprecationWarning,
|
|
160
|
-
stacklevel=2
|
|
601
|
+
stacklevel=2,
|
|
161
602
|
)
|
|
162
|
-
|
|
603
|
+
|
|
163
604
|
# Handle S3Queue with separate config
|
|
164
605
|
if legacy.engine == ClickHouseEngines.S3Queue and legacy.s3_queue_engine_config:
|
|
165
606
|
s3_config = legacy.s3_queue_engine_config
|
|
@@ -172,11 +613,11 @@ def migrate_legacy_config(legacy: TableCreateOptions) -> TableConfig:
|
|
|
172
613
|
aws_access_key_id=s3_config.aws_access_key_id,
|
|
173
614
|
aws_secret_access_key=s3_config.aws_secret_access_key,
|
|
174
615
|
compression=s3_config.compression,
|
|
175
|
-
headers=s3_config.headers
|
|
616
|
+
headers=s3_config.headers,
|
|
176
617
|
),
|
|
177
|
-
order_by=legacy.order_by
|
|
618
|
+
order_by=legacy.order_by,
|
|
178
619
|
)
|
|
179
|
-
|
|
620
|
+
|
|
180
621
|
# Map legacy engine enum to new engine classes
|
|
181
622
|
engine_map = {
|
|
182
623
|
ClickHouseEngines.MergeTree: MergeTreeEngine(),
|
|
@@ -184,97 +625,115 @@ def migrate_legacy_config(legacy: TableCreateOptions) -> TableConfig:
|
|
|
184
625
|
ClickHouseEngines.AggregatingMergeTree: AggregatingMergeTreeEngine(),
|
|
185
626
|
ClickHouseEngines.SummingMergeTree: SummingMergeTreeEngine(),
|
|
186
627
|
}
|
|
187
|
-
|
|
628
|
+
|
|
188
629
|
engine = engine_map.get(legacy.engine) if legacy.engine else MergeTreeEngine()
|
|
189
630
|
if engine is None:
|
|
190
631
|
engine = MergeTreeEngine()
|
|
191
|
-
|
|
632
|
+
|
|
192
633
|
return TableConfig(
|
|
193
634
|
name=legacy.name,
|
|
194
635
|
columns=legacy.columns,
|
|
195
636
|
engine=engine,
|
|
196
|
-
order_by=legacy.order_by
|
|
637
|
+
order_by=legacy.order_by,
|
|
197
638
|
)
|
|
198
639
|
|
|
640
|
+
|
|
199
641
|
def normalize_config(config: Union[TableConfig, TableCreateOptions]) -> TableConfig:
|
|
200
642
|
"""Normalize any configuration format to new API"""
|
|
201
643
|
if is_new_config(config):
|
|
202
644
|
return config # type: ignore
|
|
203
645
|
return migrate_legacy_config(config) # type: ignore
|
|
204
646
|
|
|
647
|
+
|
|
205
648
|
@dataclass
|
|
206
649
|
class AggregationCreateOptions:
|
|
207
650
|
table_create_options: TableCreateOptions
|
|
208
651
|
materialized_view_name: str
|
|
209
652
|
select: str
|
|
210
653
|
|
|
654
|
+
|
|
211
655
|
@dataclass
|
|
212
656
|
class AggregationDropOptions:
|
|
213
657
|
view_name: str
|
|
214
658
|
table_name: str
|
|
215
659
|
|
|
660
|
+
|
|
216
661
|
@dataclass
|
|
217
662
|
class MaterializedViewCreateOptions:
|
|
218
663
|
name: str
|
|
219
664
|
destination_table: str
|
|
220
665
|
select: str
|
|
221
666
|
|
|
667
|
+
|
|
222
668
|
@dataclass
|
|
223
669
|
class PopulateTableOptions:
|
|
224
670
|
destination_table: str
|
|
225
671
|
select: str
|
|
226
672
|
|
|
673
|
+
|
|
227
674
|
@dataclass
|
|
228
675
|
class Blocks:
|
|
229
676
|
teardown: list[str]
|
|
230
677
|
setup: list[str]
|
|
231
678
|
|
|
679
|
+
|
|
232
680
|
def drop_aggregation(options: AggregationDropOptions) -> list[str]:
|
|
233
681
|
"""
|
|
234
682
|
Drops an aggregation's view & underlying table.
|
|
235
683
|
"""
|
|
236
684
|
return [drop_view(options.view_name), drop_table(options.table_name)]
|
|
237
685
|
|
|
686
|
+
|
|
238
687
|
def drop_table(name: str) -> str:
|
|
239
688
|
"""
|
|
240
689
|
Drops an existing table if it exists.
|
|
241
690
|
"""
|
|
242
691
|
return f"DROP TABLE IF EXISTS {name}".strip()
|
|
243
692
|
|
|
693
|
+
|
|
244
694
|
def drop_view(name: str) -> str:
|
|
245
695
|
"""
|
|
246
696
|
Drops an existing view if it exists.
|
|
247
697
|
"""
|
|
248
698
|
return f"DROP VIEW IF EXISTS {name}".strip()
|
|
249
699
|
|
|
700
|
+
|
|
250
701
|
def create_aggregation(options: AggregationCreateOptions) -> list[str]:
|
|
251
702
|
"""
|
|
252
703
|
Creates an aggregation which includes a table, materialized view, and initial data load.
|
|
253
704
|
"""
|
|
254
705
|
return [
|
|
255
706
|
create_table(options.table_create_options),
|
|
256
|
-
create_materialized_view(
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
707
|
+
create_materialized_view(
|
|
708
|
+
MaterializedViewCreateOptions(
|
|
709
|
+
name=options.materialized_view_name,
|
|
710
|
+
destination_table=options.table_create_options.name,
|
|
711
|
+
select=options.select,
|
|
712
|
+
)
|
|
713
|
+
),
|
|
714
|
+
populate_table(
|
|
715
|
+
PopulateTableOptions(
|
|
716
|
+
destination_table=options.table_create_options.name,
|
|
717
|
+
select=options.select,
|
|
718
|
+
)
|
|
719
|
+
),
|
|
265
720
|
]
|
|
266
721
|
|
|
722
|
+
|
|
267
723
|
def create_materialized_view(options: MaterializedViewCreateOptions) -> str:
|
|
268
724
|
"""
|
|
269
725
|
Creates a materialized view.
|
|
270
726
|
"""
|
|
271
727
|
return f"CREATE MATERIALIZED VIEW IF NOT EXISTS {options.name} \nTO {options.destination_table}\nAS {options.select}".strip()
|
|
272
728
|
|
|
729
|
+
|
|
273
730
|
def create_table(options: TableCreateOptions) -> str:
|
|
274
731
|
"""
|
|
275
732
|
Creates a new table with default MergeTree engine.
|
|
276
733
|
"""
|
|
277
|
-
column_definitions = ",\n".join(
|
|
734
|
+
column_definitions = ",\n".join(
|
|
735
|
+
[f"{name} {type}" for name, type in options.columns.items()]
|
|
736
|
+
)
|
|
278
737
|
order_by_clause = f"ORDER BY {options.order_by}" if options.order_by else ""
|
|
279
738
|
engine = options.engine.value if options.engine else "MergeTree"
|
|
280
739
|
|
|
@@ -287,8 +746,9 @@ def create_table(options: TableCreateOptions) -> str:
|
|
|
287
746
|
{order_by_clause}
|
|
288
747
|
""".strip()
|
|
289
748
|
|
|
749
|
+
|
|
290
750
|
def populate_table(options: PopulateTableOptions) -> str:
|
|
291
751
|
"""
|
|
292
752
|
Populates a table with data.
|
|
293
753
|
"""
|
|
294
|
-
return f"INSERT INTO {options.destination_table}\n{options.select}".strip()
|
|
754
|
+
return f"INSERT INTO {options.destination_table}\n{options.select}".strip()
|