moose-lib 0.6.148.dev3442438466__py3-none-any.whl → 0.6.283__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- moose_lib/__init__.py +34 -3
- moose_lib/blocks.py +416 -52
- moose_lib/clients/redis_client.py +26 -14
- moose_lib/commons.py +37 -30
- moose_lib/config/config_file.py +5 -1
- moose_lib/config/runtime.py +73 -34
- moose_lib/data_models.py +331 -61
- moose_lib/dmv2/__init__.py +69 -73
- moose_lib/dmv2/_registry.py +2 -1
- moose_lib/dmv2/_source_capture.py +37 -0
- moose_lib/dmv2/consumption.py +55 -32
- moose_lib/dmv2/ingest_api.py +9 -2
- moose_lib/dmv2/ingest_pipeline.py +35 -16
- moose_lib/dmv2/life_cycle.py +3 -1
- moose_lib/dmv2/materialized_view.py +24 -14
- moose_lib/dmv2/moose_model.py +165 -0
- moose_lib/dmv2/olap_table.py +299 -151
- moose_lib/dmv2/registry.py +18 -3
- moose_lib/dmv2/sql_resource.py +16 -8
- moose_lib/dmv2/stream.py +75 -23
- moose_lib/dmv2/types.py +14 -8
- moose_lib/dmv2/view.py +13 -6
- moose_lib/dmv2/web_app.py +11 -6
- moose_lib/dmv2/web_app_helpers.py +5 -1
- moose_lib/dmv2/workflow.py +37 -9
- moose_lib/internal.py +340 -56
- moose_lib/main.py +87 -56
- moose_lib/query_builder.py +18 -5
- moose_lib/query_param.py +54 -20
- moose_lib/secrets.py +122 -0
- moose_lib/streaming/streaming_function_runner.py +233 -117
- moose_lib/utilities/sql.py +0 -1
- {moose_lib-0.6.148.dev3442438466.dist-info → moose_lib-0.6.283.dist-info}/METADATA +18 -1
- moose_lib-0.6.283.dist-info/RECORD +63 -0
- tests/__init__.py +1 -1
- tests/conftest.py +6 -5
- tests/test_backward_compatibility.py +85 -0
- tests/test_cluster_validation.py +85 -0
- tests/test_codec.py +75 -0
- tests/test_column_formatting.py +80 -0
- tests/test_fixedstring.py +43 -0
- tests/test_iceberg_config.py +105 -0
- tests/test_int_types.py +211 -0
- tests/test_kafka_config.py +141 -0
- tests/test_materialized.py +74 -0
- tests/test_metadata.py +37 -0
- tests/test_moose.py +21 -30
- tests/test_moose_model.py +153 -0
- tests/test_olap_table_moosemodel.py +89 -0
- tests/test_olap_table_versioning.py +52 -58
- tests/test_query_builder.py +97 -9
- tests/test_redis_client.py +10 -3
- tests/test_s3queue_config.py +211 -110
- tests/test_secrets.py +239 -0
- tests/test_simple_aggregate.py +42 -40
- tests/test_web_app.py +11 -5
- moose_lib-0.6.148.dev3442438466.dist-info/RECORD +0 -47
- {moose_lib-0.6.148.dev3442438466.dist-info → moose_lib-0.6.283.dist-info}/WHEEL +0 -0
- {moose_lib-0.6.148.dev3442438466.dist-info → moose_lib-0.6.283.dist-info}/top_level.txt +0 -0
moose_lib/blocks.py
CHANGED
|
@@ -14,78 +14,135 @@ class ClickHouseEngines(Enum):
|
|
|
14
14
|
VersionedCollapsingMergeTree = "VersionedCollapsingMergeTree"
|
|
15
15
|
GraphiteMergeTree = "GraphiteMergeTree"
|
|
16
16
|
S3Queue = "S3Queue"
|
|
17
|
+
S3 = "S3"
|
|
18
|
+
Buffer = "Buffer"
|
|
19
|
+
Distributed = "Distributed"
|
|
20
|
+
IcebergS3 = "IcebergS3"
|
|
21
|
+
Kafka = "Kafka"
|
|
17
22
|
ReplicatedMergeTree = "ReplicatedMergeTree"
|
|
18
23
|
ReplicatedReplacingMergeTree = "ReplicatedReplacingMergeTree"
|
|
19
24
|
ReplicatedAggregatingMergeTree = "ReplicatedAggregatingMergeTree"
|
|
20
25
|
ReplicatedSummingMergeTree = "ReplicatedSummingMergeTree"
|
|
26
|
+
ReplicatedCollapsingMergeTree = "ReplicatedCollapsingMergeTree"
|
|
27
|
+
ReplicatedVersionedCollapsingMergeTree = "ReplicatedVersionedCollapsingMergeTree"
|
|
28
|
+
|
|
21
29
|
|
|
22
30
|
# ==========================
|
|
23
31
|
# New Engine Configuration Classes
|
|
24
32
|
# ==========================
|
|
25
33
|
|
|
34
|
+
|
|
26
35
|
@dataclass
|
|
27
36
|
class EngineConfig(ABC):
|
|
28
37
|
"""Base class for engine configurations"""
|
|
38
|
+
|
|
29
39
|
pass
|
|
30
40
|
|
|
41
|
+
|
|
31
42
|
@dataclass
|
|
32
43
|
class MergeTreeEngine(EngineConfig):
|
|
33
44
|
"""Configuration for MergeTree engine"""
|
|
45
|
+
|
|
34
46
|
pass
|
|
35
47
|
|
|
36
|
-
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
37
50
|
class ReplacingMergeTreeEngine(EngineConfig):
|
|
38
51
|
"""Configuration for ReplacingMergeTree engine (with deduplication)
|
|
39
|
-
|
|
52
|
+
|
|
40
53
|
Args:
|
|
41
54
|
ver: Optional column name for version tracking
|
|
42
55
|
is_deleted: Optional column name for deletion marking (requires ver)
|
|
43
56
|
"""
|
|
57
|
+
|
|
44
58
|
ver: Optional[str] = None
|
|
45
59
|
is_deleted: Optional[str] = None
|
|
46
|
-
|
|
60
|
+
|
|
47
61
|
def __post_init__(self):
|
|
48
62
|
if self.is_deleted and not self.ver:
|
|
49
63
|
raise ValueError("is_deleted requires ver to be specified")
|
|
50
64
|
|
|
65
|
+
|
|
51
66
|
@dataclass
|
|
52
67
|
class AggregatingMergeTreeEngine(EngineConfig):
|
|
53
68
|
"""Configuration for AggregatingMergeTree engine"""
|
|
69
|
+
|
|
54
70
|
pass
|
|
55
71
|
|
|
72
|
+
|
|
56
73
|
@dataclass
|
|
57
74
|
class SummingMergeTreeEngine(EngineConfig):
|
|
58
75
|
"""Configuration for SummingMergeTree engine
|
|
59
|
-
|
|
76
|
+
|
|
60
77
|
Args:
|
|
61
78
|
columns: Optional list of column names to sum
|
|
62
79
|
"""
|
|
80
|
+
|
|
63
81
|
columns: Optional[List[str]] = None
|
|
64
82
|
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class CollapsingMergeTreeEngine(EngineConfig):
|
|
86
|
+
"""Configuration for CollapsingMergeTree engine
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
sign: Column name indicating row type (1 = state, -1 = cancel)
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
sign: str
|
|
93
|
+
|
|
94
|
+
def __post_init__(self):
|
|
95
|
+
if not self.sign:
|
|
96
|
+
raise ValueError("sign column is required for CollapsingMergeTree")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class VersionedCollapsingMergeTreeEngine(EngineConfig):
|
|
101
|
+
"""Configuration for VersionedCollapsingMergeTree engine
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
sign: Column name indicating row type (1 = state, -1 = cancel)
|
|
105
|
+
ver: Column name for object state versioning
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
sign: str
|
|
109
|
+
ver: str
|
|
110
|
+
|
|
111
|
+
def __post_init__(self):
|
|
112
|
+
if not self.sign:
|
|
113
|
+
raise ValueError("sign column is required for VersionedCollapsingMergeTree")
|
|
114
|
+
if not self.ver:
|
|
115
|
+
raise ValueError("ver column is required for VersionedCollapsingMergeTree")
|
|
116
|
+
|
|
117
|
+
|
|
65
118
|
@dataclass
|
|
66
119
|
class ReplicatedMergeTreeEngine(EngineConfig):
|
|
67
120
|
"""Configuration for ReplicatedMergeTree engine (replicated version of MergeTree)
|
|
68
|
-
|
|
121
|
+
|
|
69
122
|
Args:
|
|
70
123
|
keeper_path: Keeper path for replication (e.g., '/clickhouse/tables/{database}/{shard}/table_name')
|
|
71
124
|
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
72
125
|
replica_name: Replica name (e.g., '{replica}')
|
|
73
126
|
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
74
|
-
|
|
127
|
+
|
|
75
128
|
Note: Both keeper_path and replica_name must be provided together, or both omitted.
|
|
76
129
|
"""
|
|
130
|
+
|
|
77
131
|
keeper_path: Optional[str] = None
|
|
78
132
|
replica_name: Optional[str] = None
|
|
79
|
-
|
|
133
|
+
|
|
80
134
|
def __post_init__(self):
|
|
81
135
|
# Both must be provided or both must be None
|
|
82
136
|
if (self.keeper_path is None) != (self.replica_name is None):
|
|
83
|
-
raise ValueError(
|
|
137
|
+
raise ValueError(
|
|
138
|
+
"keeper_path and replica_name must both be provided or both be None"
|
|
139
|
+
)
|
|
140
|
+
|
|
84
141
|
|
|
85
142
|
@dataclass
|
|
86
143
|
class ReplicatedReplacingMergeTreeEngine(EngineConfig):
|
|
87
144
|
"""Configuration for ReplicatedReplacingMergeTree engine (replicated version with deduplication)
|
|
88
|
-
|
|
145
|
+
|
|
89
146
|
Args:
|
|
90
147
|
keeper_path: Keeper path for replication (e.g., '/clickhouse/tables/{database}/{shard}/table_name')
|
|
91
148
|
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
@@ -93,83 +150,161 @@ class ReplicatedReplacingMergeTreeEngine(EngineConfig):
|
|
|
93
150
|
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
94
151
|
ver: Optional column name for version tracking
|
|
95
152
|
is_deleted: Optional column name for deletion marking (requires ver)
|
|
96
|
-
|
|
153
|
+
|
|
97
154
|
Note: Both keeper_path and replica_name must be provided together, or both omitted.
|
|
98
155
|
"""
|
|
156
|
+
|
|
99
157
|
keeper_path: Optional[str] = None
|
|
100
158
|
replica_name: Optional[str] = None
|
|
101
159
|
ver: Optional[str] = None
|
|
102
160
|
is_deleted: Optional[str] = None
|
|
103
|
-
|
|
161
|
+
|
|
104
162
|
def __post_init__(self):
|
|
105
163
|
# Both must be provided or both must be None
|
|
106
164
|
if (self.keeper_path is None) != (self.replica_name is None):
|
|
107
|
-
raise ValueError(
|
|
165
|
+
raise ValueError(
|
|
166
|
+
"keeper_path and replica_name must both be provided or both be None"
|
|
167
|
+
)
|
|
108
168
|
if self.is_deleted and not self.ver:
|
|
109
169
|
raise ValueError("is_deleted requires ver to be specified")
|
|
110
170
|
|
|
171
|
+
|
|
111
172
|
@dataclass
|
|
112
173
|
class ReplicatedAggregatingMergeTreeEngine(EngineConfig):
|
|
113
174
|
"""Configuration for ReplicatedAggregatingMergeTree engine (replicated version for aggregations)
|
|
114
|
-
|
|
175
|
+
|
|
115
176
|
Args:
|
|
116
177
|
keeper_path: Keeper path for replication (e.g., '/clickhouse/tables/{database}/{shard}/table_name')
|
|
117
178
|
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
118
179
|
replica_name: Replica name (e.g., '{replica}')
|
|
119
180
|
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
120
|
-
|
|
181
|
+
|
|
121
182
|
Note: Both keeper_path and replica_name must be provided together, or both omitted.
|
|
122
183
|
"""
|
|
184
|
+
|
|
123
185
|
keeper_path: Optional[str] = None
|
|
124
186
|
replica_name: Optional[str] = None
|
|
125
|
-
|
|
187
|
+
|
|
126
188
|
def __post_init__(self):
|
|
127
189
|
# Both must be provided or both must be None
|
|
128
190
|
if (self.keeper_path is None) != (self.replica_name is None):
|
|
129
|
-
raise ValueError(
|
|
191
|
+
raise ValueError(
|
|
192
|
+
"keeper_path and replica_name must both be provided or both be None"
|
|
193
|
+
)
|
|
194
|
+
|
|
130
195
|
|
|
131
196
|
@dataclass
|
|
132
197
|
class ReplicatedSummingMergeTreeEngine(EngineConfig):
|
|
133
198
|
"""Configuration for ReplicatedSummingMergeTree engine (replicated version for summation)
|
|
134
|
-
|
|
199
|
+
|
|
135
200
|
Args:
|
|
136
201
|
keeper_path: Keeper path for replication (e.g., '/clickhouse/tables/{database}/{shard}/table_name')
|
|
137
202
|
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
138
203
|
replica_name: Replica name (e.g., '{replica}')
|
|
139
204
|
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
140
205
|
columns: Optional list of column names to sum
|
|
141
|
-
|
|
206
|
+
|
|
142
207
|
Note: Both keeper_path and replica_name must be provided together, or both omitted.
|
|
143
208
|
"""
|
|
209
|
+
|
|
144
210
|
keeper_path: Optional[str] = None
|
|
145
211
|
replica_name: Optional[str] = None
|
|
146
212
|
columns: Optional[List[str]] = None
|
|
147
|
-
|
|
213
|
+
|
|
214
|
+
def __post_init__(self):
|
|
215
|
+
# Both must be provided or both must be None
|
|
216
|
+
if (self.keeper_path is None) != (self.replica_name is None):
|
|
217
|
+
raise ValueError(
|
|
218
|
+
"keeper_path and replica_name must both be provided or both be None"
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@dataclass
|
|
223
|
+
class ReplicatedCollapsingMergeTreeEngine(EngineConfig):
|
|
224
|
+
"""Configuration for ReplicatedCollapsingMergeTree engine (replicated version with collapsing)
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
keeper_path: Keeper path for replication (e.g., '/clickhouse/tables/{database}/{shard}/table_name')
|
|
228
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
229
|
+
replica_name: Replica name (e.g., '{replica}')
|
|
230
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
231
|
+
sign: Column name indicating row type (1 = state, -1 = cancel)
|
|
232
|
+
|
|
233
|
+
Note: Both keeper_path and replica_name must be provided together, or both omitted.
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
keeper_path: Optional[str] = None
|
|
237
|
+
replica_name: Optional[str] = None
|
|
238
|
+
sign: str = field(default=None)
|
|
239
|
+
|
|
148
240
|
def __post_init__(self):
|
|
149
241
|
# Both must be provided or both must be None
|
|
150
242
|
if (self.keeper_path is None) != (self.replica_name is None):
|
|
151
|
-
raise ValueError(
|
|
243
|
+
raise ValueError(
|
|
244
|
+
"keeper_path and replica_name must both be provided or both be None"
|
|
245
|
+
)
|
|
246
|
+
if not self.sign:
|
|
247
|
+
raise ValueError(
|
|
248
|
+
"sign column is required for ReplicatedCollapsingMergeTree"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
@dataclass
|
|
253
|
+
class ReplicatedVersionedCollapsingMergeTreeEngine(EngineConfig):
|
|
254
|
+
"""Configuration for ReplicatedVersionedCollapsingMergeTree engine (replicated version with versioned collapsing)
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
keeper_path: Keeper path for replication (e.g., '/clickhouse/tables/{database}/{shard}/table_name')
|
|
258
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
259
|
+
replica_name: Replica name (e.g., '{replica}')
|
|
260
|
+
Optional: omit for ClickHouse Cloud which manages replication automatically
|
|
261
|
+
sign: Column name indicating row type (1 = state, -1 = cancel)
|
|
262
|
+
ver: Column name for object state versioning
|
|
263
|
+
|
|
264
|
+
Note: Both keeper_path and replica_name must be provided together, or both omitted.
|
|
265
|
+
"""
|
|
266
|
+
|
|
267
|
+
keeper_path: Optional[str] = None
|
|
268
|
+
replica_name: Optional[str] = None
|
|
269
|
+
sign: str = field(default=None)
|
|
270
|
+
ver: str = field(default=None)
|
|
271
|
+
|
|
272
|
+
def __post_init__(self):
|
|
273
|
+
# Both must be provided or both must be None
|
|
274
|
+
if (self.keeper_path is None) != (self.replica_name is None):
|
|
275
|
+
raise ValueError(
|
|
276
|
+
"keeper_path and replica_name must both be provided or both be None"
|
|
277
|
+
)
|
|
278
|
+
if not self.sign:
|
|
279
|
+
raise ValueError(
|
|
280
|
+
"sign column is required for ReplicatedVersionedCollapsingMergeTree"
|
|
281
|
+
)
|
|
282
|
+
if not self.ver:
|
|
283
|
+
raise ValueError(
|
|
284
|
+
"ver column is required for ReplicatedVersionedCollapsingMergeTree"
|
|
285
|
+
)
|
|
286
|
+
|
|
152
287
|
|
|
153
288
|
@dataclass
|
|
154
289
|
class S3QueueEngine(EngineConfig):
|
|
155
290
|
"""Configuration for S3Queue engine - only non-alterable constructor parameters.
|
|
156
|
-
|
|
291
|
+
|
|
157
292
|
S3Queue-specific settings like 'mode', 'keeper_path', etc. should be specified
|
|
158
293
|
in the settings field of OlapConfig, not here.
|
|
159
294
|
"""
|
|
160
|
-
|
|
295
|
+
|
|
161
296
|
# Required fields
|
|
162
297
|
s3_path: str # S3 bucket path with wildcards (e.g., 's3://bucket/prefix/*.json')
|
|
163
|
-
format: str
|
|
164
|
-
|
|
298
|
+
format: str # Data format (e.g., 'JSONEachRow', 'CSV', 'Parquet')
|
|
299
|
+
|
|
165
300
|
# Optional AWS credentials
|
|
166
301
|
aws_access_key_id: Optional[str] = None
|
|
167
302
|
aws_secret_access_key: Optional[str] = None
|
|
168
|
-
|
|
303
|
+
|
|
169
304
|
# Optional configuration
|
|
170
305
|
compression: Optional[str] = None # e.g., 'gzip', 'zstd'
|
|
171
306
|
headers: Optional[Dict[str, str]] = None
|
|
172
|
-
|
|
307
|
+
|
|
173
308
|
def __post_init__(self):
|
|
174
309
|
"""Validate required fields"""
|
|
175
310
|
if not self.s3_path:
|
|
@@ -177,34 +312,233 @@ class S3QueueEngine(EngineConfig):
|
|
|
177
312
|
if not self.format:
|
|
178
313
|
raise ValueError("S3Queue engine requires 'format'")
|
|
179
314
|
|
|
315
|
+
|
|
316
|
+
@dataclass
|
|
317
|
+
class S3Engine(EngineConfig):
|
|
318
|
+
"""Configuration for S3 engine - direct read/write from S3 storage.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
path: S3 path to the data file(s) (e.g., 's3://bucket/path/file.json')
|
|
322
|
+
format: Data format (e.g., 'JSONEachRow', 'CSV', 'Parquet')
|
|
323
|
+
aws_access_key_id: AWS access key ID (optional, omit for public buckets)
|
|
324
|
+
aws_secret_access_key: AWS secret access key (optional, omit for public buckets)
|
|
325
|
+
compression: Compression type (e.g., 'gzip', 'zstd', 'auto')
|
|
326
|
+
partition_strategy: Optional partition strategy
|
|
327
|
+
partition_columns_in_data_file: Optional partition columns in data file
|
|
328
|
+
"""
|
|
329
|
+
|
|
330
|
+
# Required fields
|
|
331
|
+
path: str
|
|
332
|
+
format: str
|
|
333
|
+
|
|
334
|
+
# Optional fields
|
|
335
|
+
aws_access_key_id: Optional[str] = None
|
|
336
|
+
aws_secret_access_key: Optional[str] = None
|
|
337
|
+
compression: Optional[str] = None
|
|
338
|
+
partition_strategy: Optional[str] = None
|
|
339
|
+
partition_columns_in_data_file: Optional[str] = None
|
|
340
|
+
|
|
341
|
+
def __post_init__(self):
|
|
342
|
+
"""Validate required fields"""
|
|
343
|
+
if not self.path:
|
|
344
|
+
raise ValueError("S3 engine requires 'path'")
|
|
345
|
+
if not self.format:
|
|
346
|
+
raise ValueError("S3 engine requires 'format'")
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
@dataclass
|
|
350
|
+
class BufferEngine(EngineConfig):
|
|
351
|
+
"""Configuration for Buffer engine - in-memory buffer that flushes to a destination table.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
target_database: Target database name for the destination table
|
|
355
|
+
target_table: Target table name where data will be flushed
|
|
356
|
+
num_layers: Number of buffer layers (typically 16)
|
|
357
|
+
min_time: Minimum time in seconds before flushing
|
|
358
|
+
max_time: Maximum time in seconds before flushing
|
|
359
|
+
min_rows: Minimum number of rows before flushing
|
|
360
|
+
max_rows: Maximum number of rows before flushing
|
|
361
|
+
min_bytes: Minimum bytes before flushing
|
|
362
|
+
max_bytes: Maximum bytes before flushing
|
|
363
|
+
flush_time: Optional flush time in seconds
|
|
364
|
+
flush_rows: Optional flush number of rows
|
|
365
|
+
flush_bytes: Optional flush number of bytes
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
# Required fields
|
|
369
|
+
target_database: str
|
|
370
|
+
target_table: str
|
|
371
|
+
num_layers: int
|
|
372
|
+
min_time: int
|
|
373
|
+
max_time: int
|
|
374
|
+
min_rows: int
|
|
375
|
+
max_rows: int
|
|
376
|
+
min_bytes: int
|
|
377
|
+
max_bytes: int
|
|
378
|
+
|
|
379
|
+
# Optional fields
|
|
380
|
+
flush_time: Optional[int] = None
|
|
381
|
+
flush_rows: Optional[int] = None
|
|
382
|
+
flush_bytes: Optional[int] = None
|
|
383
|
+
|
|
384
|
+
def __post_init__(self):
|
|
385
|
+
"""Validate required fields"""
|
|
386
|
+
if not self.target_database:
|
|
387
|
+
raise ValueError("Buffer engine requires 'target_database'")
|
|
388
|
+
if not self.target_table:
|
|
389
|
+
raise ValueError("Buffer engine requires 'target_table'")
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
@dataclass
|
|
393
|
+
class DistributedEngine(EngineConfig):
|
|
394
|
+
"""Configuration for Distributed engine - distributed table across a cluster.
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
cluster: Cluster name from the ClickHouse configuration
|
|
398
|
+
target_database: Database name on the cluster
|
|
399
|
+
target_table: Table name on the cluster
|
|
400
|
+
sharding_key: Optional sharding key expression for data distribution
|
|
401
|
+
policy_name: Optional policy name for data distribution
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
# Required fields
|
|
405
|
+
cluster: str
|
|
406
|
+
target_database: str
|
|
407
|
+
target_table: str
|
|
408
|
+
|
|
409
|
+
# Optional fields
|
|
410
|
+
sharding_key: Optional[str] = None
|
|
411
|
+
policy_name: Optional[str] = None
|
|
412
|
+
|
|
413
|
+
def __post_init__(self):
|
|
414
|
+
"""Validate required fields"""
|
|
415
|
+
if not self.cluster:
|
|
416
|
+
raise ValueError("Distributed engine requires 'cluster'")
|
|
417
|
+
if not self.target_database:
|
|
418
|
+
raise ValueError("Distributed engine requires 'target_database'")
|
|
419
|
+
if not self.target_table:
|
|
420
|
+
raise ValueError("Distributed engine requires 'target_table'")
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
@dataclass
|
|
424
|
+
class IcebergS3Engine(EngineConfig):
|
|
425
|
+
"""Configuration for IcebergS3 engine - read-only Iceberg table access.
|
|
426
|
+
|
|
427
|
+
Provides direct querying of Apache Iceberg tables stored on S3.
|
|
428
|
+
Data is not copied; queries stream directly from Parquet/ORC files.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
path: S3 path to Iceberg table root (e.g., 's3://bucket/warehouse/events/')
|
|
432
|
+
format: Data format - 'Parquet' or 'ORC'
|
|
433
|
+
aws_access_key_id: AWS access key ID (optional, omit for public buckets or IAM roles)
|
|
434
|
+
aws_secret_access_key: AWS secret access key (optional)
|
|
435
|
+
compression: Compression type (optional: 'gzip', 'zstd', 'auto')
|
|
436
|
+
|
|
437
|
+
Example:
|
|
438
|
+
>>> from moose_lib import OlapTable, OlapConfig, moose_runtime_env
|
|
439
|
+
>>> from moose_lib.blocks import IcebergS3Engine
|
|
440
|
+
>>>
|
|
441
|
+
>>> lake_events = OlapTable[Event](
|
|
442
|
+
... "lake_events",
|
|
443
|
+
... OlapConfig(
|
|
444
|
+
... engine=IcebergS3Engine(
|
|
445
|
+
... path="s3://datalake/events/",
|
|
446
|
+
... format="Parquet",
|
|
447
|
+
... aws_access_key_id=moose_runtime_env.get("AWS_ACCESS_KEY_ID"),
|
|
448
|
+
... aws_secret_access_key=moose_runtime_env.get("AWS_SECRET_ACCESS_KEY")
|
|
449
|
+
... )
|
|
450
|
+
... )
|
|
451
|
+
... )
|
|
452
|
+
|
|
453
|
+
Note:
|
|
454
|
+
- IcebergS3 engine is read-only
|
|
455
|
+
- Does not support ORDER BY, PARTITION BY, or SAMPLE BY clauses
|
|
456
|
+
- Queries always see the latest Iceberg snapshot (with metadata cache)
|
|
457
|
+
"""
|
|
458
|
+
|
|
459
|
+
# Required fields
|
|
460
|
+
path: str
|
|
461
|
+
format: str
|
|
462
|
+
|
|
463
|
+
# Optional fields
|
|
464
|
+
aws_access_key_id: Optional[str] = None
|
|
465
|
+
aws_secret_access_key: Optional[str] = None
|
|
466
|
+
compression: Optional[str] = None
|
|
467
|
+
|
|
468
|
+
def __post_init__(self):
|
|
469
|
+
"""Validate required fields"""
|
|
470
|
+
if not self.path:
|
|
471
|
+
raise ValueError("IcebergS3 engine requires 'path'")
|
|
472
|
+
if not self.format:
|
|
473
|
+
raise ValueError("IcebergS3 engine requires 'format'")
|
|
474
|
+
if self.format not in ["Parquet", "ORC"]:
|
|
475
|
+
raise ValueError(
|
|
476
|
+
f"IcebergS3 format must be 'Parquet' or 'ORC', got '{self.format}'"
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
@dataclass
|
|
481
|
+
class KafkaEngine(EngineConfig):
|
|
482
|
+
"""Kafka engine for streaming data from Kafka topics.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
broker_list: Kafka broker addresses (e.g., 'kafka:9092')
|
|
486
|
+
topic_list: Topics to consume from
|
|
487
|
+
group_name: Consumer group identifier
|
|
488
|
+
format: Message format (e.g., 'JSONEachRow')
|
|
489
|
+
|
|
490
|
+
Additional settings (kafka_num_consumers, security) go in OlapConfig.settings.
|
|
491
|
+
"""
|
|
492
|
+
|
|
493
|
+
broker_list: str
|
|
494
|
+
topic_list: str
|
|
495
|
+
group_name: str
|
|
496
|
+
format: str
|
|
497
|
+
|
|
498
|
+
def __post_init__(self):
|
|
499
|
+
"""Validate required fields"""
|
|
500
|
+
if not self.broker_list:
|
|
501
|
+
raise ValueError("Kafka engine requires 'broker_list'")
|
|
502
|
+
if not self.topic_list:
|
|
503
|
+
raise ValueError("Kafka engine requires 'topic_list'")
|
|
504
|
+
if not self.group_name:
|
|
505
|
+
raise ValueError("Kafka engine requires 'group_name'")
|
|
506
|
+
if not self.format:
|
|
507
|
+
raise ValueError("Kafka engine requires 'format'")
|
|
508
|
+
|
|
509
|
+
|
|
180
510
|
# ==========================
|
|
181
511
|
# New Table Configuration (Recommended API)
|
|
182
512
|
# ==========================
|
|
183
513
|
|
|
514
|
+
|
|
184
515
|
@dataclass
|
|
185
516
|
class TableConfig:
|
|
186
517
|
"""Modern table configuration with engine-specific settings"""
|
|
187
|
-
|
|
518
|
+
|
|
188
519
|
# Engine configuration (required in new API)
|
|
189
520
|
engine: EngineConfig
|
|
190
|
-
|
|
521
|
+
|
|
191
522
|
# Common settings
|
|
192
523
|
name: str
|
|
193
524
|
columns: Dict[str, str]
|
|
194
525
|
order_by: Optional[str] = None
|
|
195
|
-
|
|
526
|
+
|
|
196
527
|
# Note: Factory methods (with_s3_queue, with_merge_tree, with_replacing_merge_tree)
|
|
197
528
|
# were removed in ENG-856. Use direct configuration instead, e.g.:
|
|
198
529
|
# TableConfig(name="table", columns={...}, engine=S3QueueEngine(s3_path="...", format="..."))
|
|
199
530
|
# TableConfig(name="table", columns={...}, engine=ReplacingMergeTreeEngine(ver="updated_at"))
|
|
200
531
|
|
|
532
|
+
|
|
201
533
|
# ==========================
|
|
202
534
|
# Legacy API Support (Deprecated)
|
|
203
535
|
# ==========================
|
|
204
536
|
|
|
537
|
+
|
|
205
538
|
@dataclass
|
|
206
539
|
class S3QueueEngineConfig:
|
|
207
540
|
"""Legacy S3Queue configuration (deprecated - use S3QueueEngine instead)"""
|
|
541
|
+
|
|
208
542
|
path: str # S3 path pattern (e.g., 's3://bucket/data/*.json')
|
|
209
543
|
format: str # Data format (e.g., 'JSONEachRow', 'CSV', etc.)
|
|
210
544
|
# Optional S3 access credentials - can be NOSIGN for public buckets
|
|
@@ -215,37 +549,48 @@ class S3QueueEngineConfig:
|
|
|
215
549
|
# Optional headers
|
|
216
550
|
headers: Optional[Dict[str, str]] = None
|
|
217
551
|
|
|
552
|
+
|
|
218
553
|
@dataclass
|
|
219
554
|
class TableCreateOptions:
|
|
220
555
|
name: str
|
|
221
556
|
columns: Dict[str, str]
|
|
222
557
|
engine: Optional[ClickHouseEngines] = ClickHouseEngines.MergeTree
|
|
223
558
|
order_by: Optional[str] = None
|
|
224
|
-
s3_queue_engine_config: Optional[S3QueueEngineConfig] =
|
|
559
|
+
s3_queue_engine_config: Optional[S3QueueEngineConfig] = (
|
|
560
|
+
None # Required when engine is S3Queue
|
|
561
|
+
)
|
|
225
562
|
|
|
226
563
|
def __post_init__(self):
|
|
227
564
|
"""Validate S3Queue configuration"""
|
|
228
|
-
if
|
|
565
|
+
if (
|
|
566
|
+
self.engine == ClickHouseEngines.S3Queue
|
|
567
|
+
and self.s3_queue_engine_config is None
|
|
568
|
+
):
|
|
229
569
|
raise ValueError(
|
|
230
570
|
"s3_queue_engine_config is required when using ClickHouseEngines.S3Queue engine. "
|
|
231
571
|
"Please provide s3_queue_engine_config with path, format, and optional settings."
|
|
232
572
|
)
|
|
233
573
|
|
|
574
|
+
|
|
234
575
|
# ==========================
|
|
235
576
|
# Backward Compatibility Layer
|
|
236
577
|
# ==========================
|
|
237
578
|
|
|
579
|
+
|
|
238
580
|
def is_new_config(config: Any) -> bool:
|
|
239
581
|
"""Check if configuration uses new API"""
|
|
240
582
|
if isinstance(config, TableConfig):
|
|
241
583
|
return True
|
|
242
|
-
if hasattr(config,
|
|
584
|
+
if hasattr(config, "engine") and isinstance(
|
|
585
|
+
getattr(config, "engine"), EngineConfig
|
|
586
|
+
):
|
|
243
587
|
return True
|
|
244
588
|
return False
|
|
245
589
|
|
|
590
|
+
|
|
246
591
|
def migrate_legacy_config(legacy: TableCreateOptions) -> TableConfig:
|
|
247
592
|
"""Convert legacy configuration to new format"""
|
|
248
|
-
|
|
593
|
+
|
|
249
594
|
# Show deprecation warning
|
|
250
595
|
warnings.warn(
|
|
251
596
|
"Using deprecated TableCreateOptions. Please migrate to TableConfig:\n"
|
|
@@ -253,9 +598,9 @@ def migrate_legacy_config(legacy: TableCreateOptions) -> TableConfig:
|
|
|
253
598
|
"- For deduplication: Use TableConfig(name='table', columns={...}, engine=ReplacingMergeTreeEngine())\n"
|
|
254
599
|
"See documentation for examples.",
|
|
255
600
|
DeprecationWarning,
|
|
256
|
-
stacklevel=2
|
|
601
|
+
stacklevel=2,
|
|
257
602
|
)
|
|
258
|
-
|
|
603
|
+
|
|
259
604
|
# Handle S3Queue with separate config
|
|
260
605
|
if legacy.engine == ClickHouseEngines.S3Queue and legacy.s3_queue_engine_config:
|
|
261
606
|
s3_config = legacy.s3_queue_engine_config
|
|
@@ -268,11 +613,11 @@ def migrate_legacy_config(legacy: TableCreateOptions) -> TableConfig:
|
|
|
268
613
|
aws_access_key_id=s3_config.aws_access_key_id,
|
|
269
614
|
aws_secret_access_key=s3_config.aws_secret_access_key,
|
|
270
615
|
compression=s3_config.compression,
|
|
271
|
-
headers=s3_config.headers
|
|
616
|
+
headers=s3_config.headers,
|
|
272
617
|
),
|
|
273
|
-
order_by=legacy.order_by
|
|
618
|
+
order_by=legacy.order_by,
|
|
274
619
|
)
|
|
275
|
-
|
|
620
|
+
|
|
276
621
|
# Map legacy engine enum to new engine classes
|
|
277
622
|
engine_map = {
|
|
278
623
|
ClickHouseEngines.MergeTree: MergeTreeEngine(),
|
|
@@ -280,97 +625,115 @@ def migrate_legacy_config(legacy: TableCreateOptions) -> TableConfig:
|
|
|
280
625
|
ClickHouseEngines.AggregatingMergeTree: AggregatingMergeTreeEngine(),
|
|
281
626
|
ClickHouseEngines.SummingMergeTree: SummingMergeTreeEngine(),
|
|
282
627
|
}
|
|
283
|
-
|
|
628
|
+
|
|
284
629
|
engine = engine_map.get(legacy.engine) if legacy.engine else MergeTreeEngine()
|
|
285
630
|
if engine is None:
|
|
286
631
|
engine = MergeTreeEngine()
|
|
287
|
-
|
|
632
|
+
|
|
288
633
|
return TableConfig(
|
|
289
634
|
name=legacy.name,
|
|
290
635
|
columns=legacy.columns,
|
|
291
636
|
engine=engine,
|
|
292
|
-
order_by=legacy.order_by
|
|
637
|
+
order_by=legacy.order_by,
|
|
293
638
|
)
|
|
294
639
|
|
|
640
|
+
|
|
295
641
|
def normalize_config(config: Union[TableConfig, TableCreateOptions]) -> TableConfig:
|
|
296
642
|
"""Normalize any configuration format to new API"""
|
|
297
643
|
if is_new_config(config):
|
|
298
644
|
return config # type: ignore
|
|
299
645
|
return migrate_legacy_config(config) # type: ignore
|
|
300
646
|
|
|
647
|
+
|
|
301
648
|
@dataclass
|
|
302
649
|
class AggregationCreateOptions:
|
|
303
650
|
table_create_options: TableCreateOptions
|
|
304
651
|
materialized_view_name: str
|
|
305
652
|
select: str
|
|
306
653
|
|
|
654
|
+
|
|
307
655
|
@dataclass
|
|
308
656
|
class AggregationDropOptions:
|
|
309
657
|
view_name: str
|
|
310
658
|
table_name: str
|
|
311
659
|
|
|
660
|
+
|
|
312
661
|
@dataclass
|
|
313
662
|
class MaterializedViewCreateOptions:
|
|
314
663
|
name: str
|
|
315
664
|
destination_table: str
|
|
316
665
|
select: str
|
|
317
666
|
|
|
667
|
+
|
|
318
668
|
@dataclass
|
|
319
669
|
class PopulateTableOptions:
|
|
320
670
|
destination_table: str
|
|
321
671
|
select: str
|
|
322
672
|
|
|
673
|
+
|
|
323
674
|
@dataclass
|
|
324
675
|
class Blocks:
|
|
325
676
|
teardown: list[str]
|
|
326
677
|
setup: list[str]
|
|
327
678
|
|
|
679
|
+
|
|
328
680
|
def drop_aggregation(options: AggregationDropOptions) -> list[str]:
|
|
329
681
|
"""
|
|
330
682
|
Drops an aggregation's view & underlying table.
|
|
331
683
|
"""
|
|
332
684
|
return [drop_view(options.view_name), drop_table(options.table_name)]
|
|
333
685
|
|
|
686
|
+
|
|
334
687
|
def drop_table(name: str) -> str:
|
|
335
688
|
"""
|
|
336
689
|
Drops an existing table if it exists.
|
|
337
690
|
"""
|
|
338
691
|
return f"DROP TABLE IF EXISTS {name}".strip()
|
|
339
692
|
|
|
693
|
+
|
|
340
694
|
def drop_view(name: str) -> str:
|
|
341
695
|
"""
|
|
342
696
|
Drops an existing view if it exists.
|
|
343
697
|
"""
|
|
344
698
|
return f"DROP VIEW IF EXISTS {name}".strip()
|
|
345
699
|
|
|
700
|
+
|
|
346
701
|
def create_aggregation(options: AggregationCreateOptions) -> list[str]:
|
|
347
702
|
"""
|
|
348
703
|
Creates an aggregation which includes a table, materialized view, and initial data load.
|
|
349
704
|
"""
|
|
350
705
|
return [
|
|
351
706
|
create_table(options.table_create_options),
|
|
352
|
-
create_materialized_view(
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
707
|
+
create_materialized_view(
|
|
708
|
+
MaterializedViewCreateOptions(
|
|
709
|
+
name=options.materialized_view_name,
|
|
710
|
+
destination_table=options.table_create_options.name,
|
|
711
|
+
select=options.select,
|
|
712
|
+
)
|
|
713
|
+
),
|
|
714
|
+
populate_table(
|
|
715
|
+
PopulateTableOptions(
|
|
716
|
+
destination_table=options.table_create_options.name,
|
|
717
|
+
select=options.select,
|
|
718
|
+
)
|
|
719
|
+
),
|
|
361
720
|
]
|
|
362
721
|
|
|
722
|
+
|
|
363
723
|
def create_materialized_view(options: MaterializedViewCreateOptions) -> str:
|
|
364
724
|
"""
|
|
365
725
|
Creates a materialized view.
|
|
366
726
|
"""
|
|
367
727
|
return f"CREATE MATERIALIZED VIEW IF NOT EXISTS {options.name} \nTO {options.destination_table}\nAS {options.select}".strip()
|
|
368
728
|
|
|
729
|
+
|
|
369
730
|
def create_table(options: TableCreateOptions) -> str:
|
|
370
731
|
"""
|
|
371
732
|
Creates a new table with default MergeTree engine.
|
|
372
733
|
"""
|
|
373
|
-
column_definitions = ",\n".join(
|
|
734
|
+
column_definitions = ",\n".join(
|
|
735
|
+
[f"{name} {type}" for name, type in options.columns.items()]
|
|
736
|
+
)
|
|
374
737
|
order_by_clause = f"ORDER BY {options.order_by}" if options.order_by else ""
|
|
375
738
|
engine = options.engine.value if options.engine else "MergeTree"
|
|
376
739
|
|
|
@@ -383,8 +746,9 @@ def create_table(options: TableCreateOptions) -> str:
|
|
|
383
746
|
{order_by_clause}
|
|
384
747
|
""".strip()
|
|
385
748
|
|
|
749
|
+
|
|
386
750
|
def populate_table(options: PopulateTableOptions) -> str:
|
|
387
751
|
"""
|
|
388
752
|
Populates a table with data.
|
|
389
753
|
"""
|
|
390
|
-
return f"INSERT INTO {options.destination_table}\n{options.select}".strip()
|
|
754
|
+
return f"INSERT INTO {options.destination_table}\n{options.select}".strip()
|