moose-lib 0.6.148.dev3442438466__py3-none-any.whl → 0.6.283__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- moose_lib/__init__.py +34 -3
- moose_lib/blocks.py +416 -52
- moose_lib/clients/redis_client.py +26 -14
- moose_lib/commons.py +37 -30
- moose_lib/config/config_file.py +5 -1
- moose_lib/config/runtime.py +73 -34
- moose_lib/data_models.py +331 -61
- moose_lib/dmv2/__init__.py +69 -73
- moose_lib/dmv2/_registry.py +2 -1
- moose_lib/dmv2/_source_capture.py +37 -0
- moose_lib/dmv2/consumption.py +55 -32
- moose_lib/dmv2/ingest_api.py +9 -2
- moose_lib/dmv2/ingest_pipeline.py +35 -16
- moose_lib/dmv2/life_cycle.py +3 -1
- moose_lib/dmv2/materialized_view.py +24 -14
- moose_lib/dmv2/moose_model.py +165 -0
- moose_lib/dmv2/olap_table.py +299 -151
- moose_lib/dmv2/registry.py +18 -3
- moose_lib/dmv2/sql_resource.py +16 -8
- moose_lib/dmv2/stream.py +75 -23
- moose_lib/dmv2/types.py +14 -8
- moose_lib/dmv2/view.py +13 -6
- moose_lib/dmv2/web_app.py +11 -6
- moose_lib/dmv2/web_app_helpers.py +5 -1
- moose_lib/dmv2/workflow.py +37 -9
- moose_lib/internal.py +340 -56
- moose_lib/main.py +87 -56
- moose_lib/query_builder.py +18 -5
- moose_lib/query_param.py +54 -20
- moose_lib/secrets.py +122 -0
- moose_lib/streaming/streaming_function_runner.py +233 -117
- moose_lib/utilities/sql.py +0 -1
- {moose_lib-0.6.148.dev3442438466.dist-info → moose_lib-0.6.283.dist-info}/METADATA +18 -1
- moose_lib-0.6.283.dist-info/RECORD +63 -0
- tests/__init__.py +1 -1
- tests/conftest.py +6 -5
- tests/test_backward_compatibility.py +85 -0
- tests/test_cluster_validation.py +85 -0
- tests/test_codec.py +75 -0
- tests/test_column_formatting.py +80 -0
- tests/test_fixedstring.py +43 -0
- tests/test_iceberg_config.py +105 -0
- tests/test_int_types.py +211 -0
- tests/test_kafka_config.py +141 -0
- tests/test_materialized.py +74 -0
- tests/test_metadata.py +37 -0
- tests/test_moose.py +21 -30
- tests/test_moose_model.py +153 -0
- tests/test_olap_table_moosemodel.py +89 -0
- tests/test_olap_table_versioning.py +52 -58
- tests/test_query_builder.py +97 -9
- tests/test_redis_client.py +10 -3
- tests/test_s3queue_config.py +211 -110
- tests/test_secrets.py +239 -0
- tests/test_simple_aggregate.py +42 -40
- tests/test_web_app.py +11 -5
- moose_lib-0.6.148.dev3442438466.dist-info/RECORD +0 -47
- {moose_lib-0.6.148.dev3442438466.dist-info → moose_lib-0.6.283.dist-info}/WHEEL +0 -0
- {moose_lib-0.6.148.dev3442438466.dist-info → moose_lib-0.6.283.dist-info}/top_level.txt +0 -0
moose_lib/dmv2/olap_table.py
CHANGED
|
@@ -4,6 +4,7 @@ OLAP table definitions for Moose Data Model v2 (dmv2).
|
|
|
4
4
|
This module provides classes for defining and configuring OLAP tables,
|
|
5
5
|
particularly for ClickHouse.
|
|
6
6
|
"""
|
|
7
|
+
|
|
7
8
|
import json
|
|
8
9
|
import warnings
|
|
9
10
|
from clickhouse_connect import get_client
|
|
@@ -11,7 +12,17 @@ from clickhouse_connect.driver.client import Client
|
|
|
11
12
|
from clickhouse_connect.driver.exceptions import ClickHouseError
|
|
12
13
|
from dataclasses import dataclass
|
|
13
14
|
from pydantic import BaseModel
|
|
14
|
-
from typing import
|
|
15
|
+
from typing import (
|
|
16
|
+
List,
|
|
17
|
+
Optional,
|
|
18
|
+
Any,
|
|
19
|
+
Literal,
|
|
20
|
+
Union,
|
|
21
|
+
Tuple,
|
|
22
|
+
TypeVar,
|
|
23
|
+
Generic,
|
|
24
|
+
Iterator,
|
|
25
|
+
)
|
|
15
26
|
from ..blocks import ClickHouseEngines, EngineConfig
|
|
16
27
|
from ..commons import Logger
|
|
17
28
|
from ..config.runtime import RuntimeClickHouseConfig
|
|
@@ -20,6 +31,7 @@ from .types import TypedMooseResource, T, Cols
|
|
|
20
31
|
from ._registry import _tables
|
|
21
32
|
from ..data_models import Column, is_array_nested_type, is_nested_type, _to_columns
|
|
22
33
|
from .life_cycle import LifeCycle
|
|
34
|
+
from ._source_capture import get_source_file_from_stack
|
|
23
35
|
|
|
24
36
|
|
|
25
37
|
@dataclass
|
|
@@ -33,6 +45,7 @@ class InsertOptions:
|
|
|
33
45
|
validate: Whether to validate data against schema before insertion.
|
|
34
46
|
skip_validation_on_retry: Whether to skip validation for individual records during retries.
|
|
35
47
|
"""
|
|
48
|
+
|
|
36
49
|
allow_errors: Optional[int] = None
|
|
37
50
|
allow_errors_ratio: Optional[float] = None
|
|
38
51
|
strategy: Literal["fail-fast", "discard", "isolate"] = "fail-fast"
|
|
@@ -49,6 +62,7 @@ class FailedRecord(Generic[T]):
|
|
|
49
62
|
error: The error message describing why the insertion failed.
|
|
50
63
|
index: Optional index of this record in the original batch.
|
|
51
64
|
"""
|
|
65
|
+
|
|
52
66
|
record: T
|
|
53
67
|
error: str
|
|
54
68
|
index: Optional[int] = None
|
|
@@ -64,6 +78,7 @@ class ValidationError:
|
|
|
64
78
|
index: Optional index of this record in the original batch.
|
|
65
79
|
path: Optional path to the field that failed validation.
|
|
66
80
|
"""
|
|
81
|
+
|
|
67
82
|
record: Any
|
|
68
83
|
error: str
|
|
69
84
|
index: Optional[int] = None
|
|
@@ -79,6 +94,7 @@ class ValidationResult(Generic[T]):
|
|
|
79
94
|
invalid: Records that failed validation with detailed error information.
|
|
80
95
|
total: Total number of records processed.
|
|
81
96
|
"""
|
|
97
|
+
|
|
82
98
|
valid: List[T]
|
|
83
99
|
invalid: List[ValidationError]
|
|
84
100
|
total: int
|
|
@@ -94,6 +110,7 @@ class InsertResult(Generic[T]):
|
|
|
94
110
|
total: Total number of records processed.
|
|
95
111
|
failed_records: Detailed information about failed records (if record isolation was used).
|
|
96
112
|
"""
|
|
113
|
+
|
|
97
114
|
successful: int
|
|
98
115
|
failed: int
|
|
99
116
|
total: int
|
|
@@ -114,6 +131,11 @@ class OlapConfig(BaseModel):
|
|
|
114
131
|
partition_by: Optional PARTITION BY expression (single ClickHouse SQL expression).
|
|
115
132
|
sample_by_expression: Optional SAMPLE BY expression for data sampling (single ClickHouse SQL expression).
|
|
116
133
|
Used to enable efficient approximate query processing with SAMPLE clause.
|
|
134
|
+
primary_key_expression: Optional PRIMARY KEY expression. When specified, this overrides the primary key
|
|
135
|
+
inferred from Key[T] column annotations. This allows for complex primary keys using
|
|
136
|
+
functions (e.g., "cityHash64(id)") or different column ordering in primary key vs
|
|
137
|
+
schema definition. Note: When this is set, any Key[T] annotations on columns are
|
|
138
|
+
ignored for PRIMARY KEY generation.
|
|
117
139
|
engine: The ClickHouse table engine to use. Can be either a ClickHouseEngines enum value
|
|
118
140
|
(for backward compatibility) or an EngineConfig instance (recommended).
|
|
119
141
|
version: Optional version string for tracking configuration changes.
|
|
@@ -121,11 +143,16 @@ class OlapConfig(BaseModel):
|
|
|
121
143
|
life_cycle: Determines how changes in code will propagate to the resources.
|
|
122
144
|
settings: Optional table-level settings that can be modified with ALTER TABLE MODIFY SETTING.
|
|
123
145
|
These are alterable settings that can be changed without recreating the table.
|
|
146
|
+
cluster: Optional cluster name for ON CLUSTER support in ClickHouse.
|
|
147
|
+
Use this to enable replicated tables across ClickHouse clusters.
|
|
148
|
+
The cluster must be defined in moose.config.toml (dev environment only).
|
|
149
|
+
Example: cluster="prod_cluster"
|
|
124
150
|
"""
|
|
125
151
|
order_by_fields: list[str] = []
|
|
126
152
|
order_by_expression: Optional[str] = None
|
|
127
153
|
partition_by: Optional[str] = None
|
|
128
154
|
sample_by_expression: Optional[str] = None
|
|
155
|
+
primary_key_expression: Optional[str] = None
|
|
129
156
|
engine: Optional[Union[ClickHouseEngines, EngineConfig]] = None
|
|
130
157
|
version: Optional[str] = None
|
|
131
158
|
metadata: Optional[dict] = None
|
|
@@ -133,22 +160,95 @@ class OlapConfig(BaseModel):
|
|
|
133
160
|
settings: Optional[dict[str, str]] = None
|
|
134
161
|
# Optional table-level TTL expression (without leading 'TTL')
|
|
135
162
|
ttl: Optional[str] = None
|
|
163
|
+
# Optional cluster name for ON CLUSTER support in ClickHouse
|
|
164
|
+
cluster: Optional[str] = None
|
|
136
165
|
|
|
137
166
|
# Optional secondary/data-skipping indexes
|
|
138
167
|
class TableIndex(BaseModel):
|
|
139
168
|
name: str
|
|
140
169
|
expression: str
|
|
141
170
|
type: str
|
|
142
|
-
arguments:
|
|
143
|
-
granularity: int
|
|
171
|
+
arguments: list[str] = []
|
|
172
|
+
granularity: int = 1
|
|
144
173
|
|
|
145
174
|
indexes: list[TableIndex] = []
|
|
175
|
+
database: Optional[str] = None # Optional database name for multi-database support
|
|
146
176
|
|
|
147
177
|
def model_post_init(self, __context):
|
|
148
178
|
has_fields = bool(self.order_by_fields)
|
|
149
|
-
has_expr =
|
|
179
|
+
has_expr = (
|
|
180
|
+
isinstance(self.order_by_expression, str)
|
|
181
|
+
and len(self.order_by_expression) > 0
|
|
182
|
+
)
|
|
150
183
|
if has_fields and has_expr:
|
|
151
|
-
raise ValueError(
|
|
184
|
+
raise ValueError(
|
|
185
|
+
"Provide either order_by_fields or order_by_expression, not both."
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Validate that non-MergeTree engines don't have unsupported clauses
|
|
189
|
+
if self.engine:
|
|
190
|
+
from ..blocks import (
|
|
191
|
+
S3Engine,
|
|
192
|
+
S3QueueEngine,
|
|
193
|
+
BufferEngine,
|
|
194
|
+
DistributedEngine,
|
|
195
|
+
IcebergS3Engine,
|
|
196
|
+
KafkaEngine,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# S3QueueEngine, BufferEngine, DistributedEngine, KafkaEngine, and IcebergS3Engine don't support ORDER BY
|
|
200
|
+
# Note: S3Engine DOES support ORDER BY (unlike S3Queue)
|
|
201
|
+
engines_without_order_by = (
|
|
202
|
+
S3QueueEngine,
|
|
203
|
+
BufferEngine,
|
|
204
|
+
DistributedEngine,
|
|
205
|
+
KafkaEngine,
|
|
206
|
+
IcebergS3Engine,
|
|
207
|
+
)
|
|
208
|
+
if isinstance(self.engine, engines_without_order_by):
|
|
209
|
+
engine_name = type(self.engine).__name__
|
|
210
|
+
|
|
211
|
+
if has_fields or has_expr:
|
|
212
|
+
raise ValueError(
|
|
213
|
+
f"{engine_name} does not support ORDER BY clauses. "
|
|
214
|
+
f"Remove order_by_fields or order_by_expression from your configuration."
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# All non-MergeTree engines don't support SAMPLE BY
|
|
218
|
+
engines_without_sample_by = (
|
|
219
|
+
S3Engine,
|
|
220
|
+
S3QueueEngine,
|
|
221
|
+
BufferEngine,
|
|
222
|
+
DistributedEngine,
|
|
223
|
+
KafkaEngine,
|
|
224
|
+
IcebergS3Engine,
|
|
225
|
+
)
|
|
226
|
+
if isinstance(self.engine, engines_without_sample_by):
|
|
227
|
+
engine_name = type(self.engine).__name__
|
|
228
|
+
|
|
229
|
+
if self.sample_by_expression:
|
|
230
|
+
raise ValueError(
|
|
231
|
+
f"{engine_name} does not support SAMPLE BY clause. "
|
|
232
|
+
f"Remove sample_by_expression from your configuration."
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Only S3QueueEngine, BufferEngine, DistributedEngine, KafkaEngine, and IcebergS3Engine don't support PARTITION BY
|
|
236
|
+
# S3Engine DOES support PARTITION BY
|
|
237
|
+
engines_without_partition_by = (
|
|
238
|
+
S3QueueEngine,
|
|
239
|
+
BufferEngine,
|
|
240
|
+
DistributedEngine,
|
|
241
|
+
KafkaEngine,
|
|
242
|
+
IcebergS3Engine,
|
|
243
|
+
)
|
|
244
|
+
if isinstance(self.engine, engines_without_partition_by):
|
|
245
|
+
engine_name = type(self.engine).__name__
|
|
246
|
+
|
|
247
|
+
if self.partition_by:
|
|
248
|
+
raise ValueError(
|
|
249
|
+
f"{engine_name} does not support PARTITION BY clause. "
|
|
250
|
+
f"Remove partition_by from your configuration."
|
|
251
|
+
)
|
|
152
252
|
|
|
153
253
|
|
|
154
254
|
class OlapTable(TypedMooseResource, Generic[T]):
|
|
@@ -166,6 +266,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
166
266
|
model_type (type[T]): The Pydantic model associated with this table.
|
|
167
267
|
kind: The kind of the table (e.g., "OlapTable").
|
|
168
268
|
"""
|
|
269
|
+
|
|
169
270
|
config: OlapConfig
|
|
170
271
|
kind: str = "OlapTable"
|
|
171
272
|
_memoized_client: Optional[Client] = None
|
|
@@ -178,9 +279,33 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
178
279
|
super().__init__()
|
|
179
280
|
self._set_type(name, self._get_type(kwargs))
|
|
180
281
|
self.config = config
|
|
181
|
-
|
|
282
|
+
|
|
283
|
+
if config.metadata:
|
|
284
|
+
self.metadata = (
|
|
285
|
+
config.metadata.copy()
|
|
286
|
+
if isinstance(config.metadata, dict)
|
|
287
|
+
else config.metadata
|
|
288
|
+
)
|
|
289
|
+
else:
|
|
290
|
+
self.metadata = {}
|
|
291
|
+
|
|
292
|
+
if not isinstance(self.metadata, dict):
|
|
293
|
+
self.metadata = {}
|
|
294
|
+
if "source" not in self.metadata:
|
|
295
|
+
source_file = get_source_file_from_stack()
|
|
296
|
+
if source_file:
|
|
297
|
+
self.metadata["source"] = {"file": source_file}
|
|
298
|
+
|
|
182
299
|
self._column_list = _to_columns(self._t)
|
|
300
|
+
|
|
301
|
+
# Create Cols instance for backward compatibility
|
|
302
|
+
# This works for both BaseModel and MooseModel
|
|
183
303
|
self._cols = Cols(self._column_list)
|
|
304
|
+
|
|
305
|
+
# NOTE: For MooseModel types, columns are also accessible directly
|
|
306
|
+
# on the model class (e.g., MyModel.field_name) thanks to the metaclass.
|
|
307
|
+
# This provides LSP autocomplete without requiring .cols access.
|
|
308
|
+
|
|
184
309
|
registry_key = f"{name}_{config.version}" if config.version else name
|
|
185
310
|
if registry_key in _tables:
|
|
186
311
|
raise ValueError(
|
|
@@ -188,6 +313,39 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
188
313
|
)
|
|
189
314
|
_tables[registry_key] = self
|
|
190
315
|
|
|
316
|
+
# Validate cluster and explicit replication params are not both specified
|
|
317
|
+
if config.cluster:
|
|
318
|
+
from moose_lib.blocks import (
|
|
319
|
+
ReplicatedMergeTreeEngine,
|
|
320
|
+
ReplicatedReplacingMergeTreeEngine,
|
|
321
|
+
ReplicatedAggregatingMergeTreeEngine,
|
|
322
|
+
ReplicatedSummingMergeTreeEngine,
|
|
323
|
+
ReplicatedCollapsingMergeTreeEngine,
|
|
324
|
+
ReplicatedVersionedCollapsingMergeTreeEngine,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
if isinstance(
|
|
328
|
+
config.engine,
|
|
329
|
+
(
|
|
330
|
+
ReplicatedMergeTreeEngine,
|
|
331
|
+
ReplicatedReplacingMergeTreeEngine,
|
|
332
|
+
ReplicatedAggregatingMergeTreeEngine,
|
|
333
|
+
ReplicatedSummingMergeTreeEngine,
|
|
334
|
+
ReplicatedCollapsingMergeTreeEngine,
|
|
335
|
+
ReplicatedVersionedCollapsingMergeTreeEngine,
|
|
336
|
+
),
|
|
337
|
+
):
|
|
338
|
+
if (
|
|
339
|
+
config.engine.keeper_path is not None
|
|
340
|
+
or config.engine.replica_name is not None
|
|
341
|
+
):
|
|
342
|
+
raise ValueError(
|
|
343
|
+
f"OlapTable {name}: Cannot specify both 'cluster' and explicit replication params "
|
|
344
|
+
f"('keeper_path' or 'replica_name'). "
|
|
345
|
+
f"Use 'cluster' for auto-injected params, or use explicit 'keeper_path' and "
|
|
346
|
+
f"'replica_name' without 'cluster'."
|
|
347
|
+
)
|
|
348
|
+
|
|
191
349
|
# Check if using legacy enum-based engine configuration
|
|
192
350
|
if config.engine and isinstance(config.engine, ClickHouseEngines):
|
|
193
351
|
logger = Logger(action="OlapTable")
|
|
@@ -217,7 +375,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
217
375
|
f"Table '{name}' uses deprecated ClickHouseEngines enum. "
|
|
218
376
|
f"Please migrate to engine configuration classes (e.g., {config.engine.value}Engine).",
|
|
219
377
|
DeprecationWarning,
|
|
220
|
-
stacklevel=2
|
|
378
|
+
stacklevel=2,
|
|
221
379
|
)
|
|
222
380
|
|
|
223
381
|
@property
|
|
@@ -254,10 +412,13 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
254
412
|
A 16-character hex hash of the configuration.
|
|
255
413
|
"""
|
|
256
414
|
import hashlib
|
|
415
|
+
|
|
416
|
+
# Use per-table database if specified, otherwise fall back to global config
|
|
417
|
+
effective_database = self.config.database or clickhouse_config.database
|
|
257
418
|
config_string = (
|
|
258
419
|
f"{clickhouse_config.host}:{clickhouse_config.port}:"
|
|
259
420
|
f"{clickhouse_config.username}:{clickhouse_config.password}:"
|
|
260
|
-
f"{
|
|
421
|
+
f"{effective_database}:{clickhouse_config.use_ssl}"
|
|
261
422
|
)
|
|
262
423
|
return hashlib.sha256(config_string.encode()).hexdigest()[:16]
|
|
263
424
|
|
|
@@ -292,14 +453,16 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
292
453
|
|
|
293
454
|
try:
|
|
294
455
|
# Create new client with standard configuration
|
|
295
|
-
|
|
456
|
+
# Use per-table database if specified, otherwise fall back to global config
|
|
457
|
+
effective_database = self.config.database or clickhouse_config.database
|
|
458
|
+
interface = "https" if clickhouse_config.use_ssl else "http"
|
|
296
459
|
client = get_client(
|
|
297
460
|
interface=interface,
|
|
298
461
|
host=clickhouse_config.host,
|
|
299
462
|
port=int(clickhouse_config.port),
|
|
300
463
|
username=clickhouse_config.username,
|
|
301
464
|
password=clickhouse_config.password,
|
|
302
|
-
database=
|
|
465
|
+
database=effective_database,
|
|
303
466
|
)
|
|
304
467
|
|
|
305
468
|
# Cache the new client and config hash
|
|
@@ -335,7 +498,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
335
498
|
Returns:
|
|
336
499
|
Tuple of (validated_data, error_message). If validation succeeds,
|
|
337
500
|
validated_data will be the validated record and error_message will be None.
|
|
338
|
-
If validation fails for any reason, validated_data will be None and error_message
|
|
501
|
+
If validation fails for any reason, validated_data will be None and error_message
|
|
339
502
|
will contain the error details.
|
|
340
503
|
"""
|
|
341
504
|
try:
|
|
@@ -361,23 +524,19 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
361
524
|
if validated is not None:
|
|
362
525
|
valid.append(validated)
|
|
363
526
|
else:
|
|
364
|
-
invalid.append(
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
total=len(data)
|
|
375
|
-
)
|
|
527
|
+
invalid.append(
|
|
528
|
+
ValidationError(
|
|
529
|
+
record=record,
|
|
530
|
+
error=error or "Validation failed",
|
|
531
|
+
index=i,
|
|
532
|
+
path="root",
|
|
533
|
+
)
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
return ValidationResult(valid=valid, invalid=invalid, total=len(data))
|
|
376
537
|
|
|
377
538
|
def _validate_insert_parameters(
|
|
378
|
-
|
|
379
|
-
data: Union[List[T], Iterator[T]],
|
|
380
|
-
options: Optional[InsertOptions]
|
|
539
|
+
self, data: Union[List[T], Iterator[T]], options: Optional[InsertOptions]
|
|
381
540
|
) -> Tuple[bool, str, bool]:
|
|
382
541
|
"""Validate input parameters and strategy compatibility.
|
|
383
542
|
|
|
@@ -399,16 +558,18 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
399
558
|
)
|
|
400
559
|
|
|
401
560
|
if is_stream and should_validate:
|
|
402
|
-
print(
|
|
561
|
+
print(
|
|
562
|
+
"Warning: Validation is not supported with stream input. Validation will be skipped."
|
|
563
|
+
)
|
|
403
564
|
|
|
404
565
|
return is_stream, strategy, should_validate
|
|
405
566
|
|
|
406
567
|
def _perform_pre_insertion_validation(
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
568
|
+
self,
|
|
569
|
+
data: List[T],
|
|
570
|
+
should_validate: bool,
|
|
571
|
+
strategy: str,
|
|
572
|
+
options: Optional[InsertOptions] = None,
|
|
412
573
|
) -> Tuple[List[T], List[ValidationError]]:
|
|
413
574
|
"""Perform pre-insertion validation for array data.
|
|
414
575
|
|
|
@@ -431,10 +592,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
431
592
|
|
|
432
593
|
if validation_errors:
|
|
433
594
|
self._handle_validation_errors(
|
|
434
|
-
validation_errors,
|
|
435
|
-
strategy,
|
|
436
|
-
data,
|
|
437
|
-
options
|
|
595
|
+
validation_errors, strategy, data, options
|
|
438
596
|
)
|
|
439
597
|
|
|
440
598
|
if strategy == "discard":
|
|
@@ -453,11 +611,11 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
453
611
|
return data, []
|
|
454
612
|
|
|
455
613
|
def _handle_validation_errors(
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
614
|
+
self,
|
|
615
|
+
validation_errors: List[ValidationError],
|
|
616
|
+
strategy: str,
|
|
617
|
+
data: List[T],
|
|
618
|
+
options: Optional[InsertOptions],
|
|
461
619
|
) -> None:
|
|
462
620
|
"""Handle validation errors based on the specified strategy.
|
|
463
621
|
|
|
@@ -473,17 +631,13 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
473
631
|
f"Validation failed for record at index {first_error.index}: {first_error.error}"
|
|
474
632
|
)
|
|
475
633
|
elif strategy == "discard":
|
|
476
|
-
self._check_validation_thresholds(
|
|
477
|
-
validation_errors,
|
|
478
|
-
len(data),
|
|
479
|
-
options
|
|
480
|
-
)
|
|
634
|
+
self._check_validation_thresholds(validation_errors, len(data), options)
|
|
481
635
|
|
|
482
636
|
def _check_validation_thresholds(
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
637
|
+
self,
|
|
638
|
+
validation_errors: List[ValidationError],
|
|
639
|
+
total_records: int,
|
|
640
|
+
options: Optional[InsertOptions],
|
|
487
641
|
) -> None:
|
|
488
642
|
"""Check if validation errors exceed configured thresholds.
|
|
489
643
|
|
|
@@ -495,15 +649,21 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
495
649
|
validation_failed_count = len(validation_errors)
|
|
496
650
|
validation_failed_ratio = validation_failed_count / total_records
|
|
497
651
|
|
|
498
|
-
if (
|
|
499
|
-
|
|
652
|
+
if (
|
|
653
|
+
options
|
|
654
|
+
and options.allow_errors is not None
|
|
655
|
+
and validation_failed_count > options.allow_errors
|
|
656
|
+
):
|
|
500
657
|
raise ValueError(
|
|
501
658
|
f"Too many validation failures: {validation_failed_count} > {options.allow_errors}. "
|
|
502
659
|
f"Errors: {', '.join(e.error for e in validation_errors)}"
|
|
503
660
|
)
|
|
504
661
|
|
|
505
|
-
if (
|
|
506
|
-
|
|
662
|
+
if (
|
|
663
|
+
options
|
|
664
|
+
and options.allow_errors_ratio is not None
|
|
665
|
+
and validation_failed_ratio > options.allow_errors_ratio
|
|
666
|
+
):
|
|
507
667
|
raise ValueError(
|
|
508
668
|
f"Validation failure ratio too high: {validation_failed_ratio:.3f} > "
|
|
509
669
|
f"{options.allow_errors_ratio}. Errors: {', '.join(e.error for e in validation_errors)}"
|
|
@@ -514,36 +674,44 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
514
674
|
|
|
515
675
|
def _with_wait_end_settings(self, settings: dict) -> dict:
|
|
516
676
|
"""Add wait_end_of_query setting to ensure at least once delivery for INSERT operations.
|
|
517
|
-
|
|
677
|
+
|
|
518
678
|
Args:
|
|
519
679
|
settings: Base settings dictionary
|
|
520
|
-
|
|
680
|
+
|
|
521
681
|
Returns:
|
|
522
682
|
Settings dictionary with wait_end_of_query added
|
|
523
683
|
"""
|
|
524
684
|
return {**settings, "wait_end_of_query": 1}
|
|
525
685
|
|
|
526
686
|
def _prepare_insert_options(
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
687
|
+
self,
|
|
688
|
+
table_name: str,
|
|
689
|
+
data: Union[List[T], Iterator[T]],
|
|
690
|
+
validated_data: List[T],
|
|
691
|
+
is_stream: bool,
|
|
692
|
+
strategy: str,
|
|
693
|
+
options: Optional[InsertOptions],
|
|
534
694
|
) -> tuple[str, bytes, dict]:
|
|
535
695
|
"""Prepare insert options for JSONEachRow raw SQL insert, returning settings dict."""
|
|
536
696
|
# Base settings for all inserts
|
|
537
697
|
base_settings = {
|
|
538
698
|
"date_time_input_format": "best_effort",
|
|
539
|
-
"max_insert_block_size":
|
|
699
|
+
"max_insert_block_size": (
|
|
700
|
+
100000 if is_stream else min(len(validated_data), 100000)
|
|
701
|
+
),
|
|
540
702
|
"max_block_size": 65536,
|
|
541
703
|
"async_insert": 1 if len(validated_data) > 1000 else 0,
|
|
542
704
|
"wait_for_async_insert": 1,
|
|
543
705
|
}
|
|
544
706
|
settings = self._with_wait_end_settings(base_settings)
|
|
545
|
-
if (
|
|
546
|
-
|
|
707
|
+
if (
|
|
708
|
+
strategy == "discard"
|
|
709
|
+
and options
|
|
710
|
+
and (
|
|
711
|
+
options.allow_errors is not None
|
|
712
|
+
or options.allow_errors_ratio is not None
|
|
713
|
+
)
|
|
714
|
+
):
|
|
547
715
|
if options.allow_errors is not None:
|
|
548
716
|
settings["input_format_allow_errors_num"] = options.allow_errors
|
|
549
717
|
if options.allow_errors_ratio is not None:
|
|
@@ -556,7 +724,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
556
724
|
validated_data = [validated_data]
|
|
557
725
|
dict_data = []
|
|
558
726
|
for record in validated_data:
|
|
559
|
-
if hasattr(record,
|
|
727
|
+
if hasattr(record, "model_dump"):
|
|
560
728
|
record_dict = record.model_dump()
|
|
561
729
|
else:
|
|
562
730
|
record_dict = record
|
|
@@ -568,13 +736,13 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
568
736
|
return quote_identifier(table_name), json_lines, settings
|
|
569
737
|
|
|
570
738
|
def _create_success_result(
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
739
|
+
self,
|
|
740
|
+
data: Union[List[T], Iterator[T]],
|
|
741
|
+
validated_data: List[T],
|
|
742
|
+
validation_errors: List[ValidationError],
|
|
743
|
+
is_stream: bool,
|
|
744
|
+
should_validate: bool,
|
|
745
|
+
strategy: str,
|
|
578
746
|
) -> InsertResult[T]:
|
|
579
747
|
"""Create appropriate result based on input type.
|
|
580
748
|
|
|
@@ -590,11 +758,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
590
758
|
InsertResult with appropriate counts and error information.
|
|
591
759
|
"""
|
|
592
760
|
if is_stream:
|
|
593
|
-
return InsertResult(
|
|
594
|
-
successful=-1,
|
|
595
|
-
failed=0,
|
|
596
|
-
total=-1
|
|
597
|
-
)
|
|
761
|
+
return InsertResult(successful=-1, failed=0, total=-1)
|
|
598
762
|
|
|
599
763
|
inserted_count = len(validated_data)
|
|
600
764
|
total_processed = len(data) if not is_stream else inserted_count
|
|
@@ -602,32 +766,30 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
602
766
|
result = InsertResult(
|
|
603
767
|
successful=inserted_count,
|
|
604
768
|
failed=len(validation_errors) if should_validate else 0,
|
|
605
|
-
total=total_processed
|
|
769
|
+
total=total_processed,
|
|
606
770
|
)
|
|
607
771
|
|
|
608
|
-
if
|
|
772
|
+
if should_validate and validation_errors and strategy == "discard":
|
|
609
773
|
result.failed_records = [
|
|
610
774
|
FailedRecord(
|
|
611
775
|
record=ve.record,
|
|
612
776
|
error=f"Validation error: {ve.error}",
|
|
613
|
-
index=ve.index
|
|
614
|
-
)
|
|
777
|
+
index=ve.index,
|
|
778
|
+
)
|
|
779
|
+
for ve in validation_errors
|
|
615
780
|
]
|
|
616
781
|
|
|
617
782
|
return result
|
|
618
783
|
|
|
619
784
|
def _retry_individual_records(
|
|
620
|
-
|
|
621
|
-
client: Client,
|
|
622
|
-
records: List[T],
|
|
623
|
-
options: InsertOptions
|
|
785
|
+
self, client: Client, records: List[T], options: InsertOptions
|
|
624
786
|
) -> InsertResult[T]:
|
|
625
787
|
successful: List[T] = []
|
|
626
788
|
failed: List[FailedRecord[T]] = []
|
|
627
789
|
table_name = quote_identifier(self._generate_table_name())
|
|
628
790
|
records_dict = []
|
|
629
791
|
for record in records:
|
|
630
|
-
if hasattr(record,
|
|
792
|
+
if hasattr(record, "model_dump"):
|
|
631
793
|
record_dict = record.model_dump()
|
|
632
794
|
else:
|
|
633
795
|
record_dict = record
|
|
@@ -636,51 +798,52 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
636
798
|
|
|
637
799
|
RETRY_BATCH_SIZE = 10
|
|
638
800
|
for i in range(0, len(records_dict), RETRY_BATCH_SIZE):
|
|
639
|
-
batch = records_dict[i:i + RETRY_BATCH_SIZE]
|
|
801
|
+
batch = records_dict[i : i + RETRY_BATCH_SIZE]
|
|
640
802
|
try:
|
|
641
803
|
sql = f"INSERT INTO {table_name} FORMAT JSONEachRow"
|
|
642
804
|
base_settings = {
|
|
643
805
|
"date_time_input_format": "best_effort",
|
|
644
806
|
"max_insert_block_size": RETRY_BATCH_SIZE,
|
|
645
807
|
"max_block_size": RETRY_BATCH_SIZE,
|
|
646
|
-
"async_insert": 0
|
|
808
|
+
"async_insert": 0,
|
|
647
809
|
}
|
|
648
810
|
settings = self._with_wait_end_settings(base_settings)
|
|
649
811
|
json_lines = self._to_json_each_row(batch)
|
|
650
812
|
client.command(sql, data=json_lines, settings=settings)
|
|
651
|
-
successful.extend(records[i:i + RETRY_BATCH_SIZE])
|
|
813
|
+
successful.extend(records[i : i + RETRY_BATCH_SIZE])
|
|
652
814
|
except ClickHouseError as batch_error:
|
|
653
815
|
for j, record_dict in enumerate(batch):
|
|
654
816
|
try:
|
|
655
817
|
sql = f"INSERT INTO {table_name} FORMAT JSONEachRow"
|
|
656
|
-
individual_settings = self._with_wait_end_settings(
|
|
657
|
-
"date_time_input_format": "best_effort",
|
|
658
|
-
|
|
659
|
-
})
|
|
818
|
+
individual_settings = self._with_wait_end_settings(
|
|
819
|
+
{"date_time_input_format": "best_effort", "async_insert": 0}
|
|
820
|
+
)
|
|
660
821
|
json_line = self._to_json_each_row([record_dict])
|
|
661
|
-
client.command(
|
|
822
|
+
client.command(
|
|
823
|
+
sql, data=json_line, settings=individual_settings
|
|
824
|
+
)
|
|
662
825
|
successful.append(records[i + j])
|
|
663
826
|
except ClickHouseError as error:
|
|
664
|
-
failed.append(
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
)
|
|
827
|
+
failed.append(
|
|
828
|
+
FailedRecord(
|
|
829
|
+
record=records[i + j], error=str(error), index=i + j
|
|
830
|
+
)
|
|
831
|
+
)
|
|
669
832
|
return InsertResult(
|
|
670
833
|
successful=len(successful),
|
|
671
834
|
failed=len(failed),
|
|
672
835
|
total=len(records),
|
|
673
|
-
failed_records=failed if failed else None
|
|
836
|
+
failed_records=failed if failed else None,
|
|
674
837
|
)
|
|
675
838
|
|
|
676
839
|
def _insert_array_data(
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
840
|
+
self,
|
|
841
|
+
client: Client,
|
|
842
|
+
table_name: str,
|
|
843
|
+
data: List[T],
|
|
844
|
+
should_validate: bool,
|
|
845
|
+
strategy: str,
|
|
846
|
+
options: Optional[InsertOptions],
|
|
684
847
|
) -> InsertResult[T]:
|
|
685
848
|
"""Insert array data into the table with validation and error handling.
|
|
686
849
|
|
|
@@ -696,19 +859,11 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
696
859
|
InsertResult with detailed success/failure information.
|
|
697
860
|
"""
|
|
698
861
|
validated_data, validation_errors = self._perform_pre_insertion_validation(
|
|
699
|
-
data,
|
|
700
|
-
should_validate,
|
|
701
|
-
strategy,
|
|
702
|
-
options
|
|
862
|
+
data, should_validate, strategy, options
|
|
703
863
|
)
|
|
704
864
|
try:
|
|
705
865
|
table_name, json_lines, settings = self._prepare_insert_options(
|
|
706
|
-
table_name,
|
|
707
|
-
data,
|
|
708
|
-
validated_data,
|
|
709
|
-
False,
|
|
710
|
-
strategy,
|
|
711
|
-
options
|
|
866
|
+
table_name, data, validated_data, False, strategy, options
|
|
712
867
|
)
|
|
713
868
|
sql = f"INSERT INTO {table_name} FORMAT JSONEachRow"
|
|
714
869
|
client.command(sql, data=json_lines, settings=settings)
|
|
@@ -718,7 +873,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
718
873
|
validation_errors,
|
|
719
874
|
False,
|
|
720
875
|
should_validate,
|
|
721
|
-
strategy
|
|
876
|
+
strategy,
|
|
722
877
|
)
|
|
723
878
|
except ClickHouseError as e:
|
|
724
879
|
if strategy == "fail-fast":
|
|
@@ -729,16 +884,16 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
729
884
|
return self._retry_individual_records(
|
|
730
885
|
client,
|
|
731
886
|
validated_data if not options.skip_validation_on_retry else data,
|
|
732
|
-
options
|
|
887
|
+
options,
|
|
733
888
|
)
|
|
734
889
|
|
|
735
890
|
def _insert_stream(
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
891
|
+
self,
|
|
892
|
+
client: Client,
|
|
893
|
+
table_name: str,
|
|
894
|
+
data: Iterator[T],
|
|
895
|
+
strategy: str,
|
|
896
|
+
options: Optional[InsertOptions],
|
|
742
897
|
) -> InsertResult[T]:
|
|
743
898
|
"""Insert data from an iterator into the table.
|
|
744
899
|
|
|
@@ -756,17 +911,12 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
756
911
|
total_inserted = 0
|
|
757
912
|
|
|
758
913
|
_, _, settings = self._prepare_insert_options(
|
|
759
|
-
table_name,
|
|
760
|
-
data,
|
|
761
|
-
[],
|
|
762
|
-
True,
|
|
763
|
-
strategy,
|
|
764
|
-
options
|
|
914
|
+
table_name, data, [], True, strategy, options
|
|
765
915
|
)
|
|
766
916
|
|
|
767
917
|
for record in data:
|
|
768
918
|
# Convert record to dict using model_dump if available
|
|
769
|
-
if hasattr(record,
|
|
919
|
+
if hasattr(record, "model_dump"):
|
|
770
920
|
batch.append(record.model_dump())
|
|
771
921
|
else:
|
|
772
922
|
batch.append(record)
|
|
@@ -791,9 +941,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
791
941
|
total_inserted += len(batch)
|
|
792
942
|
|
|
793
943
|
return InsertResult(
|
|
794
|
-
successful=total_inserted,
|
|
795
|
-
failed=0,
|
|
796
|
-
total=total_inserted
|
|
944
|
+
successful=total_inserted, failed=0, total=total_inserted
|
|
797
945
|
)
|
|
798
946
|
except ClickHouseError as e:
|
|
799
947
|
if strategy == "fail-fast":
|
|
@@ -801,9 +949,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
801
949
|
raise ValueError(f"Too many errors during stream insert: {e}")
|
|
802
950
|
|
|
803
951
|
def insert(
|
|
804
|
-
|
|
805
|
-
data: Union[List[T], Iterator[T]],
|
|
806
|
-
options: Optional[InsertOptions] = None
|
|
952
|
+
self, data: Union[List[T], Iterator[T]], options: Optional[InsertOptions] = None
|
|
807
953
|
) -> InsertResult[T]:
|
|
808
954
|
"""Insert data into the table with validation and error handling.
|
|
809
955
|
|
|
@@ -855,7 +1001,9 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
855
1001
|
```
|
|
856
1002
|
"""
|
|
857
1003
|
options = options or InsertOptions()
|
|
858
|
-
is_stream, strategy, should_validate = self._validate_insert_parameters(
|
|
1004
|
+
is_stream, strategy, should_validate = self._validate_insert_parameters(
|
|
1005
|
+
data, options
|
|
1006
|
+
)
|
|
859
1007
|
if (is_stream and not data) or (not is_stream and not data):
|
|
860
1008
|
return InsertResult(successful=0, failed=0, total=0)
|
|
861
1009
|
|
|
@@ -866,15 +1014,12 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
866
1014
|
return self._insert_stream(client, table_name, data, strategy, options)
|
|
867
1015
|
else:
|
|
868
1016
|
return self._insert_array_data(
|
|
869
|
-
client,
|
|
870
|
-
table_name,
|
|
871
|
-
data,
|
|
872
|
-
should_validate,
|
|
873
|
-
strategy,
|
|
874
|
-
options
|
|
1017
|
+
client, table_name, data, should_validate, strategy, options
|
|
875
1018
|
)
|
|
876
1019
|
|
|
877
|
-
def _map_to_clickhouse_record(
|
|
1020
|
+
def _map_to_clickhouse_record(
|
|
1021
|
+
self, record: dict, columns: Optional[List[Column]] = None
|
|
1022
|
+
) -> dict:
|
|
878
1023
|
"""
|
|
879
1024
|
Recursively transforms a record to match ClickHouse's JSONEachRow requirements.
|
|
880
1025
|
|
|
@@ -903,8 +1048,9 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
903
1048
|
|
|
904
1049
|
if is_array_nested_type(data_type):
|
|
905
1050
|
# For Array(Nested(...)), wrap each item in its own array and recurse
|
|
906
|
-
if
|
|
907
|
-
|
|
1051
|
+
if isinstance(value, list) and (
|
|
1052
|
+
len(value) == 0 or isinstance(value[0], dict)
|
|
1053
|
+
):
|
|
908
1054
|
nested_columns = data_type.element_type.columns
|
|
909
1055
|
result[col.name] = [
|
|
910
1056
|
[self._map_to_clickhouse_record(item, nested_columns)]
|
|
@@ -913,7 +1059,9 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
913
1059
|
elif is_nested_type(data_type):
|
|
914
1060
|
# For Nested struct (not array), recurse into it
|
|
915
1061
|
if value and isinstance(value, dict):
|
|
916
|
-
result[col.name] = self._map_to_clickhouse_record(
|
|
1062
|
+
result[col.name] = self._map_to_clickhouse_record(
|
|
1063
|
+
value, data_type.columns
|
|
1064
|
+
)
|
|
917
1065
|
# All other types: leave as is
|
|
918
1066
|
|
|
919
1067
|
return result
|