moose-lib 0.6.90__py3-none-any.whl → 0.6.283__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- moose_lib/__init__.py +38 -3
- moose_lib/blocks.py +497 -37
- moose_lib/clients/redis_client.py +26 -14
- moose_lib/commons.py +94 -5
- moose_lib/config/config_file.py +44 -2
- moose_lib/config/runtime.py +137 -5
- moose_lib/data_models.py +451 -46
- moose_lib/dmv2/__init__.py +88 -60
- moose_lib/dmv2/_registry.py +3 -1
- moose_lib/dmv2/_source_capture.py +37 -0
- moose_lib/dmv2/consumption.py +55 -32
- moose_lib/dmv2/ingest_api.py +9 -2
- moose_lib/dmv2/ingest_pipeline.py +56 -13
- moose_lib/dmv2/life_cycle.py +3 -1
- moose_lib/dmv2/materialized_view.py +24 -14
- moose_lib/dmv2/moose_model.py +165 -0
- moose_lib/dmv2/olap_table.py +304 -119
- moose_lib/dmv2/registry.py +28 -3
- moose_lib/dmv2/sql_resource.py +16 -8
- moose_lib/dmv2/stream.py +241 -21
- moose_lib/dmv2/types.py +14 -8
- moose_lib/dmv2/view.py +13 -6
- moose_lib/dmv2/web_app.py +175 -0
- moose_lib/dmv2/web_app_helpers.py +96 -0
- moose_lib/dmv2/workflow.py +37 -9
- moose_lib/internal.py +537 -68
- moose_lib/main.py +87 -56
- moose_lib/query_builder.py +18 -5
- moose_lib/query_param.py +54 -20
- moose_lib/secrets.py +122 -0
- moose_lib/streaming/streaming_function_runner.py +266 -156
- moose_lib/utilities/sql.py +0 -1
- {moose_lib-0.6.90.dist-info → moose_lib-0.6.283.dist-info}/METADATA +19 -1
- moose_lib-0.6.283.dist-info/RECORD +63 -0
- tests/__init__.py +1 -1
- tests/conftest.py +38 -1
- tests/test_backward_compatibility.py +85 -0
- tests/test_cluster_validation.py +85 -0
- tests/test_codec.py +75 -0
- tests/test_column_formatting.py +80 -0
- tests/test_fixedstring.py +43 -0
- tests/test_iceberg_config.py +105 -0
- tests/test_int_types.py +211 -0
- tests/test_kafka_config.py +141 -0
- tests/test_materialized.py +74 -0
- tests/test_metadata.py +37 -0
- tests/test_moose.py +21 -30
- tests/test_moose_model.py +153 -0
- tests/test_olap_table_moosemodel.py +89 -0
- tests/test_olap_table_versioning.py +210 -0
- tests/test_query_builder.py +97 -9
- tests/test_redis_client.py +10 -3
- tests/test_s3queue_config.py +211 -110
- tests/test_secrets.py +239 -0
- tests/test_simple_aggregate.py +114 -0
- tests/test_web_app.py +227 -0
- moose_lib-0.6.90.dist-info/RECORD +0 -42
- {moose_lib-0.6.90.dist-info → moose_lib-0.6.283.dist-info}/WHEEL +0 -0
- {moose_lib-0.6.90.dist-info → moose_lib-0.6.283.dist-info}/top_level.txt +0 -0
moose_lib/dmv2/olap_table.py
CHANGED
|
@@ -4,6 +4,7 @@ OLAP table definitions for Moose Data Model v2 (dmv2).
|
|
|
4
4
|
This module provides classes for defining and configuring OLAP tables,
|
|
5
5
|
particularly for ClickHouse.
|
|
6
6
|
"""
|
|
7
|
+
|
|
7
8
|
import json
|
|
8
9
|
import warnings
|
|
9
10
|
from clickhouse_connect import get_client
|
|
@@ -11,7 +12,17 @@ from clickhouse_connect.driver.client import Client
|
|
|
11
12
|
from clickhouse_connect.driver.exceptions import ClickHouseError
|
|
12
13
|
from dataclasses import dataclass
|
|
13
14
|
from pydantic import BaseModel
|
|
14
|
-
from typing import
|
|
15
|
+
from typing import (
|
|
16
|
+
List,
|
|
17
|
+
Optional,
|
|
18
|
+
Any,
|
|
19
|
+
Literal,
|
|
20
|
+
Union,
|
|
21
|
+
Tuple,
|
|
22
|
+
TypeVar,
|
|
23
|
+
Generic,
|
|
24
|
+
Iterator,
|
|
25
|
+
)
|
|
15
26
|
from ..blocks import ClickHouseEngines, EngineConfig
|
|
16
27
|
from ..commons import Logger
|
|
17
28
|
from ..config.runtime import RuntimeClickHouseConfig
|
|
@@ -20,6 +31,8 @@ from .types import TypedMooseResource, T, Cols
|
|
|
20
31
|
from ._registry import _tables
|
|
21
32
|
from ..data_models import Column, is_array_nested_type, is_nested_type, _to_columns
|
|
22
33
|
from .life_cycle import LifeCycle
|
|
34
|
+
from ._source_capture import get_source_file_from_stack
|
|
35
|
+
|
|
23
36
|
|
|
24
37
|
@dataclass
|
|
25
38
|
class InsertOptions:
|
|
@@ -32,12 +45,14 @@ class InsertOptions:
|
|
|
32
45
|
validate: Whether to validate data against schema before insertion.
|
|
33
46
|
skip_validation_on_retry: Whether to skip validation for individual records during retries.
|
|
34
47
|
"""
|
|
48
|
+
|
|
35
49
|
allow_errors: Optional[int] = None
|
|
36
50
|
allow_errors_ratio: Optional[float] = None
|
|
37
51
|
strategy: Literal["fail-fast", "discard", "isolate"] = "fail-fast"
|
|
38
52
|
validate: bool = True
|
|
39
53
|
skip_validation_on_retry: bool = False
|
|
40
54
|
|
|
55
|
+
|
|
41
56
|
@dataclass
|
|
42
57
|
class FailedRecord(Generic[T]):
|
|
43
58
|
"""Represents a failed record during insertion with error details.
|
|
@@ -47,10 +62,12 @@ class FailedRecord(Generic[T]):
|
|
|
47
62
|
error: The error message describing why the insertion failed.
|
|
48
63
|
index: Optional index of this record in the original batch.
|
|
49
64
|
"""
|
|
65
|
+
|
|
50
66
|
record: T
|
|
51
67
|
error: str
|
|
52
68
|
index: Optional[int] = None
|
|
53
69
|
|
|
70
|
+
|
|
54
71
|
@dataclass
|
|
55
72
|
class ValidationError:
|
|
56
73
|
"""Validation error for a record with detailed error information.
|
|
@@ -61,11 +78,13 @@ class ValidationError:
|
|
|
61
78
|
index: Optional index of this record in the original batch.
|
|
62
79
|
path: Optional path to the field that failed validation.
|
|
63
80
|
"""
|
|
81
|
+
|
|
64
82
|
record: Any
|
|
65
83
|
error: str
|
|
66
84
|
index: Optional[int] = None
|
|
67
85
|
path: Optional[str] = None
|
|
68
86
|
|
|
87
|
+
|
|
69
88
|
@dataclass
|
|
70
89
|
class ValidationResult(Generic[T]):
|
|
71
90
|
"""Result of data validation with success/failure breakdown.
|
|
@@ -75,10 +94,12 @@ class ValidationResult(Generic[T]):
|
|
|
75
94
|
invalid: Records that failed validation with detailed error information.
|
|
76
95
|
total: Total number of records processed.
|
|
77
96
|
"""
|
|
97
|
+
|
|
78
98
|
valid: List[T]
|
|
79
99
|
invalid: List[ValidationError]
|
|
80
100
|
total: int
|
|
81
101
|
|
|
102
|
+
|
|
82
103
|
@dataclass
|
|
83
104
|
class InsertResult(Generic[T]):
|
|
84
105
|
"""Result of an insert operation with detailed success/failure information.
|
|
@@ -89,20 +110,32 @@ class InsertResult(Generic[T]):
|
|
|
89
110
|
total: Total number of records processed.
|
|
90
111
|
failed_records: Detailed information about failed records (if record isolation was used).
|
|
91
112
|
"""
|
|
113
|
+
|
|
92
114
|
successful: int
|
|
93
115
|
failed: int
|
|
94
116
|
total: int
|
|
95
117
|
failed_records: Optional[List[FailedRecord[T]]] = None
|
|
96
118
|
|
|
119
|
+
|
|
97
120
|
class OlapConfig(BaseModel):
|
|
98
121
|
model_config = {"extra": "forbid"} # Reject unknown fields for a clean API
|
|
99
|
-
|
|
122
|
+
|
|
100
123
|
"""Configuration for OLAP tables (e.g., ClickHouse tables).
|
|
101
124
|
|
|
102
125
|
Attributes:
|
|
103
126
|
order_by_fields: List of column names to use for the ORDER BY clause.
|
|
104
127
|
Crucial for `ReplacingMergeTree` and performance.
|
|
128
|
+
order_by_expression: An arbitrary ClickHouse expression for ORDER BY. Example:
|
|
129
|
+
`order_by_expression="(id, name)"` is equivalent to order_by_fields=["id", "name"], or
|
|
130
|
+
"tuple()" for no sorting.
|
|
105
131
|
partition_by: Optional PARTITION BY expression (single ClickHouse SQL expression).
|
|
132
|
+
sample_by_expression: Optional SAMPLE BY expression for data sampling (single ClickHouse SQL expression).
|
|
133
|
+
Used to enable efficient approximate query processing with SAMPLE clause.
|
|
134
|
+
primary_key_expression: Optional PRIMARY KEY expression. When specified, this overrides the primary key
|
|
135
|
+
inferred from Key[T] column annotations. This allows for complex primary keys using
|
|
136
|
+
functions (e.g., "cityHash64(id)") or different column ordering in primary key vs
|
|
137
|
+
schema definition. Note: When this is set, any Key[T] annotations on columns are
|
|
138
|
+
ignored for PRIMARY KEY generation.
|
|
106
139
|
engine: The ClickHouse table engine to use. Can be either a ClickHouseEngines enum value
|
|
107
140
|
(for backward compatibility) or an EngineConfig instance (recommended).
|
|
108
141
|
version: Optional version string for tracking configuration changes.
|
|
@@ -110,14 +143,113 @@ class OlapConfig(BaseModel):
|
|
|
110
143
|
life_cycle: Determines how changes in code will propagate to the resources.
|
|
111
144
|
settings: Optional table-level settings that can be modified with ALTER TABLE MODIFY SETTING.
|
|
112
145
|
These are alterable settings that can be changed without recreating the table.
|
|
146
|
+
cluster: Optional cluster name for ON CLUSTER support in ClickHouse.
|
|
147
|
+
Use this to enable replicated tables across ClickHouse clusters.
|
|
148
|
+
The cluster must be defined in moose.config.toml (dev environment only).
|
|
149
|
+
Example: cluster="prod_cluster"
|
|
113
150
|
"""
|
|
114
151
|
order_by_fields: list[str] = []
|
|
152
|
+
order_by_expression: Optional[str] = None
|
|
115
153
|
partition_by: Optional[str] = None
|
|
154
|
+
sample_by_expression: Optional[str] = None
|
|
155
|
+
primary_key_expression: Optional[str] = None
|
|
116
156
|
engine: Optional[Union[ClickHouseEngines, EngineConfig]] = None
|
|
117
157
|
version: Optional[str] = None
|
|
118
158
|
metadata: Optional[dict] = None
|
|
119
159
|
life_cycle: Optional[LifeCycle] = None
|
|
120
160
|
settings: Optional[dict[str, str]] = None
|
|
161
|
+
# Optional table-level TTL expression (without leading 'TTL')
|
|
162
|
+
ttl: Optional[str] = None
|
|
163
|
+
# Optional cluster name for ON CLUSTER support in ClickHouse
|
|
164
|
+
cluster: Optional[str] = None
|
|
165
|
+
|
|
166
|
+
# Optional secondary/data-skipping indexes
|
|
167
|
+
class TableIndex(BaseModel):
|
|
168
|
+
name: str
|
|
169
|
+
expression: str
|
|
170
|
+
type: str
|
|
171
|
+
arguments: list[str] = []
|
|
172
|
+
granularity: int = 1
|
|
173
|
+
|
|
174
|
+
indexes: list[TableIndex] = []
|
|
175
|
+
database: Optional[str] = None # Optional database name for multi-database support
|
|
176
|
+
|
|
177
|
+
def model_post_init(self, __context):
|
|
178
|
+
has_fields = bool(self.order_by_fields)
|
|
179
|
+
has_expr = (
|
|
180
|
+
isinstance(self.order_by_expression, str)
|
|
181
|
+
and len(self.order_by_expression) > 0
|
|
182
|
+
)
|
|
183
|
+
if has_fields and has_expr:
|
|
184
|
+
raise ValueError(
|
|
185
|
+
"Provide either order_by_fields or order_by_expression, not both."
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Validate that non-MergeTree engines don't have unsupported clauses
|
|
189
|
+
if self.engine:
|
|
190
|
+
from ..blocks import (
|
|
191
|
+
S3Engine,
|
|
192
|
+
S3QueueEngine,
|
|
193
|
+
BufferEngine,
|
|
194
|
+
DistributedEngine,
|
|
195
|
+
IcebergS3Engine,
|
|
196
|
+
KafkaEngine,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# S3QueueEngine, BufferEngine, DistributedEngine, KafkaEngine, and IcebergS3Engine don't support ORDER BY
|
|
200
|
+
# Note: S3Engine DOES support ORDER BY (unlike S3Queue)
|
|
201
|
+
engines_without_order_by = (
|
|
202
|
+
S3QueueEngine,
|
|
203
|
+
BufferEngine,
|
|
204
|
+
DistributedEngine,
|
|
205
|
+
KafkaEngine,
|
|
206
|
+
IcebergS3Engine,
|
|
207
|
+
)
|
|
208
|
+
if isinstance(self.engine, engines_without_order_by):
|
|
209
|
+
engine_name = type(self.engine).__name__
|
|
210
|
+
|
|
211
|
+
if has_fields or has_expr:
|
|
212
|
+
raise ValueError(
|
|
213
|
+
f"{engine_name} does not support ORDER BY clauses. "
|
|
214
|
+
f"Remove order_by_fields or order_by_expression from your configuration."
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# All non-MergeTree engines don't support SAMPLE BY
|
|
218
|
+
engines_without_sample_by = (
|
|
219
|
+
S3Engine,
|
|
220
|
+
S3QueueEngine,
|
|
221
|
+
BufferEngine,
|
|
222
|
+
DistributedEngine,
|
|
223
|
+
KafkaEngine,
|
|
224
|
+
IcebergS3Engine,
|
|
225
|
+
)
|
|
226
|
+
if isinstance(self.engine, engines_without_sample_by):
|
|
227
|
+
engine_name = type(self.engine).__name__
|
|
228
|
+
|
|
229
|
+
if self.sample_by_expression:
|
|
230
|
+
raise ValueError(
|
|
231
|
+
f"{engine_name} does not support SAMPLE BY clause. "
|
|
232
|
+
f"Remove sample_by_expression from your configuration."
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Only S3QueueEngine, BufferEngine, DistributedEngine, KafkaEngine, and IcebergS3Engine don't support PARTITION BY
|
|
236
|
+
# S3Engine DOES support PARTITION BY
|
|
237
|
+
engines_without_partition_by = (
|
|
238
|
+
S3QueueEngine,
|
|
239
|
+
BufferEngine,
|
|
240
|
+
DistributedEngine,
|
|
241
|
+
KafkaEngine,
|
|
242
|
+
IcebergS3Engine,
|
|
243
|
+
)
|
|
244
|
+
if isinstance(self.engine, engines_without_partition_by):
|
|
245
|
+
engine_name = type(self.engine).__name__
|
|
246
|
+
|
|
247
|
+
if self.partition_by:
|
|
248
|
+
raise ValueError(
|
|
249
|
+
f"{engine_name} does not support PARTITION BY clause. "
|
|
250
|
+
f"Remove partition_by from your configuration."
|
|
251
|
+
)
|
|
252
|
+
|
|
121
253
|
|
|
122
254
|
class OlapTable(TypedMooseResource, Generic[T]):
|
|
123
255
|
"""Represents an OLAP table (e.g., a ClickHouse table) typed with a Pydantic model.
|
|
@@ -134,6 +266,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
134
266
|
model_type (type[T]): The Pydantic model associated with this table.
|
|
135
267
|
kind: The kind of the table (e.g., "OlapTable").
|
|
136
268
|
"""
|
|
269
|
+
|
|
137
270
|
config: OlapConfig
|
|
138
271
|
kind: str = "OlapTable"
|
|
139
272
|
_memoized_client: Optional[Client] = None
|
|
@@ -146,15 +279,77 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
146
279
|
super().__init__()
|
|
147
280
|
self._set_type(name, self._get_type(kwargs))
|
|
148
281
|
self.config = config
|
|
149
|
-
|
|
282
|
+
|
|
283
|
+
if config.metadata:
|
|
284
|
+
self.metadata = (
|
|
285
|
+
config.metadata.copy()
|
|
286
|
+
if isinstance(config.metadata, dict)
|
|
287
|
+
else config.metadata
|
|
288
|
+
)
|
|
289
|
+
else:
|
|
290
|
+
self.metadata = {}
|
|
291
|
+
|
|
292
|
+
if not isinstance(self.metadata, dict):
|
|
293
|
+
self.metadata = {}
|
|
294
|
+
if "source" not in self.metadata:
|
|
295
|
+
source_file = get_source_file_from_stack()
|
|
296
|
+
if source_file:
|
|
297
|
+
self.metadata["source"] = {"file": source_file}
|
|
298
|
+
|
|
150
299
|
self._column_list = _to_columns(self._t)
|
|
300
|
+
|
|
301
|
+
# Create Cols instance for backward compatibility
|
|
302
|
+
# This works for both BaseModel and MooseModel
|
|
151
303
|
self._cols = Cols(self._column_list)
|
|
152
|
-
|
|
153
|
-
|
|
304
|
+
|
|
305
|
+
# NOTE: For MooseModel types, columns are also accessible directly
|
|
306
|
+
# on the model class (e.g., MyModel.field_name) thanks to the metaclass.
|
|
307
|
+
# This provides LSP autocomplete without requiring .cols access.
|
|
308
|
+
|
|
309
|
+
registry_key = f"{name}_{config.version}" if config.version else name
|
|
310
|
+
if registry_key in _tables:
|
|
311
|
+
raise ValueError(
|
|
312
|
+
f"OlapTable with name {name} and version {config.version or 'unversioned'} already exists"
|
|
313
|
+
)
|
|
314
|
+
_tables[registry_key] = self
|
|
315
|
+
|
|
316
|
+
# Validate cluster and explicit replication params are not both specified
|
|
317
|
+
if config.cluster:
|
|
318
|
+
from moose_lib.blocks import (
|
|
319
|
+
ReplicatedMergeTreeEngine,
|
|
320
|
+
ReplicatedReplacingMergeTreeEngine,
|
|
321
|
+
ReplicatedAggregatingMergeTreeEngine,
|
|
322
|
+
ReplicatedSummingMergeTreeEngine,
|
|
323
|
+
ReplicatedCollapsingMergeTreeEngine,
|
|
324
|
+
ReplicatedVersionedCollapsingMergeTreeEngine,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
if isinstance(
|
|
328
|
+
config.engine,
|
|
329
|
+
(
|
|
330
|
+
ReplicatedMergeTreeEngine,
|
|
331
|
+
ReplicatedReplacingMergeTreeEngine,
|
|
332
|
+
ReplicatedAggregatingMergeTreeEngine,
|
|
333
|
+
ReplicatedSummingMergeTreeEngine,
|
|
334
|
+
ReplicatedCollapsingMergeTreeEngine,
|
|
335
|
+
ReplicatedVersionedCollapsingMergeTreeEngine,
|
|
336
|
+
),
|
|
337
|
+
):
|
|
338
|
+
if (
|
|
339
|
+
config.engine.keeper_path is not None
|
|
340
|
+
or config.engine.replica_name is not None
|
|
341
|
+
):
|
|
342
|
+
raise ValueError(
|
|
343
|
+
f"OlapTable {name}: Cannot specify both 'cluster' and explicit replication params "
|
|
344
|
+
f"('keeper_path' or 'replica_name'). "
|
|
345
|
+
f"Use 'cluster' for auto-injected params, or use explicit 'keeper_path' and "
|
|
346
|
+
f"'replica_name' without 'cluster'."
|
|
347
|
+
)
|
|
348
|
+
|
|
154
349
|
# Check if using legacy enum-based engine configuration
|
|
155
350
|
if config.engine and isinstance(config.engine, ClickHouseEngines):
|
|
156
351
|
logger = Logger(action="OlapTable")
|
|
157
|
-
|
|
352
|
+
|
|
158
353
|
# Special case for S3Queue - more detailed migration message
|
|
159
354
|
if config.engine == ClickHouseEngines.S3Queue:
|
|
160
355
|
logger.highlight(
|
|
@@ -174,13 +369,13 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
174
369
|
f" New: from moose_lib.blocks import {engine_name}Engine; engine={engine_name}Engine()\n"
|
|
175
370
|
"The new API provides better type safety and configuration options."
|
|
176
371
|
)
|
|
177
|
-
|
|
372
|
+
|
|
178
373
|
# Also emit a Python warning for development environments
|
|
179
374
|
warnings.warn(
|
|
180
375
|
f"Table '{name}' uses deprecated ClickHouseEngines enum. "
|
|
181
376
|
f"Please migrate to engine configuration classes (e.g., {config.engine.value}Engine).",
|
|
182
377
|
DeprecationWarning,
|
|
183
|
-
stacklevel=2
|
|
378
|
+
stacklevel=2,
|
|
184
379
|
)
|
|
185
380
|
|
|
186
381
|
@property
|
|
@@ -217,10 +412,13 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
217
412
|
A 16-character hex hash of the configuration.
|
|
218
413
|
"""
|
|
219
414
|
import hashlib
|
|
415
|
+
|
|
416
|
+
# Use per-table database if specified, otherwise fall back to global config
|
|
417
|
+
effective_database = self.config.database or clickhouse_config.database
|
|
220
418
|
config_string = (
|
|
221
419
|
f"{clickhouse_config.host}:{clickhouse_config.port}:"
|
|
222
420
|
f"{clickhouse_config.username}:{clickhouse_config.password}:"
|
|
223
|
-
f"{
|
|
421
|
+
f"{effective_database}:{clickhouse_config.use_ssl}"
|
|
224
422
|
)
|
|
225
423
|
return hashlib.sha256(config_string.encode()).hexdigest()[:16]
|
|
226
424
|
|
|
@@ -255,14 +453,16 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
255
453
|
|
|
256
454
|
try:
|
|
257
455
|
# Create new client with standard configuration
|
|
258
|
-
|
|
456
|
+
# Use per-table database if specified, otherwise fall back to global config
|
|
457
|
+
effective_database = self.config.database or clickhouse_config.database
|
|
458
|
+
interface = "https" if clickhouse_config.use_ssl else "http"
|
|
259
459
|
client = get_client(
|
|
260
460
|
interface=interface,
|
|
261
461
|
host=clickhouse_config.host,
|
|
262
462
|
port=int(clickhouse_config.port),
|
|
263
463
|
username=clickhouse_config.username,
|
|
264
464
|
password=clickhouse_config.password,
|
|
265
|
-
database=
|
|
465
|
+
database=effective_database,
|
|
266
466
|
)
|
|
267
467
|
|
|
268
468
|
# Cache the new client and config hash
|
|
@@ -298,7 +498,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
298
498
|
Returns:
|
|
299
499
|
Tuple of (validated_data, error_message). If validation succeeds,
|
|
300
500
|
validated_data will be the validated record and error_message will be None.
|
|
301
|
-
If validation fails for any reason, validated_data will be None and error_message
|
|
501
|
+
If validation fails for any reason, validated_data will be None and error_message
|
|
302
502
|
will contain the error details.
|
|
303
503
|
"""
|
|
304
504
|
try:
|
|
@@ -324,23 +524,19 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
324
524
|
if validated is not None:
|
|
325
525
|
valid.append(validated)
|
|
326
526
|
else:
|
|
327
|
-
invalid.append(
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
total=len(data)
|
|
338
|
-
)
|
|
527
|
+
invalid.append(
|
|
528
|
+
ValidationError(
|
|
529
|
+
record=record,
|
|
530
|
+
error=error or "Validation failed",
|
|
531
|
+
index=i,
|
|
532
|
+
path="root",
|
|
533
|
+
)
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
return ValidationResult(valid=valid, invalid=invalid, total=len(data))
|
|
339
537
|
|
|
340
538
|
def _validate_insert_parameters(
|
|
341
|
-
self,
|
|
342
|
-
data: Union[List[T], Iterator[T]],
|
|
343
|
-
options: Optional[InsertOptions]
|
|
539
|
+
self, data: Union[List[T], Iterator[T]], options: Optional[InsertOptions]
|
|
344
540
|
) -> Tuple[bool, str, bool]:
|
|
345
541
|
"""Validate input parameters and strategy compatibility.
|
|
346
542
|
|
|
@@ -362,7 +558,9 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
362
558
|
)
|
|
363
559
|
|
|
364
560
|
if is_stream and should_validate:
|
|
365
|
-
print(
|
|
561
|
+
print(
|
|
562
|
+
"Warning: Validation is not supported with stream input. Validation will be skipped."
|
|
563
|
+
)
|
|
366
564
|
|
|
367
565
|
return is_stream, strategy, should_validate
|
|
368
566
|
|
|
@@ -371,7 +569,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
371
569
|
data: List[T],
|
|
372
570
|
should_validate: bool,
|
|
373
571
|
strategy: str,
|
|
374
|
-
options: Optional[InsertOptions] = None
|
|
572
|
+
options: Optional[InsertOptions] = None,
|
|
375
573
|
) -> Tuple[List[T], List[ValidationError]]:
|
|
376
574
|
"""Perform pre-insertion validation for array data.
|
|
377
575
|
|
|
@@ -394,10 +592,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
394
592
|
|
|
395
593
|
if validation_errors:
|
|
396
594
|
self._handle_validation_errors(
|
|
397
|
-
validation_errors,
|
|
398
|
-
strategy,
|
|
399
|
-
data,
|
|
400
|
-
options
|
|
595
|
+
validation_errors, strategy, data, options
|
|
401
596
|
)
|
|
402
597
|
|
|
403
598
|
if strategy == "discard":
|
|
@@ -420,7 +615,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
420
615
|
validation_errors: List[ValidationError],
|
|
421
616
|
strategy: str,
|
|
422
617
|
data: List[T],
|
|
423
|
-
options: Optional[InsertOptions]
|
|
618
|
+
options: Optional[InsertOptions],
|
|
424
619
|
) -> None:
|
|
425
620
|
"""Handle validation errors based on the specified strategy.
|
|
426
621
|
|
|
@@ -436,17 +631,13 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
436
631
|
f"Validation failed for record at index {first_error.index}: {first_error.error}"
|
|
437
632
|
)
|
|
438
633
|
elif strategy == "discard":
|
|
439
|
-
self._check_validation_thresholds(
|
|
440
|
-
validation_errors,
|
|
441
|
-
len(data),
|
|
442
|
-
options
|
|
443
|
-
)
|
|
634
|
+
self._check_validation_thresholds(validation_errors, len(data), options)
|
|
444
635
|
|
|
445
636
|
def _check_validation_thresholds(
|
|
446
637
|
self,
|
|
447
638
|
validation_errors: List[ValidationError],
|
|
448
639
|
total_records: int,
|
|
449
|
-
options: Optional[InsertOptions]
|
|
640
|
+
options: Optional[InsertOptions],
|
|
450
641
|
) -> None:
|
|
451
642
|
"""Check if validation errors exceed configured thresholds.
|
|
452
643
|
|
|
@@ -458,15 +649,21 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
458
649
|
validation_failed_count = len(validation_errors)
|
|
459
650
|
validation_failed_ratio = validation_failed_count / total_records
|
|
460
651
|
|
|
461
|
-
if (
|
|
462
|
-
|
|
652
|
+
if (
|
|
653
|
+
options
|
|
654
|
+
and options.allow_errors is not None
|
|
655
|
+
and validation_failed_count > options.allow_errors
|
|
656
|
+
):
|
|
463
657
|
raise ValueError(
|
|
464
658
|
f"Too many validation failures: {validation_failed_count} > {options.allow_errors}. "
|
|
465
659
|
f"Errors: {', '.join(e.error for e in validation_errors)}"
|
|
466
660
|
)
|
|
467
661
|
|
|
468
|
-
if (
|
|
469
|
-
|
|
662
|
+
if (
|
|
663
|
+
options
|
|
664
|
+
and options.allow_errors_ratio is not None
|
|
665
|
+
and validation_failed_ratio > options.allow_errors_ratio
|
|
666
|
+
):
|
|
470
667
|
raise ValueError(
|
|
471
668
|
f"Validation failure ratio too high: {validation_failed_ratio:.3f} > "
|
|
472
669
|
f"{options.allow_errors_ratio}. Errors: {', '.join(e.error for e in validation_errors)}"
|
|
@@ -477,10 +674,10 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
477
674
|
|
|
478
675
|
def _with_wait_end_settings(self, settings: dict) -> dict:
|
|
479
676
|
"""Add wait_end_of_query setting to ensure at least once delivery for INSERT operations.
|
|
480
|
-
|
|
677
|
+
|
|
481
678
|
Args:
|
|
482
679
|
settings: Base settings dictionary
|
|
483
|
-
|
|
680
|
+
|
|
484
681
|
Returns:
|
|
485
682
|
Settings dictionary with wait_end_of_query added
|
|
486
683
|
"""
|
|
@@ -493,20 +690,28 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
493
690
|
validated_data: List[T],
|
|
494
691
|
is_stream: bool,
|
|
495
692
|
strategy: str,
|
|
496
|
-
options: Optional[InsertOptions]
|
|
693
|
+
options: Optional[InsertOptions],
|
|
497
694
|
) -> tuple[str, bytes, dict]:
|
|
498
695
|
"""Prepare insert options for JSONEachRow raw SQL insert, returning settings dict."""
|
|
499
696
|
# Base settings for all inserts
|
|
500
697
|
base_settings = {
|
|
501
698
|
"date_time_input_format": "best_effort",
|
|
502
|
-
"max_insert_block_size":
|
|
699
|
+
"max_insert_block_size": (
|
|
700
|
+
100000 if is_stream else min(len(validated_data), 100000)
|
|
701
|
+
),
|
|
503
702
|
"max_block_size": 65536,
|
|
504
703
|
"async_insert": 1 if len(validated_data) > 1000 else 0,
|
|
505
704
|
"wait_for_async_insert": 1,
|
|
506
705
|
}
|
|
507
706
|
settings = self._with_wait_end_settings(base_settings)
|
|
508
|
-
if (
|
|
509
|
-
|
|
707
|
+
if (
|
|
708
|
+
strategy == "discard"
|
|
709
|
+
and options
|
|
710
|
+
and (
|
|
711
|
+
options.allow_errors is not None
|
|
712
|
+
or options.allow_errors_ratio is not None
|
|
713
|
+
)
|
|
714
|
+
):
|
|
510
715
|
if options.allow_errors is not None:
|
|
511
716
|
settings["input_format_allow_errors_num"] = options.allow_errors
|
|
512
717
|
if options.allow_errors_ratio is not None:
|
|
@@ -519,7 +724,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
519
724
|
validated_data = [validated_data]
|
|
520
725
|
dict_data = []
|
|
521
726
|
for record in validated_data:
|
|
522
|
-
if hasattr(record,
|
|
727
|
+
if hasattr(record, "model_dump"):
|
|
523
728
|
record_dict = record.model_dump()
|
|
524
729
|
else:
|
|
525
730
|
record_dict = record
|
|
@@ -537,7 +742,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
537
742
|
validation_errors: List[ValidationError],
|
|
538
743
|
is_stream: bool,
|
|
539
744
|
should_validate: bool,
|
|
540
|
-
strategy: str
|
|
745
|
+
strategy: str,
|
|
541
746
|
) -> InsertResult[T]:
|
|
542
747
|
"""Create appropriate result based on input type.
|
|
543
748
|
|
|
@@ -553,11 +758,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
553
758
|
InsertResult with appropriate counts and error information.
|
|
554
759
|
"""
|
|
555
760
|
if is_stream:
|
|
556
|
-
return InsertResult(
|
|
557
|
-
successful=-1,
|
|
558
|
-
failed=0,
|
|
559
|
-
total=-1
|
|
560
|
-
)
|
|
761
|
+
return InsertResult(successful=-1, failed=0, total=-1)
|
|
561
762
|
|
|
562
763
|
inserted_count = len(validated_data)
|
|
563
764
|
total_processed = len(data) if not is_stream else inserted_count
|
|
@@ -565,32 +766,30 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
565
766
|
result = InsertResult(
|
|
566
767
|
successful=inserted_count,
|
|
567
768
|
failed=len(validation_errors) if should_validate else 0,
|
|
568
|
-
total=total_processed
|
|
769
|
+
total=total_processed,
|
|
569
770
|
)
|
|
570
771
|
|
|
571
|
-
if
|
|
772
|
+
if should_validate and validation_errors and strategy == "discard":
|
|
572
773
|
result.failed_records = [
|
|
573
774
|
FailedRecord(
|
|
574
775
|
record=ve.record,
|
|
575
776
|
error=f"Validation error: {ve.error}",
|
|
576
|
-
index=ve.index
|
|
577
|
-
)
|
|
777
|
+
index=ve.index,
|
|
778
|
+
)
|
|
779
|
+
for ve in validation_errors
|
|
578
780
|
]
|
|
579
781
|
|
|
580
782
|
return result
|
|
581
783
|
|
|
582
784
|
def _retry_individual_records(
|
|
583
|
-
self,
|
|
584
|
-
client: Client,
|
|
585
|
-
records: List[T],
|
|
586
|
-
options: InsertOptions
|
|
785
|
+
self, client: Client, records: List[T], options: InsertOptions
|
|
587
786
|
) -> InsertResult[T]:
|
|
588
787
|
successful: List[T] = []
|
|
589
788
|
failed: List[FailedRecord[T]] = []
|
|
590
789
|
table_name = quote_identifier(self._generate_table_name())
|
|
591
790
|
records_dict = []
|
|
592
791
|
for record in records:
|
|
593
|
-
if hasattr(record,
|
|
792
|
+
if hasattr(record, "model_dump"):
|
|
594
793
|
record_dict = record.model_dump()
|
|
595
794
|
else:
|
|
596
795
|
record_dict = record
|
|
@@ -599,41 +798,42 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
599
798
|
|
|
600
799
|
RETRY_BATCH_SIZE = 10
|
|
601
800
|
for i in range(0, len(records_dict), RETRY_BATCH_SIZE):
|
|
602
|
-
batch = records_dict[i:i + RETRY_BATCH_SIZE]
|
|
801
|
+
batch = records_dict[i : i + RETRY_BATCH_SIZE]
|
|
603
802
|
try:
|
|
604
803
|
sql = f"INSERT INTO {table_name} FORMAT JSONEachRow"
|
|
605
804
|
base_settings = {
|
|
606
805
|
"date_time_input_format": "best_effort",
|
|
607
806
|
"max_insert_block_size": RETRY_BATCH_SIZE,
|
|
608
807
|
"max_block_size": RETRY_BATCH_SIZE,
|
|
609
|
-
"async_insert": 0
|
|
808
|
+
"async_insert": 0,
|
|
610
809
|
}
|
|
611
810
|
settings = self._with_wait_end_settings(base_settings)
|
|
612
811
|
json_lines = self._to_json_each_row(batch)
|
|
613
812
|
client.command(sql, data=json_lines, settings=settings)
|
|
614
|
-
successful.extend(records[i:i + RETRY_BATCH_SIZE])
|
|
813
|
+
successful.extend(records[i : i + RETRY_BATCH_SIZE])
|
|
615
814
|
except ClickHouseError as batch_error:
|
|
616
815
|
for j, record_dict in enumerate(batch):
|
|
617
816
|
try:
|
|
618
817
|
sql = f"INSERT INTO {table_name} FORMAT JSONEachRow"
|
|
619
|
-
individual_settings = self._with_wait_end_settings(
|
|
620
|
-
"date_time_input_format": "best_effort",
|
|
621
|
-
|
|
622
|
-
})
|
|
818
|
+
individual_settings = self._with_wait_end_settings(
|
|
819
|
+
{"date_time_input_format": "best_effort", "async_insert": 0}
|
|
820
|
+
)
|
|
623
821
|
json_line = self._to_json_each_row([record_dict])
|
|
624
|
-
client.command(
|
|
822
|
+
client.command(
|
|
823
|
+
sql, data=json_line, settings=individual_settings
|
|
824
|
+
)
|
|
625
825
|
successful.append(records[i + j])
|
|
626
826
|
except ClickHouseError as error:
|
|
627
|
-
failed.append(
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
)
|
|
827
|
+
failed.append(
|
|
828
|
+
FailedRecord(
|
|
829
|
+
record=records[i + j], error=str(error), index=i + j
|
|
830
|
+
)
|
|
831
|
+
)
|
|
632
832
|
return InsertResult(
|
|
633
833
|
successful=len(successful),
|
|
634
834
|
failed=len(failed),
|
|
635
835
|
total=len(records),
|
|
636
|
-
failed_records=failed if failed else None
|
|
836
|
+
failed_records=failed if failed else None,
|
|
637
837
|
)
|
|
638
838
|
|
|
639
839
|
def _insert_array_data(
|
|
@@ -643,7 +843,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
643
843
|
data: List[T],
|
|
644
844
|
should_validate: bool,
|
|
645
845
|
strategy: str,
|
|
646
|
-
options: Optional[InsertOptions]
|
|
846
|
+
options: Optional[InsertOptions],
|
|
647
847
|
) -> InsertResult[T]:
|
|
648
848
|
"""Insert array data into the table with validation and error handling.
|
|
649
849
|
|
|
@@ -659,19 +859,11 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
659
859
|
InsertResult with detailed success/failure information.
|
|
660
860
|
"""
|
|
661
861
|
validated_data, validation_errors = self._perform_pre_insertion_validation(
|
|
662
|
-
data,
|
|
663
|
-
should_validate,
|
|
664
|
-
strategy,
|
|
665
|
-
options
|
|
862
|
+
data, should_validate, strategy, options
|
|
666
863
|
)
|
|
667
864
|
try:
|
|
668
865
|
table_name, json_lines, settings = self._prepare_insert_options(
|
|
669
|
-
table_name,
|
|
670
|
-
data,
|
|
671
|
-
validated_data,
|
|
672
|
-
False,
|
|
673
|
-
strategy,
|
|
674
|
-
options
|
|
866
|
+
table_name, data, validated_data, False, strategy, options
|
|
675
867
|
)
|
|
676
868
|
sql = f"INSERT INTO {table_name} FORMAT JSONEachRow"
|
|
677
869
|
client.command(sql, data=json_lines, settings=settings)
|
|
@@ -681,7 +873,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
681
873
|
validation_errors,
|
|
682
874
|
False,
|
|
683
875
|
should_validate,
|
|
684
|
-
strategy
|
|
876
|
+
strategy,
|
|
685
877
|
)
|
|
686
878
|
except ClickHouseError as e:
|
|
687
879
|
if strategy == "fail-fast":
|
|
@@ -692,7 +884,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
692
884
|
return self._retry_individual_records(
|
|
693
885
|
client,
|
|
694
886
|
validated_data if not options.skip_validation_on_retry else data,
|
|
695
|
-
options
|
|
887
|
+
options,
|
|
696
888
|
)
|
|
697
889
|
|
|
698
890
|
def _insert_stream(
|
|
@@ -701,7 +893,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
701
893
|
table_name: str,
|
|
702
894
|
data: Iterator[T],
|
|
703
895
|
strategy: str,
|
|
704
|
-
options: Optional[InsertOptions]
|
|
896
|
+
options: Optional[InsertOptions],
|
|
705
897
|
) -> InsertResult[T]:
|
|
706
898
|
"""Insert data from an iterator into the table.
|
|
707
899
|
|
|
@@ -719,17 +911,12 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
719
911
|
total_inserted = 0
|
|
720
912
|
|
|
721
913
|
_, _, settings = self._prepare_insert_options(
|
|
722
|
-
table_name,
|
|
723
|
-
data,
|
|
724
|
-
[],
|
|
725
|
-
True,
|
|
726
|
-
strategy,
|
|
727
|
-
options
|
|
914
|
+
table_name, data, [], True, strategy, options
|
|
728
915
|
)
|
|
729
916
|
|
|
730
917
|
for record in data:
|
|
731
918
|
# Convert record to dict using model_dump if available
|
|
732
|
-
if hasattr(record,
|
|
919
|
+
if hasattr(record, "model_dump"):
|
|
733
920
|
batch.append(record.model_dump())
|
|
734
921
|
else:
|
|
735
922
|
batch.append(record)
|
|
@@ -754,9 +941,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
754
941
|
total_inserted += len(batch)
|
|
755
942
|
|
|
756
943
|
return InsertResult(
|
|
757
|
-
successful=total_inserted,
|
|
758
|
-
failed=0,
|
|
759
|
-
total=total_inserted
|
|
944
|
+
successful=total_inserted, failed=0, total=total_inserted
|
|
760
945
|
)
|
|
761
946
|
except ClickHouseError as e:
|
|
762
947
|
if strategy == "fail-fast":
|
|
@@ -764,9 +949,7 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
764
949
|
raise ValueError(f"Too many errors during stream insert: {e}")
|
|
765
950
|
|
|
766
951
|
def insert(
|
|
767
|
-
self,
|
|
768
|
-
data: Union[List[T], Iterator[T]],
|
|
769
|
-
options: Optional[InsertOptions] = None
|
|
952
|
+
self, data: Union[List[T], Iterator[T]], options: Optional[InsertOptions] = None
|
|
770
953
|
) -> InsertResult[T]:
|
|
771
954
|
"""Insert data into the table with validation and error handling.
|
|
772
955
|
|
|
@@ -818,7 +1001,9 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
818
1001
|
```
|
|
819
1002
|
"""
|
|
820
1003
|
options = options or InsertOptions()
|
|
821
|
-
is_stream, strategy, should_validate = self._validate_insert_parameters(
|
|
1004
|
+
is_stream, strategy, should_validate = self._validate_insert_parameters(
|
|
1005
|
+
data, options
|
|
1006
|
+
)
|
|
822
1007
|
if (is_stream and not data) or (not is_stream and not data):
|
|
823
1008
|
return InsertResult(successful=0, failed=0, total=0)
|
|
824
1009
|
|
|
@@ -829,15 +1014,12 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
829
1014
|
return self._insert_stream(client, table_name, data, strategy, options)
|
|
830
1015
|
else:
|
|
831
1016
|
return self._insert_array_data(
|
|
832
|
-
client,
|
|
833
|
-
table_name,
|
|
834
|
-
data,
|
|
835
|
-
should_validate,
|
|
836
|
-
strategy,
|
|
837
|
-
options
|
|
1017
|
+
client, table_name, data, should_validate, strategy, options
|
|
838
1018
|
)
|
|
839
1019
|
|
|
840
|
-
def _map_to_clickhouse_record(
|
|
1020
|
+
def _map_to_clickhouse_record(
|
|
1021
|
+
self, record: dict, columns: Optional[List[Column]] = None
|
|
1022
|
+
) -> dict:
|
|
841
1023
|
"""
|
|
842
1024
|
Recursively transforms a record to match ClickHouse's JSONEachRow requirements.
|
|
843
1025
|
|
|
@@ -866,8 +1048,9 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
866
1048
|
|
|
867
1049
|
if is_array_nested_type(data_type):
|
|
868
1050
|
# For Array(Nested(...)), wrap each item in its own array and recurse
|
|
869
|
-
if
|
|
870
|
-
|
|
1051
|
+
if isinstance(value, list) and (
|
|
1052
|
+
len(value) == 0 or isinstance(value[0], dict)
|
|
1053
|
+
):
|
|
871
1054
|
nested_columns = data_type.element_type.columns
|
|
872
1055
|
result[col.name] = [
|
|
873
1056
|
[self._map_to_clickhouse_record(item, nested_columns)]
|
|
@@ -876,7 +1059,9 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
|
876
1059
|
elif is_nested_type(data_type):
|
|
877
1060
|
# For Nested struct (not array), recurse into it
|
|
878
1061
|
if value and isinstance(value, dict):
|
|
879
|
-
result[col.name] = self._map_to_clickhouse_record(
|
|
1062
|
+
result[col.name] = self._map_to_clickhouse_record(
|
|
1063
|
+
value, data_type.columns
|
|
1064
|
+
)
|
|
880
1065
|
# All other types: leave as is
|
|
881
1066
|
|
|
882
|
-
return result
|
|
1067
|
+
return result
|