moose-lib 0.4.223__py3-none-any.whl → 0.4.225__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- moose_lib/dmv2/__init__.py +2 -0
- moose_lib/dmv2/olap_table.py +711 -3
- {moose_lib-0.4.223.dist-info → moose_lib-0.4.225.dist-info}/METADATA +4 -1
- {moose_lib-0.4.223.dist-info → moose_lib-0.4.225.dist-info}/RECORD +6 -6
- {moose_lib-0.4.223.dist-info → moose_lib-0.4.225.dist-info}/WHEEL +0 -0
- {moose_lib-0.4.223.dist-info → moose_lib-0.4.225.dist-info}/top_level.txt +0 -0
moose_lib/dmv2/__init__.py
CHANGED
@@ -18,6 +18,7 @@ from .types import (
|
|
18
18
|
from .olap_table import (
|
19
19
|
OlapConfig,
|
20
20
|
OlapTable,
|
21
|
+
InsertOptions,
|
21
22
|
)
|
22
23
|
|
23
24
|
from .stream import (
|
@@ -94,6 +95,7 @@ __all__ = [
|
|
94
95
|
# OLAP Tables
|
95
96
|
'OlapConfig',
|
96
97
|
'OlapTable',
|
98
|
+
'InsertOptions',
|
97
99
|
|
98
100
|
# Streams
|
99
101
|
'StreamConfig',
|
moose_lib/dmv2/olap_table.py
CHANGED
@@ -4,13 +4,91 @@ OLAP table definitions for Moose Data Model v2 (dmv2).
|
|
4
4
|
This module provides classes for defining and configuring OLAP tables,
|
5
5
|
particularly for ClickHouse.
|
6
6
|
"""
|
7
|
-
|
7
|
+
import json
|
8
|
+
from clickhouse_connect import get_client
|
9
|
+
from clickhouse_connect.driver.client import Client
|
10
|
+
from clickhouse_connect.driver.exceptions import ClickHouseError
|
11
|
+
from dataclasses import dataclass
|
8
12
|
from pydantic import BaseModel
|
9
|
-
|
13
|
+
from typing import List, Optional, Any, Literal, Union, Tuple, TypeVar, Generic, Iterator
|
10
14
|
from moose_lib import ClickHouseEngines
|
15
|
+
from ..config.runtime import RuntimeClickHouseConfig
|
11
16
|
from .types import TypedMooseResource, T
|
12
17
|
from ._registry import _tables
|
13
18
|
|
19
|
+
@dataclass
|
20
|
+
class InsertOptions:
|
21
|
+
"""Options for insert operations.
|
22
|
+
|
23
|
+
Attributes:
|
24
|
+
allow_errors: Maximum number of bad records to tolerate before failing.
|
25
|
+
allow_errors_ratio: Maximum ratio of bad records to tolerate (0.0 to 1.0).
|
26
|
+
strategy: Error handling strategy ("fail-fast", "discard", or "isolate").
|
27
|
+
validate: Whether to validate data against schema before insertion.
|
28
|
+
skip_validation_on_retry: Whether to skip validation for individual records during retries.
|
29
|
+
"""
|
30
|
+
allow_errors: Optional[int] = None
|
31
|
+
allow_errors_ratio: Optional[float] = None
|
32
|
+
strategy: Literal["fail-fast", "discard", "isolate"] = "fail-fast"
|
33
|
+
validate: bool = True
|
34
|
+
skip_validation_on_retry: bool = False
|
35
|
+
|
36
|
+
@dataclass
|
37
|
+
class FailedRecord(Generic[T]):
|
38
|
+
"""Represents a failed record during insertion with error details.
|
39
|
+
|
40
|
+
Attributes:
|
41
|
+
record: The original record that failed to insert.
|
42
|
+
error: The error message describing why the insertion failed.
|
43
|
+
index: Optional index of this record in the original batch.
|
44
|
+
"""
|
45
|
+
record: T
|
46
|
+
error: str
|
47
|
+
index: Optional[int] = None
|
48
|
+
|
49
|
+
@dataclass
|
50
|
+
class ValidationError:
|
51
|
+
"""Validation error for a record with detailed error information.
|
52
|
+
|
53
|
+
Attributes:
|
54
|
+
record: The original record that failed validation.
|
55
|
+
error: Detailed validation error message.
|
56
|
+
index: Optional index of this record in the original batch.
|
57
|
+
path: Optional path to the field that failed validation.
|
58
|
+
"""
|
59
|
+
record: Any
|
60
|
+
error: str
|
61
|
+
index: Optional[int] = None
|
62
|
+
path: Optional[str] = None
|
63
|
+
|
64
|
+
@dataclass
|
65
|
+
class ValidationResult(Generic[T]):
|
66
|
+
"""Result of data validation with success/failure breakdown.
|
67
|
+
|
68
|
+
Attributes:
|
69
|
+
valid: Records that passed validation.
|
70
|
+
invalid: Records that failed validation with detailed error information.
|
71
|
+
total: Total number of records processed.
|
72
|
+
"""
|
73
|
+
valid: List[T]
|
74
|
+
invalid: List[ValidationError]
|
75
|
+
total: int
|
76
|
+
|
77
|
+
@dataclass
|
78
|
+
class InsertResult(Generic[T]):
|
79
|
+
"""Result of an insert operation with detailed success/failure information.
|
80
|
+
|
81
|
+
Attributes:
|
82
|
+
successful: Number of records successfully inserted.
|
83
|
+
failed: Number of records that failed to insert.
|
84
|
+
total: Total number of records processed.
|
85
|
+
failed_records: Detailed information about failed records (if record isolation was used).
|
86
|
+
"""
|
87
|
+
successful: int
|
88
|
+
failed: int
|
89
|
+
total: int
|
90
|
+
failed_records: Optional[List[FailedRecord[T]]] = None
|
91
|
+
|
14
92
|
class OlapConfig(BaseModel):
|
15
93
|
"""Configuration for OLAP tables (e.g., ClickHouse tables).
|
16
94
|
|
@@ -48,10 +126,640 @@ class OlapTable(TypedMooseResource, Generic[T]):
|
|
48
126
|
"""
|
49
127
|
config: OlapConfig
|
50
128
|
kind: str = "OlapTable"
|
129
|
+
_memoized_client: Optional[Client] = None
|
130
|
+
_config_hash: Optional[str] = None
|
131
|
+
_cached_table_name: Optional[str] = None
|
51
132
|
|
52
133
|
def __init__(self, name: str, config: OlapConfig = OlapConfig(), **kwargs):
|
53
134
|
super().__init__()
|
54
135
|
self._set_type(name, self._get_type(kwargs))
|
55
136
|
self.config = config
|
56
137
|
self.metadata = config.metadata
|
57
|
-
_tables[name] = self
|
138
|
+
_tables[name] = self
|
139
|
+
|
140
|
+
def _generate_table_name(self) -> str:
|
141
|
+
"""Generate the versioned table name following Moose's naming convention.
|
142
|
+
|
143
|
+
Format: {tableName}_{version_with_dots_replaced_by_underscores}
|
144
|
+
|
145
|
+
Returns:
|
146
|
+
The versioned table name.
|
147
|
+
"""
|
148
|
+
if self._cached_table_name:
|
149
|
+
return self._cached_table_name
|
150
|
+
|
151
|
+
table_version = self.config.version
|
152
|
+
if not table_version:
|
153
|
+
self._cached_table_name = self.name
|
154
|
+
else:
|
155
|
+
version_suffix = table_version.replace(".", "_")
|
156
|
+
self._cached_table_name = f"{self.name}_{version_suffix}"
|
157
|
+
|
158
|
+
return self._cached_table_name
|
159
|
+
|
160
|
+
def _create_config_hash(self, clickhouse_config: RuntimeClickHouseConfig) -> str:
|
161
|
+
"""Create a fast hash of the ClickHouse configuration.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
clickhouse_config: The ClickHouse configuration to hash.
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
A 16-character hex hash of the configuration.
|
168
|
+
"""
|
169
|
+
import hashlib
|
170
|
+
config_string = (
|
171
|
+
f"{clickhouse_config.host}:{clickhouse_config.port}:"
|
172
|
+
f"{clickhouse_config.username}:{clickhouse_config.password}:"
|
173
|
+
f"{clickhouse_config.database}:{clickhouse_config.use_ssl}"
|
174
|
+
)
|
175
|
+
return hashlib.sha256(config_string.encode()).hexdigest()[:16]
|
176
|
+
|
177
|
+
def _get_memoized_client(self) -> Client:
|
178
|
+
"""Get or create a memoized ClickHouse client.
|
179
|
+
|
180
|
+
The client is cached and reused across multiple insert calls for better performance.
|
181
|
+
If the configuration changes, a new client will be created.
|
182
|
+
|
183
|
+
Returns:
|
184
|
+
A ClickHouse client instance.
|
185
|
+
"""
|
186
|
+
from ..config.runtime import config_registry
|
187
|
+
|
188
|
+
# Get configuration from registry (with fallback to file)
|
189
|
+
clickhouse_config = config_registry.get_clickhouse_config()
|
190
|
+
|
191
|
+
# Create a fast hash of the current configuration to detect changes
|
192
|
+
current_config_hash = self._create_config_hash(clickhouse_config)
|
193
|
+
|
194
|
+
# If we have a cached client and the config hasn't changed, reuse it
|
195
|
+
if self._memoized_client and self._config_hash == current_config_hash:
|
196
|
+
return self._memoized_client
|
197
|
+
|
198
|
+
# Close existing client if config changed
|
199
|
+
if self._memoized_client and self._config_hash != current_config_hash:
|
200
|
+
try:
|
201
|
+
self._memoized_client.close()
|
202
|
+
except Exception:
|
203
|
+
# Ignore errors when closing old client
|
204
|
+
pass
|
205
|
+
|
206
|
+
try:
|
207
|
+
# Create new client with standard configuration
|
208
|
+
interface = 'https' if clickhouse_config.use_ssl else 'http'
|
209
|
+
client = get_client(
|
210
|
+
interface=interface,
|
211
|
+
host=clickhouse_config.host,
|
212
|
+
port=int(clickhouse_config.port),
|
213
|
+
username=clickhouse_config.username,
|
214
|
+
password=clickhouse_config.password,
|
215
|
+
database=clickhouse_config.database,
|
216
|
+
)
|
217
|
+
|
218
|
+
# Cache the new client and config hash
|
219
|
+
self._memoized_client = client
|
220
|
+
self._config_hash = current_config_hash
|
221
|
+
|
222
|
+
return client
|
223
|
+
except Exception as e:
|
224
|
+
raise RuntimeError(f"Failed to create ClickHouse client: {e}")
|
225
|
+
|
226
|
+
def close_client(self) -> None:
|
227
|
+
"""Close the memoized ClickHouse client if it exists.
|
228
|
+
|
229
|
+
This is useful for cleaning up connections when the table instance is no longer needed.
|
230
|
+
The client will be automatically recreated on the next insert call if needed.
|
231
|
+
"""
|
232
|
+
if self._memoized_client:
|
233
|
+
try:
|
234
|
+
self._memoized_client.close()
|
235
|
+
except Exception:
|
236
|
+
# Ignore errors when closing
|
237
|
+
pass
|
238
|
+
finally:
|
239
|
+
self._memoized_client = None
|
240
|
+
self._config_hash = None
|
241
|
+
|
242
|
+
def validate_record(self, record: Any) -> Tuple[Optional[T], Optional[str]]:
|
243
|
+
"""Validate a single record using Pydantic validation.
|
244
|
+
|
245
|
+
Args:
|
246
|
+
record: The record to validate.
|
247
|
+
|
248
|
+
Returns:
|
249
|
+
Tuple of (validated_data, error_message). If validation succeeds,
|
250
|
+
validated_data will be the validated record and error_message will be None.
|
251
|
+
If validation fails for any reason, validated_data will be None and error_message
|
252
|
+
will contain the error details.
|
253
|
+
"""
|
254
|
+
try:
|
255
|
+
validated = self._t.model_validate(record)
|
256
|
+
return validated, None
|
257
|
+
except Exception as e:
|
258
|
+
return None, str(e)
|
259
|
+
|
260
|
+
def validate_records(self, data: List[Any]) -> ValidationResult[T]:
|
261
|
+
"""Validate an array of records with comprehensive error reporting.
|
262
|
+
|
263
|
+
Args:
|
264
|
+
data: Array of records to validate.
|
265
|
+
|
266
|
+
Returns:
|
267
|
+
ValidationResult containing valid and invalid records.
|
268
|
+
"""
|
269
|
+
valid: List[T] = []
|
270
|
+
invalid: List[ValidationError] = []
|
271
|
+
|
272
|
+
for i, record in enumerate(data):
|
273
|
+
validated, error = self.validate_record(record)
|
274
|
+
if validated is not None:
|
275
|
+
valid.append(validated)
|
276
|
+
else:
|
277
|
+
invalid.append(ValidationError(
|
278
|
+
record=record,
|
279
|
+
error=error or "Validation failed",
|
280
|
+
index=i,
|
281
|
+
path="root"
|
282
|
+
))
|
283
|
+
|
284
|
+
return ValidationResult(
|
285
|
+
valid=valid,
|
286
|
+
invalid=invalid,
|
287
|
+
total=len(data)
|
288
|
+
)
|
289
|
+
|
290
|
+
def _validate_insert_parameters(
|
291
|
+
self,
|
292
|
+
data: Union[List[T], Iterator[T]],
|
293
|
+
options: Optional[InsertOptions]
|
294
|
+
) -> Tuple[bool, str, bool]:
|
295
|
+
"""Validate input parameters and strategy compatibility.
|
296
|
+
|
297
|
+
Args:
|
298
|
+
data: The data to insert (array or iterator).
|
299
|
+
options: Optional insert options.
|
300
|
+
|
301
|
+
Returns:
|
302
|
+
Tuple of (is_stream, strategy, should_validate).
|
303
|
+
"""
|
304
|
+
is_stream = not isinstance(data, list)
|
305
|
+
strategy = options.strategy if options else "fail-fast"
|
306
|
+
should_validate = options.validate if options else True
|
307
|
+
|
308
|
+
if is_stream and strategy == "isolate":
|
309
|
+
raise ValueError(
|
310
|
+
"The 'isolate' error strategy is not supported with stream input. "
|
311
|
+
"Use 'fail-fast' or 'discard' instead."
|
312
|
+
)
|
313
|
+
|
314
|
+
if is_stream and should_validate:
|
315
|
+
print("Warning: Validation is not supported with stream input. Validation will be skipped.")
|
316
|
+
|
317
|
+
return is_stream, strategy, should_validate
|
318
|
+
|
319
|
+
def _perform_pre_insertion_validation(
|
320
|
+
self,
|
321
|
+
data: List[T],
|
322
|
+
should_validate: bool,
|
323
|
+
strategy: str,
|
324
|
+
options: Optional[InsertOptions] = None
|
325
|
+
) -> Tuple[List[T], List[ValidationError]]:
|
326
|
+
"""Perform pre-insertion validation for array data.
|
327
|
+
|
328
|
+
Args:
|
329
|
+
data: The data to validate.
|
330
|
+
should_validate: Whether to perform validation.
|
331
|
+
strategy: The error handling strategy.
|
332
|
+
options: Optional insert options.
|
333
|
+
|
334
|
+
Returns:
|
335
|
+
Tuple of (validated_data, validation_errors).
|
336
|
+
"""
|
337
|
+
if not should_validate:
|
338
|
+
return data, []
|
339
|
+
|
340
|
+
try:
|
341
|
+
validation_result = self.validate_records(data)
|
342
|
+
validated_data = validation_result.valid
|
343
|
+
validation_errors = validation_result.invalid
|
344
|
+
|
345
|
+
if validation_errors:
|
346
|
+
self._handle_validation_errors(
|
347
|
+
validation_errors,
|
348
|
+
strategy,
|
349
|
+
data,
|
350
|
+
options
|
351
|
+
)
|
352
|
+
|
353
|
+
if strategy == "discard":
|
354
|
+
return validated_data, validation_errors
|
355
|
+
elif strategy == "isolate":
|
356
|
+
return data, validation_errors
|
357
|
+
else: # fail-fast
|
358
|
+
return validated_data, validation_errors
|
359
|
+
|
360
|
+
return validated_data, validation_errors
|
361
|
+
|
362
|
+
except Exception as validation_error:
|
363
|
+
if strategy == "fail-fast":
|
364
|
+
raise ValueError(f"Validation failed: {validation_error}")
|
365
|
+
print(f"Validation error: {validation_error}")
|
366
|
+
return data, []
|
367
|
+
|
368
|
+
def _handle_validation_errors(
|
369
|
+
self,
|
370
|
+
validation_errors: List[ValidationError],
|
371
|
+
strategy: str,
|
372
|
+
data: List[T],
|
373
|
+
options: Optional[InsertOptions]
|
374
|
+
) -> None:
|
375
|
+
"""Handle validation errors based on the specified strategy.
|
376
|
+
|
377
|
+
Args:
|
378
|
+
validation_errors: List of validation errors.
|
379
|
+
strategy: The error handling strategy.
|
380
|
+
data: The original data.
|
381
|
+
options: Optional insert options.
|
382
|
+
"""
|
383
|
+
if strategy == "fail-fast":
|
384
|
+
first_error = validation_errors[0]
|
385
|
+
raise ValueError(
|
386
|
+
f"Validation failed for record at index {first_error.index}: {first_error.error}"
|
387
|
+
)
|
388
|
+
elif strategy == "discard":
|
389
|
+
self._check_validation_thresholds(
|
390
|
+
validation_errors,
|
391
|
+
len(data),
|
392
|
+
options
|
393
|
+
)
|
394
|
+
|
395
|
+
def _check_validation_thresholds(
|
396
|
+
self,
|
397
|
+
validation_errors: List[ValidationError],
|
398
|
+
total_records: int,
|
399
|
+
options: Optional[InsertOptions]
|
400
|
+
) -> None:
|
401
|
+
"""Check if validation errors exceed configured thresholds.
|
402
|
+
|
403
|
+
Args:
|
404
|
+
validation_errors: List of validation errors.
|
405
|
+
total_records: Total number of records processed.
|
406
|
+
options: Optional insert options.
|
407
|
+
"""
|
408
|
+
validation_failed_count = len(validation_errors)
|
409
|
+
validation_failed_ratio = validation_failed_count / total_records
|
410
|
+
|
411
|
+
if (options and options.allow_errors is not None and
|
412
|
+
validation_failed_count > options.allow_errors):
|
413
|
+
raise ValueError(
|
414
|
+
f"Too many validation failures: {validation_failed_count} > {options.allow_errors}. "
|
415
|
+
f"Errors: {', '.join(e.error for e in validation_errors)}"
|
416
|
+
)
|
417
|
+
|
418
|
+
if (options and options.allow_errors_ratio is not None and
|
419
|
+
validation_failed_ratio > options.allow_errors_ratio):
|
420
|
+
raise ValueError(
|
421
|
+
f"Validation failure ratio too high: {validation_failed_ratio:.3f} > "
|
422
|
+
f"{options.allow_errors_ratio}. Errors: {', '.join(e.error for e in validation_errors)}"
|
423
|
+
)
|
424
|
+
|
425
|
+
def _to_json_each_row(self, records: list[dict]) -> bytes:
|
426
|
+
return "\n".join(json.dumps(r, default=str) for r in records).encode("utf-8")
|
427
|
+
|
428
|
+
def _prepare_insert_options(
|
429
|
+
self,
|
430
|
+
table_name: str,
|
431
|
+
data: Union[List[T], Iterator[T]],
|
432
|
+
validated_data: List[T],
|
433
|
+
is_stream: bool,
|
434
|
+
strategy: str,
|
435
|
+
options: Optional[InsertOptions]
|
436
|
+
) -> tuple[str, bytes, dict]:
|
437
|
+
"""Prepare insert options for JSONEachRow raw SQL insert, returning settings dict."""
|
438
|
+
# Base settings for all inserts
|
439
|
+
settings = {
|
440
|
+
"date_time_input_format": "best_effort",
|
441
|
+
"max_insert_block_size": 100000 if is_stream else min(len(validated_data), 100000),
|
442
|
+
"max_block_size": 65536,
|
443
|
+
"async_insert": 1 if len(validated_data) > 1000 else 0,
|
444
|
+
"wait_for_async_insert": 1,
|
445
|
+
}
|
446
|
+
if (strategy == "discard" and options and
|
447
|
+
(options.allow_errors is not None or options.allow_errors_ratio is not None)):
|
448
|
+
if options.allow_errors is not None:
|
449
|
+
settings["input_format_allow_errors_num"] = options.allow_errors
|
450
|
+
if options.allow_errors_ratio is not None:
|
451
|
+
settings["input_format_allow_errors_ratio"] = options.allow_errors_ratio
|
452
|
+
|
453
|
+
if is_stream:
|
454
|
+
return table_name, data, settings
|
455
|
+
|
456
|
+
if not isinstance(validated_data, list):
|
457
|
+
validated_data = [validated_data]
|
458
|
+
dict_data = []
|
459
|
+
for record in validated_data:
|
460
|
+
if hasattr(record, 'model_dump'):
|
461
|
+
dict_data.append(record.model_dump())
|
462
|
+
else:
|
463
|
+
dict_data.append(record)
|
464
|
+
if not dict_data:
|
465
|
+
return table_name, b"", settings
|
466
|
+
json_lines = self._to_json_each_row(dict_data)
|
467
|
+
return table_name, json_lines, settings
|
468
|
+
|
469
|
+
def _create_success_result(
|
470
|
+
self,
|
471
|
+
data: Union[List[T], Iterator[T]],
|
472
|
+
validated_data: List[T],
|
473
|
+
validation_errors: List[ValidationError],
|
474
|
+
is_stream: bool,
|
475
|
+
should_validate: bool,
|
476
|
+
strategy: str
|
477
|
+
) -> InsertResult[T]:
|
478
|
+
"""Create appropriate result based on input type.
|
479
|
+
|
480
|
+
Args:
|
481
|
+
data: The original data (array or stream).
|
482
|
+
validated_data: Validated data for array input.
|
483
|
+
validation_errors: List of validation errors.
|
484
|
+
is_stream: Whether the input is a stream.
|
485
|
+
should_validate: Whether validation was performed.
|
486
|
+
strategy: The error handling strategy.
|
487
|
+
|
488
|
+
Returns:
|
489
|
+
InsertResult with appropriate counts and error information.
|
490
|
+
"""
|
491
|
+
if is_stream:
|
492
|
+
return InsertResult(
|
493
|
+
successful=-1,
|
494
|
+
failed=0,
|
495
|
+
total=-1
|
496
|
+
)
|
497
|
+
|
498
|
+
inserted_count = len(validated_data)
|
499
|
+
total_processed = len(data) if not is_stream else inserted_count
|
500
|
+
|
501
|
+
result = InsertResult(
|
502
|
+
successful=inserted_count,
|
503
|
+
failed=len(validation_errors) if should_validate else 0,
|
504
|
+
total=total_processed
|
505
|
+
)
|
506
|
+
|
507
|
+
if (should_validate and validation_errors and strategy == "discard"):
|
508
|
+
result.failed_records = [
|
509
|
+
FailedRecord(
|
510
|
+
record=ve.record,
|
511
|
+
error=f"Validation error: {ve.error}",
|
512
|
+
index=ve.index
|
513
|
+
) for ve in validation_errors
|
514
|
+
]
|
515
|
+
|
516
|
+
return result
|
517
|
+
|
518
|
+
def _retry_individual_records(
|
519
|
+
self,
|
520
|
+
client: Client,
|
521
|
+
records: List[T],
|
522
|
+
options: InsertOptions
|
523
|
+
) -> InsertResult[T]:
|
524
|
+
successful: List[T] = []
|
525
|
+
failed: List[FailedRecord[T]] = []
|
526
|
+
table_name = self._generate_table_name()
|
527
|
+
records_dict = []
|
528
|
+
for record in records:
|
529
|
+
if hasattr(record, 'model_dump'):
|
530
|
+
records_dict.append(record.model_dump())
|
531
|
+
else:
|
532
|
+
records_dict.append(record)
|
533
|
+
|
534
|
+
RETRY_BATCH_SIZE = 10
|
535
|
+
for i in range(0, len(records_dict), RETRY_BATCH_SIZE):
|
536
|
+
batch = records_dict[i:i + RETRY_BATCH_SIZE]
|
537
|
+
try:
|
538
|
+
sql = f"INSERT INTO {table_name} FORMAT JSONEachRow"
|
539
|
+
settings = {
|
540
|
+
"date_time_input_format": "best_effort",
|
541
|
+
"max_insert_block_size": RETRY_BATCH_SIZE,
|
542
|
+
"max_block_size": RETRY_BATCH_SIZE,
|
543
|
+
"async_insert": 0
|
544
|
+
}
|
545
|
+
json_lines = self._to_json_each_row(batch)
|
546
|
+
client.command(sql, data=json_lines, settings=settings)
|
547
|
+
successful.extend(records[i:i + RETRY_BATCH_SIZE])
|
548
|
+
except ClickHouseError as batch_error:
|
549
|
+
for j, record_dict in enumerate(batch):
|
550
|
+
try:
|
551
|
+
sql = f"INSERT INTO {table_name} FORMAT JSONEachRow"
|
552
|
+
settings = {
|
553
|
+
"date_time_input_format": "best_effort",
|
554
|
+
"async_insert": 0
|
555
|
+
}
|
556
|
+
json_line = self._to_json_each_row([record_dict])
|
557
|
+
client.command(sql, data=json_line, settings=settings)
|
558
|
+
successful.append(records[i + j])
|
559
|
+
except ClickHouseError as error:
|
560
|
+
failed.append(FailedRecord(
|
561
|
+
record=records[i + j],
|
562
|
+
error=str(error),
|
563
|
+
index=i + j
|
564
|
+
))
|
565
|
+
return InsertResult(
|
566
|
+
successful=len(successful),
|
567
|
+
failed=len(failed),
|
568
|
+
total=len(records),
|
569
|
+
failed_records=failed if failed else None
|
570
|
+
)
|
571
|
+
|
572
|
+
def _insert_array_data(
|
573
|
+
self,
|
574
|
+
client: Client,
|
575
|
+
table_name: str,
|
576
|
+
data: List[T],
|
577
|
+
should_validate: bool,
|
578
|
+
strategy: str,
|
579
|
+
options: Optional[InsertOptions]
|
580
|
+
) -> InsertResult[T]:
|
581
|
+
"""Insert array data into the table with validation and error handling.
|
582
|
+
|
583
|
+
Args:
|
584
|
+
client: The ClickHouse client to use.
|
585
|
+
table_name: The name of the table to insert into.
|
586
|
+
data: The original data array.
|
587
|
+
should_validate: Whether validation was performed.
|
588
|
+
strategy: The error handling strategy.
|
589
|
+
options: Optional insert options.
|
590
|
+
|
591
|
+
Returns:
|
592
|
+
InsertResult with detailed success/failure information.
|
593
|
+
"""
|
594
|
+
validated_data, validation_errors = self._perform_pre_insertion_validation(
|
595
|
+
data,
|
596
|
+
should_validate,
|
597
|
+
strategy,
|
598
|
+
options
|
599
|
+
)
|
600
|
+
try:
|
601
|
+
table_name, json_lines, settings = self._prepare_insert_options(
|
602
|
+
table_name,
|
603
|
+
data,
|
604
|
+
validated_data,
|
605
|
+
False,
|
606
|
+
strategy,
|
607
|
+
options
|
608
|
+
)
|
609
|
+
sql = f"INSERT INTO {table_name} FORMAT JSONEachRow"
|
610
|
+
client.command(sql, data=json_lines, settings=settings)
|
611
|
+
return self._create_success_result(
|
612
|
+
data,
|
613
|
+
validated_data,
|
614
|
+
validation_errors,
|
615
|
+
False,
|
616
|
+
should_validate,
|
617
|
+
strategy
|
618
|
+
)
|
619
|
+
except ClickHouseError as e:
|
620
|
+
if strategy == "fail-fast":
|
621
|
+
raise ValueError(f"Insert failed: {e}")
|
622
|
+
elif strategy == "discard":
|
623
|
+
raise ValueError(f"Too many errors during insert: {e}")
|
624
|
+
else: # isolate
|
625
|
+
return self._retry_individual_records(
|
626
|
+
client,
|
627
|
+
validated_data if not options.skip_validation_on_retry else data,
|
628
|
+
options
|
629
|
+
)
|
630
|
+
|
631
|
+
def _insert_stream(
|
632
|
+
self,
|
633
|
+
client: Client,
|
634
|
+
table_name: str,
|
635
|
+
data: Iterator[T],
|
636
|
+
strategy: str,
|
637
|
+
options: Optional[InsertOptions]
|
638
|
+
) -> InsertResult[T]:
|
639
|
+
"""Insert data from an iterator into the table.
|
640
|
+
|
641
|
+
Args:
|
642
|
+
client: The ClickHouse client to use.
|
643
|
+
table_name: The name of the table to insert into.
|
644
|
+
data: An iterator that yields objects to insert.
|
645
|
+
strategy: The error handling strategy.
|
646
|
+
|
647
|
+
Returns:
|
648
|
+
InsertResult with detailed success/failure information.
|
649
|
+
"""
|
650
|
+
try:
|
651
|
+
batch = []
|
652
|
+
total_inserted = 0
|
653
|
+
|
654
|
+
_, _, settings = self._prepare_insert_options(
|
655
|
+
table_name,
|
656
|
+
data,
|
657
|
+
[],
|
658
|
+
True,
|
659
|
+
strategy,
|
660
|
+
options
|
661
|
+
)
|
662
|
+
|
663
|
+
for record in data:
|
664
|
+
# Convert record to dict using model_dump if available
|
665
|
+
if hasattr(record, 'model_dump'):
|
666
|
+
batch.append(record.model_dump())
|
667
|
+
else:
|
668
|
+
batch.append(record)
|
669
|
+
|
670
|
+
if len(batch) >= 1000: # Batch size
|
671
|
+
json_lines = self._to_json_each_row(batch)
|
672
|
+
sql = f"INSERT INTO {table_name} FORMAT JSONEachRow"
|
673
|
+
client.command(sql, data=json_lines, settings=settings)
|
674
|
+
total_inserted += len(batch)
|
675
|
+
batch = []
|
676
|
+
|
677
|
+
if batch: # Insert any remaining records
|
678
|
+
json_lines = self._to_json_each_row(batch)
|
679
|
+
sql = f"INSERT INTO {table_name} FORMAT JSONEachRow"
|
680
|
+
client.command(sql, data=json_lines, settings=settings)
|
681
|
+
total_inserted += len(batch)
|
682
|
+
|
683
|
+
return InsertResult(
|
684
|
+
successful=total_inserted,
|
685
|
+
failed=0,
|
686
|
+
total=total_inserted
|
687
|
+
)
|
688
|
+
except ClickHouseError as e:
|
689
|
+
if strategy == "fail-fast":
|
690
|
+
raise ValueError(f"Stream insert failed: {e}")
|
691
|
+
raise ValueError(f"Too many errors during stream insert: {e}")
|
692
|
+
|
693
|
+
def insert(
|
694
|
+
self,
|
695
|
+
data: Union[List[T], Iterator[T]],
|
696
|
+
options: Optional[InsertOptions] = None
|
697
|
+
) -> InsertResult[T]:
|
698
|
+
"""Insert data into the table with validation and error handling.
|
699
|
+
|
700
|
+
This method provides a typed interface for inserting data into the ClickHouse table,
|
701
|
+
with comprehensive validation and error handling strategies.
|
702
|
+
|
703
|
+
Args:
|
704
|
+
data: Either an array of objects conforming to the table schema, or an iterator
|
705
|
+
that yields objects to insert (e.g., a generator function).
|
706
|
+
options: Optional configuration for error handling, validation, and insertion behavior.
|
707
|
+
|
708
|
+
Returns:
|
709
|
+
InsertResult with detailed success/failure information.
|
710
|
+
|
711
|
+
Example:
|
712
|
+
```python
|
713
|
+
# Create an OlapTable instance
|
714
|
+
user_table = OlapTable[User]('users')
|
715
|
+
|
716
|
+
# Insert with validation
|
717
|
+
result1 = user_table.insert([
|
718
|
+
{'id': 1, 'name': 'John', 'email': 'john@example.com'},
|
719
|
+
{'id': 2, 'name': 'Jane', 'email': 'jane@example.com'}
|
720
|
+
])
|
721
|
+
|
722
|
+
# Insert with a generator (validation not available for streams)
|
723
|
+
def user_stream():
|
724
|
+
for i in range(10):
|
725
|
+
yield User(
|
726
|
+
id=i,
|
727
|
+
name=f'User {i}',
|
728
|
+
email=f'user{i}@example.com'
|
729
|
+
)
|
730
|
+
|
731
|
+
result2 = user_table.insert(user_stream(), options=InsertOptions(strategy='fail-fast'))
|
732
|
+
|
733
|
+
# Insert with validation disabled
|
734
|
+
result3 = user_table.insert(data, options=InsertOptions(validate=False))
|
735
|
+
|
736
|
+
# Insert with error handling strategies
|
737
|
+
result4 = user_table.insert(mixed_data, options=InsertOptions(
|
738
|
+
strategy='discard',
|
739
|
+
allow_errors_ratio=0.1,
|
740
|
+
validate=True
|
741
|
+
))
|
742
|
+
|
743
|
+
# Optional: Clean up connection when done
|
744
|
+
user_table.close_client()
|
745
|
+
```
|
746
|
+
"""
|
747
|
+
options = options or InsertOptions()
|
748
|
+
is_stream, strategy, should_validate = self._validate_insert_parameters(data, options)
|
749
|
+
if (is_stream and not data) or (not is_stream and not data):
|
750
|
+
return InsertResult(successful=0, failed=0, total=0)
|
751
|
+
|
752
|
+
client = self._get_memoized_client()
|
753
|
+
table_name = self._generate_table_name()
|
754
|
+
|
755
|
+
if is_stream:
|
756
|
+
return self._insert_stream(client, table_name, data, strategy, options)
|
757
|
+
else:
|
758
|
+
return self._insert_array_data(
|
759
|
+
client,
|
760
|
+
table_name,
|
761
|
+
data,
|
762
|
+
should_validate,
|
763
|
+
strategy,
|
764
|
+
options
|
765
|
+
)
|
@@ -1,9 +1,10 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: moose_lib
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.225
|
4
4
|
Home-page: https://www.fiveonefour.com/moose
|
5
5
|
Author: Fiveonefour Labs Inc.
|
6
6
|
Author-email: support@fiveonefour.com
|
7
|
+
Requires-Python: >=3.12
|
7
8
|
Description-Content-Type: text/markdown
|
8
9
|
Requires-Dist: pyjwt[crypto]==2.9.0
|
9
10
|
Requires-Dist: asyncio==3.4.3
|
@@ -12,12 +13,14 @@ Requires-Dist: temporalio==1.9.0
|
|
12
13
|
Requires-Dist: kafka-python-ng==2.2.2
|
13
14
|
Requires-Dist: redis==6.2.0
|
14
15
|
Requires-Dist: humanfriendly==10.0
|
16
|
+
Requires-Dist: clickhouse_connect==0.7.16
|
15
17
|
Dynamic: author
|
16
18
|
Dynamic: author-email
|
17
19
|
Dynamic: description
|
18
20
|
Dynamic: description-content-type
|
19
21
|
Dynamic: home-page
|
20
22
|
Dynamic: requires-dist
|
23
|
+
Dynamic: requires-python
|
21
24
|
|
22
25
|
# Python Moose Lib
|
23
26
|
|
@@ -9,13 +9,13 @@ moose_lib/query_param.py,sha256=AB5BKu610Ji-h1iYGMBZKfnEFqt85rS94kzhDwhWJnc,6288
|
|
9
9
|
moose_lib/tasks.py,sha256=6MXA0j7nhvQILAJVTQHCAsquwrSOi2zAevghAc_7kXs,1554
|
10
10
|
moose_lib/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
moose_lib/clients/redis_client.py,sha256=UBCdxwgZpIOIOy2EnPyxJIAYjw_qmNwGsJQCQ66SxUI,8117
|
12
|
-
moose_lib/dmv2/__init__.py,sha256=
|
12
|
+
moose_lib/dmv2/__init__.py,sha256=zyGK2YDN4zQ2SMn9-SAiOuhpU46STePuJjK18LEserg,2269
|
13
13
|
moose_lib/dmv2/_registry.py,sha256=agdZ7xzS99Caou60Q2pEErzEwyNYHqwy6XqV79eEmwg,504
|
14
14
|
moose_lib/dmv2/consumption.py,sha256=71wdv6ZuEi8Om7aX3Lq-d6bAoc1-iw3Wudb8dHESJKI,4072
|
15
15
|
moose_lib/dmv2/ingest_api.py,sha256=Snek9NGwaJl_BuImSWGtQq91m9D3AJ4qBoGiKZ-9yTQ,2323
|
16
16
|
moose_lib/dmv2/ingest_pipeline.py,sha256=Y1gsvHZjlW07gMapLnBRJEsoAPv7ThvLABoLmVV7BHE,6714
|
17
17
|
moose_lib/dmv2/materialized_view.py,sha256=kcx-sJFTM-cH3Uc1GoldgFGodjoz0AegAQEMmohdS38,3826
|
18
|
-
moose_lib/dmv2/olap_table.py,sha256=
|
18
|
+
moose_lib/dmv2/olap_table.py,sha256=D3qpRGMnYF0gu5FRW8E5oDBqdWMCWLRv7fWv81DURsk,28378
|
19
19
|
moose_lib/dmv2/registry.py,sha256=AaGS6Xy0vKz-wHLPgRVxfKfSwW5KksMePjZ8N7-2OKU,2054
|
20
20
|
moose_lib/dmv2/sql_resource.py,sha256=kUZoGqxhZMHMthtBZGYJBxTFjXkspXiWLXhJRYXgGUM,1864
|
21
21
|
moose_lib/dmv2/stream.py,sha256=H5nzqVHIXulFNMNaGZUQnhGjNx7fIg0X95kxAO_qlls,10600
|
@@ -28,7 +28,7 @@ tests/__init__.py,sha256=0Gh4yzPkkC3TzBGKhenpMIxJcRhyrrCfxLSfpTZnPMQ,53
|
|
28
28
|
tests/conftest.py,sha256=ZVJNbnr4DwbcqkTmePW6U01zAzE6QD0kNAEZjPG1f4s,169
|
29
29
|
tests/test_moose.py,sha256=mBsx_OYWmL8ppDzL_7Bd7xR6qf_i3-pCIO3wm2iQNaA,2136
|
30
30
|
tests/test_redis_client.py,sha256=d9_MLYsJ4ecVil_jPB2gW3Q5aWnavxmmjZg2uYI3LVo,3256
|
31
|
-
moose_lib-0.4.
|
32
|
-
moose_lib-0.4.
|
33
|
-
moose_lib-0.4.
|
34
|
-
moose_lib-0.4.
|
31
|
+
moose_lib-0.4.225.dist-info/METADATA,sha256=09ZolwkUTQivLSDh1AG7PQeqsrnjwkqGw4fLUnFPtsU,729
|
32
|
+
moose_lib-0.4.225.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
33
|
+
moose_lib-0.4.225.dist-info/top_level.txt,sha256=XEns2-4aCmGp2XjJAeEH9TAUcGONLnSLy6ycT9FSJh8,16
|
34
|
+
moose_lib-0.4.225.dist-info/RECORD,,
|
File without changes
|
File without changes
|