clickhouse-orm 3.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,617 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from collections import OrderedDict
5
+ from itertools import chain
6
+ from logging import getLogger
7
+
8
+ import pytz
9
+
10
+ from .engines import Distributed, Merge
11
+ from .fields import Field, StringField
12
+ from .funcs import F
13
+ from .query import QuerySet
14
+ from .utils import NO_VALUE, arg_to_sql, get_subclass_names, parse_tsv
15
+
16
+ logger = getLogger("clickhouse_orm")
17
+
18
+
19
+ class Constraint:
20
+ """
21
+ Defines a model constraint.
22
+ """
23
+
24
+ name = None # this is set by the parent model
25
+ parent = None # this is set by the parent model
26
+
27
+ def __init__(self, expr):
28
+ """
29
+ Initializer. Expects an expression that ClickHouse will verify when inserting data.
30
+ """
31
+ self.expr = expr
32
+
33
+ def create_table_sql(self):
34
+ """
35
+ Returns the SQL statement for defining this constraint during table creation.
36
+ """
37
+ return "CONSTRAINT `%s` CHECK %s" % (self.name, arg_to_sql(self.expr))
38
+
39
+
40
+ class Index:
41
+ """
42
+ Defines a data-skipping index.
43
+ """
44
+
45
+ name = None # this is set by the parent model
46
+ parent = None # this is set by the parent model
47
+
48
+ def __init__(self, expr, type, granularity):
49
+ """
50
+ Initializer.
51
+
52
+ - `expr` - a column, expression, or tuple of columns and expressions to index.
53
+ - `type` - the index type. Use one of the following methods to specify the type:
54
+ `Index.minmax`, `Index.set`, `Index.ngrambf_v1`, `Index.tokenbf_v1` or `Index.bloom_filter`.
55
+ - `granularity` - index block size (number of multiples of the `index_granularity` defined by the engine).
56
+ """
57
+ self.expr = expr
58
+ self.type = type
59
+ self.granularity = granularity
60
+
61
+ def create_table_sql(self):
62
+ """
63
+ Returns the SQL statement for defining this index during table creation.
64
+ """
65
+ return "INDEX `%s` %s TYPE %s GRANULARITY %d" % (self.name, arg_to_sql(self.expr), self.type, self.granularity)
66
+
67
+ @staticmethod
68
+ def minmax():
69
+ """
70
+ An index that stores extremes of the specified expression (if the expression is tuple, then it stores
71
+ extremes for each element of tuple). The stored info is used for skipping blocks of data like the primary key.
72
+ """
73
+ return "minmax"
74
+
75
+ @staticmethod
76
+ def set(max_rows):
77
+ """
78
+ An index that stores unique values of the specified expression (no more than max_rows rows,
79
+ or unlimited if max_rows=0). Uses the values to check if the WHERE expression is not satisfiable
80
+ on a block of data.
81
+ """
82
+ return "set(%d)" % max_rows
83
+
84
+ @staticmethod
85
+ def ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed):
86
+ """
87
+ An index that stores a Bloom filter containing all ngrams from a block of data.
88
+ Works only with strings. Can be used for optimization of equals, like and in expressions.
89
+
90
+ - `n` — ngram size
91
+ - `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here,
92
+ for example 256 or 512, because it can be compressed well).
93
+ - `number_of_hash_functions` — The number of hash functions used in the Bloom filter.
94
+ - `random_seed` — The seed for Bloom filter hash functions.
95
+ """
96
+ return "ngrambf_v1(%d, %d, %d, %d)" % (n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)
97
+
98
+ @staticmethod
99
+ def tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed):
100
+ """
101
+ An index that stores a Bloom filter containing string tokens. Tokens are sequences
102
+ separated by non-alphanumeric characters.
103
+
104
+ - `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here,
105
+ for example 256 or 512, because it can be compressed well).
106
+ - `number_of_hash_functions` — The number of hash functions used in the Bloom filter.
107
+ - `random_seed` — The seed for Bloom filter hash functions.
108
+ """
109
+ return "tokenbf_v1(%d, %d, %d)" % (size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)
110
+
111
+ @staticmethod
112
+ def bloom_filter(false_positive=0.025):
113
+ """
114
+ An index that stores a Bloom filter containing values of the index expression.
115
+
116
+ - `false_positive` - the probability (between 0 and 1) of receiving a false positive
117
+ response from the filter
118
+ """
119
+ return "bloom_filter(%f)" % false_positive
120
+
121
+
122
+ class ModelBase(type):
123
+ """
124
+ A metaclass for ORM models. It adds the _fields list to model classes.
125
+ """
126
+
127
+ ad_hoc_model_cache = {}
128
+
129
+ def __new__(metacls, name, bases, attrs):
130
+ # Collect fields, constraints and indexes from parent classes
131
+ fields = {}
132
+ constraints = {}
133
+ indexes = {}
134
+ for base in bases:
135
+ if isinstance(base, ModelBase):
136
+ fields.update(base._fields)
137
+ constraints.update(base._constraints)
138
+ indexes.update(base._indexes)
139
+
140
+ # Add fields, constraints and indexes from this class
141
+ for n, obj in attrs.items():
142
+ if isinstance(obj, Field):
143
+ fields[n] = obj
144
+ elif isinstance(obj, Constraint):
145
+ constraints[n] = obj
146
+ elif isinstance(obj, Index):
147
+ indexes[n] = obj
148
+
149
+ # Convert fields to a list of (name, field) tuples, in the order they were listed in the class
150
+ fields = sorted(fields.items(), key=lambda item: item[1].creation_counter)
151
+
152
+ # Build a dictionary of default values
153
+ defaults = {}
154
+ has_funcs_as_defaults = False
155
+ for n, f in fields:
156
+ if f.alias or f.materialized:
157
+ defaults[n] = NO_VALUE
158
+ elif isinstance(f.default, F):
159
+ defaults[n] = NO_VALUE
160
+ has_funcs_as_defaults = True
161
+ else:
162
+ defaults[n] = f.to_python(f.default, pytz.UTC)
163
+
164
+ # Create the model class
165
+ attrs = dict(
166
+ attrs,
167
+ _fields=OrderedDict(fields),
168
+ _constraints=constraints,
169
+ _indexes=indexes,
170
+ _writable_fields=OrderedDict([f for f in fields if not f[1].readonly]),
171
+ _defaults=defaults,
172
+ _has_funcs_as_defaults=has_funcs_as_defaults,
173
+ )
174
+ model = super().__new__(metacls, str(name), bases, attrs)
175
+
176
+ # Let each field, constraint and index know its parent and its own name
177
+ for n, obj in chain(fields, constraints.items(), indexes.items()):
178
+ obj.parent = model
179
+ obj.name = n
180
+
181
+ return model
182
+
183
+ @classmethod
184
+ def create_ad_hoc_model(cls, fields, model_name="AdHocModel"):
185
+ # fields is a list of tuples (name, db_type)
186
+ # Check if model exists in cache
187
+ fields = list(fields)
188
+ cache_key = model_name + " " + str(fields)
189
+ if cache_key in cls.ad_hoc_model_cache:
190
+ return cls.ad_hoc_model_cache[cache_key]
191
+ # Create an ad hoc model class
192
+ attrs = {}
193
+ for name, db_type in fields:
194
+ attrs[name] = cls.create_ad_hoc_field(db_type)
195
+ model_class = cls.__new__(cls, model_name, (Model,), attrs)
196
+ # Add the model class to the cache
197
+ cls.ad_hoc_model_cache[cache_key] = model_class
198
+ return model_class
199
+
200
+ @classmethod
201
+ def create_ad_hoc_field(cls, db_type):
202
+ import clickhouse_orm.fields as orm_fields
203
+
204
+ # Enums
205
+ if db_type.startswith("Enum"):
206
+ return orm_fields.BaseEnumField.create_ad_hoc_field(db_type)
207
+ # DateTime with timezone
208
+ if db_type.startswith("DateTime("):
209
+ timezone = db_type[9:-1]
210
+ return orm_fields.DateTimeField(timezone=timezone[1:-1] if timezone else None)
211
+ # DateTime64
212
+ if db_type.startswith("DateTime64("):
213
+ precision, *timezone = [s.strip() for s in db_type[11:-1].split(",")]
214
+ return orm_fields.DateTime64Field(
215
+ precision=int(precision), timezone=timezone[0][1:-1] if timezone else None
216
+ )
217
+ # Arrays
218
+ if db_type.startswith("Array"):
219
+ inner_field = cls.create_ad_hoc_field(db_type[6:-1])
220
+ return orm_fields.ArrayField(inner_field)
221
+ # Tuples (poor man's version - convert to array)
222
+ if db_type.startswith("Tuple"):
223
+ types = [s.strip() for s in db_type[6:-1].split(",")]
224
+ # newer versions are essentially "named tuples"
225
+ if any(" " in t for t in types):
226
+ assert all(" " in t for t in types), "Either all or none of the tuple types must be named - " + db_type
227
+ types = [t.split(" ", 1)[1] for t in types]
228
+
229
+ assert len(set(types)) == 1, "No support for mixed types in tuples - " + db_type
230
+ inner_field = cls.create_ad_hoc_field(types[0])
231
+ return orm_fields.ArrayField(inner_field)
232
+ # FixedString
233
+ if db_type.startswith("FixedString"):
234
+ length = int(db_type[12:-1])
235
+ return orm_fields.FixedStringField(length)
236
+ # Decimal / Decimal32 / Decimal64 / Decimal128
237
+ if db_type.startswith("Decimal"):
238
+ p = db_type.index("(")
239
+ args = [int(n.strip()) for n in db_type[p + 1 : -1].split(",")]
240
+ field_class = getattr(orm_fields, db_type[:p] + "Field")
241
+ return field_class(*args)
242
+ # Nullable
243
+ if db_type.startswith("Nullable"):
244
+ inner_field = cls.create_ad_hoc_field(db_type[9:-1])
245
+ return orm_fields.NullableField(inner_field)
246
+ # LowCardinality
247
+ if db_type.startswith("LowCardinality"):
248
+ inner_field = cls.create_ad_hoc_field(db_type[15:-1])
249
+ return orm_fields.LowCardinalityField(inner_field)
250
+ # Simple fields
251
+ name = db_type + "Field"
252
+ if not hasattr(orm_fields, name):
253
+ raise NotImplementedError("No field class for %s" % db_type)
254
+ return getattr(orm_fields, name)()
255
+
256
+
257
+ class Model(metaclass=ModelBase):
258
+ """
259
+ A base class for ORM models. Each model class represent a ClickHouse table. For example:
260
+
261
+ class CPUStats(Model):
262
+ timestamp = DateTimeField()
263
+ cpu_id = UInt16Field()
264
+ cpu_percent = Float32Field()
265
+ engine = Memory()
266
+ """
267
+
268
+ engine = None
269
+
270
+ # Insert operations are restricted for read only models
271
+ _readonly = False
272
+
273
+ # Create table, drop table, insert operations are restricted for system models
274
+ _system = False
275
+
276
+ _database = None
277
+
278
+ def __init__(self, **kwargs):
279
+ """
280
+ Creates a model instance, using keyword arguments as field values.
281
+ Since values are immediately converted to their Pythonic type,
282
+ invalid values will cause a `ValueError` to be raised.
283
+ Unrecognized field names will cause an `AttributeError`.
284
+ """
285
+ super().__init__()
286
+ # Assign default values
287
+ self.__dict__.update(self._defaults)
288
+ # Assign field values from keyword arguments
289
+ for name, value in kwargs.items():
290
+ field = self.get_field(name)
291
+ if field:
292
+ setattr(self, name, value)
293
+ else:
294
+ raise AttributeError("%s does not have a field called %s" % (self.__class__.__name__, name))
295
+
296
+ def __setattr__(self, name, value):
297
+ """
298
+ When setting a field value, converts the value to its Pythonic type and validates it.
299
+ This may raise a `ValueError`.
300
+ """
301
+ field = self.get_field(name)
302
+ if field and (value != NO_VALUE):
303
+ try:
304
+ value = field.to_python(value, pytz.utc)
305
+ field.validate(value)
306
+ except ValueError:
307
+ tp, v, tb = sys.exc_info()
308
+ new_msg = f"{v} (field '{name}')"
309
+ raise tp.with_traceback(tp(new_msg), tb)
310
+ super().__setattr__(name, value)
311
+
312
+ def set_database(self, db):
313
+ """
314
+ Sets the `Database` that this model instance belongs to.
315
+ This is done automatically when the instance is read from the database or written to it.
316
+ """
317
+ # This can not be imported globally due to circular import
318
+ from .database import Database
319
+
320
+ assert isinstance(db, Database), "database must be database.Database instance"
321
+ self._database = db
322
+
323
+ def get_database(self):
324
+ """
325
+ Gets the `Database` that this model instance belongs to.
326
+ Returns `None` unless the instance was read from the database or written to it.
327
+ """
328
+ return self._database
329
+
330
+ def get_field(self, name):
331
+ """
332
+ Gets a `Field` instance given its name, or `None` if not found.
333
+ """
334
+ return self._fields.get(name)
335
+
336
+ @classmethod
337
+ def table_name(cls):
338
+ """
339
+ Returns the model's database table name. By default this is the
340
+ class name converted to lowercase. Override this if you want to use
341
+ a different table name.
342
+ """
343
+ return cls.__name__.lower()
344
+
345
+ @classmethod
346
+ def has_funcs_as_defaults(cls):
347
+ """
348
+ Return True if some of the model's fields use a function expression
349
+ as a default value. This requires special handling when inserting instances.
350
+ """
351
+ return cls._has_funcs_as_defaults
352
+
353
+ @classmethod
354
+ def create_table_sql(cls, db):
355
+ """
356
+ Returns the SQL statement for creating a table for this model.
357
+ """
358
+ parts = ["CREATE TABLE IF NOT EXISTS `%s`.`%s` (" % (db.db_name, cls.table_name())]
359
+ # Fields
360
+ items = []
361
+ for name, field in cls.fields().items():
362
+ items.append(" %s %s" % (name, field.get_sql(db=db)))
363
+ # Constraints
364
+ for c in cls._constraints.values():
365
+ items.append(" %s" % c.create_table_sql())
366
+ # Indexes
367
+ for i in cls._indexes.values():
368
+ items.append(" %s" % i.create_table_sql())
369
+ parts.append(",\n".join(items))
370
+ # Engine
371
+ parts.append(")")
372
+ parts.append("ENGINE = " + cls.engine.create_table_sql(db))
373
+ return "\n".join(parts)
374
+
375
+ @classmethod
376
+ def drop_table_sql(cls, db):
377
+ """
378
+ Returns the SQL command for deleting this model's table.
379
+ """
380
+ return "DROP TABLE IF EXISTS `%s`.`%s`" % (db.db_name, cls.table_name())
381
+
382
+ @classmethod
383
+ def from_tsv(cls, line, field_names, timezone_in_use=pytz.utc, database=None):
384
+ """
385
+ Create a model instance from a tab-separated line. The line may or may not include a newline.
386
+ The `field_names` list must match the fields defined in the model, but does not have to include all of them.
387
+
388
+ - `line`: the TSV-formatted data.
389
+ - `field_names`: names of the model fields in the data.
390
+ - `timezone_in_use`: the timezone to use when parsing dates and datetimes. Some fields use their own timezones.
391
+ - `database`: if given, sets the database that this instance belongs to.
392
+ """
393
+ values = iter(parse_tsv(line))
394
+ kwargs = {}
395
+ for name in field_names:
396
+ field = getattr(cls, name)
397
+ field_timezone = getattr(field, "timezone", None) or timezone_in_use
398
+ kwargs[name] = field.to_python(next(values), field_timezone)
399
+
400
+ obj = cls(**kwargs)
401
+ if database is not None:
402
+ obj.set_database(database)
403
+
404
+ return obj
405
+
406
+ def to_tsv(self, include_readonly=True):
407
+ """
408
+ Returns the instance's column values as a tab-separated line. A newline is not included.
409
+
410
+ - `include_readonly`: if false, returns only fields that can be inserted into database.
411
+ """
412
+ data = self.__dict__
413
+ fields = self.fields(writable=not include_readonly)
414
+ return "\t".join(field.to_db_string(data[name], quote=False) for name, field in fields.items())
415
+
416
+ def to_tskv(self, include_readonly=True):
417
+ """
418
+ Returns the instance's column keys and values as a tab-separated line. A newline is not included.
419
+ Fields that were not assigned a value are omitted.
420
+
421
+ - `include_readonly`: if false, returns only fields that can be inserted into database.
422
+ """
423
+ data = self.__dict__
424
+ fields = self.fields(writable=not include_readonly)
425
+ parts = []
426
+ for name, field in fields.items():
427
+ if data[name] != NO_VALUE:
428
+ parts.append(name + "=" + field.to_db_string(data[name], quote=False))
429
+ return "\t".join(parts)
430
+
431
+ def to_db_string(self):
432
+ """
433
+ Returns the instance as a bytestring ready to be inserted into the database.
434
+ """
435
+ s = self.to_tskv(False) if self._has_funcs_as_defaults else self.to_tsv(False)
436
+ s += "\n"
437
+ return s.encode("utf-8")
438
+
439
+ def to_dict(self, include_readonly=True, field_names=None):
440
+ """
441
+ Returns the instance's column values as a dict.
442
+
443
+ - `include_readonly`: if false, returns only fields that can be inserted into database.
444
+ - `field_names`: an iterable of field names to return (optional)
445
+ """
446
+ fields = self.fields(writable=not include_readonly)
447
+
448
+ if field_names is not None:
449
+ fields = [f for f in fields if f in field_names]
450
+
451
+ data = self.__dict__
452
+ return {name: data[name] for name in fields}
453
+
454
+ @classmethod
455
+ def objects_in(cls, database):
456
+ """
457
+ Returns a `QuerySet` for selecting instances of this model class.
458
+ """
459
+ return QuerySet(cls, database)
460
+
461
+ @classmethod
462
+ def fields(cls, writable=False):
463
+ """
464
+ Returns an `OrderedDict` of the model's fields (from name to `Field` instance).
465
+ If `writable` is true, only writable fields are included.
466
+ Callers should not modify the dictionary.
467
+ """
468
+ # noinspection PyProtectedMember,PyUnresolvedReferences
469
+ return cls._writable_fields if writable else cls._fields
470
+
471
+ @classmethod
472
+ def is_read_only(cls):
473
+ """
474
+ Returns true if the model is marked as read only.
475
+ """
476
+ return cls._readonly
477
+
478
+ @classmethod
479
+ def is_system_model(cls):
480
+ """
481
+ Returns true if the model represents a system table.
482
+ """
483
+ return cls._system
484
+
485
+
486
+ class BufferModel(Model):
487
+ @classmethod
488
+ def create_table_sql(cls, db):
489
+ """
490
+ Returns the SQL statement for creating a table for this model.
491
+ """
492
+ parts = [
493
+ "CREATE TABLE IF NOT EXISTS `%s`.`%s` AS `%s`.`%s`"
494
+ % (db.db_name, cls.table_name(), db.db_name, cls.engine.main_model.table_name())
495
+ ]
496
+ engine_str = cls.engine.create_table_sql(db)
497
+ parts.append(engine_str)
498
+ return " ".join(parts)
499
+
500
+
501
+ class MergeModel(Model):
502
+ """
503
+ Model for Merge engine
504
+ Predefines virtual _table column an controls that rows can't be inserted to this table type
505
+ https://clickhouse.tech/docs/en/single/index.html#document-table_engines/merge
506
+ """
507
+
508
+ readonly = True
509
+
510
+ # Virtual fields can't be inserted into database
511
+ _table = StringField(readonly=True)
512
+
513
+ @classmethod
514
+ def create_table_sql(cls, db):
515
+ """
516
+ Returns the SQL statement for creating a table for this model.
517
+ """
518
+ assert isinstance(cls.engine, Merge), "engine must be an instance of engines.Merge"
519
+ parts = ["CREATE TABLE IF NOT EXISTS `%s`.`%s` (" % (db.db_name, cls.table_name())]
520
+ cols = []
521
+ for name, field in cls.fields().items():
522
+ if name != "_table":
523
+ cols.append(" %s %s" % (name, field.get_sql(db=db)))
524
+ parts.append(",\n".join(cols))
525
+ parts.append(")")
526
+ parts.append("ENGINE = " + cls.engine.create_table_sql(db))
527
+ return "\n".join(parts)
528
+
529
+
530
+ # TODO: base class for models that require specific engine
531
+
532
+
533
+ class DistributedModel(Model):
534
+ """
535
+ Model class for use with a `Distributed` engine.
536
+ """
537
+
538
+ def set_database(self, db):
539
+ """
540
+ Sets the `Database` that this model instance belongs to.
541
+ This is done automatically when the instance is read from the database or written to it.
542
+ """
543
+ assert isinstance(self.engine, Distributed), "engine must be an instance of engines.Distributed"
544
+ res = super().set_database(db)
545
+ return res
546
+
547
+ @classmethod
548
+ def fix_engine_table(cls):
549
+ """
550
+ Remember: Distributed table does not store any data, just provides distributed access to it.
551
+
552
+ So if we define a model with engine that has no defined table for data storage
553
+ (see FooDistributed below), that table cannot be successfully created.
554
+ This routine can automatically fix engine's storage table by finding the first
555
+ non-distributed model among your model's superclasses.
556
+
557
+ >>> class Foo(Model):
558
+ ... id = UInt8Field(1)
559
+ ...
560
+ >>> class FooDistributed(Foo, DistributedModel):
561
+ ... engine = Distributed('my_cluster')
562
+ ...
563
+ >>> FooDistributed.engine.table
564
+ None
565
+ >>> FooDistributed.fix_engine()
566
+ >>> FooDistributed.engine.table
567
+ <class '__main__.Foo'>
568
+
569
+ However if you prefer more explicit way of doing things,
570
+ you can always mention the Foo model twice without bothering with any fixes:
571
+
572
+ >>> class FooDistributedVerbose(Foo, DistributedModel):
573
+ ... engine = Distributed('my_cluster', Foo)
574
+ >>> FooDistributedVerbose.engine.table
575
+ <class '__main__.Foo'>
576
+
577
+ See tests.test_engines:DistributedTestCase for more examples
578
+ """
579
+
580
+ # apply only when engine has no table defined
581
+ if cls.engine.table_name:
582
+ return
583
+
584
+ # find out all the superclasses of the Model that store any data
585
+ storage_models = [b for b in cls.__bases__ if issubclass(b, Model) and not issubclass(b, DistributedModel)]
586
+ if not storage_models:
587
+ raise TypeError(
588
+ "When defining Distributed engine without the table_name ensure that your model has a parent model"
589
+ )
590
+
591
+ if len(storage_models) > 1:
592
+ raise TypeError(
593
+ "When defining Distributed engine without the table_name "
594
+ "ensure that your model has exactly one non-distributed superclass"
595
+ )
596
+
597
+ # enable correct SQL for engine
598
+ cls.engine.table = storage_models[0]
599
+
600
+ @classmethod
601
+ def create_table_sql(cls, db):
602
+ """
603
+ Returns the SQL statement for creating a table for this model.
604
+ """
605
+ assert isinstance(cls.engine, Distributed), "engine must be engines.Distributed instance"
606
+
607
+ cls.fix_engine_table()
608
+
609
+ parts = [
610
+ f"CREATE TABLE IF NOT EXISTS `{db.db_name}`.`{cls.table_name()}` AS `{db.db_name}`.`{cls.engine.table_name}`",
611
+ "ENGINE = " + cls.engine.create_table_sql(db),
612
+ ]
613
+ return "\n".join(parts)
614
+
615
+
616
+ # Expose only relevant classes in import *
617
+ __all__ = get_subclass_names(locals(), (Model, Constraint, Index))