clickhouse-orm 3.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ from inspect import isclass
4
+
5
+ from .database import * # noqa: F401, F403
6
+ from .engines import * # noqa: F401, F403
7
+ from .fields import * # noqa: F401, F403
8
+ from .funcs import * # noqa: F401, F403
9
+ from .migrations import * # noqa: F401, F403
10
+ from .models import * # noqa: F401, F403
11
+ from .query import * # noqa: F401, F403
12
+ from .system_models import * # noqa: F401, F403
13
+
14
+ __all__ = [c.__name__ for c in locals().values() if isclass(c)]
@@ -0,0 +1,457 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime
4
+ import logging
5
+ import re
6
+ from math import ceil
7
+ from string import Template
8
+
9
+ import pytz
10
+ import requests
11
+
12
+ from .models import ModelBase
13
+ from .utils import Page, import_submodules, parse_tsv
14
+
15
+ logger = logging.getLogger("clickhouse_orm")
16
+
17
+
18
+ class DatabaseException(Exception): # noqa: N818
19
+ """
20
+ Raised when a database operation fails.
21
+ """
22
+
23
+
24
+ class ServerError(DatabaseException):
25
+ """
26
+ Raised when a server returns an error.
27
+ """
28
+
29
+ def __init__(self, message):
30
+ self.code = None
31
+ processed = self.get_error_code_msg(message)
32
+ if processed:
33
+ self.code, self.message = processed
34
+ else:
35
+ # just skip custom init
36
+ # if non-standard message format
37
+ self.message = message
38
+ super().__init__(message)
39
+
40
+ ERROR_PATTERNS = (
41
+ # ClickHouse prior to v19.3.3
42
+ re.compile(
43
+ r"""
44
+ Code:\ (?P<code>\d+),
45
+ \ e\.displayText\(\)\ =\ (?P<type1>[^ \n]+):\ (?P<msg>.+?),
46
+ \ e.what\(\)\ =\ (?P<type2>[^ \n]+)
47
+ """,
48
+ re.VERBOSE | re.DOTALL,
49
+ ),
50
+ # ClickHouse v19.3.3+
51
+ re.compile(
52
+ r"""
53
+ Code:\ (?P<code>\d+),
54
+ \ e\.displayText\(\)\ =\ (?P<type1>[^ \n]+):\ (?P<msg>.+)
55
+ """,
56
+ re.VERBOSE | re.DOTALL,
57
+ ),
58
+ # ClickHouse v21+
59
+ re.compile(
60
+ r"""
61
+ Code:\ (?P<code>\d+).
62
+ \ (?P<type1>[^ \n]+):\ (?P<msg>.+)
63
+ """,
64
+ re.VERBOSE | re.DOTALL,
65
+ ),
66
+ )
67
+
68
+ @classmethod
69
+ def get_error_code_msg(cls, full_error_message):
70
+ """
71
+ Extract the code and message of the exception that clickhouse-server generated.
72
+
73
+ See the list of error codes here:
74
+ https://github.com/yandex/ClickHouse/blob/master/dbms/src/Common/ErrorCodes.cpp
75
+ """
76
+ for pattern in cls.ERROR_PATTERNS:
77
+ match = pattern.match(full_error_message)
78
+ if match:
79
+ # assert match.group('type1') == match.group('type2')
80
+ return int(match.group("code")), match.group("msg").strip()
81
+
82
+ return 0, full_error_message
83
+
84
+ def __str__(self):
85
+ if self.code is not None:
86
+ return f"{self.message} ({self.code})"
87
+
88
+
89
+ class Database:
90
+ """
91
+ Database instances connect to a specific ClickHouse database for running queries,
92
+ inserting data and other operations.
93
+ """
94
+
95
+ _default_url = "http://localhost:8123/"
96
+
97
+ def __init__(
98
+ self,
99
+ db_name,
100
+ db_url=None,
101
+ username=None,
102
+ password=None,
103
+ readonly=False,
104
+ autocreate=True,
105
+ timeout=60,
106
+ verify_ssl_cert=True,
107
+ log_statements=False,
108
+ ):
109
+ """
110
+ Initializes a database instance. Unless it's readonly, the database will be
111
+ created on the ClickHouse server if it does not already exist.
112
+
113
+ - `db_name`: name of the database to connect to.
114
+ - `db_url`: URL of the ClickHouse server.
115
+ - `username`: optional connection credentials.
116
+ - `password`: optional connection credentials.
117
+ - `readonly`: use a read-only connection.
118
+ - `autocreate`: automatically create the database if it does not exist (unless in readonly mode).
119
+ - `timeout`: the connection timeout in seconds.
120
+ - `verify_ssl_cert`: whether to verify the server's certificate when connecting via HTTPS.
121
+ - `log_statements`: when True, all database statements are logged.
122
+ """
123
+ self.db_name = db_name
124
+ self.db_url = db_url or self._default_url
125
+ self.readonly = False
126
+ self.timeout = timeout
127
+ self.request_session = requests.Session()
128
+ self.request_session.verify = verify_ssl_cert
129
+ if username:
130
+ self.request_session.auth = (username, password or "")
131
+ self.log_statements = log_statements
132
+ self.settings = {}
133
+ self.db_exists = False # this is required before running _is_existing_database
134
+ self.db_exists = self._is_existing_database()
135
+ if readonly:
136
+ if not self.db_exists:
137
+ raise DatabaseException("Database does not exist, and cannot be created under readonly connection")
138
+ self.connection_readonly = self._is_connection_readonly()
139
+ self.readonly = True
140
+ elif autocreate and not self.db_exists:
141
+ self.create_database()
142
+ self.server_version = self._get_server_version()
143
+ # Versions 1.1.53981 and below don't have timezone function
144
+ self.server_timezone = self._get_server_timezone() if self.server_version > (1, 1, 53981) else pytz.utc
145
+ # Versions 19.1.16 and above support codec compression
146
+ self.has_codec_support = self.server_version >= (19, 1, 16)
147
+ # Version 19.0 and above support LowCardinality
148
+ self.has_low_cardinality_support = self.server_version >= (19, 0)
149
+
150
+ def create_database(self):
151
+ """
152
+ Creates the database on the ClickHouse server if it does not already exist.
153
+ """
154
+ self._send("CREATE DATABASE IF NOT EXISTS `%s`" % self.db_name)
155
+ self.db_exists = True
156
+
157
+ def drop_database(self):
158
+ """
159
+ Deletes the database on the ClickHouse server.
160
+ """
161
+ self._send("DROP DATABASE `%s`" % self.db_name)
162
+ self.db_exists = False
163
+
164
+ def create_table(self, model_class):
165
+ """
166
+ Creates a table for the given model class, if it does not exist already.
167
+ """
168
+ if model_class.is_system_model():
169
+ raise DatabaseException("You can't create system table")
170
+ if model_class.engine is None:
171
+ raise DatabaseException("%s class must define an engine" % model_class.__name__)
172
+ self._send(model_class.create_table_sql(self))
173
+
174
+ def drop_table(self, model_class):
175
+ """
176
+ Drops the database table of the given model class, if it exists.
177
+ """
178
+ if model_class.is_system_model():
179
+ raise DatabaseException("You can't drop system table")
180
+ self._send(model_class.drop_table_sql(self))
181
+
182
+ def does_table_exist(self, model_class):
183
+ """
184
+ Checks whether a table for the given model class already exists.
185
+ Note that this only checks for existence of a table with the expected name.
186
+ """
187
+ sql = "SELECT count() FROM system.tables WHERE database = '%s' AND name = '%s'"
188
+ r = self._send(sql % (self.db_name, model_class.table_name()))
189
+ return r.text.strip() == "1"
190
+
191
+ def get_model_for_table(self, table_name, system_table=False):
192
+ """
193
+ Generates a model class from an existing table in the database.
194
+ This can be used for querying tables which don't have a corresponding model class,
195
+ for example system tables.
196
+
197
+ - `table_name`: the table to create a model for
198
+ - `system_table`: whether the table is a system table, or belongs to the current database
199
+ """
200
+ db_name = "system" if system_table else self.db_name
201
+ sql = "DESCRIBE `%s`.`%s` FORMAT TSV" % (db_name, table_name)
202
+ lines = self._send(sql).iter_lines()
203
+ fields = [parse_tsv(line)[:2] for line in lines]
204
+ model = ModelBase.create_ad_hoc_model(fields, table_name)
205
+ if system_table:
206
+ model._system = model._readonly = True
207
+ return model
208
+
209
+ def add_setting(self, name, value):
210
+ """
211
+ Adds a database setting that will be sent with every request.
212
+ For example, `db.add_setting("max_execution_time", 10)` will
213
+ limit query execution time to 10 seconds.
214
+ The name must be string, and the value is converted to string in case
215
+ it isn't. To remove a setting, pass `None` as the value.
216
+ """
217
+ assert isinstance(name, str), "Setting name must be a string"
218
+ if value is None:
219
+ self.settings.pop(name, None)
220
+ else:
221
+ self.settings[name] = str(value)
222
+
223
+ def insert(self, model_instances, batch_size=1000):
224
+ """
225
+ Insert records into the database.
226
+
227
+ - `model_instances`: any iterable containing instances of a single model class.
228
+ - `batch_size`: number of records to send per chunk (use a lower number if your records are very large).
229
+ """
230
+ from io import BytesIO
231
+
232
+ i = iter(model_instances)
233
+ try:
234
+ first_instance = next(i)
235
+ except StopIteration:
236
+ return # model_instances is empty
237
+ model_class = first_instance.__class__
238
+
239
+ if first_instance.is_read_only() or first_instance.is_system_model():
240
+ raise DatabaseException("You can't insert into read only and system tables")
241
+
242
+ fields_list = ",".join(["`%s`" % name for name in first_instance.fields(writable=True)])
243
+ fmt = "TSKV" if model_class.has_funcs_as_defaults() else "TabSeparated"
244
+ query = "INSERT INTO $table (%s) FORMAT %s\n" % (fields_list, fmt)
245
+
246
+ def gen():
247
+ buf = BytesIO()
248
+ buf.write(self._substitute(query, model_class).encode("utf-8"))
249
+ first_instance.set_database(self)
250
+ buf.write(first_instance.to_db_string())
251
+ # Collect lines in batches of batch_size
252
+ lines = 2
253
+ for instance in i:
254
+ instance.set_database(self)
255
+ buf.write(instance.to_db_string())
256
+ lines += 1
257
+ if lines >= batch_size:
258
+ # Return the current batch of lines
259
+ yield buf.getvalue()
260
+ # Start a new batch
261
+ buf = BytesIO()
262
+ lines = 0
263
+ # Return any remaining lines in partial batch
264
+ if lines:
265
+ yield buf.getvalue()
266
+
267
+ self._send(gen())
268
+
269
+ def count(self, model_class, conditions=None):
270
+ """
271
+ Counts the number of records in the model's table.
272
+
273
+ - `model_class`: the model to count.
274
+ - `conditions`: optional SQL conditions (contents of the WHERE clause).
275
+ """
276
+ from clickhouse_orm.query import Q
277
+
278
+ query = "SELECT count() FROM $table"
279
+ if conditions:
280
+ if isinstance(conditions, Q):
281
+ conditions = conditions.to_sql(model_class)
282
+ query += " WHERE " + str(conditions)
283
+ query = self._substitute(query, model_class)
284
+ r = self._send(query)
285
+ return int(r.text) if r.text else 0
286
+
287
+ def select(self, query, model_class=None, settings=None):
288
+ """
289
+ Performs a query and returns a generator of model instances.
290
+
291
+ - `query`: the SQL query to execute.
292
+ - `model_class`: the model class matching the query's table,
293
+ or `None` for getting back instances of an ad-hoc model.
294
+ - `settings`: query settings to send as HTTP GET parameters
295
+ """
296
+ query += " FORMAT TabSeparatedWithNamesAndTypes"
297
+ query = self._substitute(query, model_class)
298
+ r = self._send(query, settings, True)
299
+ lines = r.iter_lines()
300
+ field_names = parse_tsv(next(lines))
301
+ field_types = parse_tsv(next(lines))
302
+ model_class = model_class or ModelBase.create_ad_hoc_model(zip(field_names, field_types))
303
+ for line in lines:
304
+ # skip blank line left by WITH TOTALS modifier
305
+ if line:
306
+ yield model_class.from_tsv(line, field_names, self.server_timezone, self)
307
+
308
+ def raw(self, query, settings=None, stream=False):
309
+ """
310
+ Performs a query and returns its output as text.
311
+
312
+ - `query`: the SQL query to execute.
313
+ - `settings`: query settings to send as HTTP GET parameters
314
+ - `stream`: if true, the HTTP response from ClickHouse will be streamed.
315
+ """
316
+ query = self._substitute(query, None)
317
+ return self._send(query, settings=settings, stream=stream).text
318
+
319
+ def paginate(self, model_class, order_by, page_num=1, page_size=100, conditions=None, settings=None):
320
+ """
321
+ Selects records and returns a single page of model instances.
322
+
323
+ - `model_class`: the model class matching the query's table,
324
+ or `None` for getting back instances of an ad-hoc model.
325
+ - `order_by`: columns to use for sorting the query (contents of the ORDER BY clause).
326
+ - `page_num`: the page number (1-based), or -1 to get the last page.
327
+ - `page_size`: number of records to return per page.
328
+ - `conditions`: optional SQL conditions (contents of the WHERE clause).
329
+ - `settings`: query settings to send as HTTP GET parameters
330
+
331
+ The result is a namedtuple containing `objects` (list), `number_of_objects`,
332
+ `pages_total`, `number` (of the current page), and `page_size`.
333
+ """
334
+ from clickhouse_orm.query import Q
335
+
336
+ count = self.count(model_class, conditions)
337
+ pages_total = int(ceil(count / float(page_size)))
338
+ if page_num == -1:
339
+ page_num = max(pages_total, 1)
340
+ elif page_num < 1:
341
+ raise ValueError("Invalid page number: %d" % page_num)
342
+ offset = (page_num - 1) * page_size
343
+ query = "SELECT * FROM $table"
344
+ if conditions:
345
+ if isinstance(conditions, Q):
346
+ conditions = conditions.to_sql(model_class)
347
+ query += " WHERE " + str(conditions)
348
+ query += " ORDER BY %s" % order_by
349
+ query += " LIMIT %d, %d" % (offset, page_size)
350
+ query = self._substitute(query, model_class)
351
+ return Page(
352
+ objects=list(self.select(query, model_class, settings)) if count else [],
353
+ number_of_objects=count,
354
+ pages_total=pages_total,
355
+ number=page_num,
356
+ page_size=page_size,
357
+ )
358
+
359
+ def migrate(self, migrations_package_name, up_to=9999):
360
+ """
361
+ Executes schema migrations.
362
+
363
+ - `migrations_package_name` - fully qualified name of the Python package
364
+ containing the migrations.
365
+ - `up_to` - number of the last migration to apply.
366
+ """
367
+ from .migrations import MigrationHistory
368
+
369
+ logger = logging.getLogger("migrations")
370
+ applied_migrations = self._get_applied_migrations(migrations_package_name)
371
+ modules = import_submodules(migrations_package_name)
372
+ unapplied_migrations = set(modules.keys()) - applied_migrations
373
+ for name in sorted(unapplied_migrations):
374
+ logger.info("Applying migration %s...", name)
375
+ for operation in modules[name].operations:
376
+ operation.apply(self)
377
+ self.insert(
378
+ [
379
+ MigrationHistory(
380
+ package_name=migrations_package_name, module_name=name, applied=datetime.date.today()
381
+ )
382
+ ]
383
+ )
384
+ if int(name[:4]) >= up_to:
385
+ break
386
+
387
+ def _get_applied_migrations(self, migrations_package_name):
388
+ from .migrations import MigrationHistory
389
+
390
+ self.create_table(MigrationHistory)
391
+ query = "SELECT module_name from $table WHERE package_name = '%s'" % migrations_package_name
392
+ query = self._substitute(query, MigrationHistory)
393
+ return set(obj.module_name for obj in self.select(query))
394
+
395
+ def _send(self, data, settings=None, stream=False):
396
+ if isinstance(data, str):
397
+ data = data.encode("utf-8")
398
+ if self.log_statements:
399
+ logger.info(data)
400
+ params = self._build_params(settings)
401
+ r = self.request_session.post(self.db_url, params=params, data=data, stream=stream, timeout=self.timeout)
402
+ if r.status_code != 200:
403
+ raise ServerError(r.text)
404
+ return r
405
+
406
+ def _build_params(self, settings):
407
+ params = dict(settings or {})
408
+ params.update(self.settings)
409
+ if self.db_exists:
410
+ params["database"] = self.db_name
411
+ # Send the readonly flag, unless the connection is already readonly (to prevent db error)
412
+ if self.readonly and not self.connection_readonly:
413
+ params["readonly"] = "1"
414
+ return params
415
+
416
+ def _substitute(self, query, model_class=None):
417
+ """
418
+ Replaces $db and $table placeholders in the query.
419
+ """
420
+ if "$" in query:
421
+ mapping = dict(db="`%s`" % self.db_name)
422
+ if model_class:
423
+ if model_class.is_system_model():
424
+ mapping["table"] = "`system`.`%s`" % model_class.table_name()
425
+ else:
426
+ mapping["table"] = "`%s`.`%s`" % (self.db_name, model_class.table_name())
427
+ query = Template(query).safe_substitute(mapping)
428
+ return query
429
+
430
+ def _get_server_timezone(self):
431
+ try:
432
+ r = self._send("SELECT timezone()")
433
+ return pytz.timezone(r.text.strip())
434
+ except ServerError as e:
435
+ logger.exception("Cannot determine server timezone (%s), assuming UTC", e)
436
+ return pytz.utc
437
+
438
+ def _get_server_version(self, as_tuple=True):
439
+ try:
440
+ r = self._send("SELECT version();")
441
+ ver = r.text
442
+ except ServerError as e:
443
+ logger.exception("Cannot determine server version (%s), assuming 1.1.0", e)
444
+ ver = "1.1.0"
445
+ return tuple(int(n) for n in ver.split(".") if n.isdigit()) if as_tuple else ver
446
+
447
+ def _is_existing_database(self):
448
+ r = self._send("SELECT count() FROM system.databases WHERE name = '%s'" % self.db_name)
449
+ return r.text.strip() == "1"
450
+
451
+ def _is_connection_readonly(self):
452
+ r = self._send("SELECT value FROM system.settings WHERE name = 'readonly'")
453
+ return r.text.strip() != "0"
454
+
455
+
456
+ # Expose only relevant classes in import *
457
+ __all__ = [c.__name__ for c in [Page, DatabaseException, ServerError, Database]]