quasardb 3.14.2.dev8__cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. quasardb/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  2. quasardb/CMakeFiles/progress.marks +1 -0
  3. quasardb/Makefile +189 -0
  4. quasardb/__init__.py +140 -0
  5. quasardb/__init__.pyi +72 -0
  6. quasardb/cmake_install.cmake +58 -0
  7. quasardb/date/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  8. quasardb/date/CMakeFiles/Export/b76006b2b7125baf1b0b4d4ca4db82bd/dateTargets.cmake +108 -0
  9. quasardb/date/CMakeFiles/progress.marks +1 -0
  10. quasardb/date/Makefile +189 -0
  11. quasardb/date/cmake_install.cmake +81 -0
  12. quasardb/date/dateConfigVersion.cmake +65 -0
  13. quasardb/date/dateTargets.cmake +63 -0
  14. quasardb/extensions/__init__.py +9 -0
  15. quasardb/extensions/writer.py +195 -0
  16. quasardb/firehose.py +112 -0
  17. quasardb/libqdb_api.so +0 -0
  18. quasardb/numpy/__init__.py +1106 -0
  19. quasardb/pandas/__init__.py +696 -0
  20. quasardb/pool.py +338 -0
  21. quasardb/pybind11/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  22. quasardb/pybind11/CMakeFiles/progress.marks +1 -0
  23. quasardb/pybind11/Makefile +189 -0
  24. quasardb/pybind11/cmake_install.cmake +50 -0
  25. quasardb/quasardb/__init__.pyi +97 -0
  26. quasardb/quasardb/_batch_column.pyi +5 -0
  27. quasardb/quasardb/_batch_inserter.pyi +32 -0
  28. quasardb/quasardb/_blob.pyi +16 -0
  29. quasardb/quasardb/_cluster.pyi +106 -0
  30. quasardb/quasardb/_continuous.pyi +18 -0
  31. quasardb/quasardb/_double.pyi +7 -0
  32. quasardb/quasardb/_entry.pyi +61 -0
  33. quasardb/quasardb/_error.pyi +15 -0
  34. quasardb/quasardb/_integer.pyi +7 -0
  35. quasardb/quasardb/_node.pyi +26 -0
  36. quasardb/quasardb/_options.pyi +106 -0
  37. quasardb/quasardb/_perf.pyi +7 -0
  38. quasardb/quasardb/_properties.pyi +5 -0
  39. quasardb/quasardb/_query.pyi +2 -0
  40. quasardb/quasardb/_reader.pyi +15 -0
  41. quasardb/quasardb/_retry.pyi +16 -0
  42. quasardb/quasardb/_string.pyi +12 -0
  43. quasardb/quasardb/_table.pyi +140 -0
  44. quasardb/quasardb/_tag.pyi +5 -0
  45. quasardb/quasardb/_timestamp.pyi +9 -0
  46. quasardb/quasardb/_writer.pyi +112 -0
  47. quasardb/quasardb/metrics/__init__.pyi +28 -0
  48. quasardb/quasardb.cpython-310-x86_64-linux-gnu.so +0 -0
  49. quasardb/range-v3/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  50. quasardb/range-v3/CMakeFiles/Export/48a02d54b5e9e60c30c5f249b431a911/range-v3-targets.cmake +128 -0
  51. quasardb/range-v3/CMakeFiles/progress.marks +1 -0
  52. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/DependInfo.cmake +22 -0
  53. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/build.make +86 -0
  54. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/cmake_clean.cmake +5 -0
  55. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/compiler_depend.make +2 -0
  56. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/compiler_depend.ts +2 -0
  57. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/progress.make +1 -0
  58. quasardb/range-v3/Makefile +204 -0
  59. quasardb/range-v3/cmake_install.cmake +93 -0
  60. quasardb/range-v3/include/range/v3/version.hpp +24 -0
  61. quasardb/range-v3/range-v3-config-version.cmake +83 -0
  62. quasardb/range-v3/range-v3-config.cmake +80 -0
  63. quasardb/stats.py +376 -0
  64. quasardb/table_cache.py +60 -0
  65. quasardb/typing.py +23 -0
  66. quasardb-3.14.2.dev8.dist-info/METADATA +41 -0
  67. quasardb-3.14.2.dev8.dist-info/RECORD +70 -0
  68. quasardb-3.14.2.dev8.dist-info/WHEEL +6 -0
  69. quasardb-3.14.2.dev8.dist-info/licenses/LICENSE.md +11 -0
  70. quasardb-3.14.2.dev8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1106 @@
1
+ # pylint: disable=C0103,C0111,C0302,R0903
2
+
3
+ # Copyright (c) 2009-2022, quasardb SAS. All rights reserved.
4
+ # All rights reserved.
5
+ #
6
+ # Redistribution and use in source and binary forms, with or without
7
+ # modification, are permitted provided that the following conditions are met:
8
+ #
9
+ # * Redistributions of source code must retain the above copyright
10
+ # notice, this list of conditions and the following disclaimer.
11
+ # * Redistributions in binary form must reproduce the above copyright
12
+ # notice, this list of conditions and the following disclaimer in the
13
+ # documentation and/or other materials provided with the distribution.
14
+ # * Neither the name of quasardb nor the names of its contributors may
15
+ # be used to endorse or promote products derived from this software
16
+ # without specific prior written permission.
17
+ #
18
+ # THIS SOFTWARE IS PROVIDED BY QUASARDB AND CONTRIBUTORS ``AS IS'' AND ANY
19
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ # DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
22
+ # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
+ #
29
+ from __future__ import annotations
30
+
31
+ import logging
32
+ import time
33
+ import warnings
34
+ from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Type, Union
35
+
36
+ import quasardb
37
+ import quasardb.table_cache as table_cache
38
+ from quasardb.quasardb import Table, Writer
39
+ from quasardb.typing import DType, MaskedArrayAny, NDArrayAny, NDArrayTime
40
+
41
+ logger = logging.getLogger("quasardb.numpy")
42
+
43
+
44
+ class NumpyRequired(ImportError):
45
+ """
46
+ Exception raised when trying to use QuasarDB pandas integration, but
47
+ pandas has not been installed.
48
+ """
49
+
50
+ pass
51
+
52
+
53
+ try:
54
+ import numpy as np
55
+ import numpy.ma as ma
56
+
57
+ except ImportError as err:
58
+ logger.exception(err)
59
+ raise NumpyRequired(
60
+ "The numpy library is required to handle numpy arrays formats"
61
+ ) from err
62
+
63
+
64
+ class IncompatibleDtypeError(TypeError):
65
+ """
66
+ Exception raised when a provided dtype is not the expected dtype.
67
+ """
68
+
69
+ def __init__(
70
+ self,
71
+ cname: Optional[str] = None,
72
+ ctype: Optional[quasardb.ColumnType] = None,
73
+ expected: Optional[List[DType]] = None,
74
+ provided: Optional[DType] = None,
75
+ ):
76
+ self.cname = cname
77
+ self.ctype = ctype
78
+ self.expected = expected
79
+ self.provided = provided
80
+ super().__init__(self.msg())
81
+
82
+ def msg(self) -> str:
83
+ return "Data for column '{}' with type '{}' was provided in dtype '{}' but need '{}'.".format(
84
+ self.cname, self.ctype, self.provided, self.expected
85
+ )
86
+
87
+
88
+ class IncompatibleDtypeErrors(TypeError):
89
+ """
90
+ Wraps multiple dtype errors
91
+ """
92
+
93
+ def __init__(self, xs: List[IncompatibleDtypeError]):
94
+ self.xs = xs
95
+ super().__init__(self.msg())
96
+
97
+ def msg(self) -> str:
98
+ return "\n".join(x.msg() for x in self.xs)
99
+
100
+
101
+ class InvalidDataCardinalityError(ValueError):
102
+ """
103
+ Raised when the provided data arrays doesn't match the table's columns.
104
+ """
105
+
106
+ def __init__(self, data: List[Any], cinfos: List[Any]) -> None:
107
+ self.data = data
108
+ self.cinfos = cinfos
109
+ super().__init__(self.msg())
110
+
111
+ def msg(self) -> str:
112
+ return "Provided data array length '{}' exceeds amount of table columns '{}', unable to map data to columns".format(
113
+ len(self.data), len(self.cinfos)
114
+ )
115
+
116
+
117
+ # Based on QuasarDB column types, which dtype do we accept?
118
+ # First entry will always be the 'preferred' dtype, other ones
119
+ # those that we can natively convert in native code.
120
+ _ctype_to_dtype: Dict[quasardb.ColumnType, List[DType]] = {
121
+ quasardb.ColumnType.String: [np.dtype("U")],
122
+ quasardb.ColumnType.Symbol: [np.dtype("U")],
123
+ quasardb.ColumnType.Int64: [np.dtype("i8"), np.dtype("i4"), np.dtype("i2")],
124
+ quasardb.ColumnType.Double: [np.dtype("f8"), np.dtype("f4")],
125
+ quasardb.ColumnType.Blob: [np.dtype("S"), np.dtype("O")],
126
+ quasardb.ColumnType.Timestamp: [np.dtype("datetime64[ns]")],
127
+ }
128
+
129
+
130
+ def _best_dtype_for_ctype(ctype: quasardb.ColumnType) -> DType:
131
+ """
132
+ Returns the 'best' DType for a certain column type. For example, for blobs, even
133
+ though we accept py::bytes, prefer bytestrings (as they are faster to read in c++).
134
+ """
135
+ possible_dtypes = _ctype_to_dtype[ctype]
136
+ assert len(possible_dtypes) > 0
137
+
138
+ # By convention, the first entry is the preferred one
139
+ return possible_dtypes[0]
140
+
141
+
142
+ def _coerce_dtype(
143
+ dtype: Optional[
144
+ Union[DType, Dict[str, Optional[DType]], Sequence[Optional[DType]]]
145
+ ],
146
+ columns: List[Tuple[str, quasardb.ColumnType]],
147
+ ) -> List[Optional[DType]]:
148
+ if dtype is None:
149
+ dtype = [None] * len(columns)
150
+
151
+ if isinstance(dtype, np.dtype):
152
+ dtype = [dtype]
153
+
154
+ if type(dtype) is dict:
155
+ # Conveniently look up column index by label
156
+ offsets: Dict[str, int] = {}
157
+ for i in range(len(columns)):
158
+ (cname, _) = columns[i]
159
+ offsets[cname] = i
160
+
161
+ # Now convert the provided dtype dict to a list that matches
162
+ # the relative offset within the table.
163
+ #
164
+ # Any columns not provided will have a 'None' dtype.
165
+ dtype_: List[Optional[DType]] = [None] * len(columns)
166
+
167
+ for k, dt in dtype.items():
168
+ if not k in offsets:
169
+ logger.warning(
170
+ "Forced dtype provided for column '%s' = %s, but that column is not found in the table. Skipping...",
171
+ k,
172
+ )
173
+
174
+ i = offsets[k]
175
+ dtype_[i] = dt
176
+
177
+ dtype = dtype_
178
+
179
+ if type(dtype) is not list:
180
+ raise ValueError(
181
+ "Forced dtype argument provided, but the argument has an incompatible type. Expected: list-like or dict-like, got: {}".format(
182
+ type(dtype)
183
+ )
184
+ )
185
+
186
+ if len(dtype) is not len(columns):
187
+ raise ValueError(
188
+ "Expected exactly one dtype for each column, but {} dtypes were provided for {} columns".format(
189
+ len(dtype), len(columns)
190
+ )
191
+ )
192
+
193
+ return list(dtype)
194
+
195
+
196
+ def _add_desired_dtypes(
197
+ dtype: List[Optional[DType]], columns: List[Tuple[str, quasardb.ColumnType]]
198
+ ) -> List[Optional[DType]]:
199
+ """
200
+ When infer_types=True, this function sets the 'desired' dtype for each of the columns.
201
+ `dtype` is expected to be the output of `_coerce_dtype`, that is, a list-like with an
202
+ entry for each column.
203
+ """
204
+ assert len(dtype) == len(columns)
205
+
206
+ for i in range(len(dtype)):
207
+ # No dtype explicitly provided by the user, otherwise we don't touch it
208
+ if dtype[i] is None:
209
+ (cname, ctype) = columns[i]
210
+ dtype_ = _best_dtype_for_ctype(ctype)
211
+ logger.debug(
212
+ "using default dtype '%s' for column '%s' with type %s",
213
+ dtype_,
214
+ cname,
215
+ ctype,
216
+ )
217
+ dtype[i] = dtype_
218
+
219
+ return dtype
220
+
221
+
222
+ def _is_all_masked(xs: Any) -> bool:
223
+ if ma.isMA(xs):
224
+ return ma.size(xs) == ma.count_masked(xs)
225
+
226
+ if np.size(xs) == 0:
227
+ # Empty lists are considered "masked"
228
+ return True
229
+
230
+ if xs.dtype == np.object_ and xs[0] is None:
231
+ # Very likely that all is None at this point, we can try a more "expensive"
232
+ # probing function
233
+ #
234
+ # We'll defer to the Python `all` function, as Numpy doesn't really have great
235
+ # built-ins for object arrays
236
+ return all(x is None for x in xs)
237
+
238
+ logger.debug(
239
+ "{} is not a masked array, not convertible to requested type... ".format(
240
+ type(xs)
241
+ )
242
+ )
243
+
244
+ # This array is *not* a masked array, it's *not* convertible to the type we want,
245
+ # and it's *not* an object array.
246
+ #
247
+ # The best we can do at this point is emit a warning and defer to the (expensive)
248
+ # python-based function as well.
249
+ return all(x is None for x in xs)
250
+
251
+
252
+ def dtypes_equal(lhs: DType, rhs: DType) -> bool:
253
+ if lhs.kind == "U" or lhs.kind == "S":
254
+ # Unicode and string data has variable length encoding, which means their itemsize
255
+ # can be anything.
256
+ #
257
+ # We only care about dtype kind in this case.
258
+ return lhs.kind == rhs.kind
259
+
260
+ return lhs == rhs
261
+
262
+
263
+ def _dtype_found(needle: DType, haystack: List[DType]) -> bool:
264
+ """
265
+ Returns True if one of the dtypes in `haystack` matches that of `needle`.
266
+ """
267
+ for x in haystack:
268
+ if dtypes_equal(needle, x) is True:
269
+ return True
270
+
271
+ return False
272
+
273
+
274
+ def _validate_dtypes(
275
+ data: List[Any], columns: List[Tuple[str, quasardb.ColumnType]]
276
+ ) -> None:
277
+ errors = list()
278
+
279
+ for data_, (cname, ctype) in zip(data, columns):
280
+ expected_ = _ctype_to_dtype[ctype]
281
+
282
+ logger.debug("data_.dtype = %s, expected_ = %s", data_.dtype, expected_)
283
+
284
+ if not _dtype_found(data_.dtype, expected_):
285
+ errors.append(
286
+ IncompatibleDtypeError(
287
+ cname=cname, ctype=ctype, provided=data_.dtype, expected=expected_
288
+ )
289
+ )
290
+
291
+ if len(errors) > 0:
292
+ raise IncompatibleDtypeErrors(errors)
293
+
294
+
295
+ def _coerce_deduplicate(
296
+ deduplicate: Union[bool, str, List[str]],
297
+ deduplication_mode: str,
298
+ columns: List[Tuple[str, quasardb.ColumnType]],
299
+ ) -> Union[bool, List[str]]:
300
+ """
301
+ Throws an error when 'deduplicate' options are incorrect.
302
+ """
303
+ cnames = [cname for (cname, _) in columns]
304
+
305
+ if deduplication_mode not in ["drop", "upsert"]:
306
+ raise RuntimeError(
307
+ "deduplication_mode should be one of ['drop', 'upsert'], got: {}".format(
308
+ deduplication_mode
309
+ )
310
+ )
311
+
312
+ if isinstance(deduplicate, bool):
313
+ return deduplicate
314
+
315
+ # Special value of $timestamp, hardcoded
316
+ if isinstance(deduplicate, str) and deduplicate == "$timestamp":
317
+ deduplicate = ["$timestamp"]
318
+ cnames.append("$timestamp")
319
+
320
+ if not isinstance(deduplicate, list):
321
+ raise TypeError(
322
+ "drop_duplicates should be either a bool or a list, got: "
323
+ + str(type(deduplicate))
324
+ )
325
+
326
+ for column_name in deduplicate:
327
+ if not column_name in cnames:
328
+ raise RuntimeError(
329
+ "Provided deduplication column name '{}' not found in table columns.".format(
330
+ column_name
331
+ )
332
+ )
333
+
334
+ return deduplicate
335
+
336
+
337
+ def _clean_nulls(xs: MaskedArrayAny, dtype: DType) -> MaskedArrayAny:
338
+ """
339
+ Numpy's masked arrays have a downside that in case they're not able to convert a (masked!) value to
340
+ the desired dtype, they raise an error. So, for example, if I have a masked array of objects that
341
+ look like this
342
+
343
+ xs: [1.234 <pd.NA> 5.678]
344
+ mask: [1 0 1]
345
+
346
+ even though pd.NA is not "visible", because it cannot be converted to a float(), the operation will
347
+ fail!
348
+
349
+ This function fixes this by replacing the null values with an acceptable value that can always be
350
+ converted to the desired dtype.
351
+ """
352
+
353
+ assert ma.isMA(xs)
354
+
355
+ if xs.dtype is not np.dtype("object"):
356
+ return xs
357
+
358
+ fill_value: Any = None
359
+ if dtype == np.float64 or dtype == np.float32 or dtype == np.float16:
360
+ fill_value = float("nan")
361
+ elif dtype == np.int64 or dtype == np.int32 or dtype == np.int16:
362
+ fill_value = -1
363
+ elif dtype == np.dtype("datetime64[ns]"):
364
+ fill_value = np.datetime64("nat")
365
+
366
+ mask = xs.mask
367
+ xs_ = xs.filled(fill_value)
368
+
369
+ return ma.array(xs_, mask=mask)
370
+
371
+
372
+ def _coerce_data(
373
+ data: List[MaskedArrayAny], dtype: List[Optional[DType]]
374
+ ) -> List[MaskedArrayAny]:
375
+ """
376
+ Coerces each numpy array of `data` to the dtype present in `dtype`.
377
+ """
378
+
379
+ assert len(data) == len(dtype)
380
+
381
+ for i in range(len(data)):
382
+ dtype_ = dtype[i]
383
+ data_ = data[i]
384
+
385
+ if dtype_ is not None and dtypes_equal(data_.dtype, dtype_) == False:
386
+ data_ = _clean_nulls(data_, dtype_)
387
+
388
+ assert ma.isMA(data_)
389
+
390
+ logger.debug(
391
+ "data for column with offset %d was provided in dtype '%s', but need '%s': converting data...",
392
+ i,
393
+ data_.dtype,
394
+ dtype_,
395
+ )
396
+
397
+ logger.debug("dtype of data[%d] before: %s", i, data_.dtype)
398
+ logger.debug("type of data[%d] after: %s", i, type(data_))
399
+ logger.debug("size of data[%d] after: %s", i, ma.size(data_))
400
+ logger.debug("data of data[%d] after: %s", i, data_)
401
+
402
+ try:
403
+ data[i] = ma.masked_array(data_.astype(dtype_))
404
+ except TypeError as err:
405
+ # One 'bug' is that, if everything is masked, the underlying data type can be
406
+ # pretty much anything.
407
+ if _is_all_masked(data_):
408
+ logger.debug(
409
+ "array completely empty, re-initializing to empty array of '%s'",
410
+ dtype_,
411
+ )
412
+ data[i] = ma.masked_all(ma.size(data_), dtype=dtype_)
413
+
414
+ # Another 'bug' is that when the input data is objects, we may have null-like values (like pd.NA)
415
+ # that cannot easily be converted to, say, float.
416
+ else:
417
+ logger.error(
418
+ "An error occured while coercing input data type from dtype '%s' to dtype '%s': ",
419
+ data_.dtype,
420
+ dtype_,
421
+ )
422
+ logger.exception(err)
423
+ raise err
424
+
425
+ assert data[i].dtype.kind == dtype_.kind
426
+
427
+ logger.debug("type of data[%d] after: %s", i, type(data[i]))
428
+ logger.debug("size of data[%d] after: %s", i, ma.size(data[i]))
429
+ logger.debug("data of data[%d] after: %s", i, data[i])
430
+ assert ma.size(data[i]) == ma.size(data_)
431
+
432
+ return data
433
+
434
+
435
+ def _probe_length(
436
+ xs: Union[Dict[Any, NDArrayAny], Iterable[NDArrayAny]]
437
+ ) -> Optional[int]:
438
+ """
439
+ Returns the length of the first non-null array in `xs`, or None if all arrays
440
+ are null.
441
+ """
442
+ if isinstance(xs, dict):
443
+ return _probe_length(xs.values())
444
+
445
+ for x in xs:
446
+ if x is not None:
447
+ return x.size
448
+
449
+ return None
450
+
451
+
452
+ def _ensure_list(
453
+ xs: Union[List[Any], Dict[Any, Any], NDArrayAny],
454
+ cinfos: List[Tuple[str, quasardb.ColumnType]],
455
+ ) -> List[Any]:
456
+ """
457
+ If input data is a dict, ensures it's converted to a list with the correct
458
+ offsets.
459
+ """
460
+ if isinstance(xs, list):
461
+ return xs
462
+
463
+ if isinstance(xs, np.ndarray):
464
+ ret = []
465
+ for x in xs:
466
+ ret.append(x)
467
+
468
+ return ret
469
+
470
+ # As we only accept list-likes or dicts as input data, it *must* be a dict at this
471
+ # point
472
+ assert isinstance(xs, dict)
473
+
474
+ logger.debug("data was provided as dict, coercing to list")
475
+
476
+ # As we may have non-existing keys, we would like to initialize those as a masked
477
+ # array with all elements masked. In those cases, though, we need to know the size
478
+ # of the array.
479
+ n = _probe_length(xs)
480
+
481
+ if n is None:
482
+ logger.error("Unable to probe length: provided arrays: %s", xs)
483
+ raise ValueError("Unable to probe array length: all provided arrays None?")
484
+
485
+ ret = list()
486
+
487
+ for i in range(len(cinfos)):
488
+ (cname, ctype) = cinfos[i]
489
+
490
+ xs_ = None
491
+ if cname in xs:
492
+ xs_ = xs[cname]
493
+ else:
494
+ xs_ = ma.masked_all(n, dtype=_best_dtype_for_ctype(ctype))
495
+
496
+ ret.append(xs_)
497
+
498
+ return ret
499
+
500
+
501
+ def _coerce_retries(
502
+ retries: Optional[Union[int, quasardb.RetryOptions]]
503
+ ) -> quasardb.RetryOptions:
504
+ if retries is None:
505
+ return quasardb.RetryOptions()
506
+ elif isinstance(retries, int):
507
+ return quasardb.RetryOptions(retries=retries)
508
+ elif isinstance(retries, quasardb.RetryOptions):
509
+ return retries
510
+ else:
511
+ raise TypeError(
512
+ "retries should either be an integer or quasardb.RetryOptions, got: "
513
+ + str(type(retries))
514
+ )
515
+
516
+
517
+ def _kwarg_deprecation_warning(
518
+ old_kwarg: str,
519
+ old_value: Any,
520
+ new_kwargs: List[str],
521
+ new_values: List[Any],
522
+ stacklevel: int,
523
+ ) -> None:
524
+ new_declaration = ", ".join(
525
+ f"{new_kwarg}={new_value}"
526
+ for new_kwarg, new_value in zip(new_kwargs, new_values)
527
+ )
528
+ warnings.warn(
529
+ f"The argument '{old_kwarg}' <{type(old_value).__name__}> is deprecated and will be removed in a future version. "
530
+ f"Please use '{new_declaration}' instead.",
531
+ DeprecationWarning,
532
+ stacklevel=stacklevel + 1,
533
+ )
534
+
535
+
536
+ def _type_check(
537
+ var: Any,
538
+ var_name: str,
539
+ target_type: Type,
540
+ raise_error: bool = True,
541
+ allow_none: bool = True,
542
+ ) -> bool:
543
+ if allow_none and var is None:
544
+ return True
545
+ if not isinstance(var, target_type):
546
+ if raise_error:
547
+ raise quasardb.InvalidArgumentError(
548
+ f"Invalid '{var_name}' type, expected: {target_type}, got: {type(var)}"
549
+ )
550
+ return False
551
+ return True
552
+
553
+
554
+ def _ensure_ma(xs: Any, dtype: Optional[DType] = None) -> MaskedArrayAny:
555
+ # Don't bother if we're already a masked array
556
+ if ma.isMA(xs):
557
+ return xs
558
+
559
+ if not isinstance(xs, np.ndarray):
560
+ logger.debug("Provided data is not a numpy array: %s", type(xs))
561
+ xs = np.array(xs, dtype=dtype)
562
+
563
+ logger.debug("coercing array with dtype: %s", xs.dtype)
564
+
565
+ if xs.dtype.kind in ["O", "U", "S"]:
566
+ logger.debug("Data is object-like, masking None values")
567
+
568
+ mask = xs == None
569
+ return ma.masked_array(data=xs, mask=mask)
570
+ else:
571
+ logger.debug("Automatically masking invalid numbers")
572
+ return ma.masked_invalid(xs, copy=False)
573
+
574
+
575
+ def ensure_ma(
576
+ xs: Any, dtype: Optional[Union[DType, List[Optional[DType]]]] = None
577
+ ) -> Union[List[MaskedArrayAny], MaskedArrayAny]:
578
+ if isinstance(dtype, list):
579
+ assert isinstance(xs, list) == True
580
+ return [_ensure_ma(xs_, dtype_) for (xs_, dtype_) in zip(xs, dtype)]
581
+
582
+ return _ensure_ma(xs, dtype)
583
+
584
+
585
+ def read_array(
586
+ table: Optional[Table] = None, column: Optional[str] = None, ranges: Any = None
587
+ ) -> Tuple[NDArrayTime, MaskedArrayAny]:
588
+ if table is None:
589
+ raise RuntimeError("A table is required.")
590
+
591
+ if column is None:
592
+ raise RuntimeError("A column is required.")
593
+
594
+ kwargs: Dict[str, Any] = {"column": column}
595
+
596
+ if ranges is not None:
597
+ kwargs["ranges"] = ranges
598
+
599
+ read_with = {
600
+ quasardb.ColumnType.Double: table.double_get_ranges,
601
+ quasardb.ColumnType.Blob: table.blob_get_ranges,
602
+ quasardb.ColumnType.String: table.string_get_ranges,
603
+ quasardb.ColumnType.Symbol: table.string_get_ranges,
604
+ quasardb.ColumnType.Int64: table.int64_get_ranges,
605
+ quasardb.ColumnType.Timestamp: table.timestamp_get_ranges,
606
+ }
607
+
608
+ ctype = table.column_type_by_id(column)
609
+
610
+ fn = read_with[ctype]
611
+ return fn(**kwargs)
612
+
613
+
614
+ def write_array(
615
+ data: Any = None,
616
+ index: Optional[NDArrayTime] = None,
617
+ table: Optional[Table] = None,
618
+ column: Optional[str] = None,
619
+ dtype: Optional[DType] = None,
620
+ infer_types: bool = True,
621
+ ) -> None:
622
+ """
623
+ Write a Numpy array to a single column.
624
+
625
+ Parameters:
626
+ -----------
627
+
628
+ data: np.array
629
+ Numpy array with a dtype that is compatible with the column's type.
630
+
631
+ index: np.array
632
+ Numpy array with a datetime64[ns] dtype that will be used as the
633
+ $timestamp axis for the data to be stored.
634
+
635
+ dtype: optional np.dtype
636
+ If provided, ensures the data array is converted to this dtype before
637
+ insertion.
638
+
639
+ infer_types: optional bool
640
+ If true, when necessary will attempt to convert the data and index array
641
+ to the best type for the column. For example, if you provide float64 data
642
+ while the column's type is int64, it will automatically convert the data.
643
+
644
+ Defaults to True. For production use cases where you want to avoid implicit
645
+ conversions, we recommend always setting this to False.
646
+
647
+ """
648
+
649
+ if table is None:
650
+ raise RuntimeError("A table is required.")
651
+
652
+ if column is None:
653
+ raise RuntimeError("A column is required.")
654
+
655
+ if data is None:
656
+ raise RuntimeError("A data numpy array is required.")
657
+
658
+ if index is None:
659
+ raise RuntimeError("An index numpy timestamp array is required.")
660
+
661
+ data = ensure_ma(data, dtype=dtype)
662
+ ctype = table.column_type_by_id(column)
663
+
664
+ # We try to reuse some of the other functions, which assume array-like
665
+ # shapes for column info and data. It's a bit hackish, but actually works
666
+ # well.
667
+ #
668
+ # We should probably generalize this block of code with the same found in
669
+ # write_arrays().
670
+
671
+ cinfos = [(column, ctype)]
672
+ dtype_: List[Optional[DType]] = [dtype]
673
+
674
+ dtype_ = _coerce_dtype(dtype_, cinfos)
675
+
676
+ if infer_types is True:
677
+ dtype_ = _add_desired_dtypes(dtype_, cinfos)
678
+
679
+ # data_ = an array of [data]
680
+ data_ = [data]
681
+ data_ = _coerce_data(data_, dtype_)
682
+ _validate_dtypes(data_, cinfos)
683
+
684
+ # No functions that assume array-of-data anymore, let's put it back
685
+ data = data_[0]
686
+
687
+ # Dispatch to the correct function
688
+ write_with = {
689
+ quasardb.ColumnType.Double: table.double_insert,
690
+ quasardb.ColumnType.Blob: table.blob_insert,
691
+ quasardb.ColumnType.String: table.string_insert,
692
+ quasardb.ColumnType.Symbol: table.string_insert,
693
+ quasardb.ColumnType.Int64: table.int64_insert,
694
+ quasardb.ColumnType.Timestamp: table.timestamp_insert,
695
+ }
696
+
697
+ logger.info(
698
+ "Writing array (%d rows of dtype %s) to columns %s.%s (type %s)",
699
+ len(data),
700
+ data.dtype,
701
+ table.get_name(),
702
+ column,
703
+ ctype,
704
+ )
705
+ write_with[ctype](column, index, data)
706
+
707
+
708
+ def write_arrays(
709
+ data: Any,
710
+ cluster: quasardb.Cluster,
711
+ table: Optional[Union[str, Table]] = None,
712
+ *,
713
+ dtype: Optional[
714
+ Union[DType, Dict[str, Optional[DType]], List[Optional[DType]]]
715
+ ] = None,
716
+ index: Optional[NDArrayTime] = None,
717
+ # TODO: Set the default push_mode after removing _async, fast and truncate
718
+ push_mode: Optional[quasardb.WriterPushMode] = None,
719
+ _async: bool = False,
720
+ fast: bool = False,
721
+ truncate: Union[bool, Tuple[Any, ...]] = False,
722
+ truncate_range: Optional[Tuple[Any, ...]] = None,
723
+ deduplicate: Union[bool, str, List[str]] = False,
724
+ deduplication_mode: str = "drop",
725
+ infer_types: bool = True,
726
+ writer: Optional[Writer] = None,
727
+ write_through: bool = True,
728
+ retries: Union[int, quasardb.RetryOptions] = 3,
729
+ # We accept additional kwargs that will be passed through the writer.push() methods
730
+ **kwargs: Any,
731
+ ) -> List[Table]:
732
+ """
733
+ Write multiple aligned numpy arrays to a table.
734
+
735
+ Parameters:
736
+ -----------
737
+
738
+ data: Iterable of np.array, or dict-like of str:np.array
739
+ Numpy arrays to write into the database. Can either be a list of numpy arrays,
740
+ in which case they are expected to be in the same order as table.list_columns(),
741
+ and an array is provided for each of the columns. If `index` is None, the first
742
+ array will be assumed to be an index with dtype `datetime64[ns]`.
743
+
744
+ Alternatively, a dict of key/values may be provided, where the key is expected
745
+ to be a table column label, and the value is expected to be a np.array. If present,
746
+ a column with label '$timestamp' will be used as the index.
747
+
748
+ In all cases, all numpy arrays are expected to be of exactly the same length as the
749
+ index.
750
+
751
+ cluster: quasardb.Cluster
752
+ Active connection to the QuasarDB cluster
753
+
754
+ table: quasardb.Table or str
755
+ Either a string or a reference to a QuasarDB Timeseries table object.
756
+ For example, 'my_table' or cluster.table('my_table') are both valid values.
757
+
758
+ Defaults to False.
759
+
760
+ index: optional np.array with dtype datetime64[ns]
761
+ Optionally explicitly provide an array as the $timestamp index. If not provided,
762
+ the first array provided to `data` will be used as the index.
763
+
764
+ dtype: optional dtype, list of dtype, or dict of dtype
765
+ Optional data type to force. If a single dtype, will force that dtype to all
766
+ columns. If list-like, will map dtypes to dataframe columns by their offset.
767
+ If dict-like, will map dtypes to dataframe columns by their label.
768
+
769
+ If a dtype for a column is provided in this argument, and infer_types is also
770
+ True, this argument takes precedence.
771
+
772
+ deduplicate: bool or list[str]
773
+ Enables server-side deduplication of data when it is written into the table.
774
+ When True, automatically deduplicates rows when all values of a row are identical.
775
+ When a list of strings is provided, deduplicates only based on the values of
776
+ these columns.
777
+
778
+ Defaults to False.
779
+
780
+ deduplication_mode: 'drop' or 'upsert'
781
+ When `deduplicate` is enabled, decides how deduplication is performed. 'drop' means
782
+ any newly written duplicates are dropped, where 'upsert' means that the previously
783
+ written data is updated to reflect the new data.
784
+
785
+ Defaults to 'drop'.
786
+
787
+ infer_types: optional bool
788
+ If true, will attemp to convert types from Python to QuasarDB natives types if
789
+ the provided dataframe has incompatible types. For example, a dataframe with integers
790
+ will automatically convert these to doubles if the QuasarDB table expects it.
791
+
792
+ Defaults to True. For production use cases where you want to avoid implicit conversions,
793
+ we recommend setting this to False.
794
+
795
+ push_mode: optional quasardb.WriterPushMode
796
+ The mode used for inserting data. Can be either a string or a `WriterPushMode` enumeration item.
797
+ Available options:
798
+ * `Truncate`: Truncate (also referred to as upsert) the data in-place. Will detect time range
799
+ to truncate from the time range inside the dataframe.
800
+ * `Async`: Uses asynchronous insertion API where commits are buffered server-side and
801
+ acknowledged before they are written to disk. If you insert to the same table from
802
+ multiple processes, setting this to True may improve performance.
803
+ * `Fast`: Whether to use 'fast push'. If you incrementally add small batches of data to table,
804
+ you may see better performance if you set this to True.
805
+ * `Transactional`: Ensures full transactional consistency.
806
+
807
+ Defaults to `Transactional`.
808
+
809
+ truncate: optional bool
810
+ **DEPRECATED** – Use `push_mode=WriterPushMode.Truncate` instead.
811
+ Truncate (also referred to as upsert) the data in-place. Will detect time range to truncate
812
+ from the time range inside the dataframe.
813
+
814
+ Defaults to False.
815
+
816
+ truncate_range: optional tuple
817
+ Time range to truncate from the time range inside the dataframe.
818
+
819
+ _async: optional bool
820
+ **DEPRECATED** – Use `push_mode=WriterPushMode.Async` instead.
821
+ If true, uses asynchronous insertion API where commits are buffered server-side and
822
+ acknowledged before they are written to disk. If you insert to the same table from
823
+ multiple processes, setting this to True may improve performance.
824
+
825
+ Defaults to False.
826
+
827
+ fast: optional bool
828
+ **DEPRECATED** – Use `push_mode=WriterPushMode.Fast` instead.
829
+ Whether to use 'fast push'. If you incrementally add small batches of data to table,
830
+ you may see better performance if you set this to True.
831
+
832
+ Defaults to False.
833
+
834
+ write_through: optional bool
835
+ If True, data is not cached after write.
836
+ By default is False, in which case caching is left at the discretion of the server.
837
+
838
+ writer: optional quasardb.Writer
839
+ Allows you to explicitly provide a Writer to use, which is expected to be
840
+ initialized with the `table`.
841
+
842
+ Reuse of the Writer allows for some performance improvements.
843
+
844
+ retries: optional int or quasardb.RetryOptions
845
+ Number of times to retry in case of a push failure. This is useful in case of async
846
+ pipeline failures, or when doing transactional inserts that may occasionally cause
847
+ transaction conflicts.
848
+
849
+ Retries with exponential backoff, starts at 3 seconds, and doubles every retry attempt.
850
+
851
+ Alternatively, a quasardb.RetryOptions object can be passed to more carefully fine-tune
852
+ retry behavior.
853
+ """
854
+
855
+ if table:
856
+ logger.debug("table explicitly provided, assuming single-table write")
857
+ data = [(table, data)]
858
+ table = None
859
+
860
+ _type_check(push_mode, "push_mode", target_type=quasardb.WriterPushMode)
861
+ deprecation_stacklevel = kwargs.pop("deprecation_stacklevel", 1) + 1
862
+
863
+ if isinstance(truncate, tuple):
864
+ # Especial case, truncate might be a tuple indicating the range.
865
+ _kwarg_deprecation_warning(
866
+ "truncate",
867
+ truncate,
868
+ ["push_mode", "truncate_range"],
869
+ [quasardb.WriterPushMode.Truncate, truncate],
870
+ deprecation_stacklevel,
871
+ )
872
+ truncate_range = truncate_range or truncate
873
+ truncate = True
874
+
875
+ kwarg_to_mode = {
876
+ # "kwarg": (kwarg_type, kwarg_push_mode, is_deprecated)
877
+ "fast": (bool, quasardb.WriterPushMode.Fast, True),
878
+ "_async": (bool, quasardb.WriterPushMode.Async, True),
879
+ "truncate": (bool, quasardb.WriterPushMode.Truncate, True),
880
+ "truncate_range": (tuple, quasardb.WriterPushMode.Truncate, False),
881
+ }
882
+
883
+ for kwarg, info in kwarg_to_mode.items():
884
+ expected_type, mode, deprecated = info
885
+ kwarg_value = locals()[kwarg]
886
+ _type_check(kwarg_value, kwarg, target_type=expected_type)
887
+
888
+ if kwarg_value:
889
+ if push_mode and push_mode != mode:
890
+ raise quasardb.InvalidArgumentError(
891
+ f"Found '{kwarg}' in kwargs, but push mode is already set to {push_mode}"
892
+ )
893
+ push_mode = mode
894
+ if deprecated:
895
+ _kwarg_deprecation_warning(
896
+ kwarg, kwarg_value, ["push_mode"], [mode], deprecation_stacklevel
897
+ )
898
+
899
+ if not push_mode:
900
+ push_mode = quasardb.WriterPushMode.Transactional
901
+
902
+ # Create batch column info from dataframe
903
+ if writer is None:
904
+ writer = cluster.writer()
905
+
906
+ ret: List[Table] = []
907
+ n_rows = 0
908
+ push_data = quasardb.WriterData()
909
+
910
+ for table_, data_ in data:
911
+ # Acquire reference to table_ if string is provided
912
+ if isinstance(table_, str):
913
+ table_ = table_cache.lookup(table_, cluster)
914
+
915
+ cinfos = [(x.name, x.type) for x in table_.list_columns()]
916
+ dtype_ = _coerce_dtype(dtype, cinfos)
917
+
918
+ assert type(dtype_) is list
919
+ assert len(dtype_) is len(cinfos)
920
+
921
+ if index is None and isinstance(data_, dict) and "$timestamp" in data_:
922
+ # Create shallow copy of `data_` so that we don't modify the reference, i.e.
923
+ # delete keys.
924
+ #
925
+ # This ensures that the user can call the same function multiple times without
926
+ # side-effects.
927
+ data_ = data_.copy()
928
+ index_ = data_.pop("$timestamp")
929
+
930
+ if ma.isMA(index_):
931
+ # Index might be a masked array
932
+ index_ = index_.data
933
+
934
+ assert "$timestamp" not in data_
935
+ elif index is not None:
936
+ index_ = index
937
+ else:
938
+ raise RuntimeError("Invalid index: no index provided.")
939
+
940
+ assert index_ is not None
941
+
942
+ if infer_types is True:
943
+ dtype_ = _add_desired_dtypes(dtype_, cinfos)
944
+
945
+ data_ = _ensure_list(data_, cinfos)
946
+
947
+ if len(data_) != len(cinfos):
948
+ raise InvalidDataCardinalityError(data_, cinfos)
949
+
950
+ data_ = ensure_ma(data_, dtype=dtype_)
951
+ assert isinstance(data_, list)
952
+ data_ = _coerce_data(data_, dtype_)
953
+
954
+ # Just some additional friendly information about incorrect dtypes, we'd
955
+ # prefer to have this information thrown from Python instead of native
956
+ # code as it generally makes for somewhat better error context.
957
+ _validate_dtypes(data_, cinfos)
958
+
959
+ deduplicate = _coerce_deduplicate(deduplicate, deduplication_mode, cinfos)
960
+
961
+ # Sanity check
962
+ assert len(data_) == len(cinfos)
963
+
964
+ for i in range(len(data_)):
965
+ assert len(data_[i]) == len(index_)
966
+
967
+ push_data.append(table_, index_, data_)
968
+
969
+ n_rows += len(index_)
970
+ ret.append(table_)
971
+
972
+ retries = _coerce_retries(retries)
973
+
974
+ # By default, we push all additional kwargs to the writer.push() function. This allows transparent propagation
975
+ # arguments.
976
+ #
977
+ # The initial use case was that so we can add additional parameters for test mocks, e.g. `mock_failures` so that
978
+ # we can validate the retry functionality.
979
+ push_kwargs = kwargs
980
+ push_kwargs["deduplicate"] = deduplicate
981
+ push_kwargs["deduplication_mode"] = deduplication_mode
982
+ push_kwargs["write_through"] = write_through
983
+ push_kwargs["retries"] = retries
984
+ push_kwargs["push_mode"] = push_mode
985
+ if truncate_range:
986
+ push_kwargs["range"] = truncate_range
987
+
988
+ logger.debug("pushing %d rows", n_rows)
989
+ start = time.time()
990
+
991
+ writer.push(push_data, **push_kwargs)
992
+
993
+ logger.debug("pushed %d rows in %s seconds", n_rows, (time.time() - start))
994
+
995
+ return ret
996
+
997
+
998
+ def _xform_query_results(
999
+ xs: Sequence[Tuple[str, MaskedArrayAny]],
1000
+ index: Optional[Union[str, int]],
1001
+ dict: bool,
1002
+ ) -> Tuple[NDArrayAny, Union[Dict[str, MaskedArrayAny], List[MaskedArrayAny]]]:
1003
+ if len(xs) == 0:
1004
+ return (np.array([], np.dtype("datetime64[ns]")), {} if dict else [])
1005
+
1006
+ n = None
1007
+ for x in xs:
1008
+ assert isinstance(x, tuple)
1009
+
1010
+ if n is None:
1011
+ n = x[1].size
1012
+ else:
1013
+ assert x[1].size == n
1014
+
1015
+ if index is None:
1016
+ # Generate a range, put it in the front of the result list,
1017
+ # recurse and tell the function to use that index.
1018
+ assert isinstance(n, int)
1019
+ xs_: Sequence[Tuple[str, MaskedArrayAny]] = [
1020
+ ("$index", ma.masked_array(np.arange(n)))
1021
+ ] + list(xs)
1022
+
1023
+ return _xform_query_results(xs_, "$index", dict)
1024
+
1025
+ if isinstance(index, str):
1026
+ for i in range(len(xs)):
1027
+ (cname, _) = xs[i]
1028
+ if cname == index:
1029
+ # Now we know that this column has offset `i`,
1030
+ # recurse with that offset
1031
+ return _xform_query_results(xs, i, dict)
1032
+
1033
+ raise KeyError(
1034
+ "Unable to resolve index column: column not found in results: {}".format(
1035
+ index
1036
+ )
1037
+ )
1038
+
1039
+ if not isinstance(index, int):
1040
+ raise TypeError(
1041
+ "Unable to resolve index column: unrecognized type {}: {}".format(
1042
+ type(index), index
1043
+ )
1044
+ )
1045
+
1046
+ assert isinstance(xs, list)
1047
+ idx = xs[index][1]
1048
+ del xs[index]
1049
+
1050
+ # Our index *must* be a masked array, and there should be no
1051
+ # masked items: we cannot not have an index for a certain row.
1052
+ if ma.isMA(idx):
1053
+ if ma.count_masked(idx) > 0:
1054
+ raise ValueError(
1055
+ "Invalid index: null values detected. An index is never allowed to have null values."
1056
+ )
1057
+
1058
+ assert isinstance(idx.data, np.ndarray)
1059
+ idx = idx.data
1060
+
1061
+ if dict:
1062
+ return idx, {x[0]: x[1] for x in xs}
1063
+ else:
1064
+ return idx, [x[1] for x in xs]
1065
+
1066
+
1067
+ def query(
1068
+ cluster: quasardb.Cluster,
1069
+ query: str,
1070
+ index: Optional[Union[str, int]] = None,
1071
+ dict: bool = False,
1072
+ ) -> Tuple[NDArrayAny, Union[Dict[str, MaskedArrayAny], List[MaskedArrayAny]]]:
1073
+ """
1074
+ Execute a query and return the results as numpy arrays. The shape of the return value
1075
+ is always:
1076
+
1077
+ tuple[index, dict | list[np.array]]
1078
+
1079
+
1080
+ If `dict` is True, constructs a dict[str, np.array] where the key is the column name.
1081
+ Otherwise, it returns a list of all the individual data arrays.
1082
+
1083
+
1084
+
1085
+ Parameters:
1086
+ -----------
1087
+
1088
+ cluster : quasardb.Cluster
1089
+ Active connection to the QuasarDB cluster
1090
+
1091
+ query : str
1092
+ The query to execute.
1093
+
1094
+ index : optional[str | int]
1095
+ If provided, resolves column and uses that as the index. If string (e.g. `$timestamp`), uses
1096
+ that column as the index. If int (e.g. `1`), looks up the column based on that offset.
1097
+
1098
+ dict : bool
1099
+ If true, returns data arrays as a dict, otherwise a list of np.arrays.
1100
+ Defaults to False.
1101
+
1102
+ """
1103
+
1104
+ xs = cluster.query_numpy(query)
1105
+
1106
+ return _xform_query_results(xs, index, dict)