quasardb 3.14.2.dev4__cp310-cp310-macosx_11_0_arm64.whl → 3.14.2.dev6__cp310-cp310-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of quasardb might be problematic. Click here for more details.
- quasardb/CMakeFiles/CMakeDirectoryInformation.cmake +2 -2
- quasardb/Makefile +20 -20
- quasardb/__init__.py +21 -7
- quasardb/cmake_install.cmake +5 -5
- quasardb/date/CMakeFiles/CMakeDirectoryInformation.cmake +2 -2
- quasardb/date/CMakeFiles/Export/a52b05f964b070ee926bcad51d3288af/dateTargets.cmake +1 -1
- quasardb/date/Makefile +20 -20
- quasardb/date/cmake_install.cmake +5 -5
- quasardb/date/dateTargets.cmake +1 -1
- quasardb/extensions/writer.py +59 -61
- quasardb/firehose.py +24 -22
- quasardb/libqdb_api.dylib +0 -0
- quasardb/numpy/__init__.py +262 -128
- quasardb/pandas/__init__.py +145 -91
- quasardb/pool.py +13 -2
- quasardb/pybind11/CMakeFiles/CMakeDirectoryInformation.cmake +2 -2
- quasardb/pybind11/Makefile +20 -20
- quasardb/pybind11/cmake_install.cmake +2 -2
- quasardb/quasardb.cpython-310-darwin.so +0 -0
- quasardb/range-v3/CMakeFiles/CMakeDirectoryInformation.cmake +2 -2
- quasardb/range-v3/CMakeFiles/Export/d94ef200eca10a819b5858b33e808f5b/range-v3-targets.cmake +1 -1
- quasardb/range-v3/CMakeFiles/range.v3.headers.dir/build.make +17 -17
- quasardb/range-v3/Makefile +25 -25
- quasardb/range-v3/cmake_install.cmake +8 -8
- quasardb/range-v3/range-v3-config.cmake +1 -1
- quasardb/stats.py +245 -120
- quasardb/table_cache.py +5 -1
- {quasardb-3.14.2.dev4.dist-info → quasardb-3.14.2.dev6.dist-info}/METADATA +3 -2
- quasardb-3.14.2.dev6.dist-info/RECORD +45 -0
- {quasardb-3.14.2.dev4.dist-info → quasardb-3.14.2.dev6.dist-info}/WHEEL +1 -1
- quasardb-3.14.2.dev4.dist-info/RECORD +0 -45
- {quasardb-3.14.2.dev4.dist-info → quasardb-3.14.2.dev6.dist-info/licenses}/LICENSE.md +0 -0
- {quasardb-3.14.2.dev4.dist-info → quasardb-3.14.2.dev6.dist-info}/top_level.txt +0 -0
quasardb/numpy/__init__.py
CHANGED
|
@@ -29,11 +29,12 @@
|
|
|
29
29
|
|
|
30
30
|
import logging
|
|
31
31
|
import time
|
|
32
|
+
import warnings
|
|
32
33
|
|
|
33
34
|
import quasardb
|
|
34
35
|
import quasardb.table_cache as table_cache
|
|
35
36
|
|
|
36
|
-
logger = logging.getLogger(
|
|
37
|
+
logger = logging.getLogger("quasardb.numpy")
|
|
37
38
|
|
|
38
39
|
|
|
39
40
|
class NumpyRequired(ImportError):
|
|
@@ -41,6 +42,7 @@ class NumpyRequired(ImportError):
|
|
|
41
42
|
Exception raised when trying to use QuasarDB pandas integration, but
|
|
42
43
|
pandas has not been installed.
|
|
43
44
|
"""
|
|
45
|
+
|
|
44
46
|
pass
|
|
45
47
|
|
|
46
48
|
|
|
@@ -52,7 +54,7 @@ except ImportError as err:
|
|
|
52
54
|
logger.exception(err)
|
|
53
55
|
raise NumpyRequired(
|
|
54
56
|
"The numpy library is required to handle numpy arrays formats"
|
|
55
|
-
|
|
57
|
+
) from err
|
|
56
58
|
|
|
57
59
|
|
|
58
60
|
class IncompatibleDtypeError(TypeError):
|
|
@@ -68,13 +70,16 @@ class IncompatibleDtypeError(TypeError):
|
|
|
68
70
|
super().__init__(self.msg())
|
|
69
71
|
|
|
70
72
|
def msg(self):
|
|
71
|
-
return "Data for column '{}' with type '{}' was provided in dtype '{}' but need '{}'.".format(
|
|
73
|
+
return "Data for column '{}' with type '{}' was provided in dtype '{}' but need '{}'.".format(
|
|
74
|
+
self.cname, self.ctype, self.provided, self.expected
|
|
75
|
+
)
|
|
72
76
|
|
|
73
77
|
|
|
74
78
|
class IncompatibleDtypeErrors(TypeError):
|
|
75
79
|
"""
|
|
76
80
|
Wraps multiple dtype errors
|
|
77
81
|
"""
|
|
82
|
+
|
|
78
83
|
def __init__(self, xs):
|
|
79
84
|
self.xs = xs
|
|
80
85
|
super().__init__(self.msg())
|
|
@@ -82,29 +87,33 @@ class IncompatibleDtypeErrors(TypeError):
|
|
|
82
87
|
def msg(self):
|
|
83
88
|
return "\n".join(x.msg() for x in self.xs)
|
|
84
89
|
|
|
90
|
+
|
|
85
91
|
class InvalidDataCardinalityError(ValueError):
|
|
86
92
|
"""
|
|
87
93
|
Raised when the provided data arrays doesn't match the table's columns.
|
|
88
94
|
"""
|
|
95
|
+
|
|
89
96
|
def __init__(self, data, cinfos):
|
|
90
97
|
self.data = data
|
|
91
98
|
self.cinfos = cinfos
|
|
92
99
|
super().__init__(self.msg())
|
|
93
100
|
|
|
94
101
|
def msg(self):
|
|
95
|
-
return "Provided data array length '{}' exceeds amount of table columns '{}', unable to map data to columns".format(
|
|
102
|
+
return "Provided data array length '{}' exceeds amount of table columns '{}', unable to map data to columns".format(
|
|
103
|
+
len(self.data), len(self.cinfos)
|
|
104
|
+
)
|
|
96
105
|
|
|
97
106
|
|
|
98
107
|
# Based on QuasarDB column types, which dtype do we accept?
|
|
99
108
|
# First entry will always be the 'preferred' dtype, other ones
|
|
100
109
|
# those that we can natively convert in native code.
|
|
101
110
|
_ctype_to_dtype = {
|
|
102
|
-
quasardb.ColumnType.String: [np.dtype(
|
|
103
|
-
quasardb.ColumnType.Symbol: [np.dtype(
|
|
104
|
-
quasardb.ColumnType.Int64: [np.dtype(
|
|
105
|
-
quasardb.ColumnType.Double: [np.dtype(
|
|
106
|
-
quasardb.ColumnType.Blob: [np.dtype(
|
|
107
|
-
quasardb.ColumnType.Timestamp: [np.dtype(
|
|
111
|
+
quasardb.ColumnType.String: [np.dtype("U")],
|
|
112
|
+
quasardb.ColumnType.Symbol: [np.dtype("U")],
|
|
113
|
+
quasardb.ColumnType.Int64: [np.dtype("i8"), np.dtype("i4"), np.dtype("i2")],
|
|
114
|
+
quasardb.ColumnType.Double: [np.dtype("f8"), np.dtype("f4")],
|
|
115
|
+
quasardb.ColumnType.Blob: [np.dtype("S"), np.dtype("O")],
|
|
116
|
+
quasardb.ColumnType.Timestamp: [np.dtype("datetime64[ns]")],
|
|
108
117
|
}
|
|
109
118
|
|
|
110
119
|
|
|
@@ -141,9 +150,12 @@ def _coerce_dtype(dtype, columns):
|
|
|
141
150
|
# Any columns not provided will have a 'None' dtype.
|
|
142
151
|
dtype_ = [None] * len(columns)
|
|
143
152
|
|
|
144
|
-
for
|
|
153
|
+
for k, dt in dtype.items():
|
|
145
154
|
if not k in offsets:
|
|
146
|
-
logger.warn(
|
|
155
|
+
logger.warn(
|
|
156
|
+
"Forced dtype provided for column '%s' = %s, but that column is not found in the table. Skipping...",
|
|
157
|
+
k,
|
|
158
|
+
)
|
|
147
159
|
|
|
148
160
|
i = offsets[k]
|
|
149
161
|
dtype_[i] = dt
|
|
@@ -151,16 +163,22 @@ def _coerce_dtype(dtype, columns):
|
|
|
151
163
|
dtype = dtype_
|
|
152
164
|
|
|
153
165
|
if type(dtype) is not list:
|
|
154
|
-
raise ValueError(
|
|
166
|
+
raise ValueError(
|
|
167
|
+
"Forced dtype argument provided, but the argument has an incompatible type. Expected: list-like or dict-like, got: {}".format(
|
|
168
|
+
type(dtype)
|
|
169
|
+
)
|
|
170
|
+
)
|
|
155
171
|
|
|
156
172
|
if len(dtype) is not len(columns):
|
|
157
|
-
raise ValueError(
|
|
173
|
+
raise ValueError(
|
|
174
|
+
"Expected exactly one dtype for each column, but %d dtypes were provided for %d columns".format(
|
|
175
|
+
len(dtype), len(columns)
|
|
176
|
+
)
|
|
177
|
+
)
|
|
158
178
|
|
|
159
179
|
return dtype
|
|
160
180
|
|
|
161
181
|
|
|
162
|
-
|
|
163
|
-
|
|
164
182
|
def _add_desired_dtypes(dtype, columns):
|
|
165
183
|
"""
|
|
166
184
|
When infer_types=True, this function sets the 'desired' dtype for each of the columns.
|
|
@@ -174,7 +192,12 @@ def _add_desired_dtypes(dtype, columns):
|
|
|
174
192
|
if dtype[i] is None:
|
|
175
193
|
(cname, ctype) = columns[i]
|
|
176
194
|
dtype_ = _best_dtype_for_ctype(ctype)
|
|
177
|
-
logger.debug(
|
|
195
|
+
logger.debug(
|
|
196
|
+
"using default dtype '%s' for column '%s' with type %s",
|
|
197
|
+
dtype_,
|
|
198
|
+
cname,
|
|
199
|
+
ctype,
|
|
200
|
+
)
|
|
178
201
|
dtype[i] = dtype_
|
|
179
202
|
|
|
180
203
|
return dtype
|
|
@@ -196,8 +219,11 @@ def _is_all_masked(xs):
|
|
|
196
219
|
# built-ins for object arrays
|
|
197
220
|
return all(x is None for x in xs)
|
|
198
221
|
|
|
199
|
-
|
|
200
|
-
|
|
222
|
+
logger.debug(
|
|
223
|
+
"{} is not a masked array, not convertible to requested type... ".format(
|
|
224
|
+
type(xs)
|
|
225
|
+
)
|
|
226
|
+
)
|
|
201
227
|
|
|
202
228
|
# This array is *not* a masked array, it's *not* convertible to the type we want,
|
|
203
229
|
# and it's *not* an object array.
|
|
@@ -208,7 +234,7 @@ def _is_all_masked(xs):
|
|
|
208
234
|
|
|
209
235
|
|
|
210
236
|
def dtypes_equal(lhs, rhs):
|
|
211
|
-
if lhs.kind ==
|
|
237
|
+
if lhs.kind == "U" or lhs.kind == "S":
|
|
212
238
|
# Unicode and string data has variable length encoding, which means their itemsize
|
|
213
239
|
# can be anything.
|
|
214
240
|
#
|
|
@@ -236,43 +262,60 @@ def _validate_dtypes(data, columns):
|
|
|
236
262
|
(cname, ctype, provided_dtype, expected_dtype) = e
|
|
237
263
|
return
|
|
238
264
|
|
|
239
|
-
for
|
|
265
|
+
for data_, (cname, ctype) in zip(data, columns):
|
|
240
266
|
expected_ = _ctype_to_dtype[ctype]
|
|
241
267
|
|
|
242
268
|
logger.debug("data_.dtype = %s, expected_ = %s", data_.dtype, expected_)
|
|
243
269
|
|
|
244
270
|
if _dtype_found(data_.dtype, expected_) == False:
|
|
245
|
-
errors.append(
|
|
271
|
+
errors.append(
|
|
272
|
+
IncompatibleDtypeError(
|
|
273
|
+
cname=cname, ctype=ctype, provided=data_.dtype, expected=expected_
|
|
274
|
+
)
|
|
275
|
+
)
|
|
246
276
|
|
|
247
277
|
if len(errors) > 0:
|
|
248
278
|
raise IncompatibleDtypeErrors(errors)
|
|
249
279
|
|
|
280
|
+
|
|
250
281
|
def _coerce_deduplicate(deduplicate, deduplication_mode, columns):
|
|
251
282
|
"""
|
|
252
283
|
Throws an error when 'deduplicate' options are incorrect.
|
|
253
284
|
"""
|
|
254
285
|
cnames = [cname for (cname, ctype) in columns]
|
|
255
286
|
|
|
256
|
-
if deduplication_mode not in [
|
|
257
|
-
raise RuntimeError(
|
|
287
|
+
if deduplication_mode not in ["drop", "upsert"]:
|
|
288
|
+
raise RuntimeError(
|
|
289
|
+
"deduplication_mode should be one of ['drop', 'upsert'], got: {}".format(
|
|
290
|
+
deduplication_mode
|
|
291
|
+
)
|
|
292
|
+
)
|
|
258
293
|
|
|
259
294
|
if isinstance(deduplicate, bool):
|
|
260
295
|
return deduplicate
|
|
261
296
|
|
|
262
297
|
# Special value of $timestamp, hardcoded
|
|
263
|
-
if isinstance(deduplicate, str) and deduplicate ==
|
|
264
|
-
deduplicate = [
|
|
265
|
-
cnames.append(
|
|
298
|
+
if isinstance(deduplicate, str) and deduplicate == "$timestamp":
|
|
299
|
+
deduplicate = ["$timestamp"]
|
|
300
|
+
cnames.append("$timestamp")
|
|
266
301
|
|
|
267
302
|
if not isinstance(deduplicate, list):
|
|
268
|
-
raise TypeError(
|
|
303
|
+
raise TypeError(
|
|
304
|
+
"drop_duplicates should be either a bool or a list, got: "
|
|
305
|
+
+ type(deduplicate)
|
|
306
|
+
)
|
|
269
307
|
|
|
270
308
|
for column_name in deduplicate:
|
|
271
309
|
if not column_name in cnames:
|
|
272
|
-
raise RuntimeError(
|
|
310
|
+
raise RuntimeError(
|
|
311
|
+
"Provided deduplication column name '{}' not found in table columns.".format(
|
|
312
|
+
column_name
|
|
313
|
+
)
|
|
314
|
+
)
|
|
273
315
|
|
|
274
316
|
return deduplicate
|
|
275
317
|
|
|
318
|
+
|
|
276
319
|
def _clean_nulls(xs, dtype):
|
|
277
320
|
"""
|
|
278
321
|
Numpy's masked arrays have a downside that in case they're not able to convert a (masked!) value to
|
|
@@ -291,16 +334,16 @@ def _clean_nulls(xs, dtype):
|
|
|
291
334
|
|
|
292
335
|
assert ma.isMA(xs)
|
|
293
336
|
|
|
294
|
-
if xs.dtype is not np.dtype(
|
|
337
|
+
if xs.dtype is not np.dtype("object"):
|
|
295
338
|
return xs
|
|
296
339
|
|
|
297
340
|
fill_value = None
|
|
298
341
|
if dtype == np.float64 or dtype == np.float32 or dtype == np.float16:
|
|
299
|
-
fill_value = float(
|
|
342
|
+
fill_value = float("nan")
|
|
300
343
|
elif dtype == np.int64 or dtype == np.int32 or dtype == np.int16:
|
|
301
344
|
fill_value = -1
|
|
302
|
-
elif dtype == np.dtype(
|
|
303
|
-
fill_value = np.datetime64(
|
|
345
|
+
elif dtype == np.dtype("datetime64[ns]"):
|
|
346
|
+
fill_value = np.datetime64("nat")
|
|
304
347
|
|
|
305
348
|
mask = xs.mask
|
|
306
349
|
xs_ = xs.filled(fill_value)
|
|
@@ -308,7 +351,6 @@ def _clean_nulls(xs, dtype):
|
|
|
308
351
|
return ma.array(xs_, mask=mask)
|
|
309
352
|
|
|
310
353
|
|
|
311
|
-
|
|
312
354
|
def _coerce_data(data, dtype):
|
|
313
355
|
"""
|
|
314
356
|
Coerces each numpy array of `data` to the dtype present in `dtype`.
|
|
@@ -325,7 +367,12 @@ def _coerce_data(data, dtype):
|
|
|
325
367
|
|
|
326
368
|
assert ma.isMA(data_)
|
|
327
369
|
|
|
328
|
-
logger.debug(
|
|
370
|
+
logger.debug(
|
|
371
|
+
"data for column with offset %d was provided in dtype '%s', but need '%s': converting data...",
|
|
372
|
+
i,
|
|
373
|
+
data_.dtype,
|
|
374
|
+
dtype_,
|
|
375
|
+
)
|
|
329
376
|
|
|
330
377
|
logger.debug("dtype of data[%d] before: %s", i, data_.dtype)
|
|
331
378
|
logger.debug("type of data[%d] after: %s", i, type(data_))
|
|
@@ -338,14 +385,20 @@ def _coerce_data(data, dtype):
|
|
|
338
385
|
# One 'bug' is that, if everything is masked, the underlying data type can be
|
|
339
386
|
# pretty much anything.
|
|
340
387
|
if _is_all_masked(data_):
|
|
341
|
-
logger.debug(
|
|
342
|
-
|
|
343
|
-
|
|
388
|
+
logger.debug(
|
|
389
|
+
"array completely empty, re-initializing to empty array of '%s'",
|
|
390
|
+
dtype_,
|
|
391
|
+
)
|
|
392
|
+
data[i] = ma.masked_all(ma.size(data_), dtype=dtype_)
|
|
344
393
|
|
|
345
394
|
# Another 'bug' is that when the input data is objects, we may have null-like values (like pd.NA)
|
|
346
395
|
# that cannot easily be converted to, say, float.
|
|
347
396
|
else:
|
|
348
|
-
logger.error(
|
|
397
|
+
logger.error(
|
|
398
|
+
"An error occured while coercing input data type from dtype '%s' to dtype '%s': ",
|
|
399
|
+
data_.dtype,
|
|
400
|
+
dtype_,
|
|
401
|
+
)
|
|
349
402
|
logger.exception(err)
|
|
350
403
|
raise err
|
|
351
404
|
|
|
@@ -358,6 +411,7 @@ def _coerce_data(data, dtype):
|
|
|
358
411
|
|
|
359
412
|
return data
|
|
360
413
|
|
|
414
|
+
|
|
361
415
|
def _probe_length(xs):
|
|
362
416
|
"""
|
|
363
417
|
Returns the length of the first non-null array in `xs`, or None if all arrays
|
|
@@ -372,6 +426,7 @@ def _probe_length(xs):
|
|
|
372
426
|
|
|
373
427
|
return None
|
|
374
428
|
|
|
429
|
+
|
|
375
430
|
def _ensure_list(xs, cinfos):
|
|
376
431
|
"""
|
|
377
432
|
If input data is a dict, ensures it's converted to a list with the correct
|
|
@@ -422,12 +477,42 @@ def _coerce_retries(retries) -> quasardb.RetryOptions:
|
|
|
422
477
|
elif isinstance(retries, quasardb.RetryOptions):
|
|
423
478
|
return retries
|
|
424
479
|
else:
|
|
425
|
-
raise TypeError(
|
|
480
|
+
raise TypeError(
|
|
481
|
+
"retries should either be an integer or quasardb.RetryOptions, got: "
|
|
482
|
+
+ type(retries)
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def _kwarg_deprecation_warning(
|
|
487
|
+
old_kwarg, old_value, new_kwargs, new_values, stacklevel
|
|
488
|
+
):
|
|
489
|
+
new_declaration = ", ".join(
|
|
490
|
+
f"{new_kwarg}={new_value}"
|
|
491
|
+
for new_kwarg, new_value in zip(new_kwargs, new_values)
|
|
492
|
+
)
|
|
493
|
+
warnings.warn(
|
|
494
|
+
f"The argument '{old_kwarg}' <{type(old_value).__name__}> is deprecated and will be removed in a future version. "
|
|
495
|
+
f"Please use '{new_declaration}' instead.",
|
|
496
|
+
DeprecationWarning,
|
|
497
|
+
stacklevel=stacklevel + 1,
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def _type_check(var, var_name, target_type, raise_error=True, allow_none=True):
|
|
502
|
+
if allow_none and var is None:
|
|
503
|
+
return True
|
|
504
|
+
if not isinstance(var, target_type):
|
|
505
|
+
if raise_error:
|
|
506
|
+
raise quasardb.quasardb.InvalidArgumentError(
|
|
507
|
+
f"Invalid '{var_name}' type, expected: {target_type}, got: {type(var)}"
|
|
508
|
+
)
|
|
509
|
+
return False
|
|
510
|
+
return True
|
|
426
511
|
|
|
427
512
|
|
|
428
513
|
def ensure_ma(xs, dtype=None):
|
|
429
514
|
if isinstance(dtype, list):
|
|
430
|
-
assert
|
|
515
|
+
assert isinstance(xs, list) == True
|
|
431
516
|
return [ensure_ma(xs_, dtype_) for (xs_, dtype_) in zip(xs, dtype)]
|
|
432
517
|
|
|
433
518
|
# Don't bother if we're already a masked array
|
|
@@ -440,7 +525,7 @@ def ensure_ma(xs, dtype=None):
|
|
|
440
525
|
|
|
441
526
|
logger.debug("coercing array with dtype: %s", xs.dtype)
|
|
442
527
|
|
|
443
|
-
if xs.dtype.kind in [
|
|
528
|
+
if xs.dtype.kind in ["O", "U", "S"]:
|
|
444
529
|
logger.debug("Data is object-like, masking None values")
|
|
445
530
|
|
|
446
531
|
mask = xs == None
|
|
@@ -450,21 +535,17 @@ def ensure_ma(xs, dtype=None):
|
|
|
450
535
|
return ma.masked_invalid(xs, copy=False)
|
|
451
536
|
|
|
452
537
|
|
|
453
|
-
def read_array(table=None,
|
|
454
|
-
column=None,
|
|
455
|
-
ranges=None):
|
|
538
|
+
def read_array(table=None, column=None, ranges=None):
|
|
456
539
|
if table is None:
|
|
457
540
|
raise RuntimeError("A table is required.")
|
|
458
541
|
|
|
459
542
|
if column is None:
|
|
460
543
|
raise RuntimeError("A column is required.")
|
|
461
544
|
|
|
462
|
-
kwargs = {
|
|
463
|
-
'column': column
|
|
464
|
-
}
|
|
545
|
+
kwargs = {"column": column}
|
|
465
546
|
|
|
466
547
|
if ranges is not None:
|
|
467
|
-
kwargs[
|
|
548
|
+
kwargs["ranges"] = ranges
|
|
468
549
|
|
|
469
550
|
read_with = {
|
|
470
551
|
quasardb.ColumnType.Double: table.double_get_ranges,
|
|
@@ -482,12 +563,8 @@ def read_array(table=None,
|
|
|
482
563
|
|
|
483
564
|
|
|
484
565
|
def write_array(
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
table=None,
|
|
488
|
-
column=None,
|
|
489
|
-
dtype=None,
|
|
490
|
-
infer_types=True):
|
|
566
|
+
data=None, index=None, table=None, column=None, dtype=None, infer_types=True
|
|
567
|
+
):
|
|
491
568
|
"""
|
|
492
569
|
Write a Numpy array to a single column.
|
|
493
570
|
|
|
@@ -527,9 +604,8 @@ def write_array(
|
|
|
527
604
|
if index is None:
|
|
528
605
|
raise RuntimeError("An index numpy timestamp array is required.")
|
|
529
606
|
|
|
530
|
-
|
|
531
607
|
data = ensure_ma(data, dtype=dtype)
|
|
532
|
-
ctype =
|
|
608
|
+
ctype = table.column_type_by_id(column)
|
|
533
609
|
|
|
534
610
|
# We try to reuse some of the other functions, which assume array-like
|
|
535
611
|
# shapes for column info and data. It's a bit hackish, but actually works
|
|
@@ -564,28 +640,39 @@ def write_array(
|
|
|
564
640
|
quasardb.ColumnType.Timestamp: table.timestamp_insert,
|
|
565
641
|
}
|
|
566
642
|
|
|
567
|
-
logger.info(
|
|
643
|
+
logger.info(
|
|
644
|
+
"Writing array (%d rows of dtype %s) to columns %s.%s (type %s)",
|
|
645
|
+
len(data),
|
|
646
|
+
data.dtype,
|
|
647
|
+
table.get_name(),
|
|
648
|
+
column,
|
|
649
|
+
ctype,
|
|
650
|
+
)
|
|
568
651
|
write_with[ctype](column, index, data)
|
|
569
652
|
|
|
653
|
+
|
|
570
654
|
def write_arrays(
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
655
|
+
data,
|
|
656
|
+
cluster,
|
|
657
|
+
table=None,
|
|
658
|
+
*,
|
|
659
|
+
dtype=None,
|
|
660
|
+
index=None,
|
|
661
|
+
# TODO: Set the default push_mode after removing _async, fast and truncate
|
|
662
|
+
push_mode=None,
|
|
663
|
+
_async=False,
|
|
664
|
+
fast=False,
|
|
665
|
+
truncate=False,
|
|
666
|
+
truncate_range=None,
|
|
667
|
+
deduplicate=False,
|
|
668
|
+
deduplication_mode="drop",
|
|
669
|
+
infer_types=True,
|
|
670
|
+
writer=None,
|
|
671
|
+
write_through=True,
|
|
672
|
+
retries=3,
|
|
673
|
+
# We accept additional kwargs that will be passed through the writer.push() methods
|
|
674
|
+
**kwargs,
|
|
675
|
+
):
|
|
589
676
|
"""
|
|
590
677
|
Write multiple aligned numpy arrays to a table.
|
|
591
678
|
|
|
@@ -649,13 +736,32 @@ def write_arrays(
|
|
|
649
736
|
Defaults to True. For production use cases where you want to avoid implicit conversions,
|
|
650
737
|
we recommend setting this to False.
|
|
651
738
|
|
|
739
|
+
push_mode: optional quasardb.WriterPushMode
|
|
740
|
+
The mode used for inserting data. Can be either a string or a `WriterPushMode` enumeration item.
|
|
741
|
+
Available options:
|
|
742
|
+
* `Truncate`: Truncate (also referred to as upsert) the data in-place. Will detect time range
|
|
743
|
+
to truncate from the time range inside the dataframe.
|
|
744
|
+
* `Async`: Uses asynchronous insertion API where commits are buffered server-side and
|
|
745
|
+
acknowledged before they are written to disk. If you insert to the same table from
|
|
746
|
+
multiple processes, setting this to True may improve performance.
|
|
747
|
+
* `Fast`: Whether to use 'fast push'. If you incrementally add small batches of data to table,
|
|
748
|
+
you may see better performance if you set this to True.
|
|
749
|
+
* `Transactional`: Ensures full transactional consistency.
|
|
750
|
+
|
|
751
|
+
Defaults to `Transactional`.
|
|
752
|
+
|
|
652
753
|
truncate: optional bool
|
|
754
|
+
**DEPRECATED** – Use `push_mode=WriterPushMode.Truncate` instead.
|
|
653
755
|
Truncate (also referred to as upsert) the data in-place. Will detect time range to truncate
|
|
654
756
|
from the time range inside the dataframe.
|
|
655
757
|
|
|
656
758
|
Defaults to False.
|
|
657
759
|
|
|
760
|
+
truncate_range: optional tuple
|
|
761
|
+
Time range to truncate from the time range inside the dataframe.
|
|
762
|
+
|
|
658
763
|
_async: optional bool
|
|
764
|
+
**DEPRECATED** – Use `push_mode=WriterPushMode.Async` instead.
|
|
659
765
|
If true, uses asynchronous insertion API where commits are buffered server-side and
|
|
660
766
|
acknowledged before they are written to disk. If you insert to the same table from
|
|
661
767
|
multiple processes, setting this to True may improve performance.
|
|
@@ -663,6 +769,7 @@ def write_arrays(
|
|
|
663
769
|
Defaults to False.
|
|
664
770
|
|
|
665
771
|
fast: optional bool
|
|
772
|
+
**DEPRECATED** – Use `push_mode=WriterPushMode.Fast` instead.
|
|
666
773
|
Whether to use 'fast push'. If you incrementally add small batches of data to table,
|
|
667
774
|
you may see better performance if you set this to True.
|
|
668
775
|
|
|
@@ -691,34 +798,60 @@ def write_arrays(
|
|
|
691
798
|
|
|
692
799
|
if table:
|
|
693
800
|
logger.debug("table explicitly provided, assuming single-table write")
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
801
|
+
data = [(table, data)]
|
|
802
|
+
table = None
|
|
803
|
+
|
|
804
|
+
_type_check(push_mode, "push_mode", target_type=quasardb.WriterPushMode)
|
|
805
|
+
deprecation_stacklevel = kwargs.pop("deprecation_stacklevel", 1) + 1
|
|
806
|
+
|
|
807
|
+
if isinstance(truncate, tuple):
|
|
808
|
+
# Especial case, truncate might be a tuple indicating the range.
|
|
809
|
+
_kwarg_deprecation_warning(
|
|
810
|
+
"truncate",
|
|
811
|
+
truncate,
|
|
812
|
+
["push_mode", "truncate_range"],
|
|
813
|
+
[quasardb.WriterPushMode.Truncate, truncate],
|
|
814
|
+
deprecation_stacklevel,
|
|
815
|
+
)
|
|
816
|
+
truncate_range = truncate_range or truncate
|
|
817
|
+
truncate = True
|
|
818
|
+
|
|
819
|
+
kwarg_to_mode = {
|
|
820
|
+
# "kwarg": (kwarg_type, kwarg_push_mode, is_deprecated)
|
|
821
|
+
"fast": (bool, quasardb.WriterPushMode.Fast, True),
|
|
822
|
+
"_async": (bool, quasardb.WriterPushMode.Async, True),
|
|
823
|
+
"truncate": (bool, quasardb.WriterPushMode.Truncate, True),
|
|
824
|
+
"truncate_range": (tuple, quasardb.WriterPushMode.Truncate, False),
|
|
825
|
+
}
|
|
710
826
|
|
|
711
|
-
|
|
827
|
+
for kwarg, info in kwarg_to_mode.items():
|
|
828
|
+
expected_type, mode, deprecated = info
|
|
829
|
+
kwarg_value = locals()[kwarg]
|
|
830
|
+
_type_check(kwarg_value, kwarg, target_type=expected_type)
|
|
831
|
+
|
|
832
|
+
if kwarg_value:
|
|
833
|
+
if push_mode and push_mode != mode:
|
|
834
|
+
raise quasardb.quasardb.InvalidArgumentError(
|
|
835
|
+
f"Found '{kwarg}' in kwargs, but push mode is already set to {push_mode}"
|
|
836
|
+
)
|
|
837
|
+
push_mode = mode
|
|
838
|
+
if deprecated:
|
|
839
|
+
_kwarg_deprecation_warning(
|
|
840
|
+
kwarg, kwarg_value, ["push_mode"], [mode], deprecation_stacklevel
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
if not push_mode:
|
|
844
|
+
push_mode = quasardb.WriterPushMode.Transactional
|
|
712
845
|
|
|
713
846
|
# Create batch column info from dataframe
|
|
714
847
|
if writer is None:
|
|
715
848
|
writer = cluster.writer()
|
|
716
849
|
|
|
850
|
+
ret = []
|
|
717
851
|
n_rows = 0
|
|
718
|
-
|
|
719
852
|
push_data = quasardb.WriterData()
|
|
720
853
|
|
|
721
|
-
for
|
|
854
|
+
for table, data_ in data:
|
|
722
855
|
# Acquire reference to table if string is provided
|
|
723
856
|
if isinstance(table, str):
|
|
724
857
|
table = table_cache.lookup(table, cluster)
|
|
@@ -729,10 +862,15 @@ def write_arrays(
|
|
|
729
862
|
assert type(dtype) is list
|
|
730
863
|
assert len(dtype) is len(cinfos)
|
|
731
864
|
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
865
|
+
if index is None and isinstance(data_, dict) and "$timestamp" in data_:
|
|
866
|
+
# Create shallow copy of `data_` so that we don't modify the reference, i.e.
|
|
867
|
+
# delete keys.
|
|
868
|
+
#
|
|
869
|
+
# This ensures that the user can call the same function multiple times without
|
|
870
|
+
# side-effects.
|
|
871
|
+
data_ = data_.copy()
|
|
872
|
+
index_ = data_.pop("$timestamp")
|
|
873
|
+
assert "$timestamp" not in data_
|
|
736
874
|
elif index is not None:
|
|
737
875
|
index_ = index
|
|
738
876
|
else:
|
|
@@ -751,7 +889,6 @@ def write_arrays(
|
|
|
751
889
|
data_ = ensure_ma(data_, dtype=dtype)
|
|
752
890
|
data_ = _coerce_data(data_, dtype)
|
|
753
891
|
|
|
754
|
-
|
|
755
892
|
# Just some additional friendly information about incorrect dtypes, we'd
|
|
756
893
|
# prefer to have this information thrown from Python instead of native
|
|
757
894
|
# code as it generally makes for somewhat better error context.
|
|
@@ -778,37 +915,27 @@ def write_arrays(
|
|
|
778
915
|
# The initial use case was that so we can add additional parameters for test mocks, e.g. `mock_failures` so that
|
|
779
916
|
# we can validate the retry functionality.
|
|
780
917
|
push_kwargs = kwargs
|
|
781
|
-
push_kwargs[
|
|
782
|
-
push_kwargs[
|
|
783
|
-
push_kwargs[
|
|
784
|
-
push_kwargs[
|
|
918
|
+
push_kwargs["deduplicate"] = deduplicate
|
|
919
|
+
push_kwargs["deduplication_mode"] = deduplication_mode
|
|
920
|
+
push_kwargs["write_through"] = write_through
|
|
921
|
+
push_kwargs["retries"] = retries
|
|
922
|
+
push_kwargs["push_mode"] = push_mode
|
|
923
|
+
if truncate_range:
|
|
924
|
+
push_kwargs["range"] = truncate_range
|
|
785
925
|
|
|
786
926
|
logger.debug("pushing %d rows", n_rows)
|
|
787
927
|
start = time.time()
|
|
788
928
|
|
|
789
|
-
if fast is True:
|
|
790
|
-
push_kwargs['push_mode'] = quasardb.WriterPushMode.Fast
|
|
791
|
-
elif truncate is True:
|
|
792
|
-
push_kwargs['push_mode'] = quasardb.WriterPushMode.Truncate
|
|
793
|
-
elif isinstance(truncate, tuple):
|
|
794
|
-
push_kwargs['push_mode'] = quasardb.WriterPushMode.Truncate
|
|
795
|
-
push_kwargs['range'] = truncate
|
|
796
|
-
elif _async is True:
|
|
797
|
-
push_kwargs['push_mode'] = quasardb.WriterPushMode.Async
|
|
798
|
-
else:
|
|
799
|
-
push_kwargs['push_mode'] = quasardb.WriterPushMode.Transactional
|
|
800
|
-
|
|
801
929
|
writer.push(push_data, **push_kwargs)
|
|
802
930
|
|
|
803
|
-
logger.debug("pushed %d rows in %s seconds",
|
|
804
|
-
n_rows, (time.time() - start))
|
|
931
|
+
logger.debug("pushed %d rows in %s seconds", n_rows, (time.time() - start))
|
|
805
932
|
|
|
806
933
|
return ret
|
|
807
934
|
|
|
808
935
|
|
|
809
936
|
def _xform_query_results(xs, index, dict):
|
|
810
937
|
if len(xs) == 0:
|
|
811
|
-
return (np.array([], np.dtype(
|
|
938
|
+
return (np.array([], np.dtype("datetime64[ns]")), np.array([]))
|
|
812
939
|
|
|
813
940
|
n = None
|
|
814
941
|
for x in xs:
|
|
@@ -822,8 +949,8 @@ def _xform_query_results(xs, index, dict):
|
|
|
822
949
|
if index is None:
|
|
823
950
|
# Generate a range, put it in the front of the result list,
|
|
824
951
|
# recurse and tell the function to use that index.
|
|
825
|
-
xs_ = [(
|
|
826
|
-
return _xform_query_results(xs_,
|
|
952
|
+
xs_ = [("$index", np.arange(n))] + xs
|
|
953
|
+
return _xform_query_results(xs_, "$index", dict)
|
|
827
954
|
|
|
828
955
|
if isinstance(index, str):
|
|
829
956
|
for i in range(len(xs)):
|
|
@@ -833,10 +960,18 @@ def _xform_query_results(xs, index, dict):
|
|
|
833
960
|
# recurse with that offset
|
|
834
961
|
return _xform_query_results(xs, i, dict)
|
|
835
962
|
|
|
836
|
-
raise KeyError(
|
|
963
|
+
raise KeyError(
|
|
964
|
+
"Unable to resolve index column: column not found in results: {}".format(
|
|
965
|
+
index
|
|
966
|
+
)
|
|
967
|
+
)
|
|
837
968
|
|
|
838
969
|
if not isinstance(index, int):
|
|
839
|
-
raise TypeError(
|
|
970
|
+
raise TypeError(
|
|
971
|
+
"Unable to resolve index column: unrecognized type {}: {}".format(
|
|
972
|
+
type(index), index
|
|
973
|
+
)
|
|
974
|
+
)
|
|
840
975
|
|
|
841
976
|
idx = xs[index][1]
|
|
842
977
|
del xs[index]
|
|
@@ -845,7 +980,9 @@ def _xform_query_results(xs, index, dict):
|
|
|
845
980
|
# masked items: we cannot not have an index for a certain row.
|
|
846
981
|
if ma.isMA(idx):
|
|
847
982
|
if ma.count_masked(idx) > 0:
|
|
848
|
-
raise ValueError(
|
|
983
|
+
raise ValueError(
|
|
984
|
+
"Invalid index: null values detected. An index is never allowed to have null values."
|
|
985
|
+
)
|
|
849
986
|
|
|
850
987
|
assert isinstance(idx.data, np.ndarray)
|
|
851
988
|
idx = idx.data
|
|
@@ -860,10 +997,7 @@ def _xform_query_results(xs, index, dict):
|
|
|
860
997
|
return (idx, xs_)
|
|
861
998
|
|
|
862
999
|
|
|
863
|
-
def query(cluster,
|
|
864
|
-
query,
|
|
865
|
-
index=None,
|
|
866
|
-
dict=False):
|
|
1000
|
+
def query(cluster, query, index=None, dict=False):
|
|
867
1001
|
"""
|
|
868
1002
|
Execute a query and return the results as numpy arrays. The shape of the return value
|
|
869
1003
|
is always:
|