quasardb 3.14.2.dev7__cp39-cp39-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of quasardb might be problematic. Click here for more details.
- quasardb/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
- quasardb/CMakeFiles/progress.marks +1 -0
- quasardb/Makefile +189 -0
- quasardb/__init__.py +140 -0
- quasardb/__init__.pyi +72 -0
- quasardb/cmake_install.cmake +58 -0
- quasardb/date/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
- quasardb/date/CMakeFiles/Export/b76006b2b7125baf1b0b4d4ca4db82bd/dateTargets.cmake +108 -0
- quasardb/date/CMakeFiles/progress.marks +1 -0
- quasardb/date/Makefile +189 -0
- quasardb/date/cmake_install.cmake +81 -0
- quasardb/date/dateConfigVersion.cmake +65 -0
- quasardb/date/dateTargets.cmake +63 -0
- quasardb/extensions/__init__.py +8 -0
- quasardb/extensions/writer.py +191 -0
- quasardb/firehose.py +103 -0
- quasardb/libqdb_api.so +0 -0
- quasardb/numpy/__init__.py +1045 -0
- quasardb/pandas/__init__.py +533 -0
- quasardb/pool.py +311 -0
- quasardb/pybind11/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
- quasardb/pybind11/CMakeFiles/progress.marks +1 -0
- quasardb/pybind11/Makefile +189 -0
- quasardb/pybind11/cmake_install.cmake +50 -0
- quasardb/quasardb/__init__.pyi +97 -0
- quasardb/quasardb/_batch_column.pyi +5 -0
- quasardb/quasardb/_batch_inserter.pyi +30 -0
- quasardb/quasardb/_blob.pyi +16 -0
- quasardb/quasardb/_cluster.pyi +100 -0
- quasardb/quasardb/_continuous.pyi +16 -0
- quasardb/quasardb/_double.pyi +7 -0
- quasardb/quasardb/_entry.pyi +60 -0
- quasardb/quasardb/_error.pyi +15 -0
- quasardb/quasardb/_integer.pyi +7 -0
- quasardb/quasardb/_node.pyi +26 -0
- quasardb/quasardb/_options.pyi +105 -0
- quasardb/quasardb/_perf.pyi +5 -0
- quasardb/quasardb/_properties.pyi +5 -0
- quasardb/quasardb/_query.pyi +2 -0
- quasardb/quasardb/_reader.pyi +9 -0
- quasardb/quasardb/_retry.pyi +16 -0
- quasardb/quasardb/_string.pyi +12 -0
- quasardb/quasardb/_table.pyi +125 -0
- quasardb/quasardb/_tag.pyi +5 -0
- quasardb/quasardb/_timestamp.pyi +9 -0
- quasardb/quasardb/_writer.pyi +111 -0
- quasardb/quasardb/metrics/__init__.pyi +20 -0
- quasardb/quasardb.cpython-39-x86_64-linux-gnu.so +0 -0
- quasardb/range-v3/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
- quasardb/range-v3/CMakeFiles/Export/48a02d54b5e9e60c30c5f249b431a911/range-v3-targets.cmake +128 -0
- quasardb/range-v3/CMakeFiles/progress.marks +1 -0
- quasardb/range-v3/CMakeFiles/range.v3.headers.dir/DependInfo.cmake +22 -0
- quasardb/range-v3/CMakeFiles/range.v3.headers.dir/build.make +86 -0
- quasardb/range-v3/CMakeFiles/range.v3.headers.dir/cmake_clean.cmake +5 -0
- quasardb/range-v3/CMakeFiles/range.v3.headers.dir/compiler_depend.make +2 -0
- quasardb/range-v3/CMakeFiles/range.v3.headers.dir/compiler_depend.ts +2 -0
- quasardb/range-v3/CMakeFiles/range.v3.headers.dir/progress.make +1 -0
- quasardb/range-v3/Makefile +204 -0
- quasardb/range-v3/cmake_install.cmake +93 -0
- quasardb/range-v3/include/range/v3/version.hpp +24 -0
- quasardb/range-v3/range-v3-config-version.cmake +83 -0
- quasardb/range-v3/range-v3-config.cmake +80 -0
- quasardb/stats.py +358 -0
- quasardb/table_cache.py +56 -0
- quasardb-3.14.2.dev7.dist-info/METADATA +41 -0
- quasardb-3.14.2.dev7.dist-info/RECORD +69 -0
- quasardb-3.14.2.dev7.dist-info/WHEEL +6 -0
- quasardb-3.14.2.dev7.dist-info/licenses/LICENSE.md +11 -0
- quasardb-3.14.2.dev7.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
# pylint: disable=C0103,C0111,C0302,R0903
|
|
2
|
+
|
|
3
|
+
# Copyright (c) 2009-2024, quasardb SAS. All rights reserved.
|
|
4
|
+
# All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Redistribution and use in source and binary forms, with or without
|
|
7
|
+
# modification, are permitted provided that the following conditions are met:
|
|
8
|
+
#
|
|
9
|
+
# * Redistributions of source code must retain the above copyright
|
|
10
|
+
# notice, this list of conditions and the following disclaimer.
|
|
11
|
+
# * Redistributions in binary form must reproduce the above copyright
|
|
12
|
+
# notice, this list of conditions and the following disclaimer in the
|
|
13
|
+
# documentation and/or other materials provided with the distribution.
|
|
14
|
+
# * Neither the name of quasardb nor the names of its contributors may
|
|
15
|
+
# be used to endorse or promote products derived from this software
|
|
16
|
+
# without specific prior written permission.
|
|
17
|
+
#
|
|
18
|
+
# THIS SOFTWARE IS PROVIDED BY QUASARDB AND CONTRIBUTORS ``AS IS'' AND ANY
|
|
19
|
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
20
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
21
|
+
# DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
|
|
22
|
+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
23
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
24
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
25
|
+
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
26
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
27
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
28
|
+
#
|
|
29
|
+
|
|
30
|
+
import logging
|
|
31
|
+
import warnings
|
|
32
|
+
from datetime import datetime
|
|
33
|
+
from functools import partial
|
|
34
|
+
|
|
35
|
+
import quasardb
|
|
36
|
+
import quasardb.table_cache as table_cache
|
|
37
|
+
import quasardb.numpy as qdbnp
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger("quasardb.pandas")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class PandasRequired(ImportError):
|
|
44
|
+
"""
|
|
45
|
+
Exception raised when trying to use QuasarDB pandas integration, but
|
|
46
|
+
pandas has not been installed.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
import numpy as np
|
|
54
|
+
import numpy.ma as ma
|
|
55
|
+
import pandas as pd
|
|
56
|
+
from pandas.core.api import DataFrame, Series
|
|
57
|
+
from pandas.core.base import PandasObject
|
|
58
|
+
|
|
59
|
+
except ImportError:
|
|
60
|
+
raise PandasRequired("The pandas library is required to handle pandas data formats")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# Constant mapping of numpy dtype to QuasarDB column type
|
|
64
|
+
# TODO(leon): support this natively in qdb C api ? we have everything we need
|
|
65
|
+
# to understand dtypes.
|
|
66
|
+
_dtype_map = {
|
|
67
|
+
np.dtype("int64"): quasardb.ColumnType.Int64,
|
|
68
|
+
np.dtype("int32"): quasardb.ColumnType.Int64,
|
|
69
|
+
np.dtype("float64"): quasardb.ColumnType.Double,
|
|
70
|
+
np.dtype("object"): quasardb.ColumnType.String,
|
|
71
|
+
np.dtype("M8[ns]"): quasardb.ColumnType.Timestamp,
|
|
72
|
+
np.dtype("datetime64[ns]"): quasardb.ColumnType.Timestamp,
|
|
73
|
+
"int64": quasardb.ColumnType.Int64,
|
|
74
|
+
"int32": quasardb.ColumnType.Int64,
|
|
75
|
+
"float32": quasardb.ColumnType.Double,
|
|
76
|
+
"float64": quasardb.ColumnType.Double,
|
|
77
|
+
"timestamp": quasardb.ColumnType.Timestamp,
|
|
78
|
+
"string": quasardb.ColumnType.String,
|
|
79
|
+
"bytes": quasardb.ColumnType.Blob,
|
|
80
|
+
"floating": quasardb.ColumnType.Double,
|
|
81
|
+
"integer": quasardb.ColumnType.Int64,
|
|
82
|
+
"bytes": quasardb.ColumnType.Blob,
|
|
83
|
+
"string": quasardb.ColumnType.String,
|
|
84
|
+
"datetime64": quasardb.ColumnType.Timestamp,
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def read_series(table, col_name, ranges=None):
|
|
89
|
+
"""
|
|
90
|
+
Read a Pandas Timeseries from a single column.
|
|
91
|
+
|
|
92
|
+
Parameters:
|
|
93
|
+
-----------
|
|
94
|
+
|
|
95
|
+
table : quasardb.Timeseries
|
|
96
|
+
QuasarDB Timeseries table object, e.g. qdb_cluster.table('my_table')
|
|
97
|
+
|
|
98
|
+
col_name : str
|
|
99
|
+
Name of the column to read.
|
|
100
|
+
|
|
101
|
+
ranges : list
|
|
102
|
+
A list of ranges to read, represented as tuples of Numpy datetime64[ns] objects.
|
|
103
|
+
"""
|
|
104
|
+
read_with = {
|
|
105
|
+
quasardb.ColumnType.Double: table.double_get_ranges,
|
|
106
|
+
quasardb.ColumnType.Blob: table.blob_get_ranges,
|
|
107
|
+
quasardb.ColumnType.String: table.string_get_ranges,
|
|
108
|
+
quasardb.ColumnType.Int64: table.int64_get_ranges,
|
|
109
|
+
quasardb.ColumnType.Timestamp: table.timestamp_get_ranges,
|
|
110
|
+
quasardb.ColumnType.Symbol: table.string_get_ranges,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
kwargs = {"column": col_name}
|
|
114
|
+
|
|
115
|
+
if ranges is not None:
|
|
116
|
+
kwargs["ranges"] = ranges
|
|
117
|
+
|
|
118
|
+
# Dispatch based on column type
|
|
119
|
+
t = table.column_type_by_id(col_name)
|
|
120
|
+
|
|
121
|
+
logger.info(
|
|
122
|
+
"reading Series from column %s.%s with type %s", table.get_name(), col_name, t
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
res = (read_with[t])(**kwargs)
|
|
126
|
+
|
|
127
|
+
return Series(res[1], index=res[0])
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def write_series(series, table, col_name, infer_types=True, dtype=None):
|
|
131
|
+
"""
|
|
132
|
+
Writes a Pandas Timeseries to a single column.
|
|
133
|
+
|
|
134
|
+
Parameters:
|
|
135
|
+
-----------
|
|
136
|
+
|
|
137
|
+
series : pandas.Series
|
|
138
|
+
Pandas Series, with a numpy.datetime64[ns] as index. Underlying data will be attempted
|
|
139
|
+
to be transformed to appropriate QuasarDB type.
|
|
140
|
+
|
|
141
|
+
table : quasardb.Timeseries
|
|
142
|
+
QuasarDB Timeseries table object, e.g. qdb_cluster.table('my_table')
|
|
143
|
+
|
|
144
|
+
col_name : str
|
|
145
|
+
Column name to store data in.
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
logger.debug(
|
|
149
|
+
"write_series, table=%s, col_name=%s, infer_types=%s, dtype=%s",
|
|
150
|
+
table.get_name(),
|
|
151
|
+
col_name,
|
|
152
|
+
infer_types,
|
|
153
|
+
dtype,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
data = None
|
|
157
|
+
index = None
|
|
158
|
+
|
|
159
|
+
data = ma.masked_array(series.to_numpy(copy=False), mask=series.isna())
|
|
160
|
+
|
|
161
|
+
if infer_types is True:
|
|
162
|
+
index = series.index.to_numpy("datetime64[ns]", copy=False)
|
|
163
|
+
else:
|
|
164
|
+
index = series.index.to_numpy(copy=False)
|
|
165
|
+
|
|
166
|
+
assert data is not None
|
|
167
|
+
assert index is not None
|
|
168
|
+
|
|
169
|
+
return qdbnp.write_array(
|
|
170
|
+
data=data,
|
|
171
|
+
index=index,
|
|
172
|
+
table=table,
|
|
173
|
+
column=col_name,
|
|
174
|
+
dtype=dtype,
|
|
175
|
+
infer_types=infer_types,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def query(
|
|
180
|
+
cluster: quasardb.Cluster,
|
|
181
|
+
query: str,
|
|
182
|
+
index: str = None,
|
|
183
|
+
blobs: bool = False,
|
|
184
|
+
numpy: bool = True,
|
|
185
|
+
):
|
|
186
|
+
"""
|
|
187
|
+
Execute *query* and return the result as a pandas DataFrame.
|
|
188
|
+
|
|
189
|
+
Parameters
|
|
190
|
+
----------
|
|
191
|
+
cluster : quasardb.Cluster
|
|
192
|
+
Active connection to the QuasarDB cluster.
|
|
193
|
+
|
|
194
|
+
query : str
|
|
195
|
+
The query to execute.
|
|
196
|
+
|
|
197
|
+
index : str | None, default None
|
|
198
|
+
Column to use as index. When None a synthetic index is created and
|
|
199
|
+
named “$index”.
|
|
200
|
+
|
|
201
|
+
blobs, numpy
|
|
202
|
+
DEPRECATED – no longer used. Supplying a non-default value raises a
|
|
203
|
+
DeprecationWarning and the argument is ignored.
|
|
204
|
+
"""
|
|
205
|
+
# ------------------------------------------------------------------ deprecations
|
|
206
|
+
if blobs is not False:
|
|
207
|
+
warnings.warn(
|
|
208
|
+
"`blobs` is deprecated and will be removed in a future version; "
|
|
209
|
+
"the argument is ignored.",
|
|
210
|
+
DeprecationWarning,
|
|
211
|
+
stacklevel=2,
|
|
212
|
+
)
|
|
213
|
+
if numpy is not True:
|
|
214
|
+
warnings.warn(
|
|
215
|
+
"`numpy` is deprecated and will be removed in a future version; "
|
|
216
|
+
"the argument is ignored.",
|
|
217
|
+
DeprecationWarning,
|
|
218
|
+
stacklevel=2,
|
|
219
|
+
)
|
|
220
|
+
# ------------------------------------------------------------------------------
|
|
221
|
+
|
|
222
|
+
logger.debug("querying and returning as DataFrame: %s", query)
|
|
223
|
+
index_vals, m = qdbnp.query(cluster, query, index=index, dict=True)
|
|
224
|
+
|
|
225
|
+
index_name = "$index" if index is None else index
|
|
226
|
+
index_obj = pd.Index(index_vals, name=index_name)
|
|
227
|
+
|
|
228
|
+
return pd.DataFrame(m, index=index_obj)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def stream_dataframes(
|
|
232
|
+
conn: quasardb.Cluster,
|
|
233
|
+
tables: list,
|
|
234
|
+
*,
|
|
235
|
+
batch_size: int = 2**16,
|
|
236
|
+
column_names: list = None,
|
|
237
|
+
ranges: list = None,
|
|
238
|
+
):
|
|
239
|
+
"""
|
|
240
|
+
Read a Pandas Dataframe from a QuasarDB Timeseries table. Returns a generator with dataframes of size `batch_size`, which is useful
|
|
241
|
+
when traversing a large dataset which does not fit into memory.
|
|
242
|
+
|
|
243
|
+
Accepts the same parameters as `stream_dataframes`.
|
|
244
|
+
|
|
245
|
+
Parameters:
|
|
246
|
+
-----------
|
|
247
|
+
|
|
248
|
+
conn : quasardb.Cluster
|
|
249
|
+
Connection to the QuasarDB database.
|
|
250
|
+
|
|
251
|
+
tables : list[str | quasardb.Table]
|
|
252
|
+
QuasarDB tables to stream, as a list of strings or quasardb table objects.
|
|
253
|
+
|
|
254
|
+
batch_size : int
|
|
255
|
+
The amount of rows to fetch in a single read operation. If unset, uses 2^16 (65536) rows
|
|
256
|
+
as batch size by default.
|
|
257
|
+
|
|
258
|
+
column_names : optional list
|
|
259
|
+
List of columns to read in dataframe. The timestamp column '$timestamp' is
|
|
260
|
+
always read.
|
|
261
|
+
|
|
262
|
+
Defaults to all columns.
|
|
263
|
+
|
|
264
|
+
ranges: optional list
|
|
265
|
+
A list of time ranges to read, represented as tuples of Numpy datetime64[ns] objects.
|
|
266
|
+
Defaults to the entire table.
|
|
267
|
+
|
|
268
|
+
"""
|
|
269
|
+
# Sanitize batch_size
|
|
270
|
+
if batch_size == None:
|
|
271
|
+
batch_size = 2**16
|
|
272
|
+
elif not isinstance(batch_size, int):
|
|
273
|
+
raise TypeError(
|
|
274
|
+
"batch_size should be an integer, but got: {} with value {}".format(
|
|
275
|
+
type(batch_size), str(batch_size)
|
|
276
|
+
)
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
kwargs = {"batch_size": batch_size}
|
|
280
|
+
|
|
281
|
+
if column_names:
|
|
282
|
+
kwargs["column_names"] = column_names
|
|
283
|
+
|
|
284
|
+
if ranges:
|
|
285
|
+
kwargs["ranges"] = ranges
|
|
286
|
+
|
|
287
|
+
coerce_table_name_fn = lambda x: x if isinstance(x, str) else x.get_name()
|
|
288
|
+
kwargs["table_names"] = [coerce_table_name_fn(x) for x in tables]
|
|
289
|
+
|
|
290
|
+
with conn.reader(**kwargs) as reader:
|
|
291
|
+
for batch in reader:
|
|
292
|
+
# We always expect the timestamp column, and set this as the index
|
|
293
|
+
assert "$timestamp" in batch
|
|
294
|
+
|
|
295
|
+
idx = pd.Index(batch.pop("$timestamp"), copy=False, name="$timestamp")
|
|
296
|
+
df = pd.DataFrame(batch, index=idx)
|
|
297
|
+
|
|
298
|
+
yield df
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def stream_dataframe(conn: quasardb.Cluster, table, **kwargs):
|
|
302
|
+
"""
|
|
303
|
+
Read a single table and return a stream of dataframes. This is a convenience function that wraps around
|
|
304
|
+
`stream_dataframes`.
|
|
305
|
+
"""
|
|
306
|
+
kwargs["tables"] = [table]
|
|
307
|
+
|
|
308
|
+
# For backwards compatibility, we drop the `$table` column returned: this is not strictly
|
|
309
|
+
# necessary, but it also is somewhat reasonable to drop it when we're reading from a single
|
|
310
|
+
# table, which is the case here.
|
|
311
|
+
clean_df_fn = lambda df: df.drop(columns=["$table"])
|
|
312
|
+
|
|
313
|
+
return (clean_df_fn(df) for df in stream_dataframes(conn, **kwargs))
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def read_dataframe(conn: quasardb.Cluster, table, **kwargs):
|
|
317
|
+
"""
|
|
318
|
+
Read a Pandas Dataframe from a QuasarDB Timeseries table. Wraps around stream_dataframes(), and
|
|
319
|
+
returns everything as a single dataframe. batch_size is always explicitly set to 0.
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
Parameters:
|
|
323
|
+
-----------
|
|
324
|
+
|
|
325
|
+
conn : quasardb.Cluster
|
|
326
|
+
Connection to the QuasarDB database.
|
|
327
|
+
|
|
328
|
+
table : str | quasardb.Table
|
|
329
|
+
QuasarDB table to stream, either as a string or a table object. When re-executing the same function
|
|
330
|
+
multiple times on the same tables, providing the table as an object has a performance benefit.
|
|
331
|
+
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
if (
|
|
335
|
+
"batch_size" in kwargs
|
|
336
|
+
and kwargs["batch_size"] != 0
|
|
337
|
+
and kwargs["batch_size"] != None
|
|
338
|
+
):
|
|
339
|
+
logger.warn(
|
|
340
|
+
"Providing a batch size with read_dataframe is unsupported, overriding batch_size to 65536."
|
|
341
|
+
)
|
|
342
|
+
logger.warn(
|
|
343
|
+
"If you wish to traverse the data in smaller batches, please use: stream_dataframe()."
|
|
344
|
+
)
|
|
345
|
+
kwargs["batch_size"] = 2**16
|
|
346
|
+
|
|
347
|
+
# Note that this is *lazy*, dfs is a generator, not a list -- as such, dataframes will be
|
|
348
|
+
# fetched on-demand, which means that an error could occur in the middle of processing
|
|
349
|
+
# dataframes.
|
|
350
|
+
dfs = stream_dataframe(conn, table, **kwargs)
|
|
351
|
+
|
|
352
|
+
# if result of stream_dataframe is empty this could result in ValueError on pd.concat()
|
|
353
|
+
# as stream_dataframe is a generator there is no easy way to check for this condition without evaluation
|
|
354
|
+
# the most simple way is to catch the ValueError and return an empty DataFrame
|
|
355
|
+
try:
|
|
356
|
+
return pd.concat(dfs, copy=False)
|
|
357
|
+
except ValueError as e:
|
|
358
|
+
logger.error(
|
|
359
|
+
"Error while concatenating dataframes. This can happen if result set is empty. Returning empty dataframe. Error: %s",
|
|
360
|
+
e,
|
|
361
|
+
)
|
|
362
|
+
return pd.DataFrame()
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _extract_columns(df, cinfos):
|
|
366
|
+
"""
|
|
367
|
+
Converts dataframe to a number of numpy arrays, one for each column.
|
|
368
|
+
|
|
369
|
+
Arrays will be indexed by relative offset, in the same order as the table's columns.
|
|
370
|
+
If a table column is not present in the dataframe, it it have a None entry.
|
|
371
|
+
If a dataframe column is not present in the table, it will be ommitted.
|
|
372
|
+
"""
|
|
373
|
+
ret = {}
|
|
374
|
+
|
|
375
|
+
# Grab all columns from the DataFrame in the order of table columns,
|
|
376
|
+
# put None if not present in df.
|
|
377
|
+
for i in range(len(cinfos)):
|
|
378
|
+
(cname, ctype) = cinfos[i]
|
|
379
|
+
xs = None
|
|
380
|
+
|
|
381
|
+
if cname in df.columns:
|
|
382
|
+
arr = df[cname].array
|
|
383
|
+
ret[cname] = ma.masked_array(arr.to_numpy(copy=False), mask=arr.isna())
|
|
384
|
+
|
|
385
|
+
return ret
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def write_dataframes(dfs, cluster, *, create=False, shard_size=None, **kwargs):
|
|
389
|
+
"""
|
|
390
|
+
Store dataframes into a table. Any additional parameters not documented here
|
|
391
|
+
are passed to numpy.write_arrays(). Please consult the pydoc of that function
|
|
392
|
+
for additional accepted parameters.
|
|
393
|
+
|
|
394
|
+
Parameters:
|
|
395
|
+
-----------
|
|
396
|
+
|
|
397
|
+
dfs: dict[str | quasardb.Table, pd.DataFrame] | list[tuple[str | quasardb.Table, pd.DataFrame]]
|
|
398
|
+
This can be either a dict that maps table (either objects or names) to a dataframe, or a list
|
|
399
|
+
of table<>dataframe tuples.
|
|
400
|
+
|
|
401
|
+
cluster: quasardb.Cluster
|
|
402
|
+
Active connection to the QuasarDB cluster
|
|
403
|
+
|
|
404
|
+
create: optional bool
|
|
405
|
+
Whether to create the table. Defaults to False.
|
|
406
|
+
|
|
407
|
+
shard_size: optional datetime.timedelta
|
|
408
|
+
The shard size of the timeseries you wish to create when `create` is True.
|
|
409
|
+
"""
|
|
410
|
+
|
|
411
|
+
# If dfs is a dict, we convert it to a list of tuples.
|
|
412
|
+
if isinstance(dfs, dict):
|
|
413
|
+
dfs = dfs.items()
|
|
414
|
+
|
|
415
|
+
if shard_size is not None and create == False:
|
|
416
|
+
raise ValueError("Invalid argument: shard size provided while create is False")
|
|
417
|
+
|
|
418
|
+
# If the tables are provided as strings, we look them up.
|
|
419
|
+
dfs_ = []
|
|
420
|
+
for table, df in dfs:
|
|
421
|
+
if isinstance(table, str):
|
|
422
|
+
table = table_cache.lookup(table, cluster)
|
|
423
|
+
|
|
424
|
+
dfs_.append((table, df))
|
|
425
|
+
|
|
426
|
+
data_by_table = []
|
|
427
|
+
|
|
428
|
+
for table, df in dfs_:
|
|
429
|
+
logger.debug("quasardb.pandas.write_dataframe, create = %s", create)
|
|
430
|
+
assert isinstance(df, pd.DataFrame)
|
|
431
|
+
|
|
432
|
+
# Create table if requested
|
|
433
|
+
if create:
|
|
434
|
+
_create_table_from_df(df, table, shard_size)
|
|
435
|
+
|
|
436
|
+
cinfos = [(x.name, x.type) for x in table.list_columns()]
|
|
437
|
+
|
|
438
|
+
if not df.index.is_monotonic_increasing:
|
|
439
|
+
logger.warn(
|
|
440
|
+
"dataframe index is unsorted, resorting dataframe based on index"
|
|
441
|
+
)
|
|
442
|
+
df = df.sort_index().reindex()
|
|
443
|
+
|
|
444
|
+
# We pass everything else to our qdbnp.write_arrays function, as generally speaking
|
|
445
|
+
# it is (much) more sensible to deal with numpy arrays than Pandas dataframes:
|
|
446
|
+
# pandas has the bad habit of wanting to cast data to different types if your data
|
|
447
|
+
# is sparse, most notably forcing sparse integer arrays to floating points.
|
|
448
|
+
|
|
449
|
+
data = _extract_columns(df, cinfos)
|
|
450
|
+
data["$timestamp"] = df.index.to_numpy(copy=False, dtype="datetime64[ns]")
|
|
451
|
+
|
|
452
|
+
data_by_table.append((table, data))
|
|
453
|
+
|
|
454
|
+
kwargs["deprecation_stacklevel"] = kwargs.get("deprecation_stacklevel", 1) + 1
|
|
455
|
+
return qdbnp.write_arrays(data_by_table, cluster, table=None, index=None, **kwargs)
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def write_dataframe(df, cluster, table, **kwargs):
|
|
459
|
+
"""
|
|
460
|
+
Store a single dataframe into a table. Takes the same arguments as `write_dataframes`, except only
|
|
461
|
+
a single df/table combination.
|
|
462
|
+
"""
|
|
463
|
+
kwargs["deprecation_stacklevel"] = kwargs.get("deprecation_stacklevel", 1) + 1
|
|
464
|
+
write_dataframes([(table, df)], cluster, **kwargs)
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def write_pinned_dataframe(*args, **kwargs):
|
|
468
|
+
"""
|
|
469
|
+
Legacy wrapper around write_dataframe()
|
|
470
|
+
"""
|
|
471
|
+
logger.warn(
|
|
472
|
+
"write_pinned_dataframe is deprecated and will be removed in a future release."
|
|
473
|
+
)
|
|
474
|
+
logger.warn("Please use write_dataframe directly instead")
|
|
475
|
+
kwargs["deprecation_stacklevel"] = 2
|
|
476
|
+
return write_dataframe(*args, **kwargs)
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def _create_table_from_df(df, table, shard_size=None):
|
|
480
|
+
cols = list()
|
|
481
|
+
|
|
482
|
+
dtypes = _get_inferred_dtypes(df)
|
|
483
|
+
|
|
484
|
+
logger.info("got inferred dtypes: %s", dtypes)
|
|
485
|
+
for c in df.columns:
|
|
486
|
+
dt = dtypes[c]
|
|
487
|
+
ct = _dtype_to_column_type(df[c].dtype, dt)
|
|
488
|
+
logger.debug(
|
|
489
|
+
"probed pandas dtype %s to inferred dtype %s and map to quasardb column type %s",
|
|
490
|
+
df[c].dtype,
|
|
491
|
+
dt,
|
|
492
|
+
ct,
|
|
493
|
+
)
|
|
494
|
+
cols.append(quasardb.ColumnInfo(ct, c))
|
|
495
|
+
|
|
496
|
+
try:
|
|
497
|
+
if not shard_size:
|
|
498
|
+
table.create(cols)
|
|
499
|
+
else:
|
|
500
|
+
table.create(cols, shard_size)
|
|
501
|
+
except quasardb.quasardb.AliasAlreadyExistsError:
|
|
502
|
+
# TODO(leon): warn? how?
|
|
503
|
+
pass
|
|
504
|
+
|
|
505
|
+
return table
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def _dtype_to_column_type(dt, inferred):
|
|
509
|
+
res = _dtype_map.get(inferred, None)
|
|
510
|
+
if res is None:
|
|
511
|
+
res = _dtype_map.get(dt, None)
|
|
512
|
+
|
|
513
|
+
if res is None:
|
|
514
|
+
raise ValueError("Incompatible data type: ", dt)
|
|
515
|
+
|
|
516
|
+
return res
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def _get_inferred_dtypes(df):
|
|
520
|
+
dtypes = dict()
|
|
521
|
+
for i in range(len(df.columns)):
|
|
522
|
+
c = df.columns[i]
|
|
523
|
+
dt = pd.api.types.infer_dtype(df[c].values)
|
|
524
|
+
logger.debug("Determined dtype of column %s to be %s", c, dt)
|
|
525
|
+
dtypes[c] = dt
|
|
526
|
+
return dtypes
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def _get_inferred_dtypes_indexed(df):
|
|
530
|
+
dtypes = _get_inferred_dtypes(df)
|
|
531
|
+
# Performance improvement: avoid a expensive dict lookups by indexing
|
|
532
|
+
# the column types by relative offset within the df.
|
|
533
|
+
return list(dtypes[c] for c in df.columns)
|