quasardb 3.14.2.dev8__cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. quasardb/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  2. quasardb/CMakeFiles/progress.marks +1 -0
  3. quasardb/Makefile +189 -0
  4. quasardb/__init__.py +140 -0
  5. quasardb/__init__.pyi +72 -0
  6. quasardb/cmake_install.cmake +58 -0
  7. quasardb/date/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  8. quasardb/date/CMakeFiles/Export/b76006b2b7125baf1b0b4d4ca4db82bd/dateTargets.cmake +108 -0
  9. quasardb/date/CMakeFiles/progress.marks +1 -0
  10. quasardb/date/Makefile +189 -0
  11. quasardb/date/cmake_install.cmake +81 -0
  12. quasardb/date/dateConfigVersion.cmake +65 -0
  13. quasardb/date/dateTargets.cmake +63 -0
  14. quasardb/extensions/__init__.py +9 -0
  15. quasardb/extensions/writer.py +195 -0
  16. quasardb/firehose.py +112 -0
  17. quasardb/libqdb_api.so +0 -0
  18. quasardb/numpy/__init__.py +1106 -0
  19. quasardb/pandas/__init__.py +696 -0
  20. quasardb/pool.py +338 -0
  21. quasardb/pybind11/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  22. quasardb/pybind11/CMakeFiles/progress.marks +1 -0
  23. quasardb/pybind11/Makefile +189 -0
  24. quasardb/pybind11/cmake_install.cmake +50 -0
  25. quasardb/quasardb/__init__.pyi +97 -0
  26. quasardb/quasardb/_batch_column.pyi +5 -0
  27. quasardb/quasardb/_batch_inserter.pyi +32 -0
  28. quasardb/quasardb/_blob.pyi +16 -0
  29. quasardb/quasardb/_cluster.pyi +106 -0
  30. quasardb/quasardb/_continuous.pyi +18 -0
  31. quasardb/quasardb/_double.pyi +7 -0
  32. quasardb/quasardb/_entry.pyi +61 -0
  33. quasardb/quasardb/_error.pyi +15 -0
  34. quasardb/quasardb/_integer.pyi +7 -0
  35. quasardb/quasardb/_node.pyi +26 -0
  36. quasardb/quasardb/_options.pyi +106 -0
  37. quasardb/quasardb/_perf.pyi +7 -0
  38. quasardb/quasardb/_properties.pyi +5 -0
  39. quasardb/quasardb/_query.pyi +2 -0
  40. quasardb/quasardb/_reader.pyi +15 -0
  41. quasardb/quasardb/_retry.pyi +16 -0
  42. quasardb/quasardb/_string.pyi +12 -0
  43. quasardb/quasardb/_table.pyi +140 -0
  44. quasardb/quasardb/_tag.pyi +5 -0
  45. quasardb/quasardb/_timestamp.pyi +9 -0
  46. quasardb/quasardb/_writer.pyi +112 -0
  47. quasardb/quasardb/metrics/__init__.pyi +28 -0
  48. quasardb/quasardb.cpython-310-x86_64-linux-gnu.so +0 -0
  49. quasardb/range-v3/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  50. quasardb/range-v3/CMakeFiles/Export/48a02d54b5e9e60c30c5f249b431a911/range-v3-targets.cmake +128 -0
  51. quasardb/range-v3/CMakeFiles/progress.marks +1 -0
  52. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/DependInfo.cmake +22 -0
  53. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/build.make +86 -0
  54. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/cmake_clean.cmake +5 -0
  55. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/compiler_depend.make +2 -0
  56. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/compiler_depend.ts +2 -0
  57. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/progress.make +1 -0
  58. quasardb/range-v3/Makefile +204 -0
  59. quasardb/range-v3/cmake_install.cmake +93 -0
  60. quasardb/range-v3/include/range/v3/version.hpp +24 -0
  61. quasardb/range-v3/range-v3-config-version.cmake +83 -0
  62. quasardb/range-v3/range-v3-config.cmake +80 -0
  63. quasardb/stats.py +376 -0
  64. quasardb/table_cache.py +60 -0
  65. quasardb/typing.py +23 -0
  66. quasardb-3.14.2.dev8.dist-info/METADATA +41 -0
  67. quasardb-3.14.2.dev8.dist-info/RECORD +70 -0
  68. quasardb-3.14.2.dev8.dist-info/WHEEL +6 -0
  69. quasardb-3.14.2.dev8.dist-info/licenses/LICENSE.md +11 -0
  70. quasardb-3.14.2.dev8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,696 @@
1
+ # pylint: disable=C0103,C0111,C0302,R0903
2
+
3
+ # Copyright (c) 2009-2024, quasardb SAS. All rights reserved.
4
+ # All rights reserved.
5
+ #
6
+ # Redistribution and use in source and binary forms, with or without
7
+ # modification, are permitted provided that the following conditions are met:
8
+ #
9
+ # * Redistributions of source code must retain the above copyright
10
+ # notice, this list of conditions and the following disclaimer.
11
+ # * Redistributions in binary form must reproduce the above copyright
12
+ # notice, this list of conditions and the following disclaimer in the
13
+ # documentation and/or other materials provided with the distribution.
14
+ # * Neither the name of quasardb nor the names of its contributors may
15
+ # be used to endorse or promote products derived from this software
16
+ # without specific prior written permission.
17
+ #
18
+ # THIS SOFTWARE IS PROVIDED BY QUASARDB AND CONTRIBUTORS ``AS IS'' AND ANY
19
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ # DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
22
+ # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
+ #
29
+ from __future__ import annotations
30
+
31
+ import logging
32
+ import warnings
33
+ from datetime import timedelta
34
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
35
+
36
+ import quasardb
37
+ import quasardb.numpy as qdbnp
38
+ import quasardb.table_cache as table_cache
39
+ from quasardb.quasardb import Cluster, Table, Writer
40
+ from quasardb.typing import DType, MaskedArrayAny, Range, RangeSet
41
+
42
+ logger = logging.getLogger("quasardb.pandas")
43
+
44
+
45
+ class PandasRequired(ImportError):
46
+ """
47
+ Exception raised when trying to use QuasarDB pandas integration, but
48
+ pandas has not been installed.
49
+ """
50
+
51
+ pass
52
+
53
+
54
+ try:
55
+ import numpy as np
56
+ import numpy.ma as ma
57
+ import pandas as pd
58
+ from pandas.core.api import DataFrame, Series
59
+ from pandas.core.base import PandasObject # type: ignore[attr-defined]
60
+
61
+ except ImportError:
62
+ raise PandasRequired("The pandas library is required to handle pandas data formats")
63
+
64
+
65
+ # Constant mapping of numpy dtype to QuasarDB column type
66
+ # TODO(leon): support this natively in qdb C api ? we have everything we need
67
+ # to understand dtypes.
68
+ _dtype_map: Dict[Any, quasardb.ColumnType] = {
69
+ np.dtype("int64"): quasardb.ColumnType.Int64,
70
+ np.dtype("int32"): quasardb.ColumnType.Int64,
71
+ np.dtype("float64"): quasardb.ColumnType.Double,
72
+ np.dtype("object"): quasardb.ColumnType.String,
73
+ np.dtype("M8[ns]"): quasardb.ColumnType.Timestamp,
74
+ np.dtype("datetime64[ns]"): quasardb.ColumnType.Timestamp,
75
+ "int64": quasardb.ColumnType.Int64,
76
+ "int32": quasardb.ColumnType.Int64,
77
+ "float32": quasardb.ColumnType.Double,
78
+ "float64": quasardb.ColumnType.Double,
79
+ "timestamp": quasardb.ColumnType.Timestamp,
80
+ "string": quasardb.ColumnType.String,
81
+ "bytes": quasardb.ColumnType.Blob,
82
+ "floating": quasardb.ColumnType.Double,
83
+ "integer": quasardb.ColumnType.Int64,
84
+ "bytes": quasardb.ColumnType.Blob,
85
+ "string": quasardb.ColumnType.String,
86
+ "datetime64": quasardb.ColumnType.Timestamp,
87
+ }
88
+
89
+ # Type hint for TableLike parameter
90
+ TableLike = Union[str, Table]
91
+
92
+
93
+ def read_series(
94
+ table: Table, col_name: str, ranges: Optional[RangeSet] = None
95
+ ) -> pd.Series:
96
+ """
97
+ Read a Pandas Timeseries from a single column.
98
+
99
+ Parameters:
100
+ -----------
101
+
102
+ table : quasardb.Timeseries
103
+ QuasarDB Timeseries table object, e.g. qdb_cluster.table('my_table')
104
+
105
+ col_name : str
106
+ Name of the column to read.
107
+
108
+ ranges : list
109
+ A list of ranges to read, represented as tuples of Numpy datetime64[ns] objects.
110
+ """
111
+ read_with = {
112
+ quasardb.ColumnType.Double: table.double_get_ranges,
113
+ quasardb.ColumnType.Blob: table.blob_get_ranges,
114
+ quasardb.ColumnType.String: table.string_get_ranges,
115
+ quasardb.ColumnType.Int64: table.int64_get_ranges,
116
+ quasardb.ColumnType.Timestamp: table.timestamp_get_ranges,
117
+ quasardb.ColumnType.Symbol: table.string_get_ranges,
118
+ }
119
+
120
+ kwargs: Dict[str, Any] = {"column": col_name}
121
+
122
+ if ranges is not None:
123
+ kwargs["ranges"] = ranges
124
+
125
+ # Dispatch based on column type
126
+ t = table.column_type_by_id(col_name)
127
+
128
+ logger.info(
129
+ "reading Series from column %s.%s with type %s", table.get_name(), col_name, t
130
+ )
131
+
132
+ res = (read_with[t])(**kwargs)
133
+
134
+ return pd.Series(res[1], index=res[0])
135
+
136
+
137
+ def write_series(
138
+ series: pd.Series,
139
+ table: Table,
140
+ col_name: str,
141
+ infer_types: bool = True,
142
+ dtype: Optional[DType] = None,
143
+ ) -> None:
144
+ """
145
+ Writes a Pandas Timeseries to a single column.
146
+
147
+ Parameters:
148
+ -----------
149
+
150
+ series : pandas.Series
151
+ Pandas Series, with a numpy.datetime64[ns] as index. Underlying data will be attempted
152
+ to be transformed to appropriate QuasarDB type.
153
+
154
+ table : quasardb.Timeseries
155
+ QuasarDB Timeseries table object, e.g. qdb_cluster.table('my_table')
156
+
157
+ col_name : str
158
+ Column name to store data in.
159
+ """
160
+
161
+ logger.debug(
162
+ "write_series, table=%s, col_name=%s, infer_types=%s, dtype=%s",
163
+ table.get_name(),
164
+ col_name,
165
+ infer_types,
166
+ dtype,
167
+ )
168
+
169
+ data = None
170
+ index = None
171
+
172
+ data = ma.masked_array(series.to_numpy(copy=False), mask=series.isna())
173
+
174
+ if infer_types is True:
175
+ index = series.index.to_numpy("datetime64[ns]", copy=False)
176
+ else:
177
+ index = series.index.to_numpy(copy=False)
178
+
179
+ assert data is not None
180
+ assert index is not None
181
+
182
+ return qdbnp.write_array(
183
+ data=data,
184
+ index=index,
185
+ table=table,
186
+ column=col_name,
187
+ dtype=dtype,
188
+ infer_types=infer_types,
189
+ )
190
+
191
+
192
+ def query(
193
+ cluster: Cluster,
194
+ query: str,
195
+ index: Optional[str] = None,
196
+ blobs: bool = False,
197
+ numpy: bool = True,
198
+ ) -> pd.DataFrame:
199
+ """
200
+ Execute *query* and return the result as a pandas DataFrame.
201
+
202
+ Parameters
203
+ ----------
204
+ cluster : quasardb.Cluster
205
+ Active connection to the QuasarDB cluster.
206
+
207
+ query : str
208
+ The query to execute.
209
+
210
+ index : str | None, default None
211
+ Column to use as index. When None a synthetic index is created and
212
+ named “$index”.
213
+
214
+ blobs, numpy
215
+ DEPRECATED – no longer used. Supplying a non-default value raises a
216
+ DeprecationWarning and the argument is ignored.
217
+ """
218
+ # ------------------------------------------------------------------ deprecations
219
+ if blobs is not False:
220
+ warnings.warn(
221
+ "`blobs` is deprecated and will be removed in a future version; "
222
+ "the argument is ignored.",
223
+ DeprecationWarning,
224
+ stacklevel=2,
225
+ )
226
+ if numpy is not True:
227
+ warnings.warn(
228
+ "`numpy` is deprecated and will be removed in a future version; "
229
+ "the argument is ignored.",
230
+ DeprecationWarning,
231
+ stacklevel=2,
232
+ )
233
+ # ------------------------------------------------------------------------------
234
+
235
+ logger.debug("querying and returning as DataFrame: %s", query)
236
+ index_vals, m = qdbnp.query(cluster, query, index=index, dict=True)
237
+
238
+ index_name = "$index" if index is None else index
239
+ index_obj = pd.Index(index_vals, name=index_name)
240
+
241
+ return pd.DataFrame(m, index=index_obj)
242
+
243
+
244
+ def stream_dataframes(
245
+ conn: Cluster,
246
+ tables: List[TableLike],
247
+ *,
248
+ batch_size: Optional[int] = 2**16,
249
+ column_names: Optional[List[str]] = None,
250
+ ranges: Optional[RangeSet] = None,
251
+ ) -> Iterator[pd.DataFrame]:
252
+ """
253
+ Read a Pandas Dataframe from a QuasarDB Timeseries table. Returns a generator with dataframes of size `batch_size`, which is useful
254
+ when traversing a large dataset which does not fit into memory.
255
+
256
+ Accepts the same parameters as `stream_dataframes`.
257
+
258
+ Parameters:
259
+ -----------
260
+
261
+ conn : quasardb.Cluster
262
+ Connection to the QuasarDB database.
263
+
264
+ tables : list[str | quasardb.Table]
265
+ QuasarDB tables to stream, as a list of strings or quasardb table objects.
266
+
267
+ batch_size : int
268
+ The amount of rows to fetch in a single read operation. If unset, uses 2^16 (65536) rows
269
+ as batch size by default.
270
+
271
+ column_names : optional list
272
+ List of columns to read in dataframe. The timestamp column '$timestamp' is
273
+ always read.
274
+
275
+ Defaults to all columns.
276
+
277
+ ranges: optional list
278
+ A list of time ranges to read, represented as tuples of Numpy datetime64[ns] objects.
279
+ Defaults to the entire table.
280
+
281
+ """
282
+ # Sanitize batch_size
283
+ if batch_size is None:
284
+ batch_size = 2**16
285
+ elif not isinstance(batch_size, int):
286
+ raise TypeError(
287
+ "batch_size should be an integer, but got: {} with value {}".format(
288
+ type(batch_size), str(batch_size)
289
+ )
290
+ )
291
+
292
+ kwargs: Dict[str, Any] = {"batch_size": batch_size}
293
+
294
+ if column_names:
295
+ kwargs["column_names"] = column_names
296
+
297
+ if ranges:
298
+ kwargs["ranges"] = ranges
299
+
300
+ coerce_table_name_fn = lambda x: x if isinstance(x, str) else x.get_name()
301
+ kwargs["table_names"] = [coerce_table_name_fn(x) for x in tables]
302
+
303
+ with conn.reader(**kwargs) as reader:
304
+ for batch in reader:
305
+ # We always expect the timestamp column, and set this as the index
306
+ assert "$timestamp" in batch
307
+
308
+ idx = pd.Index(batch.pop("$timestamp"), copy=False, name="$timestamp")
309
+ df = pd.DataFrame(batch, index=idx)
310
+
311
+ yield df
312
+
313
+
314
+ def stream_dataframe(
315
+ conn: Cluster,
316
+ table: TableLike,
317
+ *,
318
+ batch_size: Optional[int] = 2**16,
319
+ column_names: Optional[List[str]] = None,
320
+ ranges: Optional[RangeSet] = None,
321
+ ) -> Iterator[pd.DataFrame]:
322
+ """
323
+ Read a single table and return a stream of dataframes. This is a convenience function that wraps around
324
+ `stream_dataframes`.
325
+ """
326
+ # For backwards compatibility, we drop the `$table` column returned: this is not strictly
327
+ # necessary, but it also is somewhat reasonable to drop it when we're reading from a single
328
+ # table, which is the case here.
329
+ clean_df_fn = lambda df: df.drop(columns=["$table"])
330
+
331
+ return (
332
+ clean_df_fn(df)
333
+ for df in stream_dataframes(
334
+ conn,
335
+ [table],
336
+ batch_size=batch_size,
337
+ column_names=column_names,
338
+ ranges=ranges,
339
+ )
340
+ )
341
+
342
+
343
+ def read_dataframe(
344
+ conn: Cluster,
345
+ table: TableLike,
346
+ *,
347
+ batch_size: Optional[int] = 2**16,
348
+ column_names: Optional[List[str]] = None,
349
+ ranges: Optional[RangeSet] = None,
350
+ ) -> pd.DataFrame:
351
+ """
352
+ Read a Pandas Dataframe from a QuasarDB Timeseries table. Wraps around stream_dataframes(), and
353
+ returns everything as a single dataframe. batch_size is always explicitly set to 0.
354
+
355
+
356
+ Parameters:
357
+ -----------
358
+
359
+ conn : quasardb.Cluster
360
+ Connection to the QuasarDB database.
361
+
362
+ table : str | quasardb.Table
363
+ QuasarDB table to stream, either as a string or a table object. When re-executing the same function
364
+ multiple times on the same tables, providing the table as an object has a performance benefit.
365
+
366
+ """
367
+
368
+ if batch_size is not None and batch_size != 0:
369
+ logger.warning(
370
+ "Providing a batch size with read_dataframe is unsupported, overriding batch_size to 65536."
371
+ )
372
+ logger.warning(
373
+ "If you wish to traverse the data in smaller batches, please use: stream_dataframe()."
374
+ )
375
+ batch_size = 2**16
376
+
377
+ # Note that this is *lazy*, dfs is a generator, not a list -- as such, dataframes will be
378
+ # fetched on-demand, which means that an error could occur in the middle of processing
379
+ # dataframes.
380
+ dfs = stream_dataframe(
381
+ conn, table, batch_size=batch_size, column_names=column_names, ranges=ranges
382
+ )
383
+
384
+ # if result of stream_dataframe is empty this could result in ValueError on pd.concat()
385
+ # as stream_dataframe is a generator there is no easy way to check for this condition without evaluation
386
+ # the most simple way is to catch the ValueError and return an empty DataFrame
387
+ try:
388
+ return pd.concat(dfs, copy=False) # type: ignore[call-overload]
389
+ except ValueError as e:
390
+ logger.error(
391
+ "Error while concatenating dataframes. This can happen if result set is empty. Returning empty dataframe. Error: %s",
392
+ e,
393
+ )
394
+ return pd.DataFrame()
395
+
396
+
397
+ def _extract_columns(
398
+ df: pd.DataFrame, cinfos: List[Tuple[str, quasardb.ColumnType]]
399
+ ) -> Dict[str, MaskedArrayAny]:
400
+ """
401
+ Converts dataframe to a number of numpy arrays, one for each column.
402
+
403
+ Arrays will be indexed by relative offset, in the same order as the table's columns.
404
+ If a table column is not present in the dataframe, it it have a None entry.
405
+ If a dataframe column is not present in the table, it will be ommitted.
406
+ """
407
+ ret: Dict[str, MaskedArrayAny] = {}
408
+
409
+ # Grab all columns from the DataFrame in the order of table columns,
410
+ # put None if not present in df.
411
+ for i in range(len(cinfos)):
412
+ (cname, _) = cinfos[i]
413
+
414
+ if cname in df.columns:
415
+ arr = df[cname].array
416
+ ret[cname] = ma.masked_array(arr.to_numpy(copy=False), mask=arr.isna())
417
+
418
+ return ret
419
+
420
+
421
+ def write_dataframes(
422
+ dfs: Union[
423
+ Dict[TableLike, pd.DataFrame],
424
+ List[tuple[TableLike, pd.DataFrame]],
425
+ ],
426
+ cluster: quasardb.Cluster,
427
+ *,
428
+ create: bool = False,
429
+ shard_size: Optional[timedelta] = None,
430
+ # numpy.write_arrays passthrough options
431
+ dtype: Optional[
432
+ Union[DType, Dict[str, Optional[DType]], List[Optional[DType]]]
433
+ ] = None,
434
+ push_mode: Optional[quasardb.WriterPushMode] = None,
435
+ _async: bool = False,
436
+ fast: bool = False,
437
+ truncate: Union[bool, Range] = False,
438
+ truncate_range: Optional[Range] = None,
439
+ deduplicate: Union[bool, str, List[str]] = False,
440
+ deduplication_mode: str = "drop",
441
+ infer_types: bool = True,
442
+ writer: Optional[Writer] = None,
443
+ write_through: bool = True,
444
+ retries: Union[int, quasardb.RetryOptions] = 3,
445
+ **kwargs: Any,
446
+ ) -> List[Table]:
447
+ """
448
+ Store dataframes into a table. Any additional parameters not documented here
449
+ are passed to numpy.write_arrays(). Please consult the pydoc of that function
450
+ for additional accepted parameters.
451
+
452
+ Parameters:
453
+ -----------
454
+
455
+ dfs: dict[str | quasardb.Table, pd.DataFrame] | list[tuple[str | quasardb.Table, pd.DataFrame]]
456
+ This can be either a dict that maps table (either objects or names) to a dataframe, or a list
457
+ of table<>dataframe tuples.
458
+
459
+ cluster: quasardb.Cluster
460
+ Active connection to the QuasarDB cluster
461
+
462
+ create: optional bool
463
+ Whether to create the table. Defaults to False.
464
+
465
+ shard_size: optional datetime.timedelta
466
+ The shard size of the timeseries you wish to create when `create` is True.
467
+ """
468
+
469
+ # If dfs is a dict, we convert it to a list of tuples.
470
+ if isinstance(dfs, dict):
471
+ dfs = list(dfs.items())
472
+
473
+ if shard_size is not None and create == False:
474
+ raise ValueError("Invalid argument: shard size provided while create is False")
475
+
476
+ # If the tables are provided as strings, we look them up.
477
+ dfs_ = []
478
+ for table, df in dfs:
479
+ if isinstance(table, str):
480
+ table = table_cache.lookup(table, cluster)
481
+
482
+ dfs_.append((table, df))
483
+
484
+ data_by_table = []
485
+
486
+ for table, df in dfs_:
487
+ logger.debug("quasardb.pandas.write_dataframe, create = %s", create)
488
+ assert isinstance(df, pd.DataFrame)
489
+
490
+ # Create table if requested
491
+ if create:
492
+ _create_table_from_df(df, table, shard_size)
493
+
494
+ cinfos = [(x.name, x.type) for x in table.list_columns()]
495
+
496
+ if not df.index.is_monotonic_increasing:
497
+ logger.warning(
498
+ "dataframe index is unsorted, resorting dataframe based on index"
499
+ )
500
+ df = df.sort_index().reindex()
501
+
502
+ # We pass everything else to our qdbnp.write_arrays function, as generally speaking
503
+ # it is (much) more sensible to deal with numpy arrays than Pandas dataframes:
504
+ # pandas has the bad habit of wanting to cast data to different types if your data
505
+ # is sparse, most notably forcing sparse integer arrays to floating points.
506
+
507
+ data = _extract_columns(df, cinfos)
508
+ data["$timestamp"] = ma.masked_array(
509
+ df.index.to_numpy(copy=False, dtype="datetime64[ns]")
510
+ ) # We cast to masked_array to enforce typing compliance
511
+
512
+ data_by_table.append((table, data))
513
+
514
+ kwargs["deprecation_stacklevel"] = kwargs.get("deprecation_stacklevel", 1) + 1
515
+ return qdbnp.write_arrays(
516
+ data_by_table,
517
+ cluster,
518
+ table=None,
519
+ index=None,
520
+ dtype=dtype,
521
+ push_mode=push_mode,
522
+ _async=_async,
523
+ fast=fast,
524
+ truncate=truncate,
525
+ truncate_range=truncate_range,
526
+ deduplicate=deduplicate,
527
+ deduplication_mode=deduplication_mode,
528
+ infer_types=infer_types,
529
+ writer=writer,
530
+ write_through=write_through,
531
+ retries=retries,
532
+ **kwargs,
533
+ )
534
+
535
+
536
+ def write_dataframe(
537
+ df: pd.DataFrame,
538
+ cluster: quasardb.Cluster,
539
+ table: TableLike,
540
+ *,
541
+ create: bool = False,
542
+ shard_size: Optional[timedelta] = None,
543
+ # numpy.write_arrays passthrough options
544
+ dtype: Optional[
545
+ Union[DType, Dict[str, Optional[DType]], List[Optional[DType]]]
546
+ ] = None,
547
+ push_mode: Optional[quasardb.WriterPushMode] = None,
548
+ _async: bool = False,
549
+ fast: bool = False,
550
+ truncate: Union[bool, Range] = False,
551
+ truncate_range: Optional[Range] = None,
552
+ deduplicate: Union[bool, str, List[str]] = False,
553
+ deduplication_mode: str = "drop",
554
+ infer_types: bool = True,
555
+ writer: Optional[Writer] = None,
556
+ write_through: bool = True,
557
+ retries: Union[int, quasardb.RetryOptions] = 3,
558
+ **kwargs: Any,
559
+ ) -> List[Table]:
560
+ """
561
+ Store a single dataframe into a table. Takes the same arguments as `write_dataframes`, except only
562
+ a single df/table combination.
563
+ """
564
+ kwargs["deprecation_stacklevel"] = kwargs.get("deprecation_stacklevel", 1) + 1
565
+ return write_dataframes(
566
+ [(table, df)],
567
+ cluster,
568
+ create=create,
569
+ shard_size=shard_size,
570
+ dtype=dtype,
571
+ push_mode=push_mode,
572
+ _async=_async,
573
+ fast=fast,
574
+ truncate=truncate,
575
+ truncate_range=truncate_range,
576
+ deduplicate=deduplicate,
577
+ deduplication_mode=deduplication_mode,
578
+ infer_types=infer_types,
579
+ writer=writer,
580
+ write_through=write_through,
581
+ retries=retries,
582
+ **kwargs,
583
+ )
584
+
585
+
586
+ def write_pinned_dataframe(
587
+ df: pd.DataFrame,
588
+ cluster: quasardb.Cluster,
589
+ table: TableLike,
590
+ *,
591
+ create: bool = False,
592
+ shard_size: Optional[timedelta] = None,
593
+ # numpy.write_arrays passthrough options
594
+ dtype: Optional[
595
+ Union[DType, Dict[str, Optional[DType]], List[Optional[DType]]]
596
+ ] = None,
597
+ push_mode: Optional[quasardb.WriterPushMode] = None,
598
+ _async: bool = False,
599
+ fast: bool = False,
600
+ truncate: Union[bool, Range] = False,
601
+ truncate_range: Optional[Range] = None,
602
+ deduplicate: Union[bool, str, List[str]] = False,
603
+ deduplication_mode: str = "drop",
604
+ infer_types: bool = True,
605
+ writer: Optional[Writer] = None,
606
+ write_through: bool = True,
607
+ retries: Union[int, quasardb.RetryOptions] = 3,
608
+ **kwargs: Any,
609
+ ) -> List[Table]:
610
+ """
611
+ Legacy wrapper around write_dataframe()
612
+ """
613
+ logger.warning(
614
+ "write_pinned_dataframe is deprecated and will be removed in a future release."
615
+ )
616
+ logger.warning("Please use write_dataframe directly instead")
617
+ kwargs["deprecation_stacklevel"] = 2
618
+ return write_dataframe(
619
+ df,
620
+ cluster,
621
+ table,
622
+ create=create,
623
+ shard_size=shard_size,
624
+ dtype=dtype,
625
+ push_mode=push_mode,
626
+ _async=_async,
627
+ fast=fast,
628
+ truncate=truncate,
629
+ truncate_range=truncate_range,
630
+ deduplicate=deduplicate,
631
+ deduplication_mode=deduplication_mode,
632
+ infer_types=infer_types,
633
+ writer=writer,
634
+ write_through=write_through,
635
+ retries=retries,
636
+ **kwargs,
637
+ )
638
+
639
+
640
+ def _create_table_from_df(
641
+ df: pd.DataFrame, table: Table, shard_size: Optional[timedelta] = None
642
+ ) -> Table:
643
+ cols = list()
644
+
645
+ dtypes = _get_inferred_dtypes(df)
646
+
647
+ logger.info("got inferred dtypes: %s", dtypes)
648
+ for c in df.columns:
649
+ dt = dtypes[c]
650
+ ct = _dtype_to_column_type(df[c].dtype, dt)
651
+ logger.debug(
652
+ "probed pandas dtype %s to inferred dtype %s and map to quasardb column type %s",
653
+ df[c].dtype,
654
+ dt,
655
+ ct,
656
+ )
657
+ cols.append(quasardb.ColumnInfo(ct, c))
658
+
659
+ try:
660
+ if not shard_size:
661
+ table.create(cols)
662
+ else:
663
+ table.create(cols, shard_size)
664
+ except quasardb.AliasAlreadyExistsError:
665
+ # TODO(leon): warn? how?
666
+ pass
667
+
668
+ return table
669
+
670
+
671
+ def _dtype_to_column_type(dt: Any, inferred: Any) -> quasardb.ColumnType:
672
+ res = _dtype_map.get(inferred, None)
673
+ if res is None:
674
+ res = _dtype_map.get(dt, None)
675
+
676
+ if res is None:
677
+ raise ValueError("Incompatible data type: ", dt)
678
+
679
+ return res
680
+
681
+
682
+ def _get_inferred_dtypes(df: pd.DataFrame) -> Dict[str, str]:
683
+ dtypes = {}
684
+ for i in range(len(df.columns)):
685
+ c = df.columns[i]
686
+ dt = pd.api.types.infer_dtype(df[c].values)
687
+ logger.debug("Determined dtype of column %s to be %s", c, dt)
688
+ dtypes[c] = dt
689
+ return dtypes
690
+
691
+
692
+ def _get_inferred_dtypes_indexed(df: pd.DataFrame) -> List[str]:
693
+ dtypes = _get_inferred_dtypes(df)
694
+ # Performance improvement: avoid a expensive dict lookups by indexing
695
+ # the column types by relative offset within the df.
696
+ return list(dtypes[c] for c in df.columns)