quasardb 3.14.2.dev6__cp310-cp310-macosx_15_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of quasardb might be problematic. Click here for more details.

Files changed (45) hide show
  1. quasardb/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  2. quasardb/CMakeFiles/progress.marks +1 -0
  3. quasardb/Makefile +189 -0
  4. quasardb/__init__.py +137 -0
  5. quasardb/cmake_install.cmake +53 -0
  6. quasardb/date/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  7. quasardb/date/CMakeFiles/Export/a52b05f964b070ee926bcad51d3288af/dateTargets.cmake +108 -0
  8. quasardb/date/CMakeFiles/progress.marks +1 -0
  9. quasardb/date/Makefile +189 -0
  10. quasardb/date/cmake_install.cmake +76 -0
  11. quasardb/date/dateConfigVersion.cmake +65 -0
  12. quasardb/date/dateTargets.cmake +63 -0
  13. quasardb/extensions/__init__.py +8 -0
  14. quasardb/extensions/writer.py +191 -0
  15. quasardb/firehose.py +103 -0
  16. quasardb/libqdb_api.dylib +0 -0
  17. quasardb/numpy/__init__.py +1035 -0
  18. quasardb/pandas/__init__.py +501 -0
  19. quasardb/pool.py +305 -0
  20. quasardb/pybind11/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  21. quasardb/pybind11/CMakeFiles/progress.marks +1 -0
  22. quasardb/pybind11/Makefile +189 -0
  23. quasardb/pybind11/cmake_install.cmake +45 -0
  24. quasardb/quasardb.cpython-310-darwin.so +0 -0
  25. quasardb/range-v3/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  26. quasardb/range-v3/CMakeFiles/Export/d94ef200eca10a819b5858b33e808f5b/range-v3-targets.cmake +128 -0
  27. quasardb/range-v3/CMakeFiles/progress.marks +1 -0
  28. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/DependInfo.cmake +22 -0
  29. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/build.make +86 -0
  30. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/cmake_clean.cmake +5 -0
  31. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/compiler_depend.make +2 -0
  32. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/compiler_depend.ts +2 -0
  33. quasardb/range-v3/CMakeFiles/range.v3.headers.dir/progress.make +1 -0
  34. quasardb/range-v3/Makefile +204 -0
  35. quasardb/range-v3/cmake_install.cmake +88 -0
  36. quasardb/range-v3/include/range/v3/version.hpp +24 -0
  37. quasardb/range-v3/range-v3-config-version.cmake +83 -0
  38. quasardb/range-v3/range-v3-config.cmake +80 -0
  39. quasardb/stats.py +358 -0
  40. quasardb/table_cache.py +56 -0
  41. quasardb-3.14.2.dev6.dist-info/METADATA +41 -0
  42. quasardb-3.14.2.dev6.dist-info/RECORD +45 -0
  43. quasardb-3.14.2.dev6.dist-info/WHEEL +5 -0
  44. quasardb-3.14.2.dev6.dist-info/licenses/LICENSE.md +11 -0
  45. quasardb-3.14.2.dev6.dist-info/top_level.txt +1 -0
@@ -0,0 +1,501 @@
1
+ # pylint: disable=C0103,C0111,C0302,R0903
2
+
3
+ # Copyright (c) 2009-2024, quasardb SAS. All rights reserved.
4
+ # All rights reserved.
5
+ #
6
+ # Redistribution and use in source and binary forms, with or without
7
+ # modification, are permitted provided that the following conditions are met:
8
+ #
9
+ # * Redistributions of source code must retain the above copyright
10
+ # notice, this list of conditions and the following disclaimer.
11
+ # * Redistributions in binary form must reproduce the above copyright
12
+ # notice, this list of conditions and the following disclaimer in the
13
+ # documentation and/or other materials provided with the distribution.
14
+ # * Neither the name of quasardb nor the names of its contributors may
15
+ # be used to endorse or promote products derived from this software
16
+ # without specific prior written permission.
17
+ #
18
+ # THIS SOFTWARE IS PROVIDED BY QUASARDB AND CONTRIBUTORS ``AS IS'' AND ANY
19
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ # DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
22
+ # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
+ #
29
+
30
+ import logging
31
+ from datetime import datetime
32
+ from functools import partial
33
+
34
+ import quasardb
35
+ import quasardb.table_cache as table_cache
36
+ import quasardb.numpy as qdbnp
37
+
38
+
39
+ logger = logging.getLogger("quasardb.pandas")
40
+
41
+
42
+ class PandasRequired(ImportError):
43
+ """
44
+ Exception raised when trying to use QuasarDB pandas integration, but
45
+ pandas has not been installed.
46
+ """
47
+
48
+ pass
49
+
50
+
51
+ try:
52
+ import numpy as np
53
+ import numpy.ma as ma
54
+ import pandas as pd
55
+ from pandas.core.api import DataFrame, Series
56
+ from pandas.core.base import PandasObject
57
+
58
+ except ImportError:
59
+ raise PandasRequired("The pandas library is required to handle pandas data formats")
60
+
61
+
62
+ # Constant mapping of numpy dtype to QuasarDB column type
63
+ # TODO(leon): support this natively in qdb C api ? we have everything we need
64
+ # to understand dtypes.
65
+ _dtype_map = {
66
+ np.dtype("int64"): quasardb.ColumnType.Int64,
67
+ np.dtype("int32"): quasardb.ColumnType.Int64,
68
+ np.dtype("float64"): quasardb.ColumnType.Double,
69
+ np.dtype("object"): quasardb.ColumnType.String,
70
+ np.dtype("M8[ns]"): quasardb.ColumnType.Timestamp,
71
+ np.dtype("datetime64[ns]"): quasardb.ColumnType.Timestamp,
72
+ "int64": quasardb.ColumnType.Int64,
73
+ "int32": quasardb.ColumnType.Int64,
74
+ "float32": quasardb.ColumnType.Double,
75
+ "float64": quasardb.ColumnType.Double,
76
+ "timestamp": quasardb.ColumnType.Timestamp,
77
+ "string": quasardb.ColumnType.String,
78
+ "bytes": quasardb.ColumnType.Blob,
79
+ "floating": quasardb.ColumnType.Double,
80
+ "integer": quasardb.ColumnType.Int64,
81
+ "bytes": quasardb.ColumnType.Blob,
82
+ "string": quasardb.ColumnType.String,
83
+ "datetime64": quasardb.ColumnType.Timestamp,
84
+ }
85
+
86
+
87
+ def read_series(table, col_name, ranges=None):
88
+ """
89
+ Read a Pandas Timeseries from a single column.
90
+
91
+ Parameters:
92
+ -----------
93
+
94
+ table : quasardb.Timeseries
95
+ QuasarDB Timeseries table object, e.g. qdb_cluster.table('my_table')
96
+
97
+ col_name : str
98
+ Name of the column to read.
99
+
100
+ ranges : list
101
+ A list of ranges to read, represented as tuples of Numpy datetime64[ns] objects.
102
+ """
103
+ read_with = {
104
+ quasardb.ColumnType.Double: table.double_get_ranges,
105
+ quasardb.ColumnType.Blob: table.blob_get_ranges,
106
+ quasardb.ColumnType.String: table.string_get_ranges,
107
+ quasardb.ColumnType.Int64: table.int64_get_ranges,
108
+ quasardb.ColumnType.Timestamp: table.timestamp_get_ranges,
109
+ quasardb.ColumnType.Symbol: table.string_get_ranges,
110
+ }
111
+
112
+ kwargs = {"column": col_name}
113
+
114
+ if ranges is not None:
115
+ kwargs["ranges"] = ranges
116
+
117
+ # Dispatch based on column type
118
+ t = table.column_type_by_id(col_name)
119
+
120
+ logger.info(
121
+ "reading Series from column %s.%s with type %s", table.get_name(), col_name, t
122
+ )
123
+
124
+ res = (read_with[t])(**kwargs)
125
+
126
+ return Series(res[1], index=res[0])
127
+
128
+
129
+ def write_series(series, table, col_name, infer_types=True, dtype=None):
130
+ """
131
+ Writes a Pandas Timeseries to a single column.
132
+
133
+ Parameters:
134
+ -----------
135
+
136
+ series : pandas.Series
137
+ Pandas Series, with a numpy.datetime64[ns] as index. Underlying data will be attempted
138
+ to be transformed to appropriate QuasarDB type.
139
+
140
+ table : quasardb.Timeseries
141
+ QuasarDB Timeseries table object, e.g. qdb_cluster.table('my_table')
142
+
143
+ col_name : str
144
+ Column name to store data in.
145
+ """
146
+
147
+ logger.debug(
148
+ "write_series, table=%s, col_name=%s, infer_types=%s, dtype=%s",
149
+ table.get_name(),
150
+ col_name,
151
+ infer_types,
152
+ dtype,
153
+ )
154
+
155
+ data = None
156
+ index = None
157
+
158
+ data = ma.masked_array(series.to_numpy(copy=False), mask=series.isna())
159
+
160
+ if infer_types is True:
161
+ index = series.index.to_numpy("datetime64[ns]", copy=False)
162
+ else:
163
+ index = series.index.to_numpy(copy=False)
164
+
165
+ assert data is not None
166
+ assert index is not None
167
+
168
+ return qdbnp.write_array(
169
+ data=data,
170
+ index=index,
171
+ table=table,
172
+ column=col_name,
173
+ dtype=dtype,
174
+ infer_types=infer_types,
175
+ )
176
+
177
+
178
+ def query(cluster: quasardb.Cluster, query, index=None, blobs=False, numpy=True):
179
+ """
180
+ Execute a query and return the results as DataFrames. Returns a dict of
181
+ tablename / DataFrame pairs.
182
+
183
+ Parameters:
184
+ -----------
185
+
186
+ cluster : quasardb.Cluster
187
+ Active connection to the QuasarDB cluster
188
+
189
+ query : str
190
+ The query to execute.
191
+
192
+ blobs : bool or list
193
+ Determines which QuasarDB blob-columns should be returned as bytearrays; otherwise
194
+ they are returned as UTF-8 strings.
195
+
196
+ True means every blob column should be returned as byte-array, or a list will
197
+ specify which specific columns. Defaults to false, meaning all blobs are returned
198
+ as strings.
199
+
200
+ """
201
+ logger.debug("querying and returning as DataFrame: %s", query)
202
+ (index, m) = qdbnp.query(cluster, query, index=index, dict=True)
203
+ df = pd.DataFrame(m)
204
+
205
+ df.set_index(index, inplace=True)
206
+ return df
207
+
208
+
209
+ def stream_dataframes(
210
+ conn: quasardb.Cluster,
211
+ tables: list,
212
+ *,
213
+ batch_size: int = 2**16,
214
+ column_names: list = None,
215
+ ranges: list = None,
216
+ ):
217
+ """
218
+ Read a Pandas Dataframe from a QuasarDB Timeseries table. Returns a generator with dataframes of size `batch_size`, which is useful
219
+ when traversing a large dataset which does not fit into memory.
220
+
221
+ Accepts the same parameters as `stream_dataframes`.
222
+
223
+ Parameters:
224
+ -----------
225
+
226
+ conn : quasardb.Cluster
227
+ Connection to the QuasarDB database.
228
+
229
+ tables : list[str | quasardb.Table]
230
+ QuasarDB tables to stream, as a list of strings or quasardb table objects.
231
+
232
+ batch_size : int
233
+ The amount of rows to fetch in a single read operation. If unset, uses 2^16 (65536) rows
234
+ as batch size by default.
235
+
236
+ column_names : optional list
237
+ List of columns to read in dataframe. The timestamp column '$timestamp' is
238
+ always read.
239
+
240
+ Defaults to all columns.
241
+
242
+ ranges: optional list
243
+ A list of time ranges to read, represented as tuples of Numpy datetime64[ns] objects.
244
+ Defaults to the entire table.
245
+
246
+ """
247
+ # Sanitize batch_size
248
+ if batch_size == None:
249
+ batch_size = 2**16
250
+ elif not isinstance(batch_size, int):
251
+ raise TypeError(
252
+ "batch_size should be an integer, but got: {} with value {}".format(
253
+ type(batch_size), str(batch_size)
254
+ )
255
+ )
256
+
257
+ kwargs = {"batch_size": batch_size}
258
+
259
+ if column_names:
260
+ kwargs["column_names"] = column_names
261
+
262
+ if ranges:
263
+ kwargs["ranges"] = ranges
264
+
265
+ coerce_table_name_fn = lambda x: x if isinstance(x, str) else x.get_name()
266
+ kwargs["table_names"] = [coerce_table_name_fn(x) for x in tables]
267
+
268
+ with conn.reader(**kwargs) as reader:
269
+ for batch in reader:
270
+ # We always expect the timestamp column, and set this as the index
271
+ assert "$timestamp" in batch
272
+
273
+ idx = pd.Index(batch.pop("$timestamp"), copy=False, name="$timestamp")
274
+ df = pd.DataFrame(batch, index=idx)
275
+
276
+ yield df
277
+
278
+
279
+ def stream_dataframe(conn: quasardb.Cluster, table, **kwargs):
280
+ """
281
+ Read a single table and return a stream of dataframes. This is a convenience function that wraps around
282
+ `stream_dataframes`.
283
+ """
284
+ kwargs["tables"] = [table]
285
+
286
+ # For backwards compatibility, we drop the `$table` column returned: this is not strictly
287
+ # necessary, but it also is somewhat reasonable to drop it when we're reading from a single
288
+ # table, which is the case here.
289
+ clean_df_fn = lambda df: df.drop(columns=["$table"])
290
+
291
+ return (clean_df_fn(df) for df in stream_dataframes(conn, **kwargs))
292
+
293
+
294
+ def read_dataframe(conn: quasardb.Cluster, table, **kwargs):
295
+ """
296
+ Read a Pandas Dataframe from a QuasarDB Timeseries table. Wraps around stream_dataframes(), and
297
+ returns everything as a single dataframe. batch_size is always explicitly set to 0.
298
+
299
+
300
+ Parameters:
301
+ -----------
302
+
303
+ conn : quasardb.Cluster
304
+ Connection to the QuasarDB database.
305
+
306
+ table : str | quasardb.Table
307
+ QuasarDB table to stream, either as a string or a table object. When re-executing the same function
308
+ multiple times on the same tables, providing the table as an object has a performance benefit.
309
+
310
+ """
311
+
312
+ if (
313
+ "batch_size" in kwargs
314
+ and kwargs["batch_size"] != 0
315
+ and kwargs["batch_size"] != None
316
+ ):
317
+ logger.warn(
318
+ "Providing a batch size with read_dataframe is unsupported, overriding batch_size to 65536."
319
+ )
320
+ logger.warn(
321
+ "If you wish to traverse the data in smaller batches, please use: stream_dataframe()."
322
+ )
323
+ kwargs["batch_size"] = 2**16
324
+
325
+ # Note that this is *lazy*, dfs is a generator, not a list -- as such, dataframes will be
326
+ # fetched on-demand, which means that an error could occur in the middle of processing
327
+ # dataframes.
328
+ dfs = stream_dataframe(conn, table, **kwargs)
329
+
330
+ return pd.concat(dfs)
331
+
332
+
333
+ def _extract_columns(df, cinfos):
334
+ """
335
+ Converts dataframe to a number of numpy arrays, one for each column.
336
+
337
+ Arrays will be indexed by relative offset, in the same order as the table's columns.
338
+ If a table column is not present in the dataframe, it it have a None entry.
339
+ If a dataframe column is not present in the table, it will be ommitted.
340
+ """
341
+ ret = {}
342
+
343
+ # Grab all columns from the DataFrame in the order of table columns,
344
+ # put None if not present in df.
345
+ for i in range(len(cinfos)):
346
+ (cname, ctype) = cinfos[i]
347
+ xs = None
348
+
349
+ if cname in df.columns:
350
+ arr = df[cname].array
351
+ ret[cname] = ma.masked_array(arr.to_numpy(copy=False), mask=arr.isna())
352
+
353
+ return ret
354
+
355
+
356
+ def write_dataframes(dfs, cluster, *, create=False, shard_size=None, **kwargs):
357
+ """
358
+ Store dataframes into a table. Any additional parameters not documented here
359
+ are passed to numpy.write_arrays(). Please consult the pydoc of that function
360
+ for additional accepted parameters.
361
+
362
+ Parameters:
363
+ -----------
364
+
365
+ dfs: dict[str | quasardb.Table, pd.DataFrame] | list[tuple[str | quasardb.Table, pd.DataFrame]]
366
+ This can be either a dict that maps table (either objects or names) to a dataframe, or a list
367
+ of table<>dataframe tuples.
368
+
369
+ cluster: quasardb.Cluster
370
+ Active connection to the QuasarDB cluster
371
+
372
+ create: optional bool
373
+ Whether to create the table. Defaults to False.
374
+
375
+ shard_size: optional datetime.timedelta
376
+ The shard size of the timeseries you wish to create when `create` is True.
377
+ """
378
+
379
+ # If dfs is a dict, we convert it to a list of tuples.
380
+ if isinstance(dfs, dict):
381
+ dfs = dfs.items()
382
+
383
+ if shard_size is not None and create == False:
384
+ raise ValueError("Invalid argument: shard size provided while create is False")
385
+
386
+ # If the tables are provided as strings, we look them up.
387
+ dfs_ = []
388
+ for table, df in dfs:
389
+ if isinstance(table, str):
390
+ table = table_cache.lookup(table, cluster)
391
+
392
+ dfs_.append((table, df))
393
+
394
+ data_by_table = []
395
+
396
+ for table, df in dfs_:
397
+ logger.debug("quasardb.pandas.write_dataframe, create = %s", create)
398
+ assert isinstance(df, pd.DataFrame)
399
+
400
+ # Create table if requested
401
+ if create:
402
+ _create_table_from_df(df, table, shard_size)
403
+
404
+ cinfos = [(x.name, x.type) for x in table.list_columns()]
405
+
406
+ if not df.index.is_monotonic_increasing:
407
+ logger.warn(
408
+ "dataframe index is unsorted, resorting dataframe based on index"
409
+ )
410
+ df = df.sort_index().reindex()
411
+
412
+ # We pass everything else to our qdbnp.write_arrays function, as generally speaking
413
+ # it is (much) more sensible to deal with numpy arrays than Pandas dataframes:
414
+ # pandas has the bad habit of wanting to cast data to different types if your data
415
+ # is sparse, most notably forcing sparse integer arrays to floating points.
416
+
417
+ data = _extract_columns(df, cinfos)
418
+ data["$timestamp"] = df.index.to_numpy(copy=False, dtype="datetime64[ns]")
419
+
420
+ data_by_table.append((table, data))
421
+
422
+ kwargs["deprecation_stacklevel"] = kwargs.get("deprecation_stacklevel", 1) + 1
423
+ return qdbnp.write_arrays(data_by_table, cluster, table=None, index=None, **kwargs)
424
+
425
+
426
+ def write_dataframe(df, cluster, table, **kwargs):
427
+ """
428
+ Store a single dataframe into a table. Takes the same arguments as `write_dataframes`, except only
429
+ a single df/table combination.
430
+ """
431
+ kwargs["deprecation_stacklevel"] = kwargs.get("deprecation_stacklevel", 1) + 1
432
+ write_dataframes([(table, df)], cluster, **kwargs)
433
+
434
+
435
+ def write_pinned_dataframe(*args, **kwargs):
436
+ """
437
+ Legacy wrapper around write_dataframe()
438
+ """
439
+ logger.warn(
440
+ "write_pinned_dataframe is deprecated and will be removed in a future release."
441
+ )
442
+ logger.warn("Please use write_dataframe directly instead")
443
+ kwargs["deprecation_stacklevel"] = 2
444
+ return write_dataframe(*args, **kwargs)
445
+
446
+
447
+ def _create_table_from_df(df, table, shard_size=None):
448
+ cols = list()
449
+
450
+ dtypes = _get_inferred_dtypes(df)
451
+
452
+ logger.info("got inferred dtypes: %s", dtypes)
453
+ for c in df.columns:
454
+ dt = dtypes[c]
455
+ ct = _dtype_to_column_type(df[c].dtype, dt)
456
+ logger.debug(
457
+ "probed pandas dtype %s to inferred dtype %s and map to quasardb column type %s",
458
+ df[c].dtype,
459
+ dt,
460
+ ct,
461
+ )
462
+ cols.append(quasardb.ColumnInfo(ct, c))
463
+
464
+ try:
465
+ if not shard_size:
466
+ table.create(cols)
467
+ else:
468
+ table.create(cols, shard_size)
469
+ except quasardb.quasardb.AliasAlreadyExistsError:
470
+ # TODO(leon): warn? how?
471
+ pass
472
+
473
+ return table
474
+
475
+
476
+ def _dtype_to_column_type(dt, inferred):
477
+ res = _dtype_map.get(inferred, None)
478
+ if res is None:
479
+ res = _dtype_map.get(dt, None)
480
+
481
+ if res is None:
482
+ raise ValueError("Incompatible data type: ", dt)
483
+
484
+ return res
485
+
486
+
487
+ def _get_inferred_dtypes(df):
488
+ dtypes = dict()
489
+ for i in range(len(df.columns)):
490
+ c = df.columns[i]
491
+ dt = pd.api.types.infer_dtype(df[c].values)
492
+ logger.debug("Determined dtype of column %s to be %s", c, dt)
493
+ dtypes[c] = dt
494
+ return dtypes
495
+
496
+
497
+ def _get_inferred_dtypes_indexed(df):
498
+ dtypes = _get_inferred_dtypes(df)
499
+ # Performance improvement: avoid a expensive dict lookups by indexing
500
+ # the column types by relative offset within the df.
501
+ return list(dtypes[c] for c in df.columns)