Qubx 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Qubx might be problematic. Click here for more details.

Files changed (36) hide show
  1. {qubx-0.1.3 → qubx-0.1.4}/PKG-INFO +1 -1
  2. {qubx-0.1.3 → qubx-0.1.4}/pyproject.toml +1 -1
  3. qubx-0.1.4/src/qubx/data/readers.py +513 -0
  4. qubx-0.1.3/src/qubx/data/readers.py +0 -515
  5. {qubx-0.1.3 → qubx-0.1.4}/README.md +0 -0
  6. {qubx-0.1.3 → qubx-0.1.4}/build.py +0 -0
  7. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/__init__.py +0 -0
  8. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/_nb_magic.py +0 -0
  9. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/core/__init__.py +0 -0
  10. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/core/account.py +0 -0
  11. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/core/basics.py +0 -0
  12. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/core/helpers.py +0 -0
  13. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/core/loggers.py +0 -0
  14. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/core/lookups.py +0 -0
  15. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/core/series.pxd +0 -0
  16. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/core/series.pyx +0 -0
  17. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/core/strategy.py +0 -0
  18. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/core/utils.pyx +0 -0
  19. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/impl/ccxt_connector.py +0 -0
  20. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/impl/ccxt_customizations.py +0 -0
  21. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/impl/ccxt_trading.py +0 -0
  22. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/impl/ccxt_utils.py +0 -0
  23. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/math/__init__.py +0 -0
  24. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/math/stats.py +0 -0
  25. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/ta/__init__.py +0 -0
  26. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/ta/indicators.pyx +0 -0
  27. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/trackers/__init__.py +0 -0
  28. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/trackers/rebalancers.py +0 -0
  29. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/utils/__init__.py +0 -0
  30. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/utils/_pyxreloader.py +0 -0
  31. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/utils/charting/mpl_helpers.py +0 -0
  32. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/utils/marketdata/binance.py +0 -0
  33. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/utils/misc.py +0 -0
  34. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/utils/pandas.py +0 -0
  35. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/utils/runner.py +0 -0
  36. {qubx-0.1.3 → qubx-0.1.4}/src/qubx/utils/time.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: Qubx
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: Qubx - quantitative trading framework
5
5
  Home-page: https://github.com/dmarienko/Qubx
6
6
  Author: Dmitry Marienko
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "Qubx"
3
- version = "0.1.3"
3
+ version = "0.1.4"
4
4
  description = "Qubx - quantitative trading framework"
5
5
  authors = ["Dmitry Marienko <dmitry@gmail.com>"]
6
6
  readme = "README.md"
@@ -0,0 +1,513 @@
1
+ import re, os
2
+ from typing import Callable, List, Union, Optional, Iterable, Any
3
+ from os.path import exists, join
4
+ import numpy as np
5
+ import pandas as pd
6
+ import pyarrow as pa
7
+ from pyarrow import csv
8
+ import psycopg as pg
9
+ from functools import wraps
10
+
11
+ from qubx import logger
12
+ from qubx.core.series import TimeSeries, OHLCV, time_as_nsec, Quote, Trade
13
+ from qubx.utils.time import infer_series_frequency, handle_start_stop
14
+
15
+ _DT = lambda x: pd.Timedelta(x).to_numpy().item()
16
+ D1, H1 = _DT('1D'), _DT('1h')
17
+
18
+ DEFAULT_DAILY_SESSION = (_DT('00:00:00.100'), _DT('23:59:59.900'))
19
+ STOCK_DAILY_SESSION = (_DT('9:30:00.100'), _DT('15:59:59.900'))
20
+ CME_FUTURES_DAILY_SESSION = (_DT('8:30:00.100'), _DT('15:14:59.900'))
21
+
22
+
23
+ def _recognize_t(t: Union[int, str], defaultvalue, timeunit) -> int:
24
+ if isinstance(t, (str, pd.Timestamp)):
25
+ try:
26
+ return np.datetime64(t, timeunit)
27
+ except:
28
+ pass
29
+ return defaultvalue
30
+
31
+
32
+ def _find_column_index_in_list(xs, *args):
33
+ xs = [x.lower() for x in xs]
34
+ for a in args:
35
+ ai = a.lower()
36
+ if ai in xs:
37
+ return xs.index(ai)
38
+ raise IndexError(f"Can't find any from {args} in list: {xs}")
39
+
40
+
41
+ class DataTransformer:
42
+
43
+ def __init__(self) -> None:
44
+ self.buffer = []
45
+ self._column_names = []
46
+
47
+ def start_transform(self, name: str, column_names: List[str]):
48
+ self._column_names = column_names
49
+ self.buffer = []
50
+
51
+ def process_data(self, rows_data: Iterable) -> Any:
52
+ if rows_data is not None:
53
+ self.buffer.extend(rows_data)
54
+
55
+ def collect(self) -> Any:
56
+ return self.buffer
57
+
58
+
59
+ class DataReader:
60
+
61
+ def get_names(self) -> List[str] :
62
+ raise NotImplemented()
63
+
64
+ def read(self, data_id: str, start: str | None=None, stop: str | None=None,
65
+ transform: DataTransformer = DataTransformer(),
66
+ chunksize=0,
67
+ **kwargs
68
+ ) -> Iterable | List:
69
+ raise NotImplemented()
70
+
71
+
72
+ class CsvStorageDataReader(DataReader):
73
+ """
74
+ Data reader for timeseries data stored as csv files in the specified directory
75
+ """
76
+
77
+ def __init__(self, path: str) -> None:
78
+ if not exists(path):
79
+ raise ValueError(f"Folder is not found at {path}")
80
+ self.path = path
81
+
82
+ def __find_time_idx(self, arr: pa.ChunkedArray, v) -> int:
83
+ ix = arr.index(v).as_py()
84
+ if ix < 0:
85
+ for c in arr.iterchunks():
86
+ a = c.to_numpy()
87
+ ix = np.searchsorted(a, v, side='right')
88
+ if ix > 0 and ix < len(c):
89
+ ix = arr.index(a[ix]).as_py() - 1
90
+ break
91
+ return ix
92
+
93
+ def __check_file_name(self, name: str) -> str | None:
94
+ _f = join(self.path, name)
95
+ for sfx in ['.csv', '.csv.gz', '']:
96
+ if exists(p:=(_f + sfx)):
97
+ return p
98
+ return None
99
+
100
+ def read(self, data_id: str, start: str | None=None, stop: str | None=None,
101
+ transform: DataTransformer = DataTransformer(),
102
+ chunksize=0,
103
+ timestamp_formatters = None
104
+ ) -> Iterable | Any:
105
+
106
+ f_path = self.__check_file_name(data_id)
107
+ if not f_path:
108
+ ValueError(f"Can't find any csv data for {data_id} in {self.path} !")
109
+
110
+ convert_options = None
111
+ if timestamp_formatters is not None:
112
+ convert_options=csv.ConvertOptions(timestamp_parsers=timestamp_formatters)
113
+
114
+ table = csv.read_csv(
115
+ f_path,
116
+ parse_options=csv.ParseOptions(ignore_empty_lines=True),
117
+ convert_options=convert_options
118
+ )
119
+ fieldnames = table.column_names
120
+
121
+ # - try to find range to load
122
+ start_idx, stop_idx = 0, table.num_rows
123
+ try:
124
+ _time_field_idx = _find_column_index_in_list(fieldnames, 'time', 'timestamp', 'datetime', 'date')
125
+ _time_type = table.field(_time_field_idx).type
126
+ _time_unit = _time_type.unit if hasattr(_time_type, 'unit') else 's'
127
+ _time_data = table[_time_field_idx]
128
+
129
+ # - check if need convert time to primitive types (i.e. Date32 -> timestamp[x])
130
+ _time_cast_function = lambda xs: xs
131
+ if _time_type != pa.timestamp(_time_unit):
132
+ _time_cast_function = lambda xs: xs.cast(pa.timestamp(_time_unit))
133
+ _time_data = _time_cast_function(_time_data)
134
+
135
+ # - preprocessing start and stop
136
+ t_0, t_1 = handle_start_stop(start, stop, convert=lambda x: _recognize_t(x, None, _time_unit))
137
+
138
+ # - check requested range
139
+ if t_0:
140
+ start_idx = self.__find_time_idx(_time_data, t_0)
141
+ if start_idx >= table.num_rows:
142
+ # no data for requested start date
143
+ return None
144
+
145
+ if t_1:
146
+ stop_idx = self.__find_time_idx(_time_data, t_1)
147
+ if stop_idx < 0 or stop_idx < start_idx:
148
+ stop_idx = table.num_rows
149
+
150
+ except Exception as exc:
151
+ logger.warning(exc)
152
+ logger.info('loading whole file')
153
+
154
+ length = (stop_idx - start_idx + 1)
155
+ selected_table = table.slice(start_idx, length)
156
+
157
+ # - in this case we want to return iterable chunks of data
158
+ if chunksize > 0:
159
+ def _iter_chunks():
160
+ for n in range(0, length // chunksize + 1):
161
+ transform.start_transform(data_id, fieldnames)
162
+ raw_data = selected_table[n*chunksize : min((n+1)*chunksize, length)].to_pandas().to_numpy()
163
+ transform.process_data(raw_data)
164
+ yield transform.collect()
165
+ return _iter_chunks()
166
+
167
+ transform.start_transform(data_id, fieldnames)
168
+ raw_data = selected_table.to_pandas().to_numpy()
169
+ transform.process_data(raw_data)
170
+ return transform.collect()
171
+
172
+ def get_names(self) -> List[str] :
173
+ _n = []
174
+ for s in os.listdir(self.path):
175
+ if (m:=re.match(r'(.*)\.csv(.gz)?$', s)):
176
+ _n.append(m.group(1))
177
+ return _n
178
+
179
+
180
+ class AsPandasFrame(DataTransformer):
181
+ """
182
+ List of records to pandas dataframe transformer
183
+ """
184
+
185
+ def start_transform(self, name: str, column_names: List[str]):
186
+ self._time_idx = _find_column_index_in_list(column_names, 'time', 'timestamp', 'datetime', 'date')
187
+ self._column_names = column_names
188
+ self._frame = pd.DataFrame()
189
+
190
+ def process_data(self, rows_data: Iterable) -> Any:
191
+ self._frame
192
+ p = pd.DataFrame.from_records(rows_data, columns=self._column_names)
193
+ p.set_index(self._column_names[self._time_idx], drop=True, inplace=True)
194
+ p.sort_index(inplace=True)
195
+ self._frame = pd.concat((self._frame, p), axis=0, sort=True)
196
+ return p
197
+
198
+ def collect(self) -> Any:
199
+ return self._frame
200
+
201
+
202
+ class AsOhlcvSeries(DataTransformer):
203
+
204
+ def __init__(self, timeframe: str | None = None, timestamp_units='ns') -> None:
205
+ super().__init__()
206
+ self.timeframe = timeframe
207
+ self._series = None
208
+ self._data_type = None
209
+ self.timestamp_units = timestamp_units
210
+
211
+ def start_transform(self, name: str, column_names: List[str]):
212
+ self._time_idx = _find_column_index_in_list(column_names, 'time', 'timestamp', 'datetime', 'date')
213
+ self._volume_idx = None
214
+ self._b_volume_idx = None
215
+ try:
216
+ self._close_idx = _find_column_index_in_list(column_names, 'close')
217
+ self._open_idx = _find_column_index_in_list(column_names, 'open')
218
+ self._high_idx = _find_column_index_in_list(column_names, 'high')
219
+ self._low_idx = _find_column_index_in_list(column_names, 'low')
220
+
221
+ try:
222
+ self._volume_idx = _find_column_index_in_list(column_names, 'quote_volume', 'volume', 'vol')
223
+ except: pass
224
+
225
+ try:
226
+ self._b_volume_idx = _find_column_index_in_list(column_names, 'taker_buy_volume', 'taker_buy_quote_volume', 'buy_volume')
227
+ except: pass
228
+
229
+ self._data_type = 'ohlc'
230
+ except:
231
+ try:
232
+ self._ask_idx = _find_column_index_in_list(column_names, 'ask')
233
+ self._bid_idx = _find_column_index_in_list(column_names, 'bid')
234
+ self._data_type = 'quotes'
235
+ except:
236
+
237
+ try:
238
+ self._price_idx = _find_column_index_in_list(column_names, 'price')
239
+ self._size_idx = _find_column_index_in_list(column_names, 'quote_qty', 'qty', 'size', 'amount', 'volume')
240
+ self._taker_idx = None
241
+ try:
242
+ self._taker_idx = _find_column_index_in_list(column_names, 'is_buyer_maker', 'side', 'aggressive', 'taker', 'is_taker')
243
+ except: pass
244
+
245
+ self._data_type = 'trades'
246
+ except:
247
+ raise ValueError(f"Can't recognize data for update from header: {column_names}")
248
+
249
+ self._column_names = column_names
250
+ self._name = name
251
+ if self.timeframe:
252
+ self._series = OHLCV(self._name, self.timeframe)
253
+
254
+ def _time(self, t) -> int:
255
+ if self.timestamp_units == 'ns':
256
+ return np.datetime64(t, 'ns').item()
257
+ return np.datetime64(t, self.timestamp_units).astype('datetime64[ns]').item()
258
+
259
+ def _proc_ohlc(self, rows_data: List[List]):
260
+ for d in rows_data:
261
+ self._series.update_by_bar(
262
+ self._time(d[self._time_idx]),
263
+ d[self._open_idx], d[self._high_idx], d[self._low_idx], d[self._close_idx],
264
+ d[self._volume_idx] if self._volume_idx else 0,
265
+ d[self._b_volume_idx] if self._b_volume_idx else 0
266
+ )
267
+
268
+ def _proc_quotes(self, rows_data: List[List]):
269
+ for d in rows_data:
270
+ self._series.update(
271
+ self._time(d[self._time_idx]),
272
+ (d[self._ask_idx] + d[self._bid_idx])/2
273
+ )
274
+
275
+ def _proc_trades(self, rows_data: List[List]):
276
+ for d in rows_data:
277
+ a = d[self._taker_idx] if self._taker_idx else 0
278
+ s = d[self._size_idx]
279
+ b = s if a else 0
280
+ self._series.update(self._time(d[self._time_idx]), d[self._price_idx], s, b)
281
+
282
+ def process_data(self, rows_data: List[List]) -> Any:
283
+ if self._series is None:
284
+ ts = [t[self._time_idx] for t in rows_data[:100]]
285
+ self.timeframe = pd.Timedelta(infer_series_frequency(ts)).asm8.item()
286
+
287
+ # - create instance after first data received if
288
+ self._series = OHLCV(self._name, self.timeframe)
289
+
290
+ match self._data_type:
291
+ case 'ohlc':
292
+ self._proc_ohlc(rows_data)
293
+ case 'quotes':
294
+ self._proc_quotes(rows_data)
295
+ case 'trades':
296
+ self._proc_trades(rows_data)
297
+
298
+ return None
299
+
300
+ def collect(self) -> Any:
301
+ return self._series
302
+
303
+
304
+ class AsQuotes(DataTransformer):
305
+
306
+ def start_transform(self, name: str, column_names: List[str]):
307
+ self.buffer = list()
308
+ self._time_idx = _find_column_index_in_list(column_names, 'time', 'timestamp', 'datetime')
309
+ self._bid_idx = _find_column_index_in_list(column_names, 'bid')
310
+ self._ask_idx = _find_column_index_in_list(column_names, 'ask')
311
+ self._bidvol_idx = _find_column_index_in_list(column_names, 'bidvol', 'bid_vol', 'bidsize', 'bid_size')
312
+ self._askvol_idx = _find_column_index_in_list(column_names, 'askvol', 'ask_vol', 'asksize', 'ask_size')
313
+
314
+ def process_data(self, rows_data: Iterable) -> Any:
315
+ if rows_data is not None:
316
+ for d in rows_data:
317
+ t = d[self._time_idx]
318
+ b = d[self._bid_idx]
319
+ a = d[self._ask_idx]
320
+ bv = d[self._bidvol_idx]
321
+ av = d[self._askvol_idx]
322
+ self.buffer.append(Quote(t.as_unit('ns').asm8.item(), b, a, bv, av))
323
+
324
+
325
+ class RestoreTicksFromOHLC(DataTransformer):
326
+ """
327
+ Emulates quotes (and trades) from OHLC bars
328
+ """
329
+
330
+ def __init__(self,
331
+ trades: bool=False, # if we also wants 'trades'
332
+ default_bid_size=1e9, # default bid/ask is big
333
+ default_ask_size=1e9, # default bid/ask is big
334
+ daily_session_start_end=DEFAULT_DAILY_SESSION,
335
+ spread=0.0):
336
+ super().__init__()
337
+ self._trades = trades
338
+ self._bid_size = default_bid_size
339
+ self._ask_size = default_ask_size
340
+ self._s2 = spread / 2.0
341
+ self._d_session_start = daily_session_start_end[0]
342
+ self._d_session_end = daily_session_start_end[1]
343
+
344
+ def start_transform(self, name: str, column_names: List[str]):
345
+ self.buffer = []
346
+ # - it will fail if receive data doesn't look as ohlcv
347
+ self._time_idx = _find_column_index_in_list(column_names, 'time', 'timestamp', 'datetime', 'date')
348
+ self._open_idx = _find_column_index_in_list(column_names, 'open')
349
+ self._high_idx = _find_column_index_in_list(column_names, 'high')
350
+ self._low_idx = _find_column_index_in_list(column_names, 'low')
351
+ self._close_idx = _find_column_index_in_list(column_names, 'close')
352
+ self._volume_idx = None
353
+ self._freq = None
354
+ try:
355
+ self._volume_idx = _find_column_index_in_list(column_names, 'volume', 'vol')
356
+ except: pass
357
+
358
+ if self._volume_idx is None and self._trades:
359
+ logger.warning("Input OHLC data doesn't contain volume information so trades can't be emulated !")
360
+ self._trades = False
361
+
362
+ def process_data(self, rows_data:List[List]) -> Any:
363
+ if rows_data is None:
364
+ return
365
+
366
+ s2 = self._s2
367
+
368
+ if self._freq is None:
369
+ ts = [t[self._time_idx] for t in rows_data[:100]]
370
+ self._freq = infer_series_frequency(ts)
371
+
372
+ # - timestamps when we emit simulated quotes
373
+ dt = self._freq.astype('timedelta64[ns]').item()
374
+ if dt < D1:
375
+ self._t_start = dt // 10
376
+ self._t_mid1 = dt // 2 - dt // 10
377
+ self._t_mid2 = dt // 2 + dt // 10
378
+ self._t_end = dt - dt // 10
379
+ else:
380
+ self._t_start = self._d_session_start
381
+ self._t_mid1 = dt // 2 - H1
382
+ self._t_mid2 = dt // 2 + H1
383
+ self._t_end = self._d_session_end
384
+
385
+ # - input data
386
+ for data in rows_data:
387
+ ti = pd.Timestamp(data[self._time_idx]).as_unit('ns').asm8.item()
388
+ o = data[self._open_idx]
389
+ h= data[self._high_idx]
390
+ l = data[self._low_idx]
391
+ c = data[self._close_idx]
392
+ rv = data[self._volume_idx] if self._volume_idx else 0
393
+
394
+ # - opening quote
395
+ self.buffer.append(Quote(ti + self._t_start, o - s2, o + s2, self._bid_size, self._ask_size))
396
+
397
+ if c >= o:
398
+ if self._trades:
399
+ self.buffer.append(Trade(ti + self._t_start, o - s2, rv * (o - l))) # sell 1
400
+ self.buffer.append(Quote(ti + self._t_mid1, l - s2, l + s2, self._bid_size, self._ask_size))
401
+
402
+ if self._trades:
403
+ self.buffer.append(Trade(ti + self._t_mid1, l + s2, rv * (c - o))) # buy 1
404
+ self.buffer.append(Quote(ti + self._t_mid2, h - s2, h + s2, self._bid_size, self._ask_size))
405
+
406
+ if self._trades:
407
+ self.buffer.append(Trade(ti + self._t_mid2, h - s2, rv * (h - c))) # sell 2
408
+ else:
409
+ if self._trades:
410
+ self.buffer.append(Trade(ti + self._t_start, o + s2, rv * (h - o))) # buy 1
411
+ self.buffer.append(Quote(ti + self._t_mid1, h - s2, h + s2, self._bid_size, self._ask_size))
412
+
413
+ if self._trades:
414
+ self.buffer.append(Trade(ti + self._t_mid1, h - s2, rv * (o - c))) # sell 1
415
+ self.buffer.append(Quote(ti + self._t_mid2, l - s2, l + s2, self._bid_size, self._ask_size))
416
+
417
+ if self._trades:
418
+ self.buffer.append(Trade(ti + self._t_mid2, l + s2, rv * (c - l))) # buy 2
419
+
420
+ # - closing quote
421
+ self.buffer.append(Quote(ti + self._t_end, c - s2, c + s2, self._bid_size, self._ask_size))
422
+
423
+
424
+ def _retry(fn):
425
+ @wraps(fn)
426
+ def wrapper(*args, **kw):
427
+ cls = args[0]
428
+ for x in range(cls._reconnect_tries):
429
+ # print(x, cls._reconnect_tries)
430
+ try:
431
+ return fn(*args, **kw)
432
+ except (pg.InterfaceError, pg.OperationalError) as e:
433
+ logger.warning("Database Connection [InterfaceError or OperationalError]")
434
+ # print ("Idle for %s seconds" % (cls._reconnect_idle))
435
+ # time.sleep(cls._reconnect_idle)
436
+ cls._connect()
437
+ return wrapper
438
+
439
+
440
+ class QuestDBConnector(DataReader):
441
+ """
442
+ Very first version of QuestDB connector
443
+
444
+ # Connect to an existing QuestDB instance
445
+ >>> db = QuestDBConnector('user=admin password=quest host=localhost port=8812', OhlcvPandasDataProcessor())
446
+ >>> db.read('BINANCEF.ETHUSDT', '2024-01-01')
447
+ """
448
+ _reconnect_tries = 5
449
+ _reconnect_idle = 0.1 # wait seconds before retying
450
+
451
+ def __init__(self, connection_url: str) -> None:
452
+ self._connection = None
453
+ self._cursor = None
454
+ self.connection_url = connection_url
455
+ self._connect()
456
+
457
+ def _connect(self):
458
+ logger.info("Connecting to QuestDB ...")
459
+ self._connection = pg.connect(self.connection_url, autocommit=True)
460
+ self._cursor = self._connection.cursor()
461
+
462
+ @_retry
463
+ def read(self, data_id: str, start: str|None=None, stop: str|None=None,
464
+ transform: DataTransformer = DataTransformer(),
465
+ chunksize=0, # TODO: use self._cursor.fetchmany in this case !!!!
466
+ timeframe: str='1m') -> Any:
467
+ start, end = handle_start_stop(start, stop)
468
+ w0 = f"timestamp >= '{start}'" if start else ''
469
+ w1 = f"timestamp <= '{end}'" if end else ''
470
+ where = f'where {w0} and {w1}' if (w0 and w1) else f"where {(w0 or w1)}"
471
+
472
+ # just a temp hack - actually we need to discuss symbology etc
473
+ symbol = data_id#.split('.')[-1]
474
+
475
+ self._cursor.execute(
476
+ f"""
477
+ select timestamp,
478
+ first(open) as open,
479
+ max(high) as high,
480
+ min(low) as low,
481
+ last(close) as close,
482
+ sum(volume) as volume,
483
+ sum(quote_volume) as quote_volume,
484
+ sum(count) as count,
485
+ sum(taker_buy_volume) as taker_buy_volume,
486
+ sum(taker_buy_quote_volume) as taker_buy_quote_volume
487
+ from "{symbol.upper()}" {where}
488
+ SAMPLE by {timeframe};
489
+ """ # type: ignore
490
+ )
491
+ records = self._cursor.fetchall() # TODO: for chunksize > 0 use fetchmany etc
492
+ names = [d.name for d in self._cursor.description]
493
+
494
+ transform.start_transform(data_id, names)
495
+
496
+ # d = np.array(records)
497
+ transform.process_data(records)
498
+ return transform.collect()
499
+
500
+ @_retry
501
+ def get_names(self) -> List[str] :
502
+ self._cursor.execute("select table_name from tables()")
503
+ records = self._cursor.fetchall()
504
+ return [r[0] for r in records]
505
+
506
+ def __del__(self):
507
+ for c in (self._cursor, self._connection):
508
+ try:
509
+ logger.info("Closing connection")
510
+ c.close()
511
+ except:
512
+ pass
513
+
@@ -1,515 +0,0 @@
1
- import re
2
- from typing import List, Union, Optional, Iterable, Any
3
- from os.path import exists
4
- import numpy as np
5
- import pandas as pd
6
- import pyarrow as pa
7
- from pyarrow import csv
8
- import psycopg as pg
9
- from functools import wraps
10
-
11
- from qubx import logger
12
- from qubx.core.series import TimeSeries, OHLCV, time_as_nsec, Quote, Trade
13
- from qubx.utils.time import infer_series_frequency, handle_start_stop
14
-
15
- _DT = lambda x: pd.Timedelta(x).to_numpy().item()
16
- D1, H1 = _DT('1D'), _DT('1h')
17
-
18
- DEFAULT_DAILY_SESSION = (_DT('00:00:00.100'), _DT('23:59:59.900'))
19
- STOCK_DAILY_SESSION = (_DT('9:30:00.100'), _DT('15:59:59.900'))
20
- CME_FUTURES_DAILY_SESSION = (_DT('8:30:00.100'), _DT('15:14:59.900'))
21
-
22
-
23
- def _recognize_t(t: Union[int, str], defaultvalue, timeunit) -> int:
24
- if isinstance(t, (str, pd.Timestamp)):
25
- try:
26
- return np.datetime64(t, timeunit)
27
- except:
28
- pass
29
- return defaultvalue
30
-
31
-
32
- def _find_column_index_in_list(xs, *args):
33
- xs = [x.lower() for x in xs]
34
- for a in args:
35
- ai = a.lower()
36
- if ai in xs:
37
- return xs.index(ai)
38
- raise IndexError(f"Can't find any from {args} in list: {xs}")
39
-
40
-
41
- class DataProcessor:
42
- """
43
- Common interface for data processor with default aggregating into list implementation
44
- """
45
- def __init__(self) -> None:
46
- self.buffer = {}
47
- self._column_names = []
48
-
49
- def start_processing(self, column_names: List[str], name: str | None = None):
50
- self._column_names = column_names
51
- self.buffer = {c: [] for c in column_names}
52
-
53
- def process_data_columns(self, columns_data: list) -> Optional[Iterable]:
54
- for i, c in enumerate(columns_data):
55
- self.buffer[self._column_names[i]].append(c)
56
- return None
57
-
58
- def process_data_rows(self, rows_data: list) -> Optional[Iterable]:
59
- for r in rows_data:
60
- c = []
61
- for j, d in enumerate(r):
62
- self.buffer[self._column_names[j]].append(d)
63
- return None
64
-
65
- def get_result(self) -> Any:
66
- return self.buffer
67
-
68
-
69
- class DataReader:
70
- """
71
- Common interface for data reader
72
- """
73
- _processor: DataProcessor
74
-
75
- def __init__(self, processor=None) -> None:
76
- self._processor = DataProcessor() if processor is None else processor
77
-
78
- def read(self, start: Optional[str]=None, stop: Optional[str]=None) -> Any:
79
- pass
80
-
81
-
82
- class QuotesDataProcessor(DataProcessor):
83
- """
84
- Process quotes data and collect them as list
85
- """
86
- def start_processing(self, fieldnames: List[str], name: str | None = None):
87
- self.buffer = list()
88
- self._time_idx = _find_column_index_in_list(fieldnames, 'time', 'timestamp', 'datetime')
89
- self._bid_idx = _find_column_index_in_list(fieldnames, 'bid')
90
- self._ask_idx = _find_column_index_in_list(fieldnames, 'ask')
91
- self._bidvol_idx = _find_column_index_in_list(fieldnames, 'bidvol', 'bid_vol', 'bidsize', 'bid_size')
92
- self._askvol_idx = _find_column_index_in_list(fieldnames, 'askvol', 'ask_vol', 'asksize', 'ask_size')
93
-
94
- def process_data_columns(self, columns_data: list) -> Optional[Iterable]:
95
- tms = columns_data[self._time_idx]
96
- bids = columns_data[self._bid_idx]
97
- asks = columns_data[self._ask_idx]
98
- bidvol = columns_data[self._bidvol_idx]
99
- askvol = columns_data[self._askvol_idx]
100
- for i in range(len(tms)):
101
- self.buffer.append(Quote(tms[i], bids[i], asks[i], bidvol[i], askvol[i]))
102
- return None
103
-
104
-
105
- class QuotesFromOHLCVDataProcessor(DataProcessor):
106
- """
107
- Process OHLC and generate Quotes (+ Trades) from it
108
- """
109
- def __init__(self, trades: bool=False,
110
- default_bid_size=1e9, # default bid/ask is big
111
- default_ask_size=1e9, # default bid/ask is big
112
- daily_session_start_end=DEFAULT_DAILY_SESSION,
113
- spread=0.0,
114
- ) -> None:
115
- super().__init__()
116
- self._trades = trades
117
- self._bid_size = default_bid_size
118
- self._ask_size = default_ask_size
119
- self._s2 = spread / 2.0
120
- self._d_session_start = daily_session_start_end[0]
121
- self._d_session_end = daily_session_start_end[1]
122
-
123
- def start_processing(self, fieldnames: List[str], name: str | None = None):
124
- self._time_idx = _find_column_index_in_list(fieldnames, 'time', 'timestamp', 'datetime', 'date')
125
- self._open_idx = _find_column_index_in_list(fieldnames, 'open')
126
- self._high_idx = _find_column_index_in_list(fieldnames, 'high')
127
- self._low_idx = _find_column_index_in_list(fieldnames, 'low')
128
- self._close_idx = _find_column_index_in_list(fieldnames, 'close')
129
- self._volume_idx = None
130
- self._timeframe = None
131
-
132
- try:
133
- self._volume_idx = _find_column_index_in_list(fieldnames, 'volume', 'vol')
134
- except:
135
- pass
136
-
137
- self.buffer = []
138
-
139
- def process_data_columns(self, data: list) -> Optional[Iterable]:
140
- s2 = self._s2
141
- if self._timeframe is None:
142
- _freq = infer_series_frequency(data[self._time_idx])
143
- self._timeframe = _freq.astype('timedelta64[s]')
144
-
145
- # - timestamps when we emit simulated quotes
146
- dt = _freq.astype('timedelta64[ns]').item()
147
- if dt < D1:
148
- self._t_start = dt // 10
149
- self._t_mid1 = dt // 2 - dt // 10
150
- self._t_mid2 = dt // 2 + dt // 10
151
- self._t_end = dt - dt // 10
152
- else:
153
- self._t_start = self._d_session_start
154
- self._t_mid1 = dt // 2 - H1
155
- self._t_mid2 = dt // 2 + H1
156
- self._t_end = self._d_session_end
157
-
158
- # - input data
159
- times = data[self._time_idx]
160
- opens = data[self._open_idx]
161
- highs = data[self._high_idx]
162
- lows = data[self._low_idx]
163
- closes = data[self._close_idx]
164
- volumes = data[self._volume_idx] if self._volume_idx else None
165
- if volumes is None and self._trades:
166
- logger.warning("Input OHLC data doesn't contain volume information so trades can't be emulated !")
167
- self._trades = False
168
-
169
- for i in range(len(times)):
170
- ti, o, h, l, c = times[i].astype('datetime64[ns]'), opens[i], highs[i], lows[i], closes[i]
171
-
172
- if self._trades:
173
- rv = volumes[i] / (h - l)
174
-
175
- # - opening quote
176
- self.buffer.append(Quote(ti + self._t_start, o - s2, o + s2, self._bid_size, self._ask_size))
177
-
178
- if c >= o:
179
- if self._trades:
180
- self.buffer.append(Trade(ti + self._t_start, o - s2, rv * (o - l))) # sell 1
181
- self.buffer.append(Quote(ti + self._t_mid1, l - s2, l + s2, self._bid_size, self._ask_size))
182
-
183
- if self._trades:
184
- self.buffer.append(Trade(ti + self._t_mid1, l + s2, rv * (c - o))) # buy 1
185
- self.buffer.append(Quote(ti + self._t_mid2, h - s2, h + s2, self._bid_size, self._ask_size))
186
-
187
- if self._trades:
188
- self.buffer.append(Trade(ti + self._t_mid2, h - s2, rv * (h - c))) # sell 2
189
- else:
190
- if self._trades:
191
- self.buffer.append(Trade(ti + self._t_start, o + s2, rv * (h - o))) # buy 1
192
- self.buffer.append(Quote(ti + self._t_mid1, h - s2, h + s2, self._bid_size, self._ask_size))
193
-
194
- if self._trades:
195
- self.buffer.append(Trade(ti + self._t_mid1, h - s2, rv * (o - c))) # sell 1
196
- self.buffer.append(Quote(ti + self._t_mid2, l - s2, l + s2, self._bid_size, self._ask_size))
197
-
198
- if self._trades:
199
- self.buffer.append(Trade(ti + self._t_mid2, l + s2, rv * (c - l))) # buy 2
200
-
201
- # - closing quote
202
- self.buffer.append(Quote(ti + self._t_end, c - s2, c + s2, self._bid_size, self._ask_size))
203
-
204
- return None
205
-
206
- def get_result(self) -> Any:
207
- return self.buffer
208
-
209
-
210
- class OhlcvDataProcessor(DataProcessor):
211
- """
212
- Process data and convert it to Qube OHLCV timeseries
213
- """
214
- def __init__(self, name: str | None = None) -> None:
215
- super().__init__()
216
- self._name = name
217
-
218
- def start_processing(self, fieldnames: List[str], name: str | None = None):
219
- self._time_idx = _find_column_index_in_list(fieldnames, 'time', 'timestamp', 'datetime', 'date')
220
- self._open_idx = _find_column_index_in_list(fieldnames, 'open')
221
- self._high_idx = _find_column_index_in_list(fieldnames, 'high')
222
- self._low_idx = _find_column_index_in_list(fieldnames, 'low')
223
- self._close_idx = _find_column_index_in_list(fieldnames, 'close')
224
- self._volume_idx = None
225
- self._b_volume_idx = None
226
- self._timeframe = None
227
- self._name = name if name else self._name
228
-
229
- try:
230
- self._volume_idx = _find_column_index_in_list(fieldnames, 'quote_volume', 'volume', 'vol')
231
- except: pass
232
-
233
- try:
234
- self._b_volume_idx = _find_column_index_in_list(fieldnames, 'taker_buy_volume', 'taker_buy_quote_volume', 'buy_volume')
235
- except: pass
236
-
237
- self.ohlc = None
238
-
239
- def process_data_columns(self, data: list) -> Optional[Iterable]:
240
- if self._timeframe is None:
241
- self._timeframe = infer_series_frequency(data[self._time_idx]).astype('timedelta64[s]')
242
-
243
- # - create instance after first data received
244
- self.ohlc = OHLCV(self._name, self._timeframe)
245
-
246
- self.ohlc.append_data(
247
- data[self._time_idx],
248
- data[self._open_idx], data[self._high_idx], data[self._low_idx], data[self._close_idx],
249
- data[self._volume_idx] if self._volume_idx else np.empty(0),
250
- data[self._b_volume_idx] if self._b_volume_idx else np.empty(0)
251
- )
252
- return None
253
-
254
- def process_data_rows(self, data: List[list]) -> Iterable | None:
255
- if self._timeframe is None:
256
- ts = [t[self._time_idx] for t in data[:100]]
257
- self._timeframe = pd.Timedelta(infer_series_frequency(ts)).asm8.item()
258
-
259
- # - create instance after first data received
260
- self.ohlc = OHLCV(self._name, self._timeframe)
261
-
262
- for d in data:
263
- self.ohlc.update_by_bar(
264
- np.datetime64(d[self._time_idx], 'ns').item(),
265
- d[self._open_idx], d[self._high_idx], d[self._low_idx], d[self._close_idx],
266
- d[self._volume_idx] if self._volume_idx else 0,
267
- d[self._b_volume_idx] if self._b_volume_idx else 0
268
- )
269
- return None
270
-
271
- def get_result(self) -> Any:
272
- return self.ohlc
273
-
274
-
275
- class OhlcvPandasDataProcessor(DataProcessor):
276
- """
277
- Process data and convert it to pandas OHLCV dataframes
278
- """
279
- def __init__(self) -> None:
280
- super().__init__()
281
- self._fieldnames: List = []
282
-
283
- def start_processing(self, fieldnames: List[str], name: str | None = None):
284
- self._time_idx = _find_column_index_in_list(fieldnames, 'time', 'timestamp', 'datetime', 'date')
285
- self._open_idx = _find_column_index_in_list(fieldnames, 'open')
286
- self._high_idx = _find_column_index_in_list(fieldnames, 'high')
287
- self._low_idx = _find_column_index_in_list(fieldnames, 'low')
288
- self._close_idx = _find_column_index_in_list(fieldnames, 'close')
289
- self._volume_idx = None
290
- self._b_volume_idx = None
291
- self._timeframe = None
292
-
293
- try:
294
- self._volume_idx = _find_column_index_in_list(fieldnames, 'quote_volume', 'volume', 'vol')
295
- except: pass
296
-
297
- try:
298
- self._b_volume_idx = _find_column_index_in_list(fieldnames, 'taker_buy_volume', 'taker_buy_quote_volume', 'buy_volume')
299
- except: pass
300
-
301
- self._time = np.array([], dtype=np.datetime64)
302
- self._open = np.array([])
303
- self._high = np.array([])
304
- self._low = np.array([])
305
- self._close = np.array([])
306
- self._volume = np.array([])
307
- self._bvolume = np.array([])
308
- self._fieldnames = fieldnames
309
- self._ohlc = pd.DataFrame()
310
-
311
- def process_data_rows(self, data: List[list]) -> Optional[Iterable]:
312
- p = pd.DataFrame.from_records(data, columns=self._fieldnames)
313
- p.set_index(self._fieldnames[self._time_idx], drop=True, inplace=True)
314
- self._ohlc = pd.concat((self._ohlc, p), axis=0, sort=True, copy=True)
315
- return None
316
-
317
- def process_data_columns(self, data: list) -> Optional[Iterable]:
318
- # p = pd.DataFrame({
319
- # 'open': data[self._open_idx],
320
- # 'high': data[self._high_idx],
321
- # 'low': data[self._low_idx],
322
- # 'close': data[self._close_idx],
323
- # 'volume': data[self._volume_idx] if self._volume_idx else []},
324
- # index = data[self._time_idx]
325
- # )
326
- # self.ohlc = pd.concat((self.ohlc, p), axis=0, sort=True, copy=True)
327
- self._time = np.concatenate((self._time, data[self._time_idx]))
328
- self._open = np.concatenate((self._open, data[self._open_idx]))
329
- self._high = np.concatenate((self._high, data[self._high_idx]))
330
- self._low = np.concatenate((self._low, data[self._low_idx]))
331
- self._close = np.concatenate((self._close, data[self._close_idx]))
332
- if self._volume_idx:
333
- self._volume = np.concatenate((self._volume, data[self._volume_idx]))
334
- if self._b_volume_idx:
335
- self._bvolume = np.concatenate((self._bvolume, data[self._b_volume_idx]))
336
-
337
- return None
338
-
339
- def get_result(self) -> Any:
340
- if not self._ohlc.empty:
341
- return self._ohlc
342
-
343
- rd = {
344
- 'open': self._open, 'high': self._high, 'low': self._low, 'close': self._close,
345
- }
346
-
347
- if self._volume_idx:
348
- rd['volume'] = self._volume
349
-
350
- if self._b_volume_idx:
351
- rd['taker_buy_quote_volume'] = self._bvolume
352
-
353
- return pd.DataFrame(rd, index = self._time).sort_index()
354
-
355
-
356
- class CsvDataReader(DataReader):
357
- """
358
- CSV data file reader
359
- """
360
-
361
- def __init__(self, path: str, processor: DataProcessor|None=None, timestamp_parsers=None) -> None:
362
- if not exists(path):
363
- raise ValueError(f"CSV file not found at {path}")
364
- super().__init__(processor)
365
- self.time_parsers = timestamp_parsers
366
- self.path = path
367
-
368
- def __find_time_idx(self, arr: pa.ChunkedArray, v) -> int:
369
- ix = arr.index(v).as_py()
370
- if ix < 0:
371
- for c in arr.iterchunks():
372
- a = c.to_numpy()
373
- ix = np.searchsorted(a, v, side='right')
374
- if ix > 0 and ix < len(c):
375
- ix = arr.index(a[ix]).as_py() - 1
376
- break
377
- return ix
378
-
379
- def read(self, start: Optional[str]=None, stop: Optional[str]=None) -> Any:
380
- convert_options = None
381
- if self.time_parsers:
382
- convert_options=csv.ConvertOptions(timestamp_parsers=self.time_parsers)
383
-
384
- table = csv.read_csv(
385
- self.path,
386
- parse_options=csv.ParseOptions(ignore_empty_lines=True),
387
- convert_options=convert_options
388
- )
389
- fieldnames = table.column_names
390
-
391
- # - try to find range to load
392
- start_idx, stop_idx = 0, table.num_rows
393
- try:
394
- _time_field_idx = _find_column_index_in_list(fieldnames, 'time', 'timestamp', 'datetime', 'date')
395
- _time_type = table.field(_time_field_idx).type
396
- _time_unit = _time_type.unit if hasattr(_time_type, 'unit') else 's'
397
- _time_data = table[_time_field_idx]
398
-
399
- # - check if need convert time to primitive types (i.e. Date32 -> timestamp[x])
400
- _time_cast_function = lambda xs: xs
401
- if _time_type != pa.timestamp(_time_unit):
402
- _time_cast_function = lambda xs: xs.cast(pa.timestamp(_time_unit))
403
- _time_data = _time_cast_function(_time_data)
404
-
405
- # - preprocessing start and stop
406
- t_0, t_1 = handle_start_stop(start, stop, convert=lambda x: _recognize_t(x, None, _time_unit))
407
-
408
- # - check requested range
409
- if t_0:
410
- start_idx = self.__find_time_idx(_time_data, t_0)
411
- if start_idx >= table.num_rows:
412
- # no data for requested start date
413
- return None
414
-
415
- if t_1:
416
- stop_idx = self.__find_time_idx(_time_data, t_1)
417
- if stop_idx < 0 or stop_idx < start_idx:
418
- stop_idx = table.num_rows
419
-
420
- except Exception as exc:
421
- logger.warning(exc)
422
- logger.info('loading whole file')
423
-
424
- length = (stop_idx - start_idx + 1)
425
- self._processor.start_processing(fieldnames)
426
- selected_table = table.slice(start_idx, length)
427
- n_chunks = selected_table[table.column_names[0]].num_chunks
428
- for n in range(n_chunks):
429
- data = [
430
- # - in some cases we need to convert time index to primitive type
431
- _time_cast_function(selected_table[k].chunk(n)).to_numpy() if k == _time_field_idx else selected_table[k].chunk(n).to_numpy()
432
- for k in range(selected_table.num_columns)]
433
- self._processor.process_data_columns(data)
434
- return self._processor.get_result()
435
-
436
-
437
- def _retry(fn):
438
- @wraps(fn)
439
- def wrapper(*args, **kw):
440
- cls = args[0]
441
- for x in range(cls._reconnect_tries):
442
- # print(x, cls._reconnect_tries)
443
- try:
444
- return fn(*args, **kw)
445
- except (pg.InterfaceError, pg.OperationalError) as e:
446
- logger.warning("Database Connection [InterfaceError or OperationalError]")
447
- # print ("Idle for %s seconds" % (cls._reconnect_idle))
448
- # time.sleep(cls._reconnect_idle)
449
- cls._connect()
450
- return wrapper
451
-
452
-
453
- class QuestDBConnector(DataReader):
454
- """
455
- Very first version of QuestDB connector
456
-
457
- # Connect to an existing QuestDB instance
458
- >>> db = QuestDBConnector('user=admin password=quest host=localhost port=8812', OhlcvPandasDataProcessor())
459
- >>> db.read('BINANCEF.ETHUSDT', '5m', '2024-01-01')
460
- """
461
- _reconnect_tries = 5
462
- _reconnect_idle = 0.1 # wait seconds before retying
463
-
464
- def __init__(self, connection_url: str, processor: DataProcessor | None=None) -> None:
465
- super().__init__(processor)
466
- self._connection = None
467
- self._cursor = None
468
- self.connection_url = connection_url
469
- self._connect()
470
-
471
- def _connect(self):
472
- logger.info("Connecting to QuestDB ...")
473
- self._connection = pg.connect(self.connection_url, autocommit=True)
474
- self._cursor = self._connection.cursor()
475
-
476
- @_retry
477
- def read(self, symbol: str, timeframe: str, start: str|None=None, stop: str|None=None) -> Any:
478
- start, end = handle_start_stop(start, stop)
479
- w0 = f"timestamp >= '{start}'" if start else ''
480
- w1 = f"timestamp <= '{end}'" if end else ''
481
- where = f'where {w0} and {w1}' if (w0 and w1) else f"where {(w0 or w1)}"
482
-
483
- self._cursor.execute(
484
- f"""
485
- select timestamp,
486
- first(open) as open,
487
- max(high) as high,
488
- min(low) as low,
489
- last(close) as close,
490
- sum(volume) as volume,
491
- sum(quote_volume) as quote_volume,
492
- sum(count) as count,
493
- sum(taker_buy_volume) as taker_buy_volume,
494
- sum(taker_buy_quote_volume) as taker_buy_quote_volume
495
- from "{symbol.upper()}" {where}
496
- SAMPLE by {timeframe};
497
- """ # type: ignore
498
- )
499
- records = self._cursor.fetchall()
500
- names = [d.name for d in self._cursor.description]
501
-
502
- self._processor.start_processing(names, re.split(r'[.:]', symbol)[-1])
503
-
504
- # d = np.array(records)
505
- self._processor.process_data_rows(records)
506
- return self._processor.get_result()
507
-
508
- def __del__(self):
509
- for c in (self._cursor, self._connection):
510
- try:
511
- logger.info("Closing connection")
512
- c.close()
513
- except:
514
- pass
515
-
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes