Qubx 0.1.3__cp311-cp311-manylinux_2_35_x86_64.whl → 0.1.4__cp311-cp311-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Qubx might be problematic. Click here for more details.

qubx/data/readers.py CHANGED
@@ -1,6 +1,6 @@
1
- import re
2
- from typing import List, Union, Optional, Iterable, Any
3
- from os.path import exists
1
+ import re, os
2
+ from typing import Callable, List, Union, Optional, Iterable, Any
3
+ from os.path import exists, join
4
4
  import numpy as np
5
5
  import pandas as pd
6
6
  import pyarrow as pa
@@ -38,80 +38,301 @@ def _find_column_index_in_list(xs, *args):
38
38
  raise IndexError(f"Can't find any from {args} in list: {xs}")
39
39
 
40
40
 
41
- class DataProcessor:
42
- """
43
- Common interface for data processor with default aggregating into list implementation
44
- """
41
+ class DataTransformer:
42
+
45
43
  def __init__(self) -> None:
46
- self.buffer = {}
44
+ self.buffer = []
47
45
  self._column_names = []
48
46
 
49
- def start_processing(self, column_names: List[str], name: str | None = None):
47
+ def start_transform(self, name: str, column_names: List[str]):
50
48
  self._column_names = column_names
51
- self.buffer = {c: [] for c in column_names}
52
-
53
- def process_data_columns(self, columns_data: list) -> Optional[Iterable]:
54
- for i, c in enumerate(columns_data):
55
- self.buffer[self._column_names[i]].append(c)
56
- return None
57
-
58
- def process_data_rows(self, rows_data: list) -> Optional[Iterable]:
59
- for r in rows_data:
60
- c = []
61
- for j, d in enumerate(r):
62
- self.buffer[self._column_names[j]].append(d)
63
- return None
49
+ self.buffer = []
50
+
51
+ def process_data(self, rows_data: Iterable) -> Any:
52
+ if rows_data is not None:
53
+ self.buffer.extend(rows_data)
64
54
 
65
- def get_result(self) -> Any:
55
+ def collect(self) -> Any:
66
56
  return self.buffer
67
57
 
68
58
 
69
59
  class DataReader:
60
+
61
+ def get_names(self) -> List[str] :
62
+ raise NotImplemented()
63
+
64
+ def read(self, data_id: str, start: str | None=None, stop: str | None=None,
65
+ transform: DataTransformer = DataTransformer(),
66
+ chunksize=0,
67
+ **kwargs
68
+ ) -> Iterable | List:
69
+ raise NotImplemented()
70
+
71
+
72
+ class CsvStorageDataReader(DataReader):
70
73
  """
71
- Common interface for data reader
74
+ Data reader for timeseries data stored as csv files in the specified directory
72
75
  """
73
- _processor: DataProcessor
74
76
 
75
- def __init__(self, processor=None) -> None:
76
- self._processor = DataProcessor() if processor is None else processor
77
+ def __init__(self, path: str) -> None:
78
+ if not exists(path):
79
+ raise ValueError(f"Folder is not found at {path}")
80
+ self.path = path
77
81
 
78
- def read(self, start: Optional[str]=None, stop: Optional[str]=None) -> Any:
79
- pass
82
+ def __find_time_idx(self, arr: pa.ChunkedArray, v) -> int:
83
+ ix = arr.index(v).as_py()
84
+ if ix < 0:
85
+ for c in arr.iterchunks():
86
+ a = c.to_numpy()
87
+ ix = np.searchsorted(a, v, side='right')
88
+ if ix > 0 and ix < len(c):
89
+ ix = arr.index(a[ix]).as_py() - 1
90
+ break
91
+ return ix
80
92
 
81
-
82
- class QuotesDataProcessor(DataProcessor):
93
+ def __check_file_name(self, name: str) -> str | None:
94
+ _f = join(self.path, name)
95
+ for sfx in ['.csv', '.csv.gz', '']:
96
+ if exists(p:=(_f + sfx)):
97
+ return p
98
+ return None
99
+
100
+ def read(self, data_id: str, start: str | None=None, stop: str | None=None,
101
+ transform: DataTransformer = DataTransformer(),
102
+ chunksize=0,
103
+ timestamp_formatters = None
104
+ ) -> Iterable | Any:
105
+
106
+ f_path = self.__check_file_name(data_id)
107
+ if not f_path:
108
+ ValueError(f"Can't find any csv data for {data_id} in {self.path} !")
109
+
110
+ convert_options = None
111
+ if timestamp_formatters is not None:
112
+ convert_options=csv.ConvertOptions(timestamp_parsers=timestamp_formatters)
113
+
114
+ table = csv.read_csv(
115
+ f_path,
116
+ parse_options=csv.ParseOptions(ignore_empty_lines=True),
117
+ convert_options=convert_options
118
+ )
119
+ fieldnames = table.column_names
120
+
121
+ # - try to find range to load
122
+ start_idx, stop_idx = 0, table.num_rows
123
+ try:
124
+ _time_field_idx = _find_column_index_in_list(fieldnames, 'time', 'timestamp', 'datetime', 'date')
125
+ _time_type = table.field(_time_field_idx).type
126
+ _time_unit = _time_type.unit if hasattr(_time_type, 'unit') else 's'
127
+ _time_data = table[_time_field_idx]
128
+
129
+ # - check if need convert time to primitive types (i.e. Date32 -> timestamp[x])
130
+ _time_cast_function = lambda xs: xs
131
+ if _time_type != pa.timestamp(_time_unit):
132
+ _time_cast_function = lambda xs: xs.cast(pa.timestamp(_time_unit))
133
+ _time_data = _time_cast_function(_time_data)
134
+
135
+ # - preprocessing start and stop
136
+ t_0, t_1 = handle_start_stop(start, stop, convert=lambda x: _recognize_t(x, None, _time_unit))
137
+
138
+ # - check requested range
139
+ if t_0:
140
+ start_idx = self.__find_time_idx(_time_data, t_0)
141
+ if start_idx >= table.num_rows:
142
+ # no data for requested start date
143
+ return None
144
+
145
+ if t_1:
146
+ stop_idx = self.__find_time_idx(_time_data, t_1)
147
+ if stop_idx < 0 or stop_idx < start_idx:
148
+ stop_idx = table.num_rows
149
+
150
+ except Exception as exc:
151
+ logger.warning(exc)
152
+ logger.info('loading whole file')
153
+
154
+ length = (stop_idx - start_idx + 1)
155
+ selected_table = table.slice(start_idx, length)
156
+
157
+ # - in this case we want to return iterable chunks of data
158
+ if chunksize > 0:
159
+ def _iter_chunks():
160
+ for n in range(0, length // chunksize + 1):
161
+ transform.start_transform(data_id, fieldnames)
162
+ raw_data = selected_table[n*chunksize : min((n+1)*chunksize, length)].to_pandas().to_numpy()
163
+ transform.process_data(raw_data)
164
+ yield transform.collect()
165
+ return _iter_chunks()
166
+
167
+ transform.start_transform(data_id, fieldnames)
168
+ raw_data = selected_table.to_pandas().to_numpy()
169
+ transform.process_data(raw_data)
170
+ return transform.collect()
171
+
172
+ def get_names(self) -> List[str] :
173
+ _n = []
174
+ for s in os.listdir(self.path):
175
+ if (m:=re.match(r'(.*)\.csv(.gz)?$', s)):
176
+ _n.append(m.group(1))
177
+ return _n
178
+
179
+
180
+ class AsPandasFrame(DataTransformer):
83
181
  """
84
- Process quotes data and collect them as list
182
+ List of records to pandas dataframe transformer
85
183
  """
86
- def start_processing(self, fieldnames: List[str], name: str | None = None):
87
- self.buffer = list()
88
- self._time_idx = _find_column_index_in_list(fieldnames, 'time', 'timestamp', 'datetime')
89
- self._bid_idx = _find_column_index_in_list(fieldnames, 'bid')
90
- self._ask_idx = _find_column_index_in_list(fieldnames, 'ask')
91
- self._bidvol_idx = _find_column_index_in_list(fieldnames, 'bidvol', 'bid_vol', 'bidsize', 'bid_size')
92
- self._askvol_idx = _find_column_index_in_list(fieldnames, 'askvol', 'ask_vol', 'asksize', 'ask_size')
93
-
94
- def process_data_columns(self, columns_data: list) -> Optional[Iterable]:
95
- tms = columns_data[self._time_idx]
96
- bids = columns_data[self._bid_idx]
97
- asks = columns_data[self._ask_idx]
98
- bidvol = columns_data[self._bidvol_idx]
99
- askvol = columns_data[self._askvol_idx]
100
- for i in range(len(tms)):
101
- self.buffer.append(Quote(tms[i], bids[i], asks[i], bidvol[i], askvol[i]))
184
+
185
+ def start_transform(self, name: str, column_names: List[str]):
186
+ self._time_idx = _find_column_index_in_list(column_names, 'time', 'timestamp', 'datetime', 'date')
187
+ self._column_names = column_names
188
+ self._frame = pd.DataFrame()
189
+
190
+ def process_data(self, rows_data: Iterable) -> Any:
191
+ self._frame
192
+ p = pd.DataFrame.from_records(rows_data, columns=self._column_names)
193
+ p.set_index(self._column_names[self._time_idx], drop=True, inplace=True)
194
+ p.sort_index(inplace=True)
195
+ self._frame = pd.concat((self._frame, p), axis=0, sort=True)
196
+ return p
197
+
198
+ def collect(self) -> Any:
199
+ return self._frame
200
+
201
+
202
+ class AsOhlcvSeries(DataTransformer):
203
+
204
+ def __init__(self, timeframe: str | None = None, timestamp_units='ns') -> None:
205
+ super().__init__()
206
+ self.timeframe = timeframe
207
+ self._series = None
208
+ self._data_type = None
209
+ self.timestamp_units = timestamp_units
210
+
211
+ def start_transform(self, name: str, column_names: List[str]):
212
+ self._time_idx = _find_column_index_in_list(column_names, 'time', 'timestamp', 'datetime', 'date')
213
+ self._volume_idx = None
214
+ self._b_volume_idx = None
215
+ try:
216
+ self._close_idx = _find_column_index_in_list(column_names, 'close')
217
+ self._open_idx = _find_column_index_in_list(column_names, 'open')
218
+ self._high_idx = _find_column_index_in_list(column_names, 'high')
219
+ self._low_idx = _find_column_index_in_list(column_names, 'low')
220
+
221
+ try:
222
+ self._volume_idx = _find_column_index_in_list(column_names, 'quote_volume', 'volume', 'vol')
223
+ except: pass
224
+
225
+ try:
226
+ self._b_volume_idx = _find_column_index_in_list(column_names, 'taker_buy_volume', 'taker_buy_quote_volume', 'buy_volume')
227
+ except: pass
228
+
229
+ self._data_type = 'ohlc'
230
+ except:
231
+ try:
232
+ self._ask_idx = _find_column_index_in_list(column_names, 'ask')
233
+ self._bid_idx = _find_column_index_in_list(column_names, 'bid')
234
+ self._data_type = 'quotes'
235
+ except:
236
+
237
+ try:
238
+ self._price_idx = _find_column_index_in_list(column_names, 'price')
239
+ self._size_idx = _find_column_index_in_list(column_names, 'quote_qty', 'qty', 'size', 'amount', 'volume')
240
+ self._taker_idx = None
241
+ try:
242
+ self._taker_idx = _find_column_index_in_list(column_names, 'is_buyer_maker', 'side', 'aggressive', 'taker', 'is_taker')
243
+ except: pass
244
+
245
+ self._data_type = 'trades'
246
+ except:
247
+ raise ValueError(f"Can't recognize data for update from header: {column_names}")
248
+
249
+ self._column_names = column_names
250
+ self._name = name
251
+ if self.timeframe:
252
+ self._series = OHLCV(self._name, self.timeframe)
253
+
254
+ def _time(self, t) -> int:
255
+ if self.timestamp_units == 'ns':
256
+ return np.datetime64(t, 'ns').item()
257
+ return np.datetime64(t, self.timestamp_units).astype('datetime64[ns]').item()
258
+
259
+ def _proc_ohlc(self, rows_data: List[List]):
260
+ for d in rows_data:
261
+ self._series.update_by_bar(
262
+ self._time(d[self._time_idx]),
263
+ d[self._open_idx], d[self._high_idx], d[self._low_idx], d[self._close_idx],
264
+ d[self._volume_idx] if self._volume_idx else 0,
265
+ d[self._b_volume_idx] if self._b_volume_idx else 0
266
+ )
267
+
268
+ def _proc_quotes(self, rows_data: List[List]):
269
+ for d in rows_data:
270
+ self._series.update(
271
+ self._time(d[self._time_idx]),
272
+ (d[self._ask_idx] + d[self._bid_idx])/2
273
+ )
274
+
275
+ def _proc_trades(self, rows_data: List[List]):
276
+ for d in rows_data:
277
+ a = d[self._taker_idx] if self._taker_idx else 0
278
+ s = d[self._size_idx]
279
+ b = s if a else 0
280
+ self._series.update(self._time(d[self._time_idx]), d[self._price_idx], s, b)
281
+
282
+ def process_data(self, rows_data: List[List]) -> Any:
283
+ if self._series is None:
284
+ ts = [t[self._time_idx] for t in rows_data[:100]]
285
+ self.timeframe = pd.Timedelta(infer_series_frequency(ts)).asm8.item()
286
+
287
+ # - create instance after first data received if
288
+ self._series = OHLCV(self._name, self.timeframe)
289
+
290
+ match self._data_type:
291
+ case 'ohlc':
292
+ self._proc_ohlc(rows_data)
293
+ case 'quotes':
294
+ self._proc_quotes(rows_data)
295
+ case 'trades':
296
+ self._proc_trades(rows_data)
297
+
102
298
  return None
103
299
 
300
+ def collect(self) -> Any:
301
+ return self._series
104
302
 
105
- class QuotesFromOHLCVDataProcessor(DataProcessor):
303
+
304
+ class AsQuotes(DataTransformer):
305
+
306
+ def start_transform(self, name: str, column_names: List[str]):
307
+ self.buffer = list()
308
+ self._time_idx = _find_column_index_in_list(column_names, 'time', 'timestamp', 'datetime')
309
+ self._bid_idx = _find_column_index_in_list(column_names, 'bid')
310
+ self._ask_idx = _find_column_index_in_list(column_names, 'ask')
311
+ self._bidvol_idx = _find_column_index_in_list(column_names, 'bidvol', 'bid_vol', 'bidsize', 'bid_size')
312
+ self._askvol_idx = _find_column_index_in_list(column_names, 'askvol', 'ask_vol', 'asksize', 'ask_size')
313
+
314
+ def process_data(self, rows_data: Iterable) -> Any:
315
+ if rows_data is not None:
316
+ for d in rows_data:
317
+ t = d[self._time_idx]
318
+ b = d[self._bid_idx]
319
+ a = d[self._ask_idx]
320
+ bv = d[self._bidvol_idx]
321
+ av = d[self._askvol_idx]
322
+ self.buffer.append(Quote(t.as_unit('ns').asm8.item(), b, a, bv, av))
323
+
324
+
325
+ class RestoreTicksFromOHLC(DataTransformer):
106
326
  """
107
- Process OHLC and generate Quotes (+ Trades) from it
327
+ Emulates quotes (and trades) from OHLC bars
108
328
  """
109
- def __init__(self, trades: bool=False,
329
+
330
+ def __init__(self,
331
+ trades: bool=False, # if we also wants 'trades'
110
332
  default_bid_size=1e9, # default bid/ask is big
111
333
  default_ask_size=1e9, # default bid/ask is big
112
334
  daily_session_start_end=DEFAULT_DAILY_SESSION,
113
- spread=0.0,
114
- ) -> None:
335
+ spread=0.0):
115
336
  super().__init__()
116
337
  self._trades = trades
117
338
  self._bid_size = default_bid_size
@@ -120,30 +341,36 @@ class QuotesFromOHLCVDataProcessor(DataProcessor):
120
341
  self._d_session_start = daily_session_start_end[0]
121
342
  self._d_session_end = daily_session_start_end[1]
122
343
 
123
- def start_processing(self, fieldnames: List[str], name: str | None = None):
124
- self._time_idx = _find_column_index_in_list(fieldnames, 'time', 'timestamp', 'datetime', 'date')
125
- self._open_idx = _find_column_index_in_list(fieldnames, 'open')
126
- self._high_idx = _find_column_index_in_list(fieldnames, 'high')
127
- self._low_idx = _find_column_index_in_list(fieldnames, 'low')
128
- self._close_idx = _find_column_index_in_list(fieldnames, 'close')
344
+ def start_transform(self, name: str, column_names: List[str]):
345
+ self.buffer = []
346
+ # - it will fail if receive data doesn't look as ohlcv
347
+ self._time_idx = _find_column_index_in_list(column_names, 'time', 'timestamp', 'datetime', 'date')
348
+ self._open_idx = _find_column_index_in_list(column_names, 'open')
349
+ self._high_idx = _find_column_index_in_list(column_names, 'high')
350
+ self._low_idx = _find_column_index_in_list(column_names, 'low')
351
+ self._close_idx = _find_column_index_in_list(column_names, 'close')
129
352
  self._volume_idx = None
130
- self._timeframe = None
131
-
353
+ self._freq = None
132
354
  try:
133
- self._volume_idx = _find_column_index_in_list(fieldnames, 'volume', 'vol')
134
- except:
135
- pass
355
+ self._volume_idx = _find_column_index_in_list(column_names, 'volume', 'vol')
356
+ except: pass
136
357
 
137
- self.buffer = []
358
+ if self._volume_idx is None and self._trades:
359
+ logger.warning("Input OHLC data doesn't contain volume information so trades can't be emulated !")
360
+ self._trades = False
361
+
362
+ def process_data(self, rows_data:List[List]) -> Any:
363
+ if rows_data is None:
364
+ return
138
365
 
139
- def process_data_columns(self, data: list) -> Optional[Iterable]:
140
366
  s2 = self._s2
141
- if self._timeframe is None:
142
- _freq = infer_series_frequency(data[self._time_idx])
143
- self._timeframe = _freq.astype('timedelta64[s]')
367
+
368
+ if self._freq is None:
369
+ ts = [t[self._time_idx] for t in rows_data[:100]]
370
+ self._freq = infer_series_frequency(ts)
144
371
 
145
372
  # - timestamps when we emit simulated quotes
146
- dt = _freq.astype('timedelta64[ns]').item()
373
+ dt = self._freq.astype('timedelta64[ns]').item()
147
374
  if dt < D1:
148
375
  self._t_start = dt // 10
149
376
  self._t_mid1 = dt // 2 - dt // 10
@@ -156,21 +383,13 @@ class QuotesFromOHLCVDataProcessor(DataProcessor):
156
383
  self._t_end = self._d_session_end
157
384
 
158
385
  # - input data
159
- times = data[self._time_idx]
160
- opens = data[self._open_idx]
161
- highs = data[self._high_idx]
162
- lows = data[self._low_idx]
163
- closes = data[self._close_idx]
164
- volumes = data[self._volume_idx] if self._volume_idx else None
165
- if volumes is None and self._trades:
166
- logger.warning("Input OHLC data doesn't contain volume information so trades can't be emulated !")
167
- self._trades = False
168
-
169
- for i in range(len(times)):
170
- ti, o, h, l, c = times[i].astype('datetime64[ns]'), opens[i], highs[i], lows[i], closes[i]
171
-
172
- if self._trades:
173
- rv = volumes[i] / (h - l)
386
+ for data in rows_data:
387
+ ti = pd.Timestamp(data[self._time_idx]).as_unit('ns').asm8.item()
388
+ o = data[self._open_idx]
389
+ h= data[self._high_idx]
390
+ l = data[self._low_idx]
391
+ c = data[self._close_idx]
392
+ rv = data[self._volume_idx] if self._volume_idx else 0
174
393
 
175
394
  # - opening quote
176
395
  self.buffer.append(Quote(ti + self._t_start, o - s2, o + s2, self._bid_size, self._ask_size))
@@ -201,238 +420,6 @@ class QuotesFromOHLCVDataProcessor(DataProcessor):
201
420
  # - closing quote
202
421
  self.buffer.append(Quote(ti + self._t_end, c - s2, c + s2, self._bid_size, self._ask_size))
203
422
 
204
- return None
205
-
206
- def get_result(self) -> Any:
207
- return self.buffer
208
-
209
-
210
- class OhlcvDataProcessor(DataProcessor):
211
- """
212
- Process data and convert it to Qube OHLCV timeseries
213
- """
214
- def __init__(self, name: str | None = None) -> None:
215
- super().__init__()
216
- self._name = name
217
-
218
- def start_processing(self, fieldnames: List[str], name: str | None = None):
219
- self._time_idx = _find_column_index_in_list(fieldnames, 'time', 'timestamp', 'datetime', 'date')
220
- self._open_idx = _find_column_index_in_list(fieldnames, 'open')
221
- self._high_idx = _find_column_index_in_list(fieldnames, 'high')
222
- self._low_idx = _find_column_index_in_list(fieldnames, 'low')
223
- self._close_idx = _find_column_index_in_list(fieldnames, 'close')
224
- self._volume_idx = None
225
- self._b_volume_idx = None
226
- self._timeframe = None
227
- self._name = name if name else self._name
228
-
229
- try:
230
- self._volume_idx = _find_column_index_in_list(fieldnames, 'quote_volume', 'volume', 'vol')
231
- except: pass
232
-
233
- try:
234
- self._b_volume_idx = _find_column_index_in_list(fieldnames, 'taker_buy_volume', 'taker_buy_quote_volume', 'buy_volume')
235
- except: pass
236
-
237
- self.ohlc = None
238
-
239
- def process_data_columns(self, data: list) -> Optional[Iterable]:
240
- if self._timeframe is None:
241
- self._timeframe = infer_series_frequency(data[self._time_idx]).astype('timedelta64[s]')
242
-
243
- # - create instance after first data received
244
- self.ohlc = OHLCV(self._name, self._timeframe)
245
-
246
- self.ohlc.append_data(
247
- data[self._time_idx],
248
- data[self._open_idx], data[self._high_idx], data[self._low_idx], data[self._close_idx],
249
- data[self._volume_idx] if self._volume_idx else np.empty(0),
250
- data[self._b_volume_idx] if self._b_volume_idx else np.empty(0)
251
- )
252
- return None
253
-
254
- def process_data_rows(self, data: List[list]) -> Iterable | None:
255
- if self._timeframe is None:
256
- ts = [t[self._time_idx] for t in data[:100]]
257
- self._timeframe = pd.Timedelta(infer_series_frequency(ts)).asm8.item()
258
-
259
- # - create instance after first data received
260
- self.ohlc = OHLCV(self._name, self._timeframe)
261
-
262
- for d in data:
263
- self.ohlc.update_by_bar(
264
- np.datetime64(d[self._time_idx], 'ns').item(),
265
- d[self._open_idx], d[self._high_idx], d[self._low_idx], d[self._close_idx],
266
- d[self._volume_idx] if self._volume_idx else 0,
267
- d[self._b_volume_idx] if self._b_volume_idx else 0
268
- )
269
- return None
270
-
271
- def get_result(self) -> Any:
272
- return self.ohlc
273
-
274
-
275
- class OhlcvPandasDataProcessor(DataProcessor):
276
- """
277
- Process data and convert it to pandas OHLCV dataframes
278
- """
279
- def __init__(self) -> None:
280
- super().__init__()
281
- self._fieldnames: List = []
282
-
283
- def start_processing(self, fieldnames: List[str], name: str | None = None):
284
- self._time_idx = _find_column_index_in_list(fieldnames, 'time', 'timestamp', 'datetime', 'date')
285
- self._open_idx = _find_column_index_in_list(fieldnames, 'open')
286
- self._high_idx = _find_column_index_in_list(fieldnames, 'high')
287
- self._low_idx = _find_column_index_in_list(fieldnames, 'low')
288
- self._close_idx = _find_column_index_in_list(fieldnames, 'close')
289
- self._volume_idx = None
290
- self._b_volume_idx = None
291
- self._timeframe = None
292
-
293
- try:
294
- self._volume_idx = _find_column_index_in_list(fieldnames, 'quote_volume', 'volume', 'vol')
295
- except: pass
296
-
297
- try:
298
- self._b_volume_idx = _find_column_index_in_list(fieldnames, 'taker_buy_volume', 'taker_buy_quote_volume', 'buy_volume')
299
- except: pass
300
-
301
- self._time = np.array([], dtype=np.datetime64)
302
- self._open = np.array([])
303
- self._high = np.array([])
304
- self._low = np.array([])
305
- self._close = np.array([])
306
- self._volume = np.array([])
307
- self._bvolume = np.array([])
308
- self._fieldnames = fieldnames
309
- self._ohlc = pd.DataFrame()
310
-
311
- def process_data_rows(self, data: List[list]) -> Optional[Iterable]:
312
- p = pd.DataFrame.from_records(data, columns=self._fieldnames)
313
- p.set_index(self._fieldnames[self._time_idx], drop=True, inplace=True)
314
- self._ohlc = pd.concat((self._ohlc, p), axis=0, sort=True, copy=True)
315
- return None
316
-
317
- def process_data_columns(self, data: list) -> Optional[Iterable]:
318
- # p = pd.DataFrame({
319
- # 'open': data[self._open_idx],
320
- # 'high': data[self._high_idx],
321
- # 'low': data[self._low_idx],
322
- # 'close': data[self._close_idx],
323
- # 'volume': data[self._volume_idx] if self._volume_idx else []},
324
- # index = data[self._time_idx]
325
- # )
326
- # self.ohlc = pd.concat((self.ohlc, p), axis=0, sort=True, copy=True)
327
- self._time = np.concatenate((self._time, data[self._time_idx]))
328
- self._open = np.concatenate((self._open, data[self._open_idx]))
329
- self._high = np.concatenate((self._high, data[self._high_idx]))
330
- self._low = np.concatenate((self._low, data[self._low_idx]))
331
- self._close = np.concatenate((self._close, data[self._close_idx]))
332
- if self._volume_idx:
333
- self._volume = np.concatenate((self._volume, data[self._volume_idx]))
334
- if self._b_volume_idx:
335
- self._bvolume = np.concatenate((self._bvolume, data[self._b_volume_idx]))
336
-
337
- return None
338
-
339
- def get_result(self) -> Any:
340
- if not self._ohlc.empty:
341
- return self._ohlc
342
-
343
- rd = {
344
- 'open': self._open, 'high': self._high, 'low': self._low, 'close': self._close,
345
- }
346
-
347
- if self._volume_idx:
348
- rd['volume'] = self._volume
349
-
350
- if self._b_volume_idx:
351
- rd['taker_buy_quote_volume'] = self._bvolume
352
-
353
- return pd.DataFrame(rd, index = self._time).sort_index()
354
-
355
-
356
- class CsvDataReader(DataReader):
357
- """
358
- CSV data file reader
359
- """
360
-
361
- def __init__(self, path: str, processor: DataProcessor|None=None, timestamp_parsers=None) -> None:
362
- if not exists(path):
363
- raise ValueError(f"CSV file not found at {path}")
364
- super().__init__(processor)
365
- self.time_parsers = timestamp_parsers
366
- self.path = path
367
-
368
- def __find_time_idx(self, arr: pa.ChunkedArray, v) -> int:
369
- ix = arr.index(v).as_py()
370
- if ix < 0:
371
- for c in arr.iterchunks():
372
- a = c.to_numpy()
373
- ix = np.searchsorted(a, v, side='right')
374
- if ix > 0 and ix < len(c):
375
- ix = arr.index(a[ix]).as_py() - 1
376
- break
377
- return ix
378
-
379
- def read(self, start: Optional[str]=None, stop: Optional[str]=None) -> Any:
380
- convert_options = None
381
- if self.time_parsers:
382
- convert_options=csv.ConvertOptions(timestamp_parsers=self.time_parsers)
383
-
384
- table = csv.read_csv(
385
- self.path,
386
- parse_options=csv.ParseOptions(ignore_empty_lines=True),
387
- convert_options=convert_options
388
- )
389
- fieldnames = table.column_names
390
-
391
- # - try to find range to load
392
- start_idx, stop_idx = 0, table.num_rows
393
- try:
394
- _time_field_idx = _find_column_index_in_list(fieldnames, 'time', 'timestamp', 'datetime', 'date')
395
- _time_type = table.field(_time_field_idx).type
396
- _time_unit = _time_type.unit if hasattr(_time_type, 'unit') else 's'
397
- _time_data = table[_time_field_idx]
398
-
399
- # - check if need convert time to primitive types (i.e. Date32 -> timestamp[x])
400
- _time_cast_function = lambda xs: xs
401
- if _time_type != pa.timestamp(_time_unit):
402
- _time_cast_function = lambda xs: xs.cast(pa.timestamp(_time_unit))
403
- _time_data = _time_cast_function(_time_data)
404
-
405
- # - preprocessing start and stop
406
- t_0, t_1 = handle_start_stop(start, stop, convert=lambda x: _recognize_t(x, None, _time_unit))
407
-
408
- # - check requested range
409
- if t_0:
410
- start_idx = self.__find_time_idx(_time_data, t_0)
411
- if start_idx >= table.num_rows:
412
- # no data for requested start date
413
- return None
414
-
415
- if t_1:
416
- stop_idx = self.__find_time_idx(_time_data, t_1)
417
- if stop_idx < 0 or stop_idx < start_idx:
418
- stop_idx = table.num_rows
419
-
420
- except Exception as exc:
421
- logger.warning(exc)
422
- logger.info('loading whole file')
423
-
424
- length = (stop_idx - start_idx + 1)
425
- self._processor.start_processing(fieldnames)
426
- selected_table = table.slice(start_idx, length)
427
- n_chunks = selected_table[table.column_names[0]].num_chunks
428
- for n in range(n_chunks):
429
- data = [
430
- # - in some cases we need to convert time index to primitive type
431
- _time_cast_function(selected_table[k].chunk(n)).to_numpy() if k == _time_field_idx else selected_table[k].chunk(n).to_numpy()
432
- for k in range(selected_table.num_columns)]
433
- self._processor.process_data_columns(data)
434
- return self._processor.get_result()
435
-
436
423
 
437
424
  def _retry(fn):
438
425
  @wraps(fn)
@@ -456,13 +443,12 @@ class QuestDBConnector(DataReader):
456
443
 
457
444
  # Connect to an existing QuestDB instance
458
445
  >>> db = QuestDBConnector('user=admin password=quest host=localhost port=8812', OhlcvPandasDataProcessor())
459
- >>> db.read('BINANCEF.ETHUSDT', '5m', '2024-01-01')
446
+ >>> db.read('BINANCEF.ETHUSDT', '2024-01-01')
460
447
  """
461
448
  _reconnect_tries = 5
462
449
  _reconnect_idle = 0.1 # wait seconds before retying
463
450
 
464
- def __init__(self, connection_url: str, processor: DataProcessor | None=None) -> None:
465
- super().__init__(processor)
451
+ def __init__(self, connection_url: str) -> None:
466
452
  self._connection = None
467
453
  self._cursor = None
468
454
  self.connection_url = connection_url
@@ -474,12 +460,18 @@ class QuestDBConnector(DataReader):
474
460
  self._cursor = self._connection.cursor()
475
461
 
476
462
  @_retry
477
- def read(self, symbol: str, timeframe: str, start: str|None=None, stop: str|None=None) -> Any:
463
+ def read(self, data_id: str, start: str|None=None, stop: str|None=None,
464
+ transform: DataTransformer = DataTransformer(),
465
+ chunksize=0, # TODO: use self._cursor.fetchmany in this case !!!!
466
+ timeframe: str='1m') -> Any:
478
467
  start, end = handle_start_stop(start, stop)
479
468
  w0 = f"timestamp >= '{start}'" if start else ''
480
469
  w1 = f"timestamp <= '{end}'" if end else ''
481
470
  where = f'where {w0} and {w1}' if (w0 and w1) else f"where {(w0 or w1)}"
482
471
 
472
+ # just a temp hack - actually we need to discuss symbology etc
473
+ symbol = data_id#.split('.')[-1]
474
+
483
475
  self._cursor.execute(
484
476
  f"""
485
477
  select timestamp,
@@ -496,14 +488,20 @@ class QuestDBConnector(DataReader):
496
488
  SAMPLE by {timeframe};
497
489
  """ # type: ignore
498
490
  )
499
- records = self._cursor.fetchall()
491
+ records = self._cursor.fetchall() # TODO: for chunksize > 0 use fetchmany etc
500
492
  names = [d.name for d in self._cursor.description]
501
493
 
502
- self._processor.start_processing(names, re.split(r'[.:]', symbol)[-1])
494
+ transform.start_transform(data_id, names)
503
495
 
504
496
  # d = np.array(records)
505
- self._processor.process_data_rows(records)
506
- return self._processor.get_result()
497
+ transform.process_data(records)
498
+ return transform.collect()
499
+
500
+ @_retry
501
+ def get_names(self) -> List[str] :
502
+ self._cursor.execute("select table_name from tables()")
503
+ records = self._cursor.fetchall()
504
+ return [r[0] for r in records]
507
505
 
508
506
  def __del__(self):
509
507
  for c in (self._cursor, self._connection):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: Qubx
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: Qubx - quantitative trading framework
5
5
  Home-page: https://github.com/dmarienko/Qubx
6
6
  Author: Dmitry Marienko
@@ -6,13 +6,13 @@ qubx/core/basics.py,sha256=2u7WV5KX-RbTmzoKfi1yT4HNLDPfQcFMCUZ1pVsM_VE,14777
6
6
  qubx/core/helpers.py,sha256=gPE78dO718NBY0-JbfqNGCzIvr4BVatFntNIy2RUrEY,11559
7
7
  qubx/core/loggers.py,sha256=HpgavBZegoDv9ssihtqX0pitXKULVAPHUpoE_volJw0,11910
8
8
  qubx/core/lookups.py,sha256=4aEC7b2AyEXFqHHGDenex3Z1FZGrpDSb8IwzBZrSqIA,13688
9
- qubx/core/series.cpython-311-x86_64-linux-gnu.so,sha256=h6lIxJgxb5nl0AxZmdg28hv45xkiJHnP_aaj7NQOIzY,698952
9
+ qubx/core/series.cpython-311-x86_64-linux-gnu.so,sha256=lwzgrbIdmfZiMopIPJkOlh3tzcS-zIbBudvqXnduwdU,698952
10
10
  qubx/core/series.pxd,sha256=IS89NQ5FYp3T0YIHe1lELKZIAKrNvX8K6WlLyac44I4,2847
11
11
  qubx/core/series.pyx,sha256=WEAjn4j3zn540Cxx68X5gRXilvwa7NGdbki6myzZbIM,28108
12
12
  qubx/core/strategy.py,sha256=Fs4fFyHaEGYuz7mBeQHBWFu3Ipg0yFzcxXhskgsPxJE,30330
13
- qubx/core/utils.cpython-311-x86_64-linux-gnu.so,sha256=sciJD2GxzxV13hmsIZGiVTpkdgVZQWAuGD8IF6fKju0,74216
13
+ qubx/core/utils.cpython-311-x86_64-linux-gnu.so,sha256=u2eUKlnC06lHG3LTg-92Om_32Mk7ZGAleV7As7--G1U,74216
14
14
  qubx/core/utils.pyx,sha256=6dQ8R02bl8V3f-W3Wk9-e86D9OvDz-5-4NA_dlF_xwc,1368
15
- qubx/data/readers.py,sha256=H2uEjp6DPKk5rRel6_dQM9HkdH4gK3NGhuA-sKoZBso,20161
15
+ qubx/data/readers.py,sha256=r5_DhzyaTMNGHr9sDjbIgK2kMcSC8fHYeDrb2ep1NLU,19648
16
16
  qubx/impl/ccxt_connector.py,sha256=NqF-tgxfTATnmVqKUonNXCAzECrDU8YrgqM3Nq06fw8,9150
17
17
  qubx/impl/ccxt_customizations.py,sha256=kK_4KmOyKvDVgd4MTkVg4CyqdjE-6r41siZIvLj-A-Q,3488
18
18
  qubx/impl/ccxt_trading.py,sha256=cmg4P-zd78w-V8j3-IGS2LFxikGhxFPgmCvz3sr065Q,9097
@@ -20,7 +20,7 @@ qubx/impl/ccxt_utils.py,sha256=n6nicE2C_7tVF3soQJYgK0Er0hZrfY0NnN5c84vGiZc,3565
20
20
  qubx/math/__init__.py,sha256=AavTKCtU7gRffG9T10Z0uv4LdI31bVvBn-L_Iv81FRk,33
21
21
  qubx/math/stats.py,sha256=LnZZFe_3_vj1yW-wcQdtOmI9t5yGkiYfLWa4kVFXkjA,1176
22
22
  qubx/ta/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- qubx/ta/indicators.cpython-311-x86_64-linux-gnu.so,sha256=ii-VbImkMv_82oTXfOmlQCit5qONcjUq8wO9ZLtsN1I,284552
23
+ qubx/ta/indicators.cpython-311-x86_64-linux-gnu.so,sha256=skr9RSXP2ypQ3oFXzrOiMx4iUhuLfvhVAb3KRaYPoSc,284552
24
24
  qubx/ta/indicators.pyx,sha256=P-GEYUks2lSHo6hbtUFAB7TWE1AunjLR4jIjwqPHrwU,7708
25
25
  qubx/trackers/__init__.py,sha256=1y_yvIy0OQwBqfhAW_EY33NxFzFSWvI0qNAPU6zchYc,60
26
26
  qubx/trackers/rebalancers.py,sha256=QCzANCooZBi2VMCBjjCPMq_Dt1h1zrBelATnfmVve74,5522
@@ -32,6 +32,6 @@ qubx/utils/misc.py,sha256=bK9cqNKIt_qER8FnSs23L3RMSVhnJIZ5n4tyLNm5n3s,9837
32
32
  qubx/utils/pandas.py,sha256=8gf0hgrkRfuOOiANZxKcSPgj8-KL9FlVlfSvNrCar6A,18605
33
33
  qubx/utils/runner.py,sha256=ZUk7jgqx3JYUDZ_ZJLEZv0ug3m2Da-c4Ud2CwfOvC8Q,9277
34
34
  qubx/utils/time.py,sha256=mdQ02PGoUBm9iH_wvFIhAhOkBoJOpO24ZanWcGU8oms,4884
35
- qubx-0.1.3.dist-info/METADATA,sha256=pBztpBxZvNeW0Usz4jVxz3jeeMgaCZY2o0G8lQ6IKzQ,2144
36
- qubx-0.1.3.dist-info/WHEEL,sha256=MLOa6LysROdjgj4FVxsHitAnIh8Be2D_c9ZSBHKrz2M,110
37
- qubx-0.1.3.dist-info/RECORD,,
35
+ qubx-0.1.4.dist-info/METADATA,sha256=GfHFjg69n066xlpMMnpOyMAYlhXjVsyqpTlzUzzFguc,2144
36
+ qubx-0.1.4.dist-info/WHEEL,sha256=MLOa6LysROdjgj4FVxsHitAnIh8Be2D_c9ZSBHKrz2M,110
37
+ qubx-0.1.4.dist-info/RECORD,,
File without changes