forex_data_aggregator 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- forex_data/__init__.py +92 -0
- forex_data/config/__init__.py +20 -0
- forex_data/config/config_file.py +89 -0
- forex_data/data_management/__init__.py +84 -0
- forex_data/data_management/common.py +1773 -0
- forex_data/data_management/database.py +1322 -0
- forex_data/data_management/historicaldata.py +1262 -0
- forex_data/data_management/realtimedata.py +993 -0
- forex_data_aggregator-0.1.2.dist-info/LICENSE +21 -0
- forex_data_aggregator-0.1.2.dist-info/METADATA +562 -0
- forex_data_aggregator-0.1.2.dist-info/RECORD +12 -0
- forex_data_aggregator-0.1.2.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1773 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Created on Sat Apr 30 09:23:19 2022
|
|
4
|
+
|
|
5
|
+
@author: fiora
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
'YEARS',
|
|
10
|
+
'MONTHS',
|
|
11
|
+
'DATE_FORMAT_SQL',
|
|
12
|
+
'DATE_FORMAT_HISTDATA_CSV',
|
|
13
|
+
'HISTDATA_URL_TICKDATA_TEMPLATE',
|
|
14
|
+
'HISTDATA_BASE_DOWNLOAD_METHOD',
|
|
15
|
+
'HISTDATA_BASE_DOWNLOAD_URL',
|
|
16
|
+
'DEFAULT_PATHS',
|
|
17
|
+
'DATA_TYPE',
|
|
18
|
+
'BASE_DATA_COLUMN_NAME',
|
|
19
|
+
'DATA_FILE_COLUMN_INDEX',
|
|
20
|
+
'SUPPORTED_DATA_FILES',
|
|
21
|
+
'SUPPORTED_DATA_ENGINES',
|
|
22
|
+
'ASSET_TYPE',
|
|
23
|
+
'TEMP_FOLDER',
|
|
24
|
+
'TEMP_CSV_FILE',
|
|
25
|
+
'DTYPE_DICT',
|
|
26
|
+
'PYARROW_DTYPE_DICT',
|
|
27
|
+
'POLARS_DTYPE_DICT',
|
|
28
|
+
'DATA_COLUMN_NAMES',
|
|
29
|
+
'FILENAME_TEMPLATE',
|
|
30
|
+
'DATA_KEY',
|
|
31
|
+
'TICK_TIMEFRAME',
|
|
32
|
+
'FILENAME_STR',
|
|
33
|
+
'REALTIME_DATA_PROVIDER',
|
|
34
|
+
'ALPHA_VANTAGE_API_KEY',
|
|
35
|
+
'CANONICAL_INDEX',
|
|
36
|
+
'DATE_NO_HOUR_FORMAT',
|
|
37
|
+
'POLYGON_IO_API_KEY',
|
|
38
|
+
'AV_LIST_URL',
|
|
39
|
+
'PAIR_ALPHAVANTAGE_FORMAT',
|
|
40
|
+
'PAIR_POLYGON_FORMAT',
|
|
41
|
+
'SQL_COMPARISON_OPERATORS',
|
|
42
|
+
'SUPPORTED_SQL_COMPARISON_OPERATORS',
|
|
43
|
+
'SUPPORTED_BASE_DATA_COLUMN_NAME',
|
|
44
|
+
'SQL_CONDITION_AGGREGATION_MODES',
|
|
45
|
+
'SUPPORTED_SQL_CONDITION_AGGREGATION_MODES',
|
|
46
|
+
|
|
47
|
+
'validator_file_path',
|
|
48
|
+
'validator_dir_path',
|
|
49
|
+
'get_attrs_names',
|
|
50
|
+
'check_time_offset_str',
|
|
51
|
+
'check_timeframe_str',
|
|
52
|
+
'any_date_to_datetime64',
|
|
53
|
+
'empty_dataframe',
|
|
54
|
+
'is_empty_dataframe',
|
|
55
|
+
'shape_dataframe',
|
|
56
|
+
'get_dataframe_column',
|
|
57
|
+
'get_dataframe_row',
|
|
58
|
+
'get_dataframe_element',
|
|
59
|
+
'get_dotty_leafs',
|
|
60
|
+
'astype',
|
|
61
|
+
'read_csv',
|
|
62
|
+
'polars_datetime',
|
|
63
|
+
'sort_dataframe',
|
|
64
|
+
'concat_data',
|
|
65
|
+
'list_remove_duplicates',
|
|
66
|
+
'get_dotty_key_field',
|
|
67
|
+
'reframe_data',
|
|
68
|
+
'write_csv',
|
|
69
|
+
'write_parquet',
|
|
70
|
+
'read_parquet',
|
|
71
|
+
'to_pandas_dataframe',
|
|
72
|
+
'get_pair_symbols',
|
|
73
|
+
'to_source_symbol',
|
|
74
|
+
'get_date_interval',
|
|
75
|
+
'polygon_agg_to_dict',
|
|
76
|
+
'validator_list_timeframe',
|
|
77
|
+
'get_histdata_tickers',
|
|
78
|
+
'TickerNotFoundError',
|
|
79
|
+
'TickerDataNotFoundError',
|
|
80
|
+
'TickerDataBadTypeException',
|
|
81
|
+
'TickerDataInvalidException'
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
from loguru import logger
|
|
85
|
+
|
|
86
|
+
from re import (
|
|
87
|
+
fullmatch,
|
|
88
|
+
findall,
|
|
89
|
+
search
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
from typing import (
|
|
93
|
+
cast,
|
|
94
|
+
Any,
|
|
95
|
+
List,
|
|
96
|
+
Literal
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
import requests
|
|
100
|
+
from bs4 import BeautifulSoup
|
|
101
|
+
|
|
102
|
+
from datetime import (
|
|
103
|
+
timedelta
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# PANDAS
|
|
107
|
+
from pandas import (
|
|
108
|
+
DataFrame as pandas_dataframe,
|
|
109
|
+
concat as pandas_concat,
|
|
110
|
+
Timestamp,
|
|
111
|
+
isnull,
|
|
112
|
+
bdate_range,
|
|
113
|
+
to_datetime,
|
|
114
|
+
Timedelta,
|
|
115
|
+
read_parquet as pandas_read_parquet,
|
|
116
|
+
read_csv as pandas_read_csv
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
from pandas.api.types import is_datetime64_any_dtype
|
|
120
|
+
from pandas.tseries.frequencies import to_offset
|
|
121
|
+
from pandas.tseries.offsets import DateOffset
|
|
122
|
+
|
|
123
|
+
# PYARROW
|
|
124
|
+
from pyarrow import (
|
|
125
|
+
float32 as pyarrow_float32,
|
|
126
|
+
timestamp as pyarrow_timestamp,
|
|
127
|
+
schema as pyarrow_schema,
|
|
128
|
+
Table,
|
|
129
|
+
table as pyarrow_table,
|
|
130
|
+
concat_tables,
|
|
131
|
+
csv as arrow_csv
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
from pyarrow.parquet import (
|
|
135
|
+
write_table,
|
|
136
|
+
read_table
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# POLARS
|
|
140
|
+
from polars import (
|
|
141
|
+
Float32 as polars_float32,
|
|
142
|
+
Datetime as polars_datetime,
|
|
143
|
+
read_csv as polars_read_csv,
|
|
144
|
+
concat as polars_concat,
|
|
145
|
+
col,
|
|
146
|
+
len as polars_len,
|
|
147
|
+
read_parquet as polars_read_parquet,
|
|
148
|
+
from_arrow,
|
|
149
|
+
DataFrame as polars_dataframe,
|
|
150
|
+
LazyFrame as polars_lazyframe,
|
|
151
|
+
scan_csv as polars_scan_csv,
|
|
152
|
+
scan_parquet as polars_scan_parquet
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# POLYGON real time provider
|
|
156
|
+
from polygon.rest.models.aggs import (
|
|
157
|
+
Agg as polygon_agg
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
from dateutil.rrule import (
|
|
161
|
+
rrule,
|
|
162
|
+
DAILY,
|
|
163
|
+
MO,
|
|
164
|
+
TU,
|
|
165
|
+
WE,
|
|
166
|
+
TH,
|
|
167
|
+
FR
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
from datetime import datetime
|
|
171
|
+
|
|
172
|
+
from pathlib import Path
|
|
173
|
+
|
|
174
|
+
from attrs import (
|
|
175
|
+
field,
|
|
176
|
+
validators
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# =============================================================================
|
|
181
|
+
# CUSTOM EXCEPTIONS
|
|
182
|
+
# =============================================================================
|
|
183
|
+
|
|
184
|
+
# TickerNotFoundError:
|
|
185
|
+
# This exception is raised when the ticker requested is misspelled
|
|
186
|
+
# or does not exist in the database.
|
|
187
|
+
class TickerNotFoundError(Exception):
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# TickerDataNotFoundError:
|
|
192
|
+
# This exception is raised when the ticker is found
|
|
193
|
+
# but data is not available or data retrieval failed.
|
|
194
|
+
class TickerDataNotFoundError(Exception):
|
|
195
|
+
pass
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# TickerDataBadTypeException:
|
|
199
|
+
# This exception is raised when the ticker data
|
|
200
|
+
# is found but data type is not compliant with the expected type.
|
|
201
|
+
class TickerDataBadTypeException(Exception):
|
|
202
|
+
pass
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
# TickerDataInvalidException:
|
|
206
|
+
# This exception is raised when the ticker data
|
|
207
|
+
# is not found or invalid for generic reasons.
|
|
208
|
+
class TickerDataInvalidException(Exception):
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
# common functions, constants and templates
|
|
213
|
+
TEMP_FOLDER = "Temp"
|
|
214
|
+
TEMP_CSV_FILE = "Temp.csv"
|
|
215
|
+
|
|
216
|
+
HISTDATA_URL_TICKDATA_TEMPLATE = (
|
|
217
|
+
'https://www.histdata.com/download-free-forex-historical-data/?/'
|
|
218
|
+
'ascii/tick-data-quotes/{ticker}/{year}/{month_num}'
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
HISTDATA_URL_ONEMINDATA_TEMPLATE = (
|
|
222
|
+
'http://www.histdata.com/download-free-forex-data/?/'
|
|
223
|
+
'ascii/1-minute-bar-quotes/{pair}/{year}/{month_num}'
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
HISTDATA_BASE_DOWNLOAD_URL = "http://www.histdata.com/get.php"
|
|
227
|
+
HISTDATA_BASE_DOWNLOAD_METHOD = 'POST'
|
|
228
|
+
|
|
229
|
+
MONTHS = ['January', 'February', 'March', 'April', 'May', 'June',
|
|
230
|
+
'July', 'August', 'September', 'October', 'November', 'December']
|
|
231
|
+
YEARS = list(range(2001, datetime.now().year, 1))
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
DATE_NO_HOUR_FORMAT = '%Y-%m-%d'
|
|
235
|
+
DATE_FORMAT_ISO8601 = 'ISO8601'
|
|
236
|
+
DATE_FORMAT_SQL = '%Y-%m-%d %H:%M:%S.%f'
|
|
237
|
+
DATE_FORMAT_HISTDATA_CSV = '%Y%m%d %H%M%S%f'
|
|
238
|
+
|
|
239
|
+
# DATA_KEY_TEMPLATE_STR = '{ticker}.Y{year}.{tf}'
|
|
240
|
+
# DATA_KEY_TEMPLATE_PATTERN = '^[A-Za-z]+.Y[0-9]+.[A-Za-z0-9]+'
|
|
241
|
+
# FILENAME_STR = '{ticker}_Y{year}_{tf}.{file_ext}'
|
|
242
|
+
DATA_KEY_TEMPLATE_STR = '{market}.{ticker}.{tf}'
|
|
243
|
+
DATA_KEY_TEMPLATE_PATTERN = '^[A-Za-z0-9]_[A-Za-z]+.[A-Za-z0-9]+'
|
|
244
|
+
FILENAME_STR = '{market}_{ticker}_{tf}.{file_ext}'
|
|
245
|
+
DEFAULT_TIMEZONE = 'utc'
|
|
246
|
+
TICK_TIMEFRAME = 'tick'
|
|
247
|
+
|
|
248
|
+
# ticker PAIR of forex market
|
|
249
|
+
SINGLE_CURRENCY_PATTERN_STR = '[A-Za-z]{3}'
|
|
250
|
+
TICKER_PATTERN = '^' + SINGLE_CURRENCY_PATTERN_STR \
|
|
251
|
+
+ SINGLE_CURRENCY_PATTERN_STR + '$'
|
|
252
|
+
PAIR_GENERIC_FORMAT = '{TO}/{FROM}'
|
|
253
|
+
|
|
254
|
+
# ALPHAVANTAGE
|
|
255
|
+
PAIR_ALPHAVANTAGE_FORMAT = '{TO}/{FROM}'
|
|
256
|
+
PAIR_ALPHAVANTAGE_PATTERN = '^' + SINGLE_CURRENCY_PATTERN_STR + '/' \
|
|
257
|
+
+ SINGLE_CURRENCY_PATTERN_STR + '$'
|
|
258
|
+
ALPHA_VANTAGE_API_KEY = 'ALPHA_VANTAGE_API_KEY'
|
|
259
|
+
AV_LIST_URL = (
|
|
260
|
+
'https://www.alphavantage.co/query?'
|
|
261
|
+
'function=LISTING_STATUS&apikey={api_key}'
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# PAIR POLYGON IO
|
|
265
|
+
PAIR_POLYGON_FORMAT = 'C:{TO}{FROM}'
|
|
266
|
+
PAIR_POLYGON_PATTERN = '^C:' + SINGLE_CURRENCY_PATTERN_STR + \
|
|
267
|
+
SINGLE_CURRENCY_PATTERN_STR + '$'
|
|
268
|
+
POLYGON_IO_API_KEY = 'POLYGON_IO_API_KEY'
|
|
269
|
+
|
|
270
|
+
# TIME PATTERN
|
|
271
|
+
TIME_WINDOW_PATTERN_STR = '^[-+]?[0-9]+[A-Za-z]{1,}$'
|
|
272
|
+
TIME_WINDOW_COMPONENTS_PATTERN_STR = '^[-+]?[0-9]+|[A-Za-z]{1,}$'
|
|
273
|
+
TIME_WINDOW_UNIT_PATTERN_STR = '[A-Za-z]{1,}$'
|
|
274
|
+
GET_YEAR_FROM_TICK_KEY_PATTERN_STR = '^[A-Za-z].Y[0-9].TICK'
|
|
275
|
+
YEAR_FIELD_PATTERN_STR = '^Y([0-9]{4,})$'
|
|
276
|
+
|
|
277
|
+
# auxiliary CONSTANT DEFINITIONS
|
|
278
|
+
|
|
279
|
+
# dotty key template: <ticker>.Y<year>.<timeframe>.<data-type>
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
class DATA_KEY:
|
|
283
|
+
|
|
284
|
+
MARKET = 0
|
|
285
|
+
TICKER_INDEX = 1
|
|
286
|
+
TF_INDEX = 2
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
# filename template : <ticker>_Y<year>_<timeframe>.<filetype>
|
|
290
|
+
class FILENAME_TEMPLATE:
|
|
291
|
+
|
|
292
|
+
TICKER_INDEX = 0
|
|
293
|
+
YEAR_INDEX = 1
|
|
294
|
+
YEAR_NUMERICAL_CHAR = 1
|
|
295
|
+
TF_INDEX = 2
|
|
296
|
+
FILETYPE_INDEX = 3
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class DEFAULT_PATHS:
|
|
300
|
+
|
|
301
|
+
BASE_PATH = str(Path.home() / '.database')
|
|
302
|
+
HIST_DATA_FOLDER = 'HistoricalData'
|
|
303
|
+
REALTIME_DATA_FOLDER = 'RealtimeData'
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
class DATA_TYPE:
|
|
307
|
+
|
|
308
|
+
CSV_FILETYPE = 'csv'
|
|
309
|
+
PARQUET_FILETYPE = 'parquet'
|
|
310
|
+
DUCKDB = 'duckdb'
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
class DATA_FILE_COLUMN_INDEX:
|
|
314
|
+
|
|
315
|
+
TIMESTAMP = 0
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
SUPPORTED_DATA_FILES = [
|
|
319
|
+
DATA_TYPE.CSV_FILETYPE,
|
|
320
|
+
DATA_TYPE.PARQUET_FILETYPE,
|
|
321
|
+
DATA_TYPE.DUCKDB
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
# supported dataframe engines
|
|
325
|
+
# pyarrow is inserted but reframe operation all in pyarrow
|
|
326
|
+
# is not yet available, now it is masked
|
|
327
|
+
# to a refame call with polars
|
|
328
|
+
# reframe_data() on pyarrow Table
|
|
329
|
+
SUPPORTED_DATA_ENGINES = [
|
|
330
|
+
'pandas',
|
|
331
|
+
'pyarrow',
|
|
332
|
+
'polars',
|
|
333
|
+
'polars_lazy'
|
|
334
|
+
]
|
|
335
|
+
|
|
336
|
+
# SINGLE BASE DATA COMPOSIION TEMPLATE: ['open','close','high','low']
|
|
337
|
+
# with datetime/timestamp as index
|
|
338
|
+
# column names for dataframes TICK and timeframe filtered
|
|
339
|
+
# OHLC and related column names
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
class DATA_COLUMN_NAMES:
|
|
343
|
+
|
|
344
|
+
TICK_DATA_NO_PVALUE = ['timestamp', 'ask', 'bid', 'vol']
|
|
345
|
+
TICK_DATA = ['timestamp', 'ask', 'bid', 'vol', 'p']
|
|
346
|
+
TF_DATA = ['timestamp', 'open', 'high', 'low', 'close']
|
|
347
|
+
TICK_DATA_TIME_INDEX = ['ask', 'bid', 'vol', 'p']
|
|
348
|
+
TF_DATA_TIME_INDEX = ['open', 'high', 'low', 'close']
|
|
349
|
+
POLYGON_IO_AGGS = ['open', 'high', 'low', 'close', 'volume', 'vwap',
|
|
350
|
+
'timestamp', 'transactions']
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
# SELECTED AS SINGLE BASE DATA COMPOSION TEMPLATE
|
|
354
|
+
BASE_DATA = DATA_COLUMN_NAMES.TF_DATA_TIME_INDEX
|
|
355
|
+
BASE_DATA_WITH_TIME = DATA_COLUMN_NAMES.TF_DATA
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
class REALTIME_DATA_PROVIDER:
|
|
359
|
+
|
|
360
|
+
ALPHA_VANTAGE = 'ALPHA_VANTAGE'
|
|
361
|
+
POLYGON_IO = 'POLYGON-IO'
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
REALTIME_DATA_PROVIDER_LIST = [REALTIME_DATA_PROVIDER.ALPHA_VANTAGE,
|
|
365
|
+
REALTIME_DATA_PROVIDER.POLYGON_IO]
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
class DB_MODE:
|
|
369
|
+
|
|
370
|
+
FULL_MODE = 'FULL_MODE'
|
|
371
|
+
HISTORICAL_MODE = 'HISTORICAL_MODE'
|
|
372
|
+
REALTIME_MODE = 'REALTIME_MODE'
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
class ASSET_TYPE:
|
|
376
|
+
|
|
377
|
+
STOCK = 'STOCK'
|
|
378
|
+
ETF = 'ETF'
|
|
379
|
+
FOREX = 'FOREX'
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
class BASE_DATA_COLUMN_NAME:
|
|
383
|
+
|
|
384
|
+
TIMESTAMP = 'timestamp'
|
|
385
|
+
OPEN = 'open'
|
|
386
|
+
HIGH = 'high'
|
|
387
|
+
LOW = 'low'
|
|
388
|
+
CLOSE = 'close'
|
|
389
|
+
ASK = 'ask'
|
|
390
|
+
BID = 'bid'
|
|
391
|
+
VOL = 'vol'
|
|
392
|
+
P_VALUE = 'p'
|
|
393
|
+
TRANSACTIONS = 'transactions'
|
|
394
|
+
VWAP = 'vwap'
|
|
395
|
+
OTC = 'otc'
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
SUPPORTED_BASE_DATA_COLUMN_NAME = Literal[
|
|
399
|
+
BASE_DATA_COLUMN_NAME.TIMESTAMP,
|
|
400
|
+
BASE_DATA_COLUMN_NAME.OPEN,
|
|
401
|
+
BASE_DATA_COLUMN_NAME.HIGH,
|
|
402
|
+
BASE_DATA_COLUMN_NAME.LOW,
|
|
403
|
+
BASE_DATA_COLUMN_NAME.CLOSE,
|
|
404
|
+
BASE_DATA_COLUMN_NAME.ASK,
|
|
405
|
+
BASE_DATA_COLUMN_NAME.BID,
|
|
406
|
+
BASE_DATA_COLUMN_NAME.VOL,
|
|
407
|
+
BASE_DATA_COLUMN_NAME.P_VALUE,
|
|
408
|
+
BASE_DATA_COLUMN_NAME.TRANSACTIONS,
|
|
409
|
+
BASE_DATA_COLUMN_NAME.VWAP,
|
|
410
|
+
BASE_DATA_COLUMN_NAME.OTC
|
|
411
|
+
]
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
class CANONICAL_INDEX:
|
|
415
|
+
|
|
416
|
+
AV_LATEST_DATA_INDEX = 0
|
|
417
|
+
AV_DF_DATA_INDEX = 0
|
|
418
|
+
AV_DICT_INFO_INDEX = 1
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
class SQL_COMPARISON_OPERATORS:
|
|
422
|
+
|
|
423
|
+
GREATER_THAN = '>'
|
|
424
|
+
LESS_THAN = '<'
|
|
425
|
+
GREATER_THAN_OR_EQUAL = '>='
|
|
426
|
+
LESS_THAN_OR_EQUAL = '<='
|
|
427
|
+
EQUAL = '=='
|
|
428
|
+
NOT_EQUAL = '!='
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
SUPPORTED_SQL_COMPARISON_OPERATORS = Literal[
|
|
432
|
+
SQL_COMPARISON_OPERATORS.GREATER_THAN,
|
|
433
|
+
SQL_COMPARISON_OPERATORS.LESS_THAN,
|
|
434
|
+
SQL_COMPARISON_OPERATORS.GREATER_THAN_OR_EQUAL,
|
|
435
|
+
SQL_COMPARISON_OPERATORS.LESS_THAN_OR_EQUAL,
|
|
436
|
+
SQL_COMPARISON_OPERATORS.EQUAL,
|
|
437
|
+
SQL_COMPARISON_OPERATORS.NOT_EQUAL
|
|
438
|
+
]
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
class SQL_CONDITION_AGGREGATION_MODES:
|
|
442
|
+
|
|
443
|
+
AND = 'AND'
|
|
444
|
+
OR = 'OR'
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
SUPPORTED_SQL_CONDITION_AGGREGATION_MODES = Literal[
|
|
448
|
+
SQL_CONDITION_AGGREGATION_MODES.AND,
|
|
449
|
+
SQL_CONDITION_AGGREGATION_MODES.OR
|
|
450
|
+
]
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
# auxiliary functions
|
|
454
|
+
|
|
455
|
+
# get elements from db key
|
|
456
|
+
def get_db_key_elements(key):
|
|
457
|
+
|
|
458
|
+
res = fullmatch(DATA_KEY_TEMPLATE_STR, key)
|
|
459
|
+
|
|
460
|
+
if res:
|
|
461
|
+
|
|
462
|
+
return res.groups()
|
|
463
|
+
|
|
464
|
+
else:
|
|
465
|
+
|
|
466
|
+
logger.error(
|
|
467
|
+
f'key {key} does not respect regex template {DATA_KEY_TEMPLATE_STR}')
|
|
468
|
+
raise ValueError
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
# parse argument to get datetime object with date format as input
|
|
472
|
+
def infer_date_from_format_dt(s, date_format='ISO8601', unit=None, utc=False):
|
|
473
|
+
|
|
474
|
+
if unit:
|
|
475
|
+
|
|
476
|
+
return to_datetime(s,
|
|
477
|
+
unit=unit,
|
|
478
|
+
utc=utc)
|
|
479
|
+
|
|
480
|
+
else:
|
|
481
|
+
|
|
482
|
+
return to_datetime(s,
|
|
483
|
+
format=date_format,
|
|
484
|
+
utc=utc)
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
# parse timeframe as string and validate if it is valid
|
|
488
|
+
# following pandas DateOffset freqstr rules and 'TICK' (=lowest timeframe available)
|
|
489
|
+
# link to official pandas doc
|
|
490
|
+
# https://pandas.pydata.org/docs/user_guide/timeseries.html#dateoffset-objects
|
|
491
|
+
# add compatibility to polars frequency string
|
|
492
|
+
|
|
493
|
+
def check_timeframe_str(tf):
|
|
494
|
+
|
|
495
|
+
check = False
|
|
496
|
+
|
|
497
|
+
if tf == 'TICK':
|
|
498
|
+
|
|
499
|
+
check = True
|
|
500
|
+
|
|
501
|
+
else:
|
|
502
|
+
|
|
503
|
+
try:
|
|
504
|
+
|
|
505
|
+
check = (
|
|
506
|
+
isinstance(to_offset(tf), DateOffset) or
|
|
507
|
+
isinstance(Timedelta(tf).to_pytimedelta(),
|
|
508
|
+
timedelta)
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
except ValueError:
|
|
512
|
+
|
|
513
|
+
logger.critical(f"Type check: Invalid timeframe: {tf}")
|
|
514
|
+
raise
|
|
515
|
+
|
|
516
|
+
if check:
|
|
517
|
+
|
|
518
|
+
return tf
|
|
519
|
+
|
|
520
|
+
else:
|
|
521
|
+
|
|
522
|
+
logger.critical(f"Type check: Invalid timeframe "
|
|
523
|
+
f"conversion to timedelta: {tf}")
|
|
524
|
+
raise ValueError
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
# PAIR symbol functions
|
|
528
|
+
def get_pair_symbols(ticker):
|
|
529
|
+
|
|
530
|
+
components = findall(SINGLE_CURRENCY_PATTERN_STR, ticker)
|
|
531
|
+
|
|
532
|
+
if len(components) == 2:
|
|
533
|
+
|
|
534
|
+
return components[0], components[1]
|
|
535
|
+
|
|
536
|
+
else:
|
|
537
|
+
|
|
538
|
+
return None
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def check_symbol(symbol, source):
|
|
542
|
+
|
|
543
|
+
if source == REALTIME_DATA_PROVIDER.ALPHA_VANTAGE:
|
|
544
|
+
|
|
545
|
+
if fullmatch(PAIR_ALPHAVANTAGE_PATTERN, symbol):
|
|
546
|
+
|
|
547
|
+
return True
|
|
548
|
+
|
|
549
|
+
else:
|
|
550
|
+
|
|
551
|
+
return False
|
|
552
|
+
|
|
553
|
+
elif source == REALTIME_DATA_PROVIDER.POLYGON_IO:
|
|
554
|
+
|
|
555
|
+
if fullmatch(PAIR_POLYGON_FORMAT, symbol):
|
|
556
|
+
|
|
557
|
+
return True
|
|
558
|
+
|
|
559
|
+
else:
|
|
560
|
+
|
|
561
|
+
return False
|
|
562
|
+
|
|
563
|
+
else:
|
|
564
|
+
|
|
565
|
+
if fullmatch(PAIR_POLYGON_FORMAT, symbol):
|
|
566
|
+
|
|
567
|
+
return True
|
|
568
|
+
|
|
569
|
+
else:
|
|
570
|
+
|
|
571
|
+
return False
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def to_source_symbol(ticker, source):
|
|
575
|
+
|
|
576
|
+
to_symbol, from_symbol = get_pair_symbols(ticker)
|
|
577
|
+
|
|
578
|
+
if source == REALTIME_DATA_PROVIDER.ALPHA_VANTAGE:
|
|
579
|
+
|
|
580
|
+
return PAIR_ALPHAVANTAGE_FORMAT.format(TO=to_symbol,
|
|
581
|
+
FROM=from_symbol)
|
|
582
|
+
|
|
583
|
+
elif source == REALTIME_DATA_PROVIDER.POLYGON_IO:
|
|
584
|
+
|
|
585
|
+
return PAIR_POLYGON_FORMAT.format(TO=to_symbol,
|
|
586
|
+
FROM=from_symbol)
|
|
587
|
+
|
|
588
|
+
else:
|
|
589
|
+
|
|
590
|
+
return PAIR_GENERIC_FORMAT.format(TO=to_symbol,
|
|
591
|
+
FROM=from_symbol)
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
# TIMESTAMP RELATED FUNCTIONS
|
|
595
|
+
|
|
596
|
+
def check_time_offset_str(timeoffset_str):
|
|
597
|
+
|
|
598
|
+
# TODO: add support for polars time/date offset
|
|
599
|
+
return isinstance(to_offset(timeoffset_str), DateOffset)
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
def timewindow_str_to_timedelta(time_window_str):
|
|
603
|
+
|
|
604
|
+
if fullmatch(TIME_WINDOW_PATTERN_STR, time_window_str):
|
|
605
|
+
|
|
606
|
+
return Timedelta(time_window_str)
|
|
607
|
+
|
|
608
|
+
else:
|
|
609
|
+
|
|
610
|
+
logger.error('time window pattern not match: '
|
|
611
|
+
'"<integer_multiplier><unit>" str')
|
|
612
|
+
raise ValueError
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def any_date_to_datetime64(any_date,
|
|
616
|
+
date_format='ISO8601',
|
|
617
|
+
unit=None,
|
|
618
|
+
to_pydatetime=False):
|
|
619
|
+
|
|
620
|
+
try:
|
|
621
|
+
|
|
622
|
+
any_date = infer_date_from_format_dt(any_date,
|
|
623
|
+
date_format,
|
|
624
|
+
unit=unit)
|
|
625
|
+
|
|
626
|
+
if to_pydatetime:
|
|
627
|
+
|
|
628
|
+
any_date = any_date.to_pydatetime()
|
|
629
|
+
|
|
630
|
+
except Exception as e:
|
|
631
|
+
|
|
632
|
+
logger.error(f'date {any_date} conversion failed, '
|
|
633
|
+
f'failed conversion to {date_format} '
|
|
634
|
+
'date format')
|
|
635
|
+
raise
|
|
636
|
+
|
|
637
|
+
# =============================================================================
|
|
638
|
+
# TODO: is it necessary utc timezone when source is naive?
|
|
639
|
+
# if not any_date.tzinfo:
|
|
640
|
+
#
|
|
641
|
+
# any_date = any_date.tz_localize('utc')
|
|
642
|
+
# =============================================================================
|
|
643
|
+
|
|
644
|
+
return any_date
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def get_date_interval(start=None,
|
|
648
|
+
end=None,
|
|
649
|
+
interval_start_mode=None,
|
|
650
|
+
interval_end_mode='now',
|
|
651
|
+
interval_timespan=None,
|
|
652
|
+
freq=None,
|
|
653
|
+
normalize=False,
|
|
654
|
+
bdays=False):
|
|
655
|
+
|
|
656
|
+
# create start and end date as timestamp instances
|
|
657
|
+
start_date = Timestamp(start)
|
|
658
|
+
end_date = Timestamp(end)
|
|
659
|
+
|
|
660
|
+
if interval_timespan:
|
|
661
|
+
|
|
662
|
+
# a variety of interval mode could be implemented
|
|
663
|
+
|
|
664
|
+
# 'now' - end of date interval is timestamp now
|
|
665
|
+
if interval_end_mode == 'now':
|
|
666
|
+
|
|
667
|
+
end_date = Timestamp.now()
|
|
668
|
+
start_date = end_date - timewindow_str_to_timedelta(interval_timespan)
|
|
669
|
+
|
|
670
|
+
if bdays:
|
|
671
|
+
|
|
672
|
+
components = findall(TIME_WINDOW_COMPONENTS_PATTERN_STR,
|
|
673
|
+
interval_timespan)
|
|
674
|
+
|
|
675
|
+
# fixed days redundancy check available only with 'd' type requested
|
|
676
|
+
# timespan
|
|
677
|
+
if components[1] == 'd':
|
|
678
|
+
|
|
679
|
+
days_list = list(
|
|
680
|
+
rrule(freq=DAILY,
|
|
681
|
+
dtstart=start_date,
|
|
682
|
+
until=end_date,
|
|
683
|
+
byweekday=(MO, TU, WE, TH, FR))
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
while len(days_list) < int(components[0]):
|
|
687
|
+
|
|
688
|
+
start_date = start_date - Timedelta(days=1)
|
|
689
|
+
|
|
690
|
+
days_list = list(
|
|
691
|
+
rrule(freq=DAILY,
|
|
692
|
+
dtstart=start_date,
|
|
693
|
+
until=end_date,
|
|
694
|
+
byweekday=(MO, TU, WE, TH, FR))
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
# Timestamp() constructor ensures these are Timestamp objects
|
|
698
|
+
if normalize:
|
|
699
|
+
|
|
700
|
+
if not isnull(start_date):
|
|
701
|
+
start_date = Timestamp.normalize(start_date)
|
|
702
|
+
|
|
703
|
+
if not isnull(end_date):
|
|
704
|
+
end_date = Timestamp.normalize(end_date)
|
|
705
|
+
|
|
706
|
+
start_date = any_date_to_datetime64(start_date)
|
|
707
|
+
end_date = any_date_to_datetime64(end_date)
|
|
708
|
+
|
|
709
|
+
# generate DateTimeIndex if freq is set
|
|
710
|
+
# otherwise return just start and end of interval
|
|
711
|
+
if freq:
|
|
712
|
+
|
|
713
|
+
bdate_dtindex = bdate_range(start=start_date,
|
|
714
|
+
end=end_date,
|
|
715
|
+
freq=freq,
|
|
716
|
+
tz=None,
|
|
717
|
+
normalize=normalize,
|
|
718
|
+
name='timestamp'
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
return start_date, end_date, bdate_dtindex
|
|
722
|
+
|
|
723
|
+
else:
|
|
724
|
+
|
|
725
|
+
return start_date, end_date
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
# BASE OPERATIONS WITH DATAFRAME
|
|
729
|
+
# depending on dataframe engine support
|
|
730
|
+
# for supported engines see var SUPPORTED_DATA_ENGINES
|
|
731
|
+
|
|
732
|
+
# DATA ENGINES TYPES DICTIONARY
|
|
733
|
+
class DTYPE_DICT:
|
|
734
|
+
|
|
735
|
+
TICK_DTYPE = {'ask': 'float32',
|
|
736
|
+
'bid': 'float32',
|
|
737
|
+
'vol': 'float32',
|
|
738
|
+
'p': 'float32'}
|
|
739
|
+
TF_DTYPE = {'open': 'float32',
|
|
740
|
+
'high': 'float32',
|
|
741
|
+
'low': 'float32',
|
|
742
|
+
'close': 'float32'}
|
|
743
|
+
TIME_TICK_DTYPE = {'timestamp': 'datetime64[ms]',
|
|
744
|
+
'ask': 'float32',
|
|
745
|
+
'bid': 'float32',
|
|
746
|
+
'vol': 'float32',
|
|
747
|
+
'p': 'float32'}
|
|
748
|
+
TIME_TF_DTYPE = {'timestamp': 'datetime64[ms]',
|
|
749
|
+
'open': 'float32',
|
|
750
|
+
'high': 'float32',
|
|
751
|
+
'low': 'float32',
|
|
752
|
+
'close': 'float32'}
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
class PYARROW_DTYPE_DICT:
|
|
756
|
+
|
|
757
|
+
TICK_DTYPE = {'ask': pyarrow_float32(),
|
|
758
|
+
'bid': pyarrow_float32(),
|
|
759
|
+
'vol': pyarrow_float32(),
|
|
760
|
+
'p': pyarrow_float32()}
|
|
761
|
+
TF_DTYPE = {'open': pyarrow_float32(),
|
|
762
|
+
'high': pyarrow_float32(),
|
|
763
|
+
'low': pyarrow_float32(),
|
|
764
|
+
'close': pyarrow_float32()}
|
|
765
|
+
TIME_TICK_DTYPE = {'timestamp': pyarrow_timestamp('ms'),
|
|
766
|
+
'ask': pyarrow_float32(),
|
|
767
|
+
'bid': pyarrow_float32(),
|
|
768
|
+
'vol': pyarrow_float32(),
|
|
769
|
+
'p': pyarrow_float32()}
|
|
770
|
+
TIME_TF_DTYPE = {'timestamp': pyarrow_timestamp('ms'),
|
|
771
|
+
'open': pyarrow_float32(),
|
|
772
|
+
'high': pyarrow_float32(),
|
|
773
|
+
'low': pyarrow_float32(),
|
|
774
|
+
'close': pyarrow_float32()}
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
class POLARS_DTYPE_DICT:
|
|
778
|
+
|
|
779
|
+
TICK_DTYPE = {'ask': polars_float32,
|
|
780
|
+
'bid': polars_float32,
|
|
781
|
+
'vol': polars_float32,
|
|
782
|
+
'p': polars_float32}
|
|
783
|
+
TF_DTYPE = {'open': polars_float32,
|
|
784
|
+
'high': polars_float32,
|
|
785
|
+
'low': polars_float32,
|
|
786
|
+
'close': polars_float32}
|
|
787
|
+
TIME_TICK_DTYPE = {'timestamp': polars_datetime('ms'),
|
|
788
|
+
'ask': polars_float32,
|
|
789
|
+
'bid': polars_float32,
|
|
790
|
+
'vol': polars_float32,
|
|
791
|
+
'p': polars_float32}
|
|
792
|
+
TIME_TF_DTYPE = {'timestamp': polars_datetime('ms'),
|
|
793
|
+
'open': polars_float32,
|
|
794
|
+
'high': polars_float32,
|
|
795
|
+
'low': polars_float32,
|
|
796
|
+
'close': polars_float32}
|
|
797
|
+
|
|
798
|
+
# DATA ENGINES FUNCTIONS
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
def empty_dataframe(engine):
|
|
802
|
+
|
|
803
|
+
if engine == 'pandas':
|
|
804
|
+
|
|
805
|
+
return pandas_dataframe()
|
|
806
|
+
|
|
807
|
+
elif engine == 'pyarrow':
|
|
808
|
+
|
|
809
|
+
return pyarrow_table([])
|
|
810
|
+
|
|
811
|
+
elif engine == 'polars':
|
|
812
|
+
|
|
813
|
+
return polars_dataframe()
|
|
814
|
+
|
|
815
|
+
elif engine == 'polars_lazy':
|
|
816
|
+
|
|
817
|
+
return polars_lazyframe()
|
|
818
|
+
|
|
819
|
+
else:
|
|
820
|
+
|
|
821
|
+
logger.error('function empty_dataframe not available'
|
|
822
|
+
f' for engine {engine}')
|
|
823
|
+
raise ValueError
|
|
824
|
+
|
|
825
|
+
|
|
826
|
+
def is_empty_dataframe(dataframe):
|
|
827
|
+
|
|
828
|
+
if isinstance(dataframe, pandas_dataframe):
|
|
829
|
+
|
|
830
|
+
return dataframe.empty
|
|
831
|
+
|
|
832
|
+
elif isinstance(dataframe, Table):
|
|
833
|
+
|
|
834
|
+
return (not bool(dataframe))
|
|
835
|
+
|
|
836
|
+
elif isinstance(dataframe, polars_dataframe):
|
|
837
|
+
|
|
838
|
+
return dataframe.is_empty()
|
|
839
|
+
|
|
840
|
+
elif isinstance(dataframe, polars_lazyframe):
|
|
841
|
+
|
|
842
|
+
return dataframe.collect().is_empty()
|
|
843
|
+
|
|
844
|
+
else:
|
|
845
|
+
|
|
846
|
+
logger.error('function is_empty_dataframe not available'
|
|
847
|
+
' for instance of type'
|
|
848
|
+
f' {type(dataframe)}')
|
|
849
|
+
raise ValueError
|
|
850
|
+
|
|
851
|
+
|
|
852
|
+
def shape_dataframe(dataframe):
|
|
853
|
+
|
|
854
|
+
if isinstance(dataframe, pandas_dataframe):
|
|
855
|
+
|
|
856
|
+
return dataframe.shape
|
|
857
|
+
|
|
858
|
+
elif isinstance(dataframe, Table):
|
|
859
|
+
|
|
860
|
+
return dataframe.shape
|
|
861
|
+
|
|
862
|
+
elif isinstance(dataframe, polars_dataframe):
|
|
863
|
+
|
|
864
|
+
return dataframe.shape
|
|
865
|
+
|
|
866
|
+
elif isinstance(dataframe, polars_lazyframe):
|
|
867
|
+
|
|
868
|
+
return (
|
|
869
|
+
dataframe.select(polars_len()).collect().item(0, 0),
|
|
870
|
+
dataframe.collect_schema().len()
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
else:
|
|
874
|
+
|
|
875
|
+
logger.error('function shape_dataframe not available'
|
|
876
|
+
' for instance of type'
|
|
877
|
+
f' {type(dataframe)}')
|
|
878
|
+
raise ValueError
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
def sort_dataframe(dataframe, column):
|
|
882
|
+
|
|
883
|
+
if isinstance(dataframe, pandas_dataframe):
|
|
884
|
+
|
|
885
|
+
return dataframe.sort_values(by=[column])
|
|
886
|
+
|
|
887
|
+
elif isinstance(dataframe, Table):
|
|
888
|
+
|
|
889
|
+
return dataframe.sort_by(column)
|
|
890
|
+
|
|
891
|
+
elif isinstance(dataframe, polars_dataframe):
|
|
892
|
+
|
|
893
|
+
return dataframe.sort(column, nulls_last=True)
|
|
894
|
+
|
|
895
|
+
elif isinstance(dataframe, polars_lazyframe):
|
|
896
|
+
|
|
897
|
+
return dataframe.sort(column, nulls_last=True)
|
|
898
|
+
|
|
899
|
+
else:
|
|
900
|
+
|
|
901
|
+
logger.error('function sort_dataframe not available'
|
|
902
|
+
' for instance of type'
|
|
903
|
+
f' {type(dataframe)}')
|
|
904
|
+
raise ValueError
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
def get_dataframe_column(dataframe, column):
|
|
908
|
+
|
|
909
|
+
if isinstance(dataframe, pandas_dataframe):
|
|
910
|
+
|
|
911
|
+
return dataframe[column]
|
|
912
|
+
|
|
913
|
+
elif isinstance(dataframe, Table):
|
|
914
|
+
|
|
915
|
+
return dataframe[column]
|
|
916
|
+
|
|
917
|
+
elif isinstance(dataframe, polars_dataframe):
|
|
918
|
+
|
|
919
|
+
return dataframe[column]
|
|
920
|
+
|
|
921
|
+
elif isinstance(dataframe, polars_lazyframe):
|
|
922
|
+
|
|
923
|
+
return dataframe.select(column).collect()
|
|
924
|
+
|
|
925
|
+
else:
|
|
926
|
+
|
|
927
|
+
logger.error('function get_dataframe_column not available'
|
|
928
|
+
' for instance of type'
|
|
929
|
+
f' {type(dataframe)}')
|
|
930
|
+
|
|
931
|
+
|
|
932
|
+
def get_dataframe_row(dataframe, row):
|
|
933
|
+
|
|
934
|
+
if isinstance(dataframe, pandas_dataframe):
|
|
935
|
+
|
|
936
|
+
return dataframe.loc[row]
|
|
937
|
+
|
|
938
|
+
elif isinstance(dataframe, Table):
|
|
939
|
+
|
|
940
|
+
return dataframe.slice(row, 1)
|
|
941
|
+
|
|
942
|
+
elif isinstance(dataframe, polars_dataframe):
|
|
943
|
+
|
|
944
|
+
return dataframe.slice(row, 1)
|
|
945
|
+
|
|
946
|
+
elif isinstance(dataframe, polars_lazyframe):
|
|
947
|
+
|
|
948
|
+
return dataframe.slice(row, 1)
|
|
949
|
+
|
|
950
|
+
else:
|
|
951
|
+
|
|
952
|
+
logger.error('function get_dataframe_row not available'
|
|
953
|
+
' for instance of type'
|
|
954
|
+
f' {type(dataframe)}')
|
|
955
|
+
|
|
956
|
+
|
|
957
|
+
def get_dataframe_element(dataframe, column, row):
|
|
958
|
+
|
|
959
|
+
if isinstance(dataframe, pandas_dataframe):
|
|
960
|
+
|
|
961
|
+
return dataframe[column][row]
|
|
962
|
+
|
|
963
|
+
elif isinstance(dataframe, Table):
|
|
964
|
+
|
|
965
|
+
return dataframe[column][row]
|
|
966
|
+
|
|
967
|
+
elif isinstance(dataframe, polars_dataframe):
|
|
968
|
+
|
|
969
|
+
return dataframe[column][row]
|
|
970
|
+
|
|
971
|
+
elif isinstance(dataframe, polars_lazyframe):
|
|
972
|
+
|
|
973
|
+
return dataframe.select(column).collect().item(row, 0)
|
|
974
|
+
|
|
975
|
+
else:
|
|
976
|
+
|
|
977
|
+
logger.error('function get_dataframe_element not available'
|
|
978
|
+
' for instance of type'
|
|
979
|
+
f' {type(dataframe)}')
|
|
980
|
+
raise ValueError
|
|
981
|
+
|
|
982
|
+
|
|
983
|
+
def dtype_dict_to_pyarrow_schema(dtype_dict):
|
|
984
|
+
|
|
985
|
+
return pyarrow_schema(dtype_dict.items())
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
def astype(dataframe, dtype_dict):
|
|
989
|
+
|
|
990
|
+
if isinstance(dataframe, pandas_dataframe):
|
|
991
|
+
|
|
992
|
+
return dataframe.astype(dtype_dict)
|
|
993
|
+
|
|
994
|
+
elif isinstance(dataframe, Table):
|
|
995
|
+
|
|
996
|
+
return dataframe.cast(dtype_dict_to_pyarrow_schema(dtype_dict))
|
|
997
|
+
|
|
998
|
+
elif isinstance(dataframe, polars_dataframe):
|
|
999
|
+
|
|
1000
|
+
return dataframe.cast(dtype_dict)
|
|
1001
|
+
|
|
1002
|
+
elif isinstance(dataframe, polars_lazyframe):
|
|
1003
|
+
|
|
1004
|
+
return dataframe.cast(dtype_dict)
|
|
1005
|
+
|
|
1006
|
+
else:
|
|
1007
|
+
|
|
1008
|
+
logger.error('function astype not available'
|
|
1009
|
+
' for instance of type'
|
|
1010
|
+
f' {type(dataframe)}')
|
|
1011
|
+
raise ValueError
|
|
1012
|
+
|
|
1013
|
+
|
|
1014
|
+
def read_parquet(engine, filepath):
|
|
1015
|
+
|
|
1016
|
+
if engine == 'pandas':
|
|
1017
|
+
|
|
1018
|
+
return pandas_read_parquet(filepath)
|
|
1019
|
+
|
|
1020
|
+
elif engine == 'pyarrow':
|
|
1021
|
+
|
|
1022
|
+
return read_table(filepath)
|
|
1023
|
+
|
|
1024
|
+
elif engine == 'polars':
|
|
1025
|
+
|
|
1026
|
+
return polars_read_parquet(filepath)
|
|
1027
|
+
|
|
1028
|
+
elif engine == 'polars_lazy':
|
|
1029
|
+
|
|
1030
|
+
return polars_scan_parquet(filepath)
|
|
1031
|
+
|
|
1032
|
+
else:
|
|
1033
|
+
|
|
1034
|
+
logger.error('function read_parquet not available'
|
|
1035
|
+
f' for engine {engine}')
|
|
1036
|
+
raise ValueError
|
|
1037
|
+
|
|
1038
|
+
|
|
1039
|
+
def write_parquet(dataframe, filepath):
|
|
1040
|
+
|
|
1041
|
+
if isinstance(dataframe, pandas_dataframe):
|
|
1042
|
+
|
|
1043
|
+
try:
|
|
1044
|
+
|
|
1045
|
+
dataframe.to_parquet(filepath, index=True)
|
|
1046
|
+
|
|
1047
|
+
except Exception as e:
|
|
1048
|
+
|
|
1049
|
+
logger.exception(f'pandas write parquet failed: {e}')
|
|
1050
|
+
raise
|
|
1051
|
+
|
|
1052
|
+
elif isinstance(dataframe, Table):
|
|
1053
|
+
|
|
1054
|
+
try:
|
|
1055
|
+
|
|
1056
|
+
write_table(dataframe, filepath)
|
|
1057
|
+
|
|
1058
|
+
except Exception as e:
|
|
1059
|
+
|
|
1060
|
+
logger.exception(f'pyarrow write parquet failed: {e}')
|
|
1061
|
+
raise
|
|
1062
|
+
|
|
1063
|
+
elif isinstance(dataframe, polars_dataframe):
|
|
1064
|
+
|
|
1065
|
+
try:
|
|
1066
|
+
|
|
1067
|
+
dataframe.write_parquet(filepath)
|
|
1068
|
+
|
|
1069
|
+
except Exception as e:
|
|
1070
|
+
|
|
1071
|
+
logger.exception(f'polars write parquet failed: {e}')
|
|
1072
|
+
raise
|
|
1073
|
+
|
|
1074
|
+
elif isinstance(dataframe, polars_lazyframe):
|
|
1075
|
+
|
|
1076
|
+
try:
|
|
1077
|
+
|
|
1078
|
+
dataframe.sink_parquet(filepath)
|
|
1079
|
+
|
|
1080
|
+
# alternative to sink_parquet()
|
|
1081
|
+
# dataframe.collect(streaming=False).write_parquet(filepath)
|
|
1082
|
+
|
|
1083
|
+
except Exception as e:
|
|
1084
|
+
|
|
1085
|
+
logger.exception(f'polars lazyframe sink '
|
|
1086
|
+
'parquet failed: {e}')
|
|
1087
|
+
raise
|
|
1088
|
+
|
|
1089
|
+
else:
|
|
1090
|
+
|
|
1091
|
+
logger.error('function write_parquet not available'
|
|
1092
|
+
' for instance of type'
|
|
1093
|
+
f' {type(dataframe)}')
|
|
1094
|
+
raise ValueError
|
|
1095
|
+
|
|
1096
|
+
|
|
1097
|
+
def read_csv(engine, file, **kwargs):
|
|
1098
|
+
|
|
1099
|
+
if engine == 'pandas':
|
|
1100
|
+
|
|
1101
|
+
return pandas_read_csv(file, **kwargs)
|
|
1102
|
+
|
|
1103
|
+
elif engine == 'pyarrow':
|
|
1104
|
+
|
|
1105
|
+
return arrow_csv.read_csv(file, **kwargs)
|
|
1106
|
+
|
|
1107
|
+
elif engine == 'polars':
|
|
1108
|
+
|
|
1109
|
+
return polars_read_csv(file, **kwargs)
|
|
1110
|
+
|
|
1111
|
+
elif engine == 'polars_lazy':
|
|
1112
|
+
|
|
1113
|
+
return polars_scan_csv(file, **kwargs)
|
|
1114
|
+
|
|
1115
|
+
else:
|
|
1116
|
+
|
|
1117
|
+
logger.error('function read_csv not available'
|
|
1118
|
+
f' for engine {engine}')
|
|
1119
|
+
raise ValueError
|
|
1120
|
+
|
|
1121
|
+
|
|
1122
|
+
def write_csv(dataframe, file, **kwargs):
|
|
1123
|
+
|
|
1124
|
+
if isinstance(dataframe, pandas_dataframe):
|
|
1125
|
+
|
|
1126
|
+
try:
|
|
1127
|
+
|
|
1128
|
+
# IMPORTANT
|
|
1129
|
+
# pandas dataframe case
|
|
1130
|
+
# avoid date_format parameter since it is reported that
|
|
1131
|
+
# it makes to_csv to be excessively long with column data
|
|
1132
|
+
# being datetime data type
|
|
1133
|
+
# see: https://github.com/pandas-dev/pandas/issues/37484
|
|
1134
|
+
# https://stackoverflow.com/questions/65903287/pandas-1-2-1-to-csv-performance-with-datetime-as-the-index-and-setting-date-form
|
|
1135
|
+
|
|
1136
|
+
dataframe.to_csv(file,
|
|
1137
|
+
header=True,
|
|
1138
|
+
**kwargs)
|
|
1139
|
+
|
|
1140
|
+
except Exception as e:
|
|
1141
|
+
|
|
1142
|
+
logger.exception(f'Error writing csv file {file}'
|
|
1143
|
+
f' with data type {type(dataframe)}: {e}')
|
|
1144
|
+
raise IOError
|
|
1145
|
+
|
|
1146
|
+
elif isinstance(dataframe, Table):
|
|
1147
|
+
|
|
1148
|
+
try:
|
|
1149
|
+
|
|
1150
|
+
arrow_csv.write_csv(dataframe, file, **kwargs)
|
|
1151
|
+
|
|
1152
|
+
except Exception as e:
|
|
1153
|
+
|
|
1154
|
+
logger.exception(f'Error writing csv file {file}'
|
|
1155
|
+
f' with data type {type(dataframe)}: {e}')
|
|
1156
|
+
raise IOError
|
|
1157
|
+
|
|
1158
|
+
elif isinstance(dataframe, polars_dataframe):
|
|
1159
|
+
|
|
1160
|
+
try:
|
|
1161
|
+
|
|
1162
|
+
dataframe.write_csv(file, **kwargs)
|
|
1163
|
+
|
|
1164
|
+
except Exception as e:
|
|
1165
|
+
|
|
1166
|
+
logger.exception(f'Error writing csv file {file}'
|
|
1167
|
+
f' with data type {type(dataframe)}: {e}')
|
|
1168
|
+
raise IOError
|
|
1169
|
+
|
|
1170
|
+
elif isinstance(dataframe, polars_lazyframe):
|
|
1171
|
+
|
|
1172
|
+
try:
|
|
1173
|
+
|
|
1174
|
+
dataframe.sink_csv(file, **kwargs)
|
|
1175
|
+
|
|
1176
|
+
except Exception as e:
|
|
1177
|
+
|
|
1178
|
+
logger.exception(f'Error writing csv file {file}'
|
|
1179
|
+
f' with data type {type(dataframe)}: {e}')
|
|
1180
|
+
raise IOError
|
|
1181
|
+
|
|
1182
|
+
else:
|
|
1183
|
+
|
|
1184
|
+
logger.error('function write_csv not available'
|
|
1185
|
+
' for instance of type'
|
|
1186
|
+
f' {type(dataframe)}')
|
|
1187
|
+
raise ValueError
|
|
1188
|
+
|
|
1189
|
+
|
|
1190
|
+
def concat_data(data_list=field(validator=validators.instance_of(list))):
|
|
1191
|
+
|
|
1192
|
+
if not isinstance(data_list, list):
|
|
1193
|
+
|
|
1194
|
+
logger.error('required input as list')
|
|
1195
|
+
raise TypeError
|
|
1196
|
+
|
|
1197
|
+
# assume data type is unique by input
|
|
1198
|
+
# get type from first element
|
|
1199
|
+
|
|
1200
|
+
if isinstance(data_list[0], pandas_dataframe):
|
|
1201
|
+
|
|
1202
|
+
return pandas_concat(data_list,
|
|
1203
|
+
ignore_index=False,
|
|
1204
|
+
copy=False)
|
|
1205
|
+
|
|
1206
|
+
elif isinstance(data_list[0], Table):
|
|
1207
|
+
|
|
1208
|
+
return concat_tables(data_list)
|
|
1209
|
+
|
|
1210
|
+
elif isinstance(data_list[0], polars_dataframe):
|
|
1211
|
+
|
|
1212
|
+
return polars_concat(data_list, how='vertical')
|
|
1213
|
+
|
|
1214
|
+
elif isinstance(data_list[0], polars_lazyframe):
|
|
1215
|
+
|
|
1216
|
+
return polars_concat(data_list, how='vertical')
|
|
1217
|
+
|
|
1218
|
+
else:
|
|
1219
|
+
|
|
1220
|
+
logger.error('function concat not available'
|
|
1221
|
+
' for instance of type'
|
|
1222
|
+
f' {type(data_list[0])}')
|
|
1223
|
+
raise ValueError
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
def to_pandas_dataframe(dataframe):
|
|
1227
|
+
|
|
1228
|
+
# convert to pandas dataframe
|
|
1229
|
+
# useful for those calls
|
|
1230
|
+
# requiring pandas as input
|
|
1231
|
+
# or pandas functions not covered
|
|
1232
|
+
# by other dataframe instance
|
|
1233
|
+
|
|
1234
|
+
if isinstance(dataframe, pandas_dataframe):
|
|
1235
|
+
|
|
1236
|
+
return dataframe
|
|
1237
|
+
|
|
1238
|
+
elif isinstance(dataframe, Table):
|
|
1239
|
+
|
|
1240
|
+
return dataframe.to_pandas()
|
|
1241
|
+
|
|
1242
|
+
elif isinstance(dataframe, polars_dataframe):
|
|
1243
|
+
|
|
1244
|
+
return dataframe.to_pandas(use_pyarrow_extension_array=True)
|
|
1245
|
+
|
|
1246
|
+
elif isinstance(dataframe, polars_lazyframe):
|
|
1247
|
+
|
|
1248
|
+
return dataframe.collect().to_pandas(use_pyarrow_extension_array=True)
|
|
1249
|
+
|
|
1250
|
+
else:
|
|
1251
|
+
|
|
1252
|
+
logger.error('function to_pandas() not available'
|
|
1253
|
+
' for instance of type'
|
|
1254
|
+
f' {type(dataframe)}')
|
|
1255
|
+
raise ValueError
|
|
1256
|
+
|
|
1257
|
+
|
|
1258
|
+
def reframe_data(dataframe, tf):
|
|
1259
|
+
'''
|
|
1260
|
+
|
|
1261
|
+
|
|
1262
|
+
Parameters
|
|
1263
|
+
----------
|
|
1264
|
+
data : TYPE
|
|
1265
|
+
DESCRIPTION.
|
|
1266
|
+
tf : TYPE
|
|
1267
|
+
DESCRIPTION.
|
|
1268
|
+
|
|
1269
|
+
Raises
|
|
1270
|
+
------
|
|
1271
|
+
ValueError
|
|
1272
|
+
DESCRIPTION.
|
|
1273
|
+
|
|
1274
|
+
Returns
|
|
1275
|
+
-------
|
|
1276
|
+
Dataframe
|
|
1277
|
+
DESCRIPTION.
|
|
1278
|
+
|
|
1279
|
+
'''
|
|
1280
|
+
|
|
1281
|
+
if is_empty_dataframe(dataframe):
|
|
1282
|
+
|
|
1283
|
+
return dataframe
|
|
1284
|
+
|
|
1285
|
+
if isinstance(dataframe, pandas_dataframe):
|
|
1286
|
+
|
|
1287
|
+
# assert timeframe input value
|
|
1288
|
+
tf = check_timeframe_str(tf)
|
|
1289
|
+
|
|
1290
|
+
if tf == TICK_TIMEFRAME:
|
|
1291
|
+
|
|
1292
|
+
logger.warning(f'reframe not possible wih target {TICK_TIMEFRAME}')
|
|
1293
|
+
|
|
1294
|
+
return dataframe
|
|
1295
|
+
|
|
1296
|
+
if not is_datetime64_any_dtype(dataframe.index):
|
|
1297
|
+
|
|
1298
|
+
if BASE_DATA_COLUMN_NAME.TIMESTAMP in dataframe.columns:
|
|
1299
|
+
|
|
1300
|
+
if not is_datetime64_any_dtype(
|
|
1301
|
+
dataframe[BASE_DATA_COLUMN_NAME.TIMESTAMP]):
|
|
1302
|
+
|
|
1303
|
+
try:
|
|
1304
|
+
|
|
1305
|
+
dataframe[BASE_DATA_COLUMN_NAME.TIMESTAMP] = any_date_to_datetime64(
|
|
1306
|
+
dataframe[BASE_DATA_COLUMN_NAME.TIMESTAMP])
|
|
1307
|
+
|
|
1308
|
+
except Exception as e:
|
|
1309
|
+
|
|
1310
|
+
logger.exception('Pandas engine: '
|
|
1311
|
+
'Failed conversion of timestamp columns '
|
|
1312
|
+
'to DatetimeIndex')
|
|
1313
|
+
raise
|
|
1314
|
+
|
|
1315
|
+
else:
|
|
1316
|
+
|
|
1317
|
+
logger.error('Pandas engine: required column with '
|
|
1318
|
+
f'name {BASE_DATA_COLUMN_NAME.TIMESTAMP}')
|
|
1319
|
+
raise ValueError
|
|
1320
|
+
|
|
1321
|
+
# use pandas functions to reframe data on pandas Dataframe
|
|
1322
|
+
|
|
1323
|
+
dataframe = sort_dataframe(dataframe, BASE_DATA_COLUMN_NAME.TIMESTAMP)
|
|
1324
|
+
|
|
1325
|
+
dataframe = dataframe.set_index(BASE_DATA_COLUMN_NAME.TIMESTAMP,
|
|
1326
|
+
inplace=False,
|
|
1327
|
+
drop=True
|
|
1328
|
+
)
|
|
1329
|
+
|
|
1330
|
+
# resample based on p value
|
|
1331
|
+
if all([col in DATA_COLUMN_NAMES.TICK_DATA_TIME_INDEX
|
|
1332
|
+
for col in dataframe.columns]):
|
|
1333
|
+
|
|
1334
|
+
# resample along 'p' column, data in ask, bid, p format
|
|
1335
|
+
dataframe = dataframe.p.resample(tf).ohlc().interpolate(method='nearest')
|
|
1336
|
+
|
|
1337
|
+
elif all([col in DATA_COLUMN_NAMES.TF_DATA_TIME_INDEX
|
|
1338
|
+
for col in dataframe.columns]):
|
|
1339
|
+
|
|
1340
|
+
# resample along given data already in ohlc format
|
|
1341
|
+
dataframe = dataframe.resample(tf).interpolate(method='nearest')
|
|
1342
|
+
|
|
1343
|
+
else:
|
|
1344
|
+
|
|
1345
|
+
logger.error(f'data columns {dataframe.columns} invalid, '
|
|
1346
|
+
f'required {DATA_COLUMN_NAMES.TICK_DATA_TIME_INDEX} '
|
|
1347
|
+
f'or {DATA_COLUMN_NAMES.TF_DATA_TIME_INDEX}')
|
|
1348
|
+
raise ValueError
|
|
1349
|
+
|
|
1350
|
+
return dataframe.reset_index(drop=False)
|
|
1351
|
+
|
|
1352
|
+
elif isinstance(dataframe, Table):
|
|
1353
|
+
|
|
1354
|
+
'''
|
|
1355
|
+
|
|
1356
|
+
use pyarrow functions to reframe data on pyarrow Table
|
|
1357
|
+
could not find easy way to filter an arrow table
|
|
1358
|
+
based on time interval
|
|
1359
|
+
|
|
1360
|
+
opened an enhancement issue on github
|
|
1361
|
+
|
|
1362
|
+
https://github.com/apache/arrow/issues/41049
|
|
1363
|
+
|
|
1364
|
+
As a temporary alternative, convert arrow Table to polars
|
|
1365
|
+
and perform reframe with polars engine
|
|
1366
|
+
|
|
1367
|
+
'''
|
|
1368
|
+
|
|
1369
|
+
if all([col in DATA_COLUMN_NAMES.TICK_DATA
|
|
1370
|
+
for col in dataframe.column_names]):
|
|
1371
|
+
|
|
1372
|
+
# convert to polars dataframe
|
|
1373
|
+
dataframe = from_arrow(dataframe,
|
|
1374
|
+
schema=cast(Any, POLARS_DTYPE_DICT.TIME_TICK_DTYPE))
|
|
1375
|
+
|
|
1376
|
+
elif all([col in DATA_COLUMN_NAMES.TF_DATA
|
|
1377
|
+
for col in dataframe.column_names]):
|
|
1378
|
+
|
|
1379
|
+
# convert to polars dataframe
|
|
1380
|
+
dataframe = from_arrow(dataframe,
|
|
1381
|
+
schema=cast(Any, POLARS_DTYPE_DICT.TIME_TF_DTYPE))
|
|
1382
|
+
|
|
1383
|
+
# perform operation
|
|
1384
|
+
# convert to arrow Table and return
|
|
1385
|
+
return reframe_data(dataframe, tf).to_arrow()
|
|
1386
|
+
|
|
1387
|
+
elif isinstance(dataframe, polars_dataframe):
|
|
1388
|
+
|
|
1389
|
+
tf = tf.lower()
|
|
1390
|
+
|
|
1391
|
+
dataframe = sort_dataframe(dataframe, BASE_DATA_COLUMN_NAME.TIMESTAMP)
|
|
1392
|
+
|
|
1393
|
+
if all([col in DATA_COLUMN_NAMES.TICK_DATA
|
|
1394
|
+
for col in dataframe.columns]):
|
|
1395
|
+
|
|
1396
|
+
return dataframe.group_by_dynamic(
|
|
1397
|
+
BASE_DATA_COLUMN_NAME.TIMESTAMP,
|
|
1398
|
+
every=tf).agg(col('p').first().alias(BASE_DATA_COLUMN_NAME.OPEN),
|
|
1399
|
+
col('p').max().alias(BASE_DATA_COLUMN_NAME.HIGH),
|
|
1400
|
+
col('p').min().alias(BASE_DATA_COLUMN_NAME.LOW),
|
|
1401
|
+
col('p').last().alias(BASE_DATA_COLUMN_NAME.CLOSE)
|
|
1402
|
+
)
|
|
1403
|
+
|
|
1404
|
+
elif all([col in DATA_COLUMN_NAMES.TF_DATA
|
|
1405
|
+
for col in dataframe.columns]):
|
|
1406
|
+
|
|
1407
|
+
return dataframe.group_by_dynamic(
|
|
1408
|
+
BASE_DATA_COLUMN_NAME.TIMESTAMP,
|
|
1409
|
+
every=tf).agg(col(BASE_DATA_COLUMN_NAME.OPEN).first(),
|
|
1410
|
+
col(BASE_DATA_COLUMN_NAME.HIGH).max(),
|
|
1411
|
+
col(BASE_DATA_COLUMN_NAME.LOW).min(),
|
|
1412
|
+
col(BASE_DATA_COLUMN_NAME.CLOSE).last()
|
|
1413
|
+
)
|
|
1414
|
+
|
|
1415
|
+
else:
|
|
1416
|
+
|
|
1417
|
+
logger.error(f'data columns {dataframe.columns} invalid, '
|
|
1418
|
+
f'required {DATA_COLUMN_NAMES.TICK_DATA} '
|
|
1419
|
+
f'or {DATA_COLUMN_NAMES.TF_DATA}')
|
|
1420
|
+
raise ValueError
|
|
1421
|
+
|
|
1422
|
+
elif isinstance(dataframe, polars_lazyframe):
|
|
1423
|
+
|
|
1424
|
+
tf = tf.lower()
|
|
1425
|
+
|
|
1426
|
+
dataframe = dataframe.sort('timestamp', nulls_last=True)
|
|
1427
|
+
|
|
1428
|
+
if all([col in DATA_COLUMN_NAMES.TICK_DATA
|
|
1429
|
+
for col in dataframe.collect_schema().names()]):
|
|
1430
|
+
|
|
1431
|
+
return dataframe.group_by_dynamic(
|
|
1432
|
+
BASE_DATA_COLUMN_NAME.TIMESTAMP,
|
|
1433
|
+
every=tf).agg(col('p').first().alias(BASE_DATA_COLUMN_NAME.OPEN),
|
|
1434
|
+
col('p').max().alias(BASE_DATA_COLUMN_NAME.HIGH),
|
|
1435
|
+
col('p').min().alias(BASE_DATA_COLUMN_NAME.LOW),
|
|
1436
|
+
col('p').last().alias(BASE_DATA_COLUMN_NAME.CLOSE)
|
|
1437
|
+
)
|
|
1438
|
+
|
|
1439
|
+
elif all([col in DATA_COLUMN_NAMES.TF_DATA
|
|
1440
|
+
for col in dataframe.collect_schema().names()]):
|
|
1441
|
+
|
|
1442
|
+
return dataframe.group_by_dynamic(
|
|
1443
|
+
BASE_DATA_COLUMN_NAME.TIMESTAMP,
|
|
1444
|
+
every=tf).agg(col(BASE_DATA_COLUMN_NAME.OPEN).first(),
|
|
1445
|
+
col(BASE_DATA_COLUMN_NAME.HIGH).max(),
|
|
1446
|
+
col(BASE_DATA_COLUMN_NAME.LOW).min(),
|
|
1447
|
+
col(BASE_DATA_COLUMN_NAME.CLOSE).last()
|
|
1448
|
+
)
|
|
1449
|
+
|
|
1450
|
+
else:
|
|
1451
|
+
|
|
1452
|
+
logger.error(f'data columns {dataframe.columns} invalid, '
|
|
1453
|
+
f'required {DATA_COLUMN_NAMES.TICK_DATA} '
|
|
1454
|
+
f'or {DATA_COLUMN_NAMES.TF_DATA}')
|
|
1455
|
+
raise ValueError
|
|
1456
|
+
|
|
1457
|
+
# UTILS FOR DOTTY DICTIONARY
|
|
1458
|
+
|
|
1459
|
+
|
|
1460
|
+
def get_dotty_key_field(key, index):
|
|
1461
|
+
|
|
1462
|
+
if not isinstance(key, str):
|
|
1463
|
+
|
|
1464
|
+
logger.error(f'dotty key {key} invalid type, str required')
|
|
1465
|
+
raise TypeError
|
|
1466
|
+
|
|
1467
|
+
try:
|
|
1468
|
+
|
|
1469
|
+
field = key.split('.')[index]
|
|
1470
|
+
|
|
1471
|
+
except IndexError:
|
|
1472
|
+
|
|
1473
|
+
logger.exception(f'index {index} invalid for key {key}')
|
|
1474
|
+
raise
|
|
1475
|
+
|
|
1476
|
+
return field
|
|
1477
|
+
|
|
1478
|
+
|
|
1479
|
+
def get_dotty_keys(dotty_dict,
|
|
1480
|
+
root=False,
|
|
1481
|
+
level=None,
|
|
1482
|
+
parent_key=None):
|
|
1483
|
+
|
|
1484
|
+
dotty_copy = dotty_dict.copy()
|
|
1485
|
+
|
|
1486
|
+
if root:
|
|
1487
|
+
|
|
1488
|
+
return dotty_copy.keys()
|
|
1489
|
+
|
|
1490
|
+
elif level:
|
|
1491
|
+
|
|
1492
|
+
if not (
|
|
1493
|
+
isinstance(level, int) and
|
|
1494
|
+
level >= 0):
|
|
1495
|
+
|
|
1496
|
+
logger.error('level must be zero or positive integer')
|
|
1497
|
+
raise ValueError
|
|
1498
|
+
|
|
1499
|
+
# default start at root key
|
|
1500
|
+
level_counter = 0
|
|
1501
|
+
|
|
1502
|
+
pass
|
|
1503
|
+
|
|
1504
|
+
elif parent_key:
|
|
1505
|
+
|
|
1506
|
+
if not isinstance(parent_key, str):
|
|
1507
|
+
|
|
1508
|
+
logger.error('parent key must be str')
|
|
1509
|
+
|
|
1510
|
+
parent_dict = dotty_copy.pop(parent_key)
|
|
1511
|
+
|
|
1512
|
+
if parent_dict:
|
|
1513
|
+
|
|
1514
|
+
try:
|
|
1515
|
+
keys = parent_dict.keys()
|
|
1516
|
+
except KeyError as err:
|
|
1517
|
+
|
|
1518
|
+
logger.exception(f'{err} : keys not found under {parent_key}')
|
|
1519
|
+
return []
|
|
1520
|
+
|
|
1521
|
+
else:
|
|
1522
|
+
|
|
1523
|
+
return [str(k) for k in keys]
|
|
1524
|
+
|
|
1525
|
+
else:
|
|
1526
|
+
|
|
1527
|
+
logger.error('{parent_key} key not exist')
|
|
1528
|
+
raise KeyError
|
|
1529
|
+
|
|
1530
|
+
|
|
1531
|
+
def get_dotty_leafs(dotty_dict):
|
|
1532
|
+
|
|
1533
|
+
leaf_keys = list()
|
|
1534
|
+
|
|
1535
|
+
def get_leaf(dotty_dict, parent_key):
|
|
1536
|
+
|
|
1537
|
+
try:
|
|
1538
|
+
|
|
1539
|
+
if dotty_dict.keys():
|
|
1540
|
+
|
|
1541
|
+
for key in dotty_dict.keys():
|
|
1542
|
+
|
|
1543
|
+
key_w_parent = '{parent}.{key}'.format(parent=parent_key,
|
|
1544
|
+
key=key)
|
|
1545
|
+
|
|
1546
|
+
get_leaf(dotty_dict.get(key), key_w_parent)
|
|
1547
|
+
|
|
1548
|
+
except AttributeError:
|
|
1549
|
+
|
|
1550
|
+
leaf_keys.append(parent_key)
|
|
1551
|
+
|
|
1552
|
+
except ValueError:
|
|
1553
|
+
|
|
1554
|
+
leaf_keys.append(parent_key)
|
|
1555
|
+
|
|
1556
|
+
# root field is temporary to have common start in any case in all leafs
|
|
1557
|
+
get_leaf(dotty_dict, 'root')
|
|
1558
|
+
|
|
1559
|
+
# leave out root field from all paths to leafs
|
|
1560
|
+
original_leaf_keys = leaf_keys
|
|
1561
|
+
leaf_keys = []
|
|
1562
|
+
for leaf in original_leaf_keys:
|
|
1563
|
+
match_result = search(r'(?<=root.)\\S+', leaf)
|
|
1564
|
+
if match_result:
|
|
1565
|
+
leaf_keys.append(match_result.group(0))
|
|
1566
|
+
|
|
1567
|
+
return leaf_keys
|
|
1568
|
+
|
|
1569
|
+
|
|
1570
|
+
def get_dotty_key_parent(key):
|
|
1571
|
+
|
|
1572
|
+
if isinstance(key, str):
|
|
1573
|
+
|
|
1574
|
+
logger.error('dotty key must be str type')
|
|
1575
|
+
raise TypeError
|
|
1576
|
+
|
|
1577
|
+
# prune last field and rejoin with '.' separator
|
|
1578
|
+
# to recreate a dotty key
|
|
1579
|
+
parent_key = '.'.join(key.split('.')[:-2])
|
|
1580
|
+
|
|
1581
|
+
return parent_key
|
|
1582
|
+
|
|
1583
|
+
|
|
1584
|
+
# TODO: function that returns all leafs at a given
|
|
1585
|
+
# given level
|
|
1586
|
+
|
|
1587
|
+
|
|
1588
|
+
# ATTRS
|
|
1589
|
+
|
|
1590
|
+
# ADDED VALIDATORS
|
|
1591
|
+
|
|
1592
|
+
def validator_file_path(file_ext=None):
|
|
1593
|
+
|
|
1594
|
+
def validate_file_path(instance, attribute, value):
|
|
1595
|
+
|
|
1596
|
+
try:
|
|
1597
|
+
|
|
1598
|
+
filepath = Path(value)
|
|
1599
|
+
|
|
1600
|
+
except Exception as e:
|
|
1601
|
+
|
|
1602
|
+
logger.error(f'File {value} Path creation error: {e}')
|
|
1603
|
+
raise
|
|
1604
|
+
|
|
1605
|
+
else:
|
|
1606
|
+
|
|
1607
|
+
if not (
|
|
1608
|
+
|
|
1609
|
+
filepath.exists() or
|
|
1610
|
+
filepath.is_file()
|
|
1611
|
+
):
|
|
1612
|
+
|
|
1613
|
+
logger.error(f'file {value} not exists')
|
|
1614
|
+
raise FileExistsError
|
|
1615
|
+
|
|
1616
|
+
return validate_file_path
|
|
1617
|
+
|
|
1618
|
+
|
|
1619
|
+
def validator_dir_path(create_if_missing=False):
|
|
1620
|
+
|
|
1621
|
+
def validate_or_create_dir(instance, attribute, value):
|
|
1622
|
+
|
|
1623
|
+
if create_if_missing:
|
|
1624
|
+
|
|
1625
|
+
Path(value).mkdir(parents=True, exist_ok=True)
|
|
1626
|
+
|
|
1627
|
+
else:
|
|
1628
|
+
|
|
1629
|
+
if not (
|
|
1630
|
+
Path(value).exists() or
|
|
1631
|
+
Path(value).is_dir()
|
|
1632
|
+
):
|
|
1633
|
+
|
|
1634
|
+
logger.error(f'Directory {value} not valid')
|
|
1635
|
+
raise TypeError()
|
|
1636
|
+
|
|
1637
|
+
return validate_or_create_dir
|
|
1638
|
+
|
|
1639
|
+
|
|
1640
|
+
def validator_list_timeframe(instance, attribute, value):
|
|
1641
|
+
|
|
1642
|
+
if not isinstance(value, list):
|
|
1643
|
+
|
|
1644
|
+
logger.error(f'Required type list for argument {attribute}')
|
|
1645
|
+
raise TypeError
|
|
1646
|
+
|
|
1647
|
+
if not all([
|
|
1648
|
+
check_timeframe_str(val)
|
|
1649
|
+
for val in value
|
|
1650
|
+
]):
|
|
1651
|
+
|
|
1652
|
+
fails = [
|
|
1653
|
+
val for val in value
|
|
1654
|
+
if not check_timeframe_str(val)
|
|
1655
|
+
]
|
|
1656
|
+
|
|
1657
|
+
return ValueError('Values are not timeframe compatible: '
|
|
1658
|
+
f'{fails}')
|
|
1659
|
+
|
|
1660
|
+
|
|
1661
|
+
def validator_list_ge(min_value):
|
|
1662
|
+
|
|
1663
|
+
def validator_list_values(instance, attribute, value):
|
|
1664
|
+
|
|
1665
|
+
if not (
|
|
1666
|
+
isinstance(value, list) and
|
|
1667
|
+
all([isinstance(val, int)
|
|
1668
|
+
for val in value])
|
|
1669
|
+
):
|
|
1670
|
+
|
|
1671
|
+
logger.error('Required list of int type for argument '
|
|
1672
|
+
f'{attribute}')
|
|
1673
|
+
raise TypeError
|
|
1674
|
+
|
|
1675
|
+
if any([
|
|
1676
|
+
val < min_value
|
|
1677
|
+
for val in value
|
|
1678
|
+
]):
|
|
1679
|
+
|
|
1680
|
+
fails = [
|
|
1681
|
+
val for val in value
|
|
1682
|
+
if val < min_value
|
|
1683
|
+
]
|
|
1684
|
+
|
|
1685
|
+
logger.error(f'Values in {attribute}: {fails} '
|
|
1686
|
+
f'are not greater than {min_value}')
|
|
1687
|
+
raise ValueError
|
|
1688
|
+
|
|
1689
|
+
# ATTRIBUTES
|
|
1690
|
+
|
|
1691
|
+
|
|
1692
|
+
def get_attrs_names(instance_object, **kwargs):
|
|
1693
|
+
|
|
1694
|
+
if hasattr(instance_object, '__attrs_attrs__'):
|
|
1695
|
+
|
|
1696
|
+
return [attr.name
|
|
1697
|
+
for attr in instance_object.__attrs_attrs__]
|
|
1698
|
+
|
|
1699
|
+
else:
|
|
1700
|
+
|
|
1701
|
+
logger.error('attribute "__attrs__attrs__" not found in '
|
|
1702
|
+
f'object {instance_object}')
|
|
1703
|
+
raise KeyError
|
|
1704
|
+
|
|
1705
|
+
# GENERIC UTILITIES
|
|
1706
|
+
|
|
1707
|
+
|
|
1708
|
+
def list_remove_duplicates(list_in):
|
|
1709
|
+
|
|
1710
|
+
return list(dict.fromkeys(list_in))
|
|
1711
|
+
|
|
1712
|
+
# HISTDATA data provider utilities
|
|
1713
|
+
|
|
1714
|
+
|
|
1715
|
+
# Analyze the Histdata Forex download base page
|
|
1716
|
+
# https://www.histdata.com/download-free-forex-data/?/ascii/1-minute-bar-quotes
|
|
1717
|
+
# and get a list of all avilable tickers in the form as the example "EURUSD"
|
|
1718
|
+
def get_histdata_tickers() -> List[str]:
|
|
1719
|
+
"""
|
|
1720
|
+
Get all available tickers from HistData.com.
|
|
1721
|
+
|
|
1722
|
+
Returns
|
|
1723
|
+
-------
|
|
1724
|
+
List[str]
|
|
1725
|
+
List of all available tickers (e.g., ['EURUSD', 'GBPUSD', ...]).
|
|
1726
|
+
"""
|
|
1727
|
+
url = "https://www.histdata.com/download-free-forex-data/?/ascii/1-minute-bar-quotes"
|
|
1728
|
+
|
|
1729
|
+
# TODO: test connection with url, if fails return empty list and log error
|
|
1730
|
+
|
|
1731
|
+
try:
|
|
1732
|
+
response = requests.get(url)
|
|
1733
|
+
response.raise_for_status()
|
|
1734
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
1735
|
+
|
|
1736
|
+
tickers = []
|
|
1737
|
+
# Tickers are typically in links that lead to the pair's specific page
|
|
1738
|
+
for link in soup.find_all('a', href=True):
|
|
1739
|
+
href = link['href']
|
|
1740
|
+
# Pattern check based on the observed links
|
|
1741
|
+
if "/ascii/1-minute-bar-quotes/" in href:
|
|
1742
|
+
parts = href.split('/')
|
|
1743
|
+
ticker = parts[-1]
|
|
1744
|
+
# Validate it's a valid ticker (usually 6 chars like EURUSD)
|
|
1745
|
+
if ticker and len(ticker) >= 6:
|
|
1746
|
+
tickers.append(ticker.upper())
|
|
1747
|
+
|
|
1748
|
+
return sorted(list(set(tickers)))
|
|
1749
|
+
|
|
1750
|
+
except Exception as e:
|
|
1751
|
+
logger.error(f"Failed to retrieve tickers from HistData: {e}")
|
|
1752
|
+
return []
|
|
1753
|
+
# REAL TIME PROVIDERS UTILITIES
|
|
1754
|
+
|
|
1755
|
+
|
|
1756
|
+
def polygon_agg_to_dict(agg):
|
|
1757
|
+
|
|
1758
|
+
if not isinstance(agg, polygon_agg):
|
|
1759
|
+
|
|
1760
|
+
logger.error('argument invalid type, required '
|
|
1761
|
+
'polygon.rest.models.aggs.Agg')
|
|
1762
|
+
|
|
1763
|
+
return {
|
|
1764
|
+
BASE_DATA_COLUMN_NAME.TIMESTAMP: agg.timestamp,
|
|
1765
|
+
BASE_DATA_COLUMN_NAME.OPEN: agg.open,
|
|
1766
|
+
BASE_DATA_COLUMN_NAME.HIGH: agg.high,
|
|
1767
|
+
BASE_DATA_COLUMN_NAME.LOW: agg.low,
|
|
1768
|
+
BASE_DATA_COLUMN_NAME.CLOSE: agg.close,
|
|
1769
|
+
BASE_DATA_COLUMN_NAME.VOL: agg.volume,
|
|
1770
|
+
BASE_DATA_COLUMN_NAME.TRANSACTIONS: agg.transactions,
|
|
1771
|
+
BASE_DATA_COLUMN_NAME.VWAP: agg.vwap,
|
|
1772
|
+
BASE_DATA_COLUMN_NAME.OTC: agg.otc
|
|
1773
|
+
}
|