forex_data_aggregator 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1773 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Sat Apr 30 09:23:19 2022
4
+
5
+ @author: fiora
6
+ """
7
+
8
+ __all__ = [
9
+ 'YEARS',
10
+ 'MONTHS',
11
+ 'DATE_FORMAT_SQL',
12
+ 'DATE_FORMAT_HISTDATA_CSV',
13
+ 'HISTDATA_URL_TICKDATA_TEMPLATE',
14
+ 'HISTDATA_BASE_DOWNLOAD_METHOD',
15
+ 'HISTDATA_BASE_DOWNLOAD_URL',
16
+ 'DEFAULT_PATHS',
17
+ 'DATA_TYPE',
18
+ 'BASE_DATA_COLUMN_NAME',
19
+ 'DATA_FILE_COLUMN_INDEX',
20
+ 'SUPPORTED_DATA_FILES',
21
+ 'SUPPORTED_DATA_ENGINES',
22
+ 'ASSET_TYPE',
23
+ 'TEMP_FOLDER',
24
+ 'TEMP_CSV_FILE',
25
+ 'DTYPE_DICT',
26
+ 'PYARROW_DTYPE_DICT',
27
+ 'POLARS_DTYPE_DICT',
28
+ 'DATA_COLUMN_NAMES',
29
+ 'FILENAME_TEMPLATE',
30
+ 'DATA_KEY',
31
+ 'TICK_TIMEFRAME',
32
+ 'FILENAME_STR',
33
+ 'REALTIME_DATA_PROVIDER',
34
+ 'ALPHA_VANTAGE_API_KEY',
35
+ 'CANONICAL_INDEX',
36
+ 'DATE_NO_HOUR_FORMAT',
37
+ 'POLYGON_IO_API_KEY',
38
+ 'AV_LIST_URL',
39
+ 'PAIR_ALPHAVANTAGE_FORMAT',
40
+ 'PAIR_POLYGON_FORMAT',
41
+ 'SQL_COMPARISON_OPERATORS',
42
+ 'SUPPORTED_SQL_COMPARISON_OPERATORS',
43
+ 'SUPPORTED_BASE_DATA_COLUMN_NAME',
44
+ 'SQL_CONDITION_AGGREGATION_MODES',
45
+ 'SUPPORTED_SQL_CONDITION_AGGREGATION_MODES',
46
+
47
+ 'validator_file_path',
48
+ 'validator_dir_path',
49
+ 'get_attrs_names',
50
+ 'check_time_offset_str',
51
+ 'check_timeframe_str',
52
+ 'any_date_to_datetime64',
53
+ 'empty_dataframe',
54
+ 'is_empty_dataframe',
55
+ 'shape_dataframe',
56
+ 'get_dataframe_column',
57
+ 'get_dataframe_row',
58
+ 'get_dataframe_element',
59
+ 'get_dotty_leafs',
60
+ 'astype',
61
+ 'read_csv',
62
+ 'polars_datetime',
63
+ 'sort_dataframe',
64
+ 'concat_data',
65
+ 'list_remove_duplicates',
66
+ 'get_dotty_key_field',
67
+ 'reframe_data',
68
+ 'write_csv',
69
+ 'write_parquet',
70
+ 'read_parquet',
71
+ 'to_pandas_dataframe',
72
+ 'get_pair_symbols',
73
+ 'to_source_symbol',
74
+ 'get_date_interval',
75
+ 'polygon_agg_to_dict',
76
+ 'validator_list_timeframe',
77
+ 'get_histdata_tickers',
78
+ 'TickerNotFoundError',
79
+ 'TickerDataNotFoundError',
80
+ 'TickerDataBadTypeException',
81
+ 'TickerDataInvalidException'
82
+ ]
83
+
84
+ from loguru import logger
85
+
86
+ from re import (
87
+ fullmatch,
88
+ findall,
89
+ search
90
+ )
91
+
92
+ from typing import (
93
+ cast,
94
+ Any,
95
+ List,
96
+ Literal
97
+ )
98
+
99
+ import requests
100
+ from bs4 import BeautifulSoup
101
+
102
+ from datetime import (
103
+ timedelta
104
+ )
105
+
106
+ # PANDAS
107
+ from pandas import (
108
+ DataFrame as pandas_dataframe,
109
+ concat as pandas_concat,
110
+ Timestamp,
111
+ isnull,
112
+ bdate_range,
113
+ to_datetime,
114
+ Timedelta,
115
+ read_parquet as pandas_read_parquet,
116
+ read_csv as pandas_read_csv
117
+ )
118
+
119
+ from pandas.api.types import is_datetime64_any_dtype
120
+ from pandas.tseries.frequencies import to_offset
121
+ from pandas.tseries.offsets import DateOffset
122
+
123
+ # PYARROW
124
+ from pyarrow import (
125
+ float32 as pyarrow_float32,
126
+ timestamp as pyarrow_timestamp,
127
+ schema as pyarrow_schema,
128
+ Table,
129
+ table as pyarrow_table,
130
+ concat_tables,
131
+ csv as arrow_csv
132
+ )
133
+
134
+ from pyarrow.parquet import (
135
+ write_table,
136
+ read_table
137
+ )
138
+
139
+ # POLARS
140
+ from polars import (
141
+ Float32 as polars_float32,
142
+ Datetime as polars_datetime,
143
+ read_csv as polars_read_csv,
144
+ concat as polars_concat,
145
+ col,
146
+ len as polars_len,
147
+ read_parquet as polars_read_parquet,
148
+ from_arrow,
149
+ DataFrame as polars_dataframe,
150
+ LazyFrame as polars_lazyframe,
151
+ scan_csv as polars_scan_csv,
152
+ scan_parquet as polars_scan_parquet
153
+ )
154
+
155
+ # POLYGON real time provider
156
+ from polygon.rest.models.aggs import (
157
+ Agg as polygon_agg
158
+ )
159
+
160
+ from dateutil.rrule import (
161
+ rrule,
162
+ DAILY,
163
+ MO,
164
+ TU,
165
+ WE,
166
+ TH,
167
+ FR
168
+ )
169
+
170
+ from datetime import datetime
171
+
172
+ from pathlib import Path
173
+
174
+ from attrs import (
175
+ field,
176
+ validators
177
+ )
178
+
179
+
180
+ # =============================================================================
181
+ # CUSTOM EXCEPTIONS
182
+ # =============================================================================
183
+
184
+ # TickerNotFoundError:
185
+ # This exception is raised when the ticker requested is misspelled
186
+ # or does not exist in the database.
187
+ class TickerNotFoundError(Exception):
188
+ pass
189
+
190
+
191
+ # TickerDataNotFoundError:
192
+ # This exception is raised when the ticker is found
193
+ # but data is not available or data retrieval failed.
194
+ class TickerDataNotFoundError(Exception):
195
+ pass
196
+
197
+
198
+ # TickerDataBadTypeException:
199
+ # This exception is raised when the ticker data
200
+ # is found but data type is not compliant with the expected type.
201
+ class TickerDataBadTypeException(Exception):
202
+ pass
203
+
204
+
205
+ # TickerDataInvalidException:
206
+ # This exception is raised when the ticker data
207
+ # is not found or invalid for generic reasons.
208
+ class TickerDataInvalidException(Exception):
209
+ pass
210
+
211
+
212
+ # common functions, constants and templates
213
+ TEMP_FOLDER = "Temp"
214
+ TEMP_CSV_FILE = "Temp.csv"
215
+
216
+ HISTDATA_URL_TICKDATA_TEMPLATE = (
217
+ 'https://www.histdata.com/download-free-forex-historical-data/?/'
218
+ 'ascii/tick-data-quotes/{ticker}/{year}/{month_num}'
219
+ )
220
+
221
+ HISTDATA_URL_ONEMINDATA_TEMPLATE = (
222
+ 'http://www.histdata.com/download-free-forex-data/?/'
223
+ 'ascii/1-minute-bar-quotes/{pair}/{year}/{month_num}'
224
+ )
225
+
226
+ HISTDATA_BASE_DOWNLOAD_URL = "http://www.histdata.com/get.php"
227
+ HISTDATA_BASE_DOWNLOAD_METHOD = 'POST'
228
+
229
+ MONTHS = ['January', 'February', 'March', 'April', 'May', 'June',
230
+ 'July', 'August', 'September', 'October', 'November', 'December']
231
+ YEARS = list(range(2001, datetime.now().year, 1))
232
+
233
+
234
+ DATE_NO_HOUR_FORMAT = '%Y-%m-%d'
235
+ DATE_FORMAT_ISO8601 = 'ISO8601'
236
+ DATE_FORMAT_SQL = '%Y-%m-%d %H:%M:%S.%f'
237
+ DATE_FORMAT_HISTDATA_CSV = '%Y%m%d %H%M%S%f'
238
+
239
+ # DATA_KEY_TEMPLATE_STR = '{ticker}.Y{year}.{tf}'
240
+ # DATA_KEY_TEMPLATE_PATTERN = '^[A-Za-z]+.Y[0-9]+.[A-Za-z0-9]+'
241
+ # FILENAME_STR = '{ticker}_Y{year}_{tf}.{file_ext}'
242
+ DATA_KEY_TEMPLATE_STR = '{market}.{ticker}.{tf}'
243
+ DATA_KEY_TEMPLATE_PATTERN = '^[A-Za-z0-9]_[A-Za-z]+.[A-Za-z0-9]+'
244
+ FILENAME_STR = '{market}_{ticker}_{tf}.{file_ext}'
245
+ DEFAULT_TIMEZONE = 'utc'
246
+ TICK_TIMEFRAME = 'tick'
247
+
248
+ # ticker PAIR of forex market
249
+ SINGLE_CURRENCY_PATTERN_STR = '[A-Za-z]{3}'
250
+ TICKER_PATTERN = '^' + SINGLE_CURRENCY_PATTERN_STR \
251
+ + SINGLE_CURRENCY_PATTERN_STR + '$'
252
+ PAIR_GENERIC_FORMAT = '{TO}/{FROM}'
253
+
254
+ # ALPHAVANTAGE
255
+ PAIR_ALPHAVANTAGE_FORMAT = '{TO}/{FROM}'
256
+ PAIR_ALPHAVANTAGE_PATTERN = '^' + SINGLE_CURRENCY_PATTERN_STR + '/' \
257
+ + SINGLE_CURRENCY_PATTERN_STR + '$'
258
+ ALPHA_VANTAGE_API_KEY = 'ALPHA_VANTAGE_API_KEY'
259
+ AV_LIST_URL = (
260
+ 'https://www.alphavantage.co/query?'
261
+ 'function=LISTING_STATUS&apikey={api_key}'
262
+ )
263
+
264
+ # PAIR POLYGON IO
265
+ PAIR_POLYGON_FORMAT = 'C:{TO}{FROM}'
266
+ PAIR_POLYGON_PATTERN = '^C:' + SINGLE_CURRENCY_PATTERN_STR + \
267
+ SINGLE_CURRENCY_PATTERN_STR + '$'
268
+ POLYGON_IO_API_KEY = 'POLYGON_IO_API_KEY'
269
+
270
+ # TIME PATTERN
271
+ TIME_WINDOW_PATTERN_STR = '^[-+]?[0-9]+[A-Za-z]{1,}$'
272
+ TIME_WINDOW_COMPONENTS_PATTERN_STR = '^[-+]?[0-9]+|[A-Za-z]{1,}$'
273
+ TIME_WINDOW_UNIT_PATTERN_STR = '[A-Za-z]{1,}$'
274
+ GET_YEAR_FROM_TICK_KEY_PATTERN_STR = '^[A-Za-z].Y[0-9].TICK'
275
+ YEAR_FIELD_PATTERN_STR = '^Y([0-9]{4,})$'
276
+
277
+ # auxiliary CONSTANT DEFINITIONS
278
+
279
+ # dotty key template: <ticker>.Y<year>.<timeframe>.<data-type>
280
+
281
+
282
+ class DATA_KEY:
283
+
284
+ MARKET = 0
285
+ TICKER_INDEX = 1
286
+ TF_INDEX = 2
287
+
288
+
289
+ # filename template : <ticker>_Y<year>_<timeframe>.<filetype>
290
+ class FILENAME_TEMPLATE:
291
+
292
+ TICKER_INDEX = 0
293
+ YEAR_INDEX = 1
294
+ YEAR_NUMERICAL_CHAR = 1
295
+ TF_INDEX = 2
296
+ FILETYPE_INDEX = 3
297
+
298
+
299
+ class DEFAULT_PATHS:
300
+
301
+ BASE_PATH = str(Path.home() / '.database')
302
+ HIST_DATA_FOLDER = 'HistoricalData'
303
+ REALTIME_DATA_FOLDER = 'RealtimeData'
304
+
305
+
306
+ class DATA_TYPE:
307
+
308
+ CSV_FILETYPE = 'csv'
309
+ PARQUET_FILETYPE = 'parquet'
310
+ DUCKDB = 'duckdb'
311
+
312
+
313
+ class DATA_FILE_COLUMN_INDEX:
314
+
315
+ TIMESTAMP = 0
316
+
317
+
318
+ SUPPORTED_DATA_FILES = [
319
+ DATA_TYPE.CSV_FILETYPE,
320
+ DATA_TYPE.PARQUET_FILETYPE,
321
+ DATA_TYPE.DUCKDB
322
+ ]
323
+
324
+ # supported dataframe engines
325
+ # pyarrow is inserted but reframe operation all in pyarrow
326
+ # is not yet available, now it is masked
327
+ # to a refame call with polars
328
+ # reframe_data() on pyarrow Table
329
+ SUPPORTED_DATA_ENGINES = [
330
+ 'pandas',
331
+ 'pyarrow',
332
+ 'polars',
333
+ 'polars_lazy'
334
+ ]
335
+
336
+ # SINGLE BASE DATA COMPOSIION TEMPLATE: ['open','close','high','low']
337
+ # with datetime/timestamp as index
338
+ # column names for dataframes TICK and timeframe filtered
339
+ # OHLC and related column names
340
+
341
+
342
+ class DATA_COLUMN_NAMES:
343
+
344
+ TICK_DATA_NO_PVALUE = ['timestamp', 'ask', 'bid', 'vol']
345
+ TICK_DATA = ['timestamp', 'ask', 'bid', 'vol', 'p']
346
+ TF_DATA = ['timestamp', 'open', 'high', 'low', 'close']
347
+ TICK_DATA_TIME_INDEX = ['ask', 'bid', 'vol', 'p']
348
+ TF_DATA_TIME_INDEX = ['open', 'high', 'low', 'close']
349
+ POLYGON_IO_AGGS = ['open', 'high', 'low', 'close', 'volume', 'vwap',
350
+ 'timestamp', 'transactions']
351
+
352
+
353
+ # SELECTED AS SINGLE BASE DATA COMPOSION TEMPLATE
354
+ BASE_DATA = DATA_COLUMN_NAMES.TF_DATA_TIME_INDEX
355
+ BASE_DATA_WITH_TIME = DATA_COLUMN_NAMES.TF_DATA
356
+
357
+
358
+ class REALTIME_DATA_PROVIDER:
359
+
360
+ ALPHA_VANTAGE = 'ALPHA_VANTAGE'
361
+ POLYGON_IO = 'POLYGON-IO'
362
+
363
+
364
+ REALTIME_DATA_PROVIDER_LIST = [REALTIME_DATA_PROVIDER.ALPHA_VANTAGE,
365
+ REALTIME_DATA_PROVIDER.POLYGON_IO]
366
+
367
+
368
+ class DB_MODE:
369
+
370
+ FULL_MODE = 'FULL_MODE'
371
+ HISTORICAL_MODE = 'HISTORICAL_MODE'
372
+ REALTIME_MODE = 'REALTIME_MODE'
373
+
374
+
375
+ class ASSET_TYPE:
376
+
377
+ STOCK = 'STOCK'
378
+ ETF = 'ETF'
379
+ FOREX = 'FOREX'
380
+
381
+
382
+ class BASE_DATA_COLUMN_NAME:
383
+
384
+ TIMESTAMP = 'timestamp'
385
+ OPEN = 'open'
386
+ HIGH = 'high'
387
+ LOW = 'low'
388
+ CLOSE = 'close'
389
+ ASK = 'ask'
390
+ BID = 'bid'
391
+ VOL = 'vol'
392
+ P_VALUE = 'p'
393
+ TRANSACTIONS = 'transactions'
394
+ VWAP = 'vwap'
395
+ OTC = 'otc'
396
+
397
+
398
+ SUPPORTED_BASE_DATA_COLUMN_NAME = Literal[
399
+ BASE_DATA_COLUMN_NAME.TIMESTAMP,
400
+ BASE_DATA_COLUMN_NAME.OPEN,
401
+ BASE_DATA_COLUMN_NAME.HIGH,
402
+ BASE_DATA_COLUMN_NAME.LOW,
403
+ BASE_DATA_COLUMN_NAME.CLOSE,
404
+ BASE_DATA_COLUMN_NAME.ASK,
405
+ BASE_DATA_COLUMN_NAME.BID,
406
+ BASE_DATA_COLUMN_NAME.VOL,
407
+ BASE_DATA_COLUMN_NAME.P_VALUE,
408
+ BASE_DATA_COLUMN_NAME.TRANSACTIONS,
409
+ BASE_DATA_COLUMN_NAME.VWAP,
410
+ BASE_DATA_COLUMN_NAME.OTC
411
+ ]
412
+
413
+
414
+ class CANONICAL_INDEX:
415
+
416
+ AV_LATEST_DATA_INDEX = 0
417
+ AV_DF_DATA_INDEX = 0
418
+ AV_DICT_INFO_INDEX = 1
419
+
420
+
421
+ class SQL_COMPARISON_OPERATORS:
422
+
423
+ GREATER_THAN = '>'
424
+ LESS_THAN = '<'
425
+ GREATER_THAN_OR_EQUAL = '>='
426
+ LESS_THAN_OR_EQUAL = '<='
427
+ EQUAL = '=='
428
+ NOT_EQUAL = '!='
429
+
430
+
431
+ SUPPORTED_SQL_COMPARISON_OPERATORS = Literal[
432
+ SQL_COMPARISON_OPERATORS.GREATER_THAN,
433
+ SQL_COMPARISON_OPERATORS.LESS_THAN,
434
+ SQL_COMPARISON_OPERATORS.GREATER_THAN_OR_EQUAL,
435
+ SQL_COMPARISON_OPERATORS.LESS_THAN_OR_EQUAL,
436
+ SQL_COMPARISON_OPERATORS.EQUAL,
437
+ SQL_COMPARISON_OPERATORS.NOT_EQUAL
438
+ ]
439
+
440
+
441
+ class SQL_CONDITION_AGGREGATION_MODES:
442
+
443
+ AND = 'AND'
444
+ OR = 'OR'
445
+
446
+
447
+ SUPPORTED_SQL_CONDITION_AGGREGATION_MODES = Literal[
448
+ SQL_CONDITION_AGGREGATION_MODES.AND,
449
+ SQL_CONDITION_AGGREGATION_MODES.OR
450
+ ]
451
+
452
+
453
+ # auxiliary functions
454
+
455
+ # get elements from db key
456
+ def get_db_key_elements(key):
457
+
458
+ res = fullmatch(DATA_KEY_TEMPLATE_STR, key)
459
+
460
+ if res:
461
+
462
+ return res.groups()
463
+
464
+ else:
465
+
466
+ logger.error(
467
+ f'key {key} does not respect regex template {DATA_KEY_TEMPLATE_STR}')
468
+ raise ValueError
469
+
470
+
471
+ # parse argument to get datetime object with date format as input
472
+ def infer_date_from_format_dt(s, date_format='ISO8601', unit=None, utc=False):
473
+
474
+ if unit:
475
+
476
+ return to_datetime(s,
477
+ unit=unit,
478
+ utc=utc)
479
+
480
+ else:
481
+
482
+ return to_datetime(s,
483
+ format=date_format,
484
+ utc=utc)
485
+
486
+
487
+ # parse timeframe as string and validate if it is valid
488
+ # following pandas DateOffset freqstr rules and 'TICK' (=lowest timeframe available)
489
+ # link to official pandas doc
490
+ # https://pandas.pydata.org/docs/user_guide/timeseries.html#dateoffset-objects
491
+ # add compatibility to polars frequency string
492
+
493
+ def check_timeframe_str(tf):
494
+
495
+ check = False
496
+
497
+ if tf == 'TICK':
498
+
499
+ check = True
500
+
501
+ else:
502
+
503
+ try:
504
+
505
+ check = (
506
+ isinstance(to_offset(tf), DateOffset) or
507
+ isinstance(Timedelta(tf).to_pytimedelta(),
508
+ timedelta)
509
+ )
510
+
511
+ except ValueError:
512
+
513
+ logger.critical(f"Type check: Invalid timeframe: {tf}")
514
+ raise
515
+
516
+ if check:
517
+
518
+ return tf
519
+
520
+ else:
521
+
522
+ logger.critical(f"Type check: Invalid timeframe "
523
+ f"conversion to timedelta: {tf}")
524
+ raise ValueError
525
+
526
+
527
+ # PAIR symbol functions
528
+ def get_pair_symbols(ticker):
529
+
530
+ components = findall(SINGLE_CURRENCY_PATTERN_STR, ticker)
531
+
532
+ if len(components) == 2:
533
+
534
+ return components[0], components[1]
535
+
536
+ else:
537
+
538
+ return None
539
+
540
+
541
+ def check_symbol(symbol, source):
542
+
543
+ if source == REALTIME_DATA_PROVIDER.ALPHA_VANTAGE:
544
+
545
+ if fullmatch(PAIR_ALPHAVANTAGE_PATTERN, symbol):
546
+
547
+ return True
548
+
549
+ else:
550
+
551
+ return False
552
+
553
+ elif source == REALTIME_DATA_PROVIDER.POLYGON_IO:
554
+
555
+ if fullmatch(PAIR_POLYGON_FORMAT, symbol):
556
+
557
+ return True
558
+
559
+ else:
560
+
561
+ return False
562
+
563
+ else:
564
+
565
+ if fullmatch(PAIR_POLYGON_FORMAT, symbol):
566
+
567
+ return True
568
+
569
+ else:
570
+
571
+ return False
572
+
573
+
574
+ def to_source_symbol(ticker, source):
575
+
576
+ to_symbol, from_symbol = get_pair_symbols(ticker)
577
+
578
+ if source == REALTIME_DATA_PROVIDER.ALPHA_VANTAGE:
579
+
580
+ return PAIR_ALPHAVANTAGE_FORMAT.format(TO=to_symbol,
581
+ FROM=from_symbol)
582
+
583
+ elif source == REALTIME_DATA_PROVIDER.POLYGON_IO:
584
+
585
+ return PAIR_POLYGON_FORMAT.format(TO=to_symbol,
586
+ FROM=from_symbol)
587
+
588
+ else:
589
+
590
+ return PAIR_GENERIC_FORMAT.format(TO=to_symbol,
591
+ FROM=from_symbol)
592
+
593
+
594
+ # TIMESTAMP RELATED FUNCTIONS
595
+
596
+ def check_time_offset_str(timeoffset_str):
597
+
598
+ # TODO: add support for polars time/date offset
599
+ return isinstance(to_offset(timeoffset_str), DateOffset)
600
+
601
+
602
+ def timewindow_str_to_timedelta(time_window_str):
603
+
604
+ if fullmatch(TIME_WINDOW_PATTERN_STR, time_window_str):
605
+
606
+ return Timedelta(time_window_str)
607
+
608
+ else:
609
+
610
+ logger.error('time window pattern not match: '
611
+ '"<integer_multiplier><unit>" str')
612
+ raise ValueError
613
+
614
+
615
+ def any_date_to_datetime64(any_date,
616
+ date_format='ISO8601',
617
+ unit=None,
618
+ to_pydatetime=False):
619
+
620
+ try:
621
+
622
+ any_date = infer_date_from_format_dt(any_date,
623
+ date_format,
624
+ unit=unit)
625
+
626
+ if to_pydatetime:
627
+
628
+ any_date = any_date.to_pydatetime()
629
+
630
+ except Exception as e:
631
+
632
+ logger.error(f'date {any_date} conversion failed, '
633
+ f'failed conversion to {date_format} '
634
+ 'date format')
635
+ raise
636
+
637
+ # =============================================================================
638
+ # TODO: is it necessary utc timezone when source is naive?
639
+ # if not any_date.tzinfo:
640
+ #
641
+ # any_date = any_date.tz_localize('utc')
642
+ # =============================================================================
643
+
644
+ return any_date
645
+
646
+
647
+ def get_date_interval(start=None,
648
+ end=None,
649
+ interval_start_mode=None,
650
+ interval_end_mode='now',
651
+ interval_timespan=None,
652
+ freq=None,
653
+ normalize=False,
654
+ bdays=False):
655
+
656
+ # create start and end date as timestamp instances
657
+ start_date = Timestamp(start)
658
+ end_date = Timestamp(end)
659
+
660
+ if interval_timespan:
661
+
662
+ # a variety of interval mode could be implemented
663
+
664
+ # 'now' - end of date interval is timestamp now
665
+ if interval_end_mode == 'now':
666
+
667
+ end_date = Timestamp.now()
668
+ start_date = end_date - timewindow_str_to_timedelta(interval_timespan)
669
+
670
+ if bdays:
671
+
672
+ components = findall(TIME_WINDOW_COMPONENTS_PATTERN_STR,
673
+ interval_timespan)
674
+
675
+ # fixed days redundancy check available only with 'd' type requested
676
+ # timespan
677
+ if components[1] == 'd':
678
+
679
+ days_list = list(
680
+ rrule(freq=DAILY,
681
+ dtstart=start_date,
682
+ until=end_date,
683
+ byweekday=(MO, TU, WE, TH, FR))
684
+ )
685
+
686
+ while len(days_list) < int(components[0]):
687
+
688
+ start_date = start_date - Timedelta(days=1)
689
+
690
+ days_list = list(
691
+ rrule(freq=DAILY,
692
+ dtstart=start_date,
693
+ until=end_date,
694
+ byweekday=(MO, TU, WE, TH, FR))
695
+ )
696
+
697
+ # Timestamp() constructor ensures these are Timestamp objects
698
+ if normalize:
699
+
700
+ if not isnull(start_date):
701
+ start_date = Timestamp.normalize(start_date)
702
+
703
+ if not isnull(end_date):
704
+ end_date = Timestamp.normalize(end_date)
705
+
706
+ start_date = any_date_to_datetime64(start_date)
707
+ end_date = any_date_to_datetime64(end_date)
708
+
709
+ # generate DateTimeIndex if freq is set
710
+ # otherwise return just start and end of interval
711
+ if freq:
712
+
713
+ bdate_dtindex = bdate_range(start=start_date,
714
+ end=end_date,
715
+ freq=freq,
716
+ tz=None,
717
+ normalize=normalize,
718
+ name='timestamp'
719
+ )
720
+
721
+ return start_date, end_date, bdate_dtindex
722
+
723
+ else:
724
+
725
+ return start_date, end_date
726
+
727
+
728
+ # BASE OPERATIONS WITH DATAFRAME
729
+ # depending on dataframe engine support
730
+ # for supported engines see var SUPPORTED_DATA_ENGINES
731
+
732
+ # DATA ENGINES TYPES DICTIONARY
733
+ class DTYPE_DICT:
734
+
735
+ TICK_DTYPE = {'ask': 'float32',
736
+ 'bid': 'float32',
737
+ 'vol': 'float32',
738
+ 'p': 'float32'}
739
+ TF_DTYPE = {'open': 'float32',
740
+ 'high': 'float32',
741
+ 'low': 'float32',
742
+ 'close': 'float32'}
743
+ TIME_TICK_DTYPE = {'timestamp': 'datetime64[ms]',
744
+ 'ask': 'float32',
745
+ 'bid': 'float32',
746
+ 'vol': 'float32',
747
+ 'p': 'float32'}
748
+ TIME_TF_DTYPE = {'timestamp': 'datetime64[ms]',
749
+ 'open': 'float32',
750
+ 'high': 'float32',
751
+ 'low': 'float32',
752
+ 'close': 'float32'}
753
+
754
+
755
+ class PYARROW_DTYPE_DICT:
756
+
757
+ TICK_DTYPE = {'ask': pyarrow_float32(),
758
+ 'bid': pyarrow_float32(),
759
+ 'vol': pyarrow_float32(),
760
+ 'p': pyarrow_float32()}
761
+ TF_DTYPE = {'open': pyarrow_float32(),
762
+ 'high': pyarrow_float32(),
763
+ 'low': pyarrow_float32(),
764
+ 'close': pyarrow_float32()}
765
+ TIME_TICK_DTYPE = {'timestamp': pyarrow_timestamp('ms'),
766
+ 'ask': pyarrow_float32(),
767
+ 'bid': pyarrow_float32(),
768
+ 'vol': pyarrow_float32(),
769
+ 'p': pyarrow_float32()}
770
+ TIME_TF_DTYPE = {'timestamp': pyarrow_timestamp('ms'),
771
+ 'open': pyarrow_float32(),
772
+ 'high': pyarrow_float32(),
773
+ 'low': pyarrow_float32(),
774
+ 'close': pyarrow_float32()}
775
+
776
+
777
+ class POLARS_DTYPE_DICT:
778
+
779
+ TICK_DTYPE = {'ask': polars_float32,
780
+ 'bid': polars_float32,
781
+ 'vol': polars_float32,
782
+ 'p': polars_float32}
783
+ TF_DTYPE = {'open': polars_float32,
784
+ 'high': polars_float32,
785
+ 'low': polars_float32,
786
+ 'close': polars_float32}
787
+ TIME_TICK_DTYPE = {'timestamp': polars_datetime('ms'),
788
+ 'ask': polars_float32,
789
+ 'bid': polars_float32,
790
+ 'vol': polars_float32,
791
+ 'p': polars_float32}
792
+ TIME_TF_DTYPE = {'timestamp': polars_datetime('ms'),
793
+ 'open': polars_float32,
794
+ 'high': polars_float32,
795
+ 'low': polars_float32,
796
+ 'close': polars_float32}
797
+
798
+ # DATA ENGINES FUNCTIONS
799
+
800
+
801
+ def empty_dataframe(engine):
802
+
803
+ if engine == 'pandas':
804
+
805
+ return pandas_dataframe()
806
+
807
+ elif engine == 'pyarrow':
808
+
809
+ return pyarrow_table([])
810
+
811
+ elif engine == 'polars':
812
+
813
+ return polars_dataframe()
814
+
815
+ elif engine == 'polars_lazy':
816
+
817
+ return polars_lazyframe()
818
+
819
+ else:
820
+
821
+ logger.error('function empty_dataframe not available'
822
+ f' for engine {engine}')
823
+ raise ValueError
824
+
825
+
826
+ def is_empty_dataframe(dataframe):
827
+
828
+ if isinstance(dataframe, pandas_dataframe):
829
+
830
+ return dataframe.empty
831
+
832
+ elif isinstance(dataframe, Table):
833
+
834
+ return (not bool(dataframe))
835
+
836
+ elif isinstance(dataframe, polars_dataframe):
837
+
838
+ return dataframe.is_empty()
839
+
840
+ elif isinstance(dataframe, polars_lazyframe):
841
+
842
+ return dataframe.collect().is_empty()
843
+
844
+ else:
845
+
846
+ logger.error('function is_empty_dataframe not available'
847
+ ' for instance of type'
848
+ f' {type(dataframe)}')
849
+ raise ValueError
850
+
851
+
852
+ def shape_dataframe(dataframe):
853
+
854
+ if isinstance(dataframe, pandas_dataframe):
855
+
856
+ return dataframe.shape
857
+
858
+ elif isinstance(dataframe, Table):
859
+
860
+ return dataframe.shape
861
+
862
+ elif isinstance(dataframe, polars_dataframe):
863
+
864
+ return dataframe.shape
865
+
866
+ elif isinstance(dataframe, polars_lazyframe):
867
+
868
+ return (
869
+ dataframe.select(polars_len()).collect().item(0, 0),
870
+ dataframe.collect_schema().len()
871
+ )
872
+
873
+ else:
874
+
875
+ logger.error('function shape_dataframe not available'
876
+ ' for instance of type'
877
+ f' {type(dataframe)}')
878
+ raise ValueError
879
+
880
+
881
+ def sort_dataframe(dataframe, column):
882
+
883
+ if isinstance(dataframe, pandas_dataframe):
884
+
885
+ return dataframe.sort_values(by=[column])
886
+
887
+ elif isinstance(dataframe, Table):
888
+
889
+ return dataframe.sort_by(column)
890
+
891
+ elif isinstance(dataframe, polars_dataframe):
892
+
893
+ return dataframe.sort(column, nulls_last=True)
894
+
895
+ elif isinstance(dataframe, polars_lazyframe):
896
+
897
+ return dataframe.sort(column, nulls_last=True)
898
+
899
+ else:
900
+
901
+ logger.error('function sort_dataframe not available'
902
+ ' for instance of type'
903
+ f' {type(dataframe)}')
904
+ raise ValueError
905
+
906
+
907
+ def get_dataframe_column(dataframe, column):
908
+
909
+ if isinstance(dataframe, pandas_dataframe):
910
+
911
+ return dataframe[column]
912
+
913
+ elif isinstance(dataframe, Table):
914
+
915
+ return dataframe[column]
916
+
917
+ elif isinstance(dataframe, polars_dataframe):
918
+
919
+ return dataframe[column]
920
+
921
+ elif isinstance(dataframe, polars_lazyframe):
922
+
923
+ return dataframe.select(column).collect()
924
+
925
+ else:
926
+
927
+ logger.error('function get_dataframe_column not available'
928
+ ' for instance of type'
929
+ f' {type(dataframe)}')
930
+
931
+
932
+ def get_dataframe_row(dataframe, row):
933
+
934
+ if isinstance(dataframe, pandas_dataframe):
935
+
936
+ return dataframe.loc[row]
937
+
938
+ elif isinstance(dataframe, Table):
939
+
940
+ return dataframe.slice(row, 1)
941
+
942
+ elif isinstance(dataframe, polars_dataframe):
943
+
944
+ return dataframe.slice(row, 1)
945
+
946
+ elif isinstance(dataframe, polars_lazyframe):
947
+
948
+ return dataframe.slice(row, 1)
949
+
950
+ else:
951
+
952
+ logger.error('function get_dataframe_row not available'
953
+ ' for instance of type'
954
+ f' {type(dataframe)}')
955
+
956
+
957
+ def get_dataframe_element(dataframe, column, row):
958
+
959
+ if isinstance(dataframe, pandas_dataframe):
960
+
961
+ return dataframe[column][row]
962
+
963
+ elif isinstance(dataframe, Table):
964
+
965
+ return dataframe[column][row]
966
+
967
+ elif isinstance(dataframe, polars_dataframe):
968
+
969
+ return dataframe[column][row]
970
+
971
+ elif isinstance(dataframe, polars_lazyframe):
972
+
973
+ return dataframe.select(column).collect().item(row, 0)
974
+
975
+ else:
976
+
977
+ logger.error('function get_dataframe_element not available'
978
+ ' for instance of type'
979
+ f' {type(dataframe)}')
980
+ raise ValueError
981
+
982
+
983
+ def dtype_dict_to_pyarrow_schema(dtype_dict):
984
+
985
+ return pyarrow_schema(dtype_dict.items())
986
+
987
+
988
+ def astype(dataframe, dtype_dict):
989
+
990
+ if isinstance(dataframe, pandas_dataframe):
991
+
992
+ return dataframe.astype(dtype_dict)
993
+
994
+ elif isinstance(dataframe, Table):
995
+
996
+ return dataframe.cast(dtype_dict_to_pyarrow_schema(dtype_dict))
997
+
998
+ elif isinstance(dataframe, polars_dataframe):
999
+
1000
+ return dataframe.cast(dtype_dict)
1001
+
1002
+ elif isinstance(dataframe, polars_lazyframe):
1003
+
1004
+ return dataframe.cast(dtype_dict)
1005
+
1006
+ else:
1007
+
1008
+ logger.error('function astype not available'
1009
+ ' for instance of type'
1010
+ f' {type(dataframe)}')
1011
+ raise ValueError
1012
+
1013
+
1014
+ def read_parquet(engine, filepath):
1015
+
1016
+ if engine == 'pandas':
1017
+
1018
+ return pandas_read_parquet(filepath)
1019
+
1020
+ elif engine == 'pyarrow':
1021
+
1022
+ return read_table(filepath)
1023
+
1024
+ elif engine == 'polars':
1025
+
1026
+ return polars_read_parquet(filepath)
1027
+
1028
+ elif engine == 'polars_lazy':
1029
+
1030
+ return polars_scan_parquet(filepath)
1031
+
1032
+ else:
1033
+
1034
+ logger.error('function read_parquet not available'
1035
+ f' for engine {engine}')
1036
+ raise ValueError
1037
+
1038
+
1039
+ def write_parquet(dataframe, filepath):
1040
+
1041
+ if isinstance(dataframe, pandas_dataframe):
1042
+
1043
+ try:
1044
+
1045
+ dataframe.to_parquet(filepath, index=True)
1046
+
1047
+ except Exception as e:
1048
+
1049
+ logger.exception(f'pandas write parquet failed: {e}')
1050
+ raise
1051
+
1052
+ elif isinstance(dataframe, Table):
1053
+
1054
+ try:
1055
+
1056
+ write_table(dataframe, filepath)
1057
+
1058
+ except Exception as e:
1059
+
1060
+ logger.exception(f'pyarrow write parquet failed: {e}')
1061
+ raise
1062
+
1063
+ elif isinstance(dataframe, polars_dataframe):
1064
+
1065
+ try:
1066
+
1067
+ dataframe.write_parquet(filepath)
1068
+
1069
+ except Exception as e:
1070
+
1071
+ logger.exception(f'polars write parquet failed: {e}')
1072
+ raise
1073
+
1074
+ elif isinstance(dataframe, polars_lazyframe):
1075
+
1076
+ try:
1077
+
1078
+ dataframe.sink_parquet(filepath)
1079
+
1080
+ # alternative to sink_parquet()
1081
+ # dataframe.collect(streaming=False).write_parquet(filepath)
1082
+
1083
+ except Exception as e:
1084
+
1085
+ logger.exception(f'polars lazyframe sink '
1086
+ 'parquet failed: {e}')
1087
+ raise
1088
+
1089
+ else:
1090
+
1091
+ logger.error('function write_parquet not available'
1092
+ ' for instance of type'
1093
+ f' {type(dataframe)}')
1094
+ raise ValueError
1095
+
1096
+
1097
+ def read_csv(engine, file, **kwargs):
1098
+
1099
+ if engine == 'pandas':
1100
+
1101
+ return pandas_read_csv(file, **kwargs)
1102
+
1103
+ elif engine == 'pyarrow':
1104
+
1105
+ return arrow_csv.read_csv(file, **kwargs)
1106
+
1107
+ elif engine == 'polars':
1108
+
1109
+ return polars_read_csv(file, **kwargs)
1110
+
1111
+ elif engine == 'polars_lazy':
1112
+
1113
+ return polars_scan_csv(file, **kwargs)
1114
+
1115
+ else:
1116
+
1117
+ logger.error('function read_csv not available'
1118
+ f' for engine {engine}')
1119
+ raise ValueError
1120
+
1121
+
1122
+ def write_csv(dataframe, file, **kwargs):
1123
+
1124
+ if isinstance(dataframe, pandas_dataframe):
1125
+
1126
+ try:
1127
+
1128
+ # IMPORTANT
1129
+ # pandas dataframe case
1130
+ # avoid date_format parameter since it is reported that
1131
+ # it makes to_csv to be excessively long with column data
1132
+ # being datetime data type
1133
+ # see: https://github.com/pandas-dev/pandas/issues/37484
1134
+ # https://stackoverflow.com/questions/65903287/pandas-1-2-1-to-csv-performance-with-datetime-as-the-index-and-setting-date-form
1135
+
1136
+ dataframe.to_csv(file,
1137
+ header=True,
1138
+ **kwargs)
1139
+
1140
+ except Exception as e:
1141
+
1142
+ logger.exception(f'Error writing csv file {file}'
1143
+ f' with data type {type(dataframe)}: {e}')
1144
+ raise IOError
1145
+
1146
+ elif isinstance(dataframe, Table):
1147
+
1148
+ try:
1149
+
1150
+ arrow_csv.write_csv(dataframe, file, **kwargs)
1151
+
1152
+ except Exception as e:
1153
+
1154
+ logger.exception(f'Error writing csv file {file}'
1155
+ f' with data type {type(dataframe)}: {e}')
1156
+ raise IOError
1157
+
1158
+ elif isinstance(dataframe, polars_dataframe):
1159
+
1160
+ try:
1161
+
1162
+ dataframe.write_csv(file, **kwargs)
1163
+
1164
+ except Exception as e:
1165
+
1166
+ logger.exception(f'Error writing csv file {file}'
1167
+ f' with data type {type(dataframe)}: {e}')
1168
+ raise IOError
1169
+
1170
+ elif isinstance(dataframe, polars_lazyframe):
1171
+
1172
+ try:
1173
+
1174
+ dataframe.sink_csv(file, **kwargs)
1175
+
1176
+ except Exception as e:
1177
+
1178
+ logger.exception(f'Error writing csv file {file}'
1179
+ f' with data type {type(dataframe)}: {e}')
1180
+ raise IOError
1181
+
1182
+ else:
1183
+
1184
+ logger.error('function write_csv not available'
1185
+ ' for instance of type'
1186
+ f' {type(dataframe)}')
1187
+ raise ValueError
1188
+
1189
+
1190
+ def concat_data(data_list=field(validator=validators.instance_of(list))):
1191
+
1192
+ if not isinstance(data_list, list):
1193
+
1194
+ logger.error('required input as list')
1195
+ raise TypeError
1196
+
1197
+ # assume data type is unique by input
1198
+ # get type from first element
1199
+
1200
+ if isinstance(data_list[0], pandas_dataframe):
1201
+
1202
+ return pandas_concat(data_list,
1203
+ ignore_index=False,
1204
+ copy=False)
1205
+
1206
+ elif isinstance(data_list[0], Table):
1207
+
1208
+ return concat_tables(data_list)
1209
+
1210
+ elif isinstance(data_list[0], polars_dataframe):
1211
+
1212
+ return polars_concat(data_list, how='vertical')
1213
+
1214
+ elif isinstance(data_list[0], polars_lazyframe):
1215
+
1216
+ return polars_concat(data_list, how='vertical')
1217
+
1218
+ else:
1219
+
1220
+ logger.error('function concat not available'
1221
+ ' for instance of type'
1222
+ f' {type(data_list[0])}')
1223
+ raise ValueError
1224
+
1225
+
1226
+ def to_pandas_dataframe(dataframe):
1227
+
1228
+ # convert to pandas dataframe
1229
+ # useful for those calls
1230
+ # requiring pandas as input
1231
+ # or pandas functions not covered
1232
+ # by other dataframe instance
1233
+
1234
+ if isinstance(dataframe, pandas_dataframe):
1235
+
1236
+ return dataframe
1237
+
1238
+ elif isinstance(dataframe, Table):
1239
+
1240
+ return dataframe.to_pandas()
1241
+
1242
+ elif isinstance(dataframe, polars_dataframe):
1243
+
1244
+ return dataframe.to_pandas(use_pyarrow_extension_array=True)
1245
+
1246
+ elif isinstance(dataframe, polars_lazyframe):
1247
+
1248
+ return dataframe.collect().to_pandas(use_pyarrow_extension_array=True)
1249
+
1250
+ else:
1251
+
1252
+ logger.error('function to_pandas() not available'
1253
+ ' for instance of type'
1254
+ f' {type(dataframe)}')
1255
+ raise ValueError
1256
+
1257
+
1258
+ def reframe_data(dataframe, tf):
1259
+ '''
1260
+
1261
+
1262
+ Parameters
1263
+ ----------
1264
+ data : TYPE
1265
+ DESCRIPTION.
1266
+ tf : TYPE
1267
+ DESCRIPTION.
1268
+
1269
+ Raises
1270
+ ------
1271
+ ValueError
1272
+ DESCRIPTION.
1273
+
1274
+ Returns
1275
+ -------
1276
+ Dataframe
1277
+ DESCRIPTION.
1278
+
1279
+ '''
1280
+
1281
+ if is_empty_dataframe(dataframe):
1282
+
1283
+ return dataframe
1284
+
1285
+ if isinstance(dataframe, pandas_dataframe):
1286
+
1287
+ # assert timeframe input value
1288
+ tf = check_timeframe_str(tf)
1289
+
1290
+ if tf == TICK_TIMEFRAME:
1291
+
1292
+ logger.warning(f'reframe not possible wih target {TICK_TIMEFRAME}')
1293
+
1294
+ return dataframe
1295
+
1296
+ if not is_datetime64_any_dtype(dataframe.index):
1297
+
1298
+ if BASE_DATA_COLUMN_NAME.TIMESTAMP in dataframe.columns:
1299
+
1300
+ if not is_datetime64_any_dtype(
1301
+ dataframe[BASE_DATA_COLUMN_NAME.TIMESTAMP]):
1302
+
1303
+ try:
1304
+
1305
+ dataframe[BASE_DATA_COLUMN_NAME.TIMESTAMP] = any_date_to_datetime64(
1306
+ dataframe[BASE_DATA_COLUMN_NAME.TIMESTAMP])
1307
+
1308
+ except Exception as e:
1309
+
1310
+ logger.exception('Pandas engine: '
1311
+ 'Failed conversion of timestamp columns '
1312
+ 'to DatetimeIndex')
1313
+ raise
1314
+
1315
+ else:
1316
+
1317
+ logger.error('Pandas engine: required column with '
1318
+ f'name {BASE_DATA_COLUMN_NAME.TIMESTAMP}')
1319
+ raise ValueError
1320
+
1321
+ # use pandas functions to reframe data on pandas Dataframe
1322
+
1323
+ dataframe = sort_dataframe(dataframe, BASE_DATA_COLUMN_NAME.TIMESTAMP)
1324
+
1325
+ dataframe = dataframe.set_index(BASE_DATA_COLUMN_NAME.TIMESTAMP,
1326
+ inplace=False,
1327
+ drop=True
1328
+ )
1329
+
1330
+ # resample based on p value
1331
+ if all([col in DATA_COLUMN_NAMES.TICK_DATA_TIME_INDEX
1332
+ for col in dataframe.columns]):
1333
+
1334
+ # resample along 'p' column, data in ask, bid, p format
1335
+ dataframe = dataframe.p.resample(tf).ohlc().interpolate(method='nearest')
1336
+
1337
+ elif all([col in DATA_COLUMN_NAMES.TF_DATA_TIME_INDEX
1338
+ for col in dataframe.columns]):
1339
+
1340
+ # resample along given data already in ohlc format
1341
+ dataframe = dataframe.resample(tf).interpolate(method='nearest')
1342
+
1343
+ else:
1344
+
1345
+ logger.error(f'data columns {dataframe.columns} invalid, '
1346
+ f'required {DATA_COLUMN_NAMES.TICK_DATA_TIME_INDEX} '
1347
+ f'or {DATA_COLUMN_NAMES.TF_DATA_TIME_INDEX}')
1348
+ raise ValueError
1349
+
1350
+ return dataframe.reset_index(drop=False)
1351
+
1352
+ elif isinstance(dataframe, Table):
1353
+
1354
+ '''
1355
+
1356
+ use pyarrow functions to reframe data on pyarrow Table
1357
+ could not find easy way to filter an arrow table
1358
+ based on time interval
1359
+
1360
+ opened an enhancement issue on github
1361
+
1362
+ https://github.com/apache/arrow/issues/41049
1363
+
1364
+ As a temporary alternative, convert arrow Table to polars
1365
+ and perform reframe with polars engine
1366
+
1367
+ '''
1368
+
1369
+ if all([col in DATA_COLUMN_NAMES.TICK_DATA
1370
+ for col in dataframe.column_names]):
1371
+
1372
+ # convert to polars dataframe
1373
+ dataframe = from_arrow(dataframe,
1374
+ schema=cast(Any, POLARS_DTYPE_DICT.TIME_TICK_DTYPE))
1375
+
1376
+ elif all([col in DATA_COLUMN_NAMES.TF_DATA
1377
+ for col in dataframe.column_names]):
1378
+
1379
+ # convert to polars dataframe
1380
+ dataframe = from_arrow(dataframe,
1381
+ schema=cast(Any, POLARS_DTYPE_DICT.TIME_TF_DTYPE))
1382
+
1383
+ # perform operation
1384
+ # convert to arrow Table and return
1385
+ return reframe_data(dataframe, tf).to_arrow()
1386
+
1387
+ elif isinstance(dataframe, polars_dataframe):
1388
+
1389
+ tf = tf.lower()
1390
+
1391
+ dataframe = sort_dataframe(dataframe, BASE_DATA_COLUMN_NAME.TIMESTAMP)
1392
+
1393
+ if all([col in DATA_COLUMN_NAMES.TICK_DATA
1394
+ for col in dataframe.columns]):
1395
+
1396
+ return dataframe.group_by_dynamic(
1397
+ BASE_DATA_COLUMN_NAME.TIMESTAMP,
1398
+ every=tf).agg(col('p').first().alias(BASE_DATA_COLUMN_NAME.OPEN),
1399
+ col('p').max().alias(BASE_DATA_COLUMN_NAME.HIGH),
1400
+ col('p').min().alias(BASE_DATA_COLUMN_NAME.LOW),
1401
+ col('p').last().alias(BASE_DATA_COLUMN_NAME.CLOSE)
1402
+ )
1403
+
1404
+ elif all([col in DATA_COLUMN_NAMES.TF_DATA
1405
+ for col in dataframe.columns]):
1406
+
1407
+ return dataframe.group_by_dynamic(
1408
+ BASE_DATA_COLUMN_NAME.TIMESTAMP,
1409
+ every=tf).agg(col(BASE_DATA_COLUMN_NAME.OPEN).first(),
1410
+ col(BASE_DATA_COLUMN_NAME.HIGH).max(),
1411
+ col(BASE_DATA_COLUMN_NAME.LOW).min(),
1412
+ col(BASE_DATA_COLUMN_NAME.CLOSE).last()
1413
+ )
1414
+
1415
+ else:
1416
+
1417
+ logger.error(f'data columns {dataframe.columns} invalid, '
1418
+ f'required {DATA_COLUMN_NAMES.TICK_DATA} '
1419
+ f'or {DATA_COLUMN_NAMES.TF_DATA}')
1420
+ raise ValueError
1421
+
1422
+ elif isinstance(dataframe, polars_lazyframe):
1423
+
1424
+ tf = tf.lower()
1425
+
1426
+ dataframe = dataframe.sort('timestamp', nulls_last=True)
1427
+
1428
+ if all([col in DATA_COLUMN_NAMES.TICK_DATA
1429
+ for col in dataframe.collect_schema().names()]):
1430
+
1431
+ return dataframe.group_by_dynamic(
1432
+ BASE_DATA_COLUMN_NAME.TIMESTAMP,
1433
+ every=tf).agg(col('p').first().alias(BASE_DATA_COLUMN_NAME.OPEN),
1434
+ col('p').max().alias(BASE_DATA_COLUMN_NAME.HIGH),
1435
+ col('p').min().alias(BASE_DATA_COLUMN_NAME.LOW),
1436
+ col('p').last().alias(BASE_DATA_COLUMN_NAME.CLOSE)
1437
+ )
1438
+
1439
+ elif all([col in DATA_COLUMN_NAMES.TF_DATA
1440
+ for col in dataframe.collect_schema().names()]):
1441
+
1442
+ return dataframe.group_by_dynamic(
1443
+ BASE_DATA_COLUMN_NAME.TIMESTAMP,
1444
+ every=tf).agg(col(BASE_DATA_COLUMN_NAME.OPEN).first(),
1445
+ col(BASE_DATA_COLUMN_NAME.HIGH).max(),
1446
+ col(BASE_DATA_COLUMN_NAME.LOW).min(),
1447
+ col(BASE_DATA_COLUMN_NAME.CLOSE).last()
1448
+ )
1449
+
1450
+ else:
1451
+
1452
+ logger.error(f'data columns {dataframe.columns} invalid, '
1453
+ f'required {DATA_COLUMN_NAMES.TICK_DATA} '
1454
+ f'or {DATA_COLUMN_NAMES.TF_DATA}')
1455
+ raise ValueError
1456
+
1457
+ # UTILS FOR DOTTY DICTIONARY
1458
+
1459
+
1460
+ def get_dotty_key_field(key, index):
1461
+
1462
+ if not isinstance(key, str):
1463
+
1464
+ logger.error(f'dotty key {key} invalid type, str required')
1465
+ raise TypeError
1466
+
1467
+ try:
1468
+
1469
+ field = key.split('.')[index]
1470
+
1471
+ except IndexError:
1472
+
1473
+ logger.exception(f'index {index} invalid for key {key}')
1474
+ raise
1475
+
1476
+ return field
1477
+
1478
+
1479
+ def get_dotty_keys(dotty_dict,
1480
+ root=False,
1481
+ level=None,
1482
+ parent_key=None):
1483
+
1484
+ dotty_copy = dotty_dict.copy()
1485
+
1486
+ if root:
1487
+
1488
+ return dotty_copy.keys()
1489
+
1490
+ elif level:
1491
+
1492
+ if not (
1493
+ isinstance(level, int) and
1494
+ level >= 0):
1495
+
1496
+ logger.error('level must be zero or positive integer')
1497
+ raise ValueError
1498
+
1499
+ # default start at root key
1500
+ level_counter = 0
1501
+
1502
+ pass
1503
+
1504
+ elif parent_key:
1505
+
1506
+ if not isinstance(parent_key, str):
1507
+
1508
+ logger.error('parent key must be str')
1509
+
1510
+ parent_dict = dotty_copy.pop(parent_key)
1511
+
1512
+ if parent_dict:
1513
+
1514
+ try:
1515
+ keys = parent_dict.keys()
1516
+ except KeyError as err:
1517
+
1518
+ logger.exception(f'{err} : keys not found under {parent_key}')
1519
+ return []
1520
+
1521
+ else:
1522
+
1523
+ return [str(k) for k in keys]
1524
+
1525
+ else:
1526
+
1527
+ logger.error('{parent_key} key not exist')
1528
+ raise KeyError
1529
+
1530
+
1531
+ def get_dotty_leafs(dotty_dict):
1532
+
1533
+ leaf_keys = list()
1534
+
1535
+ def get_leaf(dotty_dict, parent_key):
1536
+
1537
+ try:
1538
+
1539
+ if dotty_dict.keys():
1540
+
1541
+ for key in dotty_dict.keys():
1542
+
1543
+ key_w_parent = '{parent}.{key}'.format(parent=parent_key,
1544
+ key=key)
1545
+
1546
+ get_leaf(dotty_dict.get(key), key_w_parent)
1547
+
1548
+ except AttributeError:
1549
+
1550
+ leaf_keys.append(parent_key)
1551
+
1552
+ except ValueError:
1553
+
1554
+ leaf_keys.append(parent_key)
1555
+
1556
+ # root field is temporary to have common start in any case in all leafs
1557
+ get_leaf(dotty_dict, 'root')
1558
+
1559
+ # leave out root field from all paths to leafs
1560
+ original_leaf_keys = leaf_keys
1561
+ leaf_keys = []
1562
+ for leaf in original_leaf_keys:
1563
+ match_result = search(r'(?<=root.)\\S+', leaf)
1564
+ if match_result:
1565
+ leaf_keys.append(match_result.group(0))
1566
+
1567
+ return leaf_keys
1568
+
1569
+
1570
+ def get_dotty_key_parent(key):
1571
+
1572
+ if isinstance(key, str):
1573
+
1574
+ logger.error('dotty key must be str type')
1575
+ raise TypeError
1576
+
1577
+ # prune last field and rejoin with '.' separator
1578
+ # to recreate a dotty key
1579
+ parent_key = '.'.join(key.split('.')[:-2])
1580
+
1581
+ return parent_key
1582
+
1583
+
1584
+ # TODO: function that returns all leafs at a given
1585
+ # given level
1586
+
1587
+
1588
+ # ATTRS
1589
+
1590
+ # ADDED VALIDATORS
1591
+
1592
+ def validator_file_path(file_ext=None):
1593
+
1594
+ def validate_file_path(instance, attribute, value):
1595
+
1596
+ try:
1597
+
1598
+ filepath = Path(value)
1599
+
1600
+ except Exception as e:
1601
+
1602
+ logger.error(f'File {value} Path creation error: {e}')
1603
+ raise
1604
+
1605
+ else:
1606
+
1607
+ if not (
1608
+
1609
+ filepath.exists() or
1610
+ filepath.is_file()
1611
+ ):
1612
+
1613
+ logger.error(f'file {value} not exists')
1614
+ raise FileExistsError
1615
+
1616
+ return validate_file_path
1617
+
1618
+
1619
+ def validator_dir_path(create_if_missing=False):
1620
+
1621
+ def validate_or_create_dir(instance, attribute, value):
1622
+
1623
+ if create_if_missing:
1624
+
1625
+ Path(value).mkdir(parents=True, exist_ok=True)
1626
+
1627
+ else:
1628
+
1629
+ if not (
1630
+ Path(value).exists() or
1631
+ Path(value).is_dir()
1632
+ ):
1633
+
1634
+ logger.error(f'Directory {value} not valid')
1635
+ raise TypeError()
1636
+
1637
+ return validate_or_create_dir
1638
+
1639
+
1640
+ def validator_list_timeframe(instance, attribute, value):
1641
+
1642
+ if not isinstance(value, list):
1643
+
1644
+ logger.error(f'Required type list for argument {attribute}')
1645
+ raise TypeError
1646
+
1647
+ if not all([
1648
+ check_timeframe_str(val)
1649
+ for val in value
1650
+ ]):
1651
+
1652
+ fails = [
1653
+ val for val in value
1654
+ if not check_timeframe_str(val)
1655
+ ]
1656
+
1657
+ return ValueError('Values are not timeframe compatible: '
1658
+ f'{fails}')
1659
+
1660
+
1661
+ def validator_list_ge(min_value):
1662
+
1663
+ def validator_list_values(instance, attribute, value):
1664
+
1665
+ if not (
1666
+ isinstance(value, list) and
1667
+ all([isinstance(val, int)
1668
+ for val in value])
1669
+ ):
1670
+
1671
+ logger.error('Required list of int type for argument '
1672
+ f'{attribute}')
1673
+ raise TypeError
1674
+
1675
+ if any([
1676
+ val < min_value
1677
+ for val in value
1678
+ ]):
1679
+
1680
+ fails = [
1681
+ val for val in value
1682
+ if val < min_value
1683
+ ]
1684
+
1685
+ logger.error(f'Values in {attribute}: {fails} '
1686
+ f'are not greater than {min_value}')
1687
+ raise ValueError
1688
+
1689
+ # ATTRIBUTES
1690
+
1691
+
1692
+ def get_attrs_names(instance_object, **kwargs):
1693
+
1694
+ if hasattr(instance_object, '__attrs_attrs__'):
1695
+
1696
+ return [attr.name
1697
+ for attr in instance_object.__attrs_attrs__]
1698
+
1699
+ else:
1700
+
1701
+ logger.error('attribute "__attrs__attrs__" not found in '
1702
+ f'object {instance_object}')
1703
+ raise KeyError
1704
+
1705
+ # GENERIC UTILITIES
1706
+
1707
+
1708
+ def list_remove_duplicates(list_in):
1709
+
1710
+ return list(dict.fromkeys(list_in))
1711
+
1712
+ # HISTDATA data provider utilities
1713
+
1714
+
1715
+ # Analyze the Histdata Forex download base page
1716
+ # https://www.histdata.com/download-free-forex-data/?/ascii/1-minute-bar-quotes
1717
+ # and get a list of all avilable tickers in the form as the example "EURUSD"
1718
+ def get_histdata_tickers() -> List[str]:
1719
+ """
1720
+ Get all available tickers from HistData.com.
1721
+
1722
+ Returns
1723
+ -------
1724
+ List[str]
1725
+ List of all available tickers (e.g., ['EURUSD', 'GBPUSD', ...]).
1726
+ """
1727
+ url = "https://www.histdata.com/download-free-forex-data/?/ascii/1-minute-bar-quotes"
1728
+
1729
+ # TODO: test connection with url, if fails return empty list and log error
1730
+
1731
+ try:
1732
+ response = requests.get(url)
1733
+ response.raise_for_status()
1734
+ soup = BeautifulSoup(response.content, 'html.parser')
1735
+
1736
+ tickers = []
1737
+ # Tickers are typically in links that lead to the pair's specific page
1738
+ for link in soup.find_all('a', href=True):
1739
+ href = link['href']
1740
+ # Pattern check based on the observed links
1741
+ if "/ascii/1-minute-bar-quotes/" in href:
1742
+ parts = href.split('/')
1743
+ ticker = parts[-1]
1744
+ # Validate it's a valid ticker (usually 6 chars like EURUSD)
1745
+ if ticker and len(ticker) >= 6:
1746
+ tickers.append(ticker.upper())
1747
+
1748
+ return sorted(list(set(tickers)))
1749
+
1750
+ except Exception as e:
1751
+ logger.error(f"Failed to retrieve tickers from HistData: {e}")
1752
+ return []
1753
+ # REAL TIME PROVIDERS UTILITIES
1754
+
1755
+
1756
+ def polygon_agg_to_dict(agg):
1757
+
1758
+ if not isinstance(agg, polygon_agg):
1759
+
1760
+ logger.error('argument invalid type, required '
1761
+ 'polygon.rest.models.aggs.Agg')
1762
+
1763
+ return {
1764
+ BASE_DATA_COLUMN_NAME.TIMESTAMP: agg.timestamp,
1765
+ BASE_DATA_COLUMN_NAME.OPEN: agg.open,
1766
+ BASE_DATA_COLUMN_NAME.HIGH: agg.high,
1767
+ BASE_DATA_COLUMN_NAME.LOW: agg.low,
1768
+ BASE_DATA_COLUMN_NAME.CLOSE: agg.close,
1769
+ BASE_DATA_COLUMN_NAME.VOL: agg.volume,
1770
+ BASE_DATA_COLUMN_NAME.TRANSACTIONS: agg.transactions,
1771
+ BASE_DATA_COLUMN_NAME.VWAP: agg.vwap,
1772
+ BASE_DATA_COLUMN_NAME.OTC: agg.otc
1773
+ }