forex_data_aggregator 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1262 @@
1
+
2
+ from loguru import logger
3
+ from typing import Any, Dict, List, Optional, Union
4
+ from datetime import datetime
5
+
6
+ from attrs import (
7
+ define,
8
+ field,
9
+ validate,
10
+ validators
11
+ )
12
+
13
+ # PANDAS
14
+ from pandas import (
15
+ DataFrame as pandas_dataframe,
16
+ to_datetime
17
+ )
18
+
19
+ # PYARROW
20
+ from pyarrow import (
21
+ int64 as pyarrow_int64,
22
+ string as pyarrow_string,
23
+ BufferReader,
24
+ csv as arrow_csv,
25
+ compute as pc,
26
+ schema,
27
+ Table,
28
+ table as pyarrow_table,
29
+ duration
30
+ )
31
+
32
+ # POLARS
33
+ from polars import (
34
+ String as polars_string,
35
+ col,
36
+ DataFrame as polars_dataframe,
37
+ LazyFrame as polars_lazyframe
38
+ )
39
+
40
+ from zipfile import (
41
+ ZipFile,
42
+ ZipExtFile,
43
+ BadZipFile
44
+ )
45
+
46
+ from re import (
47
+ search,
48
+ match
49
+ )
50
+
51
+ from mplfinance import (
52
+ plot as mpf_plot,
53
+ show as mpf_show
54
+ )
55
+
56
+ from numpy import array
57
+
58
+ from pathlib import Path
59
+ from requests import Session
60
+ from io import BytesIO
61
+ from shutil import rmtree
62
+
63
+ # python base
64
+ from dotty_dict import (
65
+ Dotty,
66
+ dotty
67
+ )
68
+
69
+ from iteration_utilities import (
70
+ duplicates,
71
+ unique_everseen
72
+ )
73
+
74
+ # internally defined
75
+ from .common import *
76
+ from ..config import (
77
+ read_config_file,
78
+ read_config_string,
79
+ read_config_folder
80
+ )
81
+
82
+ from .database import (
83
+ DatabaseConnector,
84
+ DuckDBConnector,
85
+ LocalDBConnector
86
+ )
87
+
88
+
89
+ __all__ = ['HistoricalManagerDB']
90
+
91
+
92
+ # HISTORICAL DATA MANAGER
93
+ @define(kw_only=True, slots=True)
94
+ class HistoricalManagerDB:
95
+
96
+ # interface parameters
97
+ config: str = field(default='',
98
+ validator=validators.instance_of(str))
99
+ data_type: str = field(default='parquet',
100
+ validator=validators.in_(SUPPORTED_DATA_FILES))
101
+ engine: str = field(default='polars_lazy',
102
+ validator=validators.in_(SUPPORTED_DATA_ENGINES))
103
+
104
+ # internal
105
+ _db_connector = field(factory=DatabaseConnector)
106
+ _tf_list = field(factory=list, validator=validators.instance_of(list))
107
+ _dataframe_type = field(default=pandas_dataframe)
108
+ _data_path = field(default=Path(DEFAULT_PATHS.BASE_PATH),
109
+ validator=validator_dir_path(create_if_missing=True))
110
+ _histdata_path = field(
111
+ default=Path(DEFAULT_PATHS.BASE_PATH) / DEFAULT_PATHS.HIST_DATA_FOLDER,
112
+ validator=validator_dir_path(create_if_missing=True))
113
+ _temporary_data_path = field(
114
+ default=(Path(DEFAULT_PATHS.BASE_PATH) /
115
+ DEFAULT_PATHS.HIST_DATA_FOLDER /
116
+ TEMP_FOLDER),
117
+ validator=validator_dir_path(create_if_missing=True))
118
+
119
+ # if a valid config file or string
120
+ # is passed
121
+ # arguments contained are assigned here
122
+ # if instantiation passed values are present
123
+ # they will override the related argument
124
+ # value in the next initialization step
125
+
126
+ # if neither by instantation or config file
127
+ # an argument value is set, the argument
128
+ # will be set by asociated defined default
129
+ # or factory generator
130
+
131
+ def __init__(self, **kwargs: Any) -> None:
132
+
133
+ _class_attributes_name = get_attrs_names(self, **kwargs)
134
+ _not_assigned_attrs_index_mask = [True] * len(_class_attributes_name)
135
+
136
+ if 'config' in kwargs.keys():
137
+
138
+ if kwargs['config']:
139
+
140
+ config_path = Path(kwargs['config'])
141
+
142
+ if (
143
+ config_path.exists()
144
+ and
145
+ config_path.is_dir()
146
+ ):
147
+
148
+ config_filepath = read_config_folder(
149
+ config_path, file_pattern='data_config.yaml')
150
+
151
+ else:
152
+
153
+ config_filepath = Path()
154
+
155
+ config_args = {}
156
+ if config_filepath.exists() \
157
+ and \
158
+ config_filepath.is_file() \
159
+ and \
160
+ config_filepath.suffix == '.yaml':
161
+
162
+ # read parameters from config file
163
+ # and force keys to lower case
164
+ config_args = {key.lower(): val for key, val in
165
+ read_config_file(str(config_filepath)).items()}
166
+
167
+ elif isinstance(kwargs['config'], str):
168
+
169
+ # read parameters from config file
170
+ # and force keys to lower case
171
+ config_args = {key.lower(): val for key, val in
172
+ read_config_string(kwargs['config']).items()}
173
+
174
+ else:
175
+
176
+ logger.critical('invalid config type '
177
+ f'{kwargs["config"]}: '
178
+ 'required str or Path, got '
179
+ f'{type(kwargs["config"])}')
180
+ raise TypeError
181
+
182
+ # check consistency of config_args
183
+ if (
184
+ not isinstance(config_args, dict)
185
+ or
186
+ not bool(config_args)
187
+ ):
188
+
189
+ logger.critical(f'config {kwargs["config"]} '
190
+ 'has no valid yaml formatted data')
191
+ raise TypeError
192
+
193
+ # set args from config file
194
+ attrs_keys_configfile = \
195
+ set(_class_attributes_name).intersection(config_args.keys())
196
+
197
+ for attr_key in attrs_keys_configfile:
198
+
199
+ self.__setattr__(attr_key,
200
+ config_args[attr_key])
201
+
202
+ _not_assigned_attrs_index_mask[
203
+ _class_attributes_name.index(attr_key)
204
+ ] = False
205
+
206
+ # set args from instantiation
207
+ # override if attr already has a value from config
208
+ attrs_keys_input = \
209
+ set(_class_attributes_name).intersection(kwargs.keys())
210
+
211
+ for attr_key in attrs_keys_input:
212
+
213
+ self.__setattr__(attr_key,
214
+ kwargs[attr_key])
215
+
216
+ _not_assigned_attrs_index_mask[
217
+ _class_attributes_name.index(attr_key)
218
+ ] = False
219
+
220
+ # attrs not present in config file or instance inputs
221
+ # --> self.attr leads to KeyError
222
+ # are manually assigned to default value derived
223
+ # from __attrs_attrs__
224
+
225
+ for attr_key in array(_class_attributes_name)[
226
+ _not_assigned_attrs_index_mask
227
+ ]:
228
+
229
+ try:
230
+
231
+ attr = [attr
232
+ for attr in self.__attrs_attrs__
233
+ if attr.name == attr_key][0]
234
+
235
+ except KeyError:
236
+
237
+ logger.warning('KeyError: initializing object has no '
238
+ f'attribute {attr.name}')
239
+ raise
240
+
241
+ except IndexError:
242
+
243
+ logger.warning('IndexError: initializing object has no '
244
+ f'attribute {attr.name}')
245
+ raise
246
+
247
+ else:
248
+
249
+ # assign default value
250
+ # try default and factory sabsequently
251
+ # if neither are present
252
+ # assign None
253
+ if hasattr(attr, 'default'):
254
+
255
+ if hasattr(attr.default, 'factory'):
256
+
257
+ self.__setattr__(attr.name,
258
+ attr.default.factory())
259
+
260
+ else:
261
+
262
+ self.__setattr__(attr.name,
263
+ attr.default)
264
+
265
+ else:
266
+
267
+ self.__setattr__(attr.name,
268
+ None)
269
+
270
+ else:
271
+
272
+ logger.trace(
273
+ f'config {kwargs["config"]} is empty, using default configuration')
274
+
275
+ else:
276
+
277
+ # no config file is defined
278
+ # call generated init
279
+ self.__attrs_init__(**kwargs) # type: ignore[attr-defined]
280
+
281
+ validate(self)
282
+
283
+ self.__attrs_post_init__()
284
+
285
+ def __attrs_post_init__(self, **kwargs: Any) -> None:
286
+
287
+ # set up log sink for historical manager
288
+ logger.add(self._data_path / 'log' / 'forexhistdata.log',
289
+ level="TRACE",
290
+ rotation="5 MB",
291
+ filter=lambda record: ('histmanager' == record['extra'].get('target') and
292
+ bool(record["extra"].get('target'))))
293
+
294
+ # set up dataframe engine internal var based on config selection
295
+ if self.engine == 'pandas':
296
+
297
+ self._dataframe_type = pandas_dataframe
298
+
299
+ elif self.engine == 'pyarrow':
300
+
301
+ self._dataframe_type = pyarrow_table
302
+
303
+ elif self.engine == 'polars':
304
+
305
+ self._dataframe_type = polars_dataframe
306
+
307
+ elif self.engine == 'polars_lazy':
308
+
309
+ self._dataframe_type = polars_lazyframe
310
+
311
+ else:
312
+
313
+ logger.bind(target='histmanager').error(f'Engine {self.engine} not supported')
314
+ raise ValueError(f'Engine {self.engine} not supported')
315
+
316
+ self._temporary_data_path = self._histdata_path \
317
+ / TEMP_FOLDER
318
+
319
+ self._clear_temporary_data_folder()
320
+
321
+ # instance database connector if selected
322
+ if self.data_type == DATA_TYPE.DUCKDB:
323
+
324
+ self._db_connector = DuckDBConnector(duckdb_filepath=str(
325
+ self._histdata_path / 'DuckDB' / 'marketdata.duckdb'))
326
+
327
+ elif (
328
+ self.data_type == DATA_TYPE.CSV_FILETYPE or
329
+ self.data_type == DATA_TYPE.PARQUET_FILETYPE
330
+ ):
331
+
332
+ self._db_connector = \
333
+ LocalDBConnector(
334
+ data_folder=str(self._histdata_path / 'LocalDB'),
335
+ data_type=self.data_type,
336
+ engine=self.engine
337
+ )
338
+
339
+ else:
340
+
341
+ logger.bind(target='histmanager').error(f'Data type {self.data_type} not supported')
342
+ raise ValueError(f'Data type {self.data_type} not supported')
343
+
344
+ def _clear_temporary_data_folder(self) -> None:
345
+
346
+ # delete temporary data path
347
+ if (
348
+ self._temporary_data_path.exists() and
349
+ self._temporary_data_path.is_dir()
350
+ ):
351
+
352
+ try:
353
+
354
+ rmtree(str(self._temporary_data_path))
355
+
356
+ except Exception as e:
357
+
358
+ logger.bind(target='histmanager').warning(
359
+ 'Deleting temporary data folder '
360
+ f'{str(self._temporary_data_path)} not successfull: {e}')
361
+
362
+ else:
363
+ logger.bind(target='histmanager').trace(
364
+ f'Temporary data folder {
365
+ self._temporary_data_path} does not exist')
366
+
367
+ def _get_ticker_list(self) -> List[str]:
368
+
369
+ # return list of tickers elements as str
370
+
371
+ return self._db_connector.get_tickers_list()
372
+
373
+ def _get_ticker_keys(
374
+ self,
375
+ ticker: str,
376
+ timeframe: Optional[str] = None) -> List[str]:
377
+
378
+ # return list of ticker keys elements as str
379
+
380
+ return self._db_connector.get_ticker_keys(ticker,
381
+ timeframe=timeframe)
382
+
383
+ def _get_ticker_years_list(
384
+ self,
385
+ ticker: str,
386
+ timeframe: str = TICK_TIMEFRAME) -> List[int]:
387
+
388
+ # return list of ticker years covered in data elements as str
389
+ # if timeframe is None means years in data in tick or 1m timeframe
390
+
391
+ return self._db_connector.get_ticker_years_list(ticker,
392
+ timeframe=timeframe)
393
+
394
+ def _complete_timeframe(self) -> None:
395
+
396
+ for ticker in self._get_ticker_list():
397
+
398
+ years_tick = self._get_ticker_years_list(ticker)
399
+
400
+ for tf in self._tf_list:
401
+
402
+ ticker_years_list = self._get_ticker_years_list(ticker, timeframe=tf)
403
+
404
+ if set(years_tick).difference(ticker_years_list):
405
+ years = set(years_tick).difference(ticker_years_list)
406
+
407
+ end_year = max(years)
408
+ start_year = min(years)
409
+
410
+ year_start = f'{start_year}-01-01 00:00:00.000'
411
+ year_end = f'{end_year + 1}-01-01 00:00:00.000'
412
+ # read missing years from tick timeframe
413
+ start = datetime.strptime(year_start, DATE_FORMAT_SQL)
414
+ end = datetime.strptime(year_end, DATE_FORMAT_SQL)
415
+
416
+ dataframe = self._db_connector.read_data(
417
+ market='forex',
418
+ ticker=ticker,
419
+ timeframe=TICK_TIMEFRAME,
420
+ start=start,
421
+ end=end
422
+ )
423
+
424
+ # reframe to timeframe
425
+ dataframe_tf = reframe_data(dataframe, tf)
426
+
427
+ # get key for dotty dict: TICK
428
+ tf_key = self._db_connector._db_key(
429
+ 'forex',
430
+ ticker,
431
+ tf
432
+ )
433
+
434
+ # write to database to complete the years
435
+ # call to upload df to database
436
+ self._db_connector.write_data(tf_key,
437
+ dataframe_tf)
438
+
439
+ ticker_years_list = self._get_ticker_years_list(ticker,
440
+ timeframe=tf)
441
+
442
+ # REDO THE CHECK FOR CONSISTENCY
443
+ if set(years_tick).difference(ticker_years_list):
444
+
445
+ logger.bind(target='histmanager').critical(
446
+ f'ticker {ticker}: {tf} timeframe completing'
447
+ ' operation FAILED')
448
+
449
+ raise KeyError
450
+
451
+ else:
452
+ logger.bind(target='histmanager').trace(
453
+ f'ticker {ticker}: {tf} timeframe completing operation successful')
454
+
455
+ else:
456
+ logger.bind(target='histmanager').trace(f'ticker {ticker}: no complete timeframe found')
457
+
458
+ def _update_db(self) -> None:
459
+
460
+ self._complete_timeframe()
461
+
462
+ def _download_month_raw(self,
463
+ ticker,
464
+ url,
465
+ year,
466
+ month_num
467
+ ) -> bytes:
468
+ """
469
+
470
+ Download a month data
471
+
472
+
473
+ Parameters
474
+ ----------
475
+ year : TYPE
476
+ DESCRIPTION.
477
+ month_num : TYPE
478
+ DESCRIPTION.
479
+
480
+ Returns
481
+ -------
482
+ TYPE
483
+ DESCRIPTION.
484
+
485
+ """
486
+
487
+ session = Session()
488
+ r = session.get(url)
489
+
490
+ token = None
491
+ try:
492
+ token = search('id="tk" value="(.*?)"', r.text).groups()[0]
493
+ except AttributeError:
494
+ logger.bind(target='histmanager').critical(
495
+ f'token value was not found scraping '
496
+ f'url {url}: {ticker} not existing or'
497
+ f'not supported by histdata.com: {ticker} - '
498
+ f'{year}-{MONTHS[month_num - 1]}')
499
+
500
+ # If exception was caught, token will still be None
501
+ if token is None:
502
+ raise TickerNotFoundError(
503
+ f"Ticker {ticker} not found or not supported by histdata.com")
504
+
505
+ ''' Alternative: using BeautifulSoup parser
506
+ r = session.get(url, allow_redirects=True)
507
+ soup = BeautifulSoup(r.content, 'html.parser')
508
+
509
+ with logger.catch(exception=AttributeError,
510
+ level='CRITICAL',
511
+ message=f'token value was not found scraping url {url}'):
512
+
513
+ token = soup.find('input', {'id': 'tk'}).attrs['value']
514
+
515
+ '''
516
+
517
+ headers = {'Referer': url}
518
+ data = {
519
+ 'tk': token,
520
+ 'date': year,
521
+ 'datemonth': "%d%02d" % (year, month_num),
522
+ 'platform': 'ASCII',
523
+ 'timeframe': 'T',
524
+ 'fxpair': ticker
525
+ }
526
+
527
+ # logger trace ticker year and month specifed are being downloaded
528
+ logger.bind(target='histmanager').trace(f'{ticker} - {year} - {MONTHS[month_num - 1]}: downloading')
529
+ r = session.request(
530
+ HISTDATA_BASE_DOWNLOAD_METHOD,
531
+ HISTDATA_BASE_DOWNLOAD_URL,
532
+ data=data,
533
+ headers=headers,
534
+ stream=True
535
+ )
536
+
537
+ bio = BytesIO()
538
+
539
+ # write content to stream
540
+ bio.write(r.content)
541
+
542
+ try:
543
+
544
+ zf = ZipFile(bio)
545
+
546
+ except BadZipFile as e:
547
+
548
+ # here will be a warning log
549
+ logger.bind(target='histmanager').error(f'{ticker} - {year} - {MONTHS[month_num - 1]}: {e}')
550
+ raise TickerDataBadTypeException(
551
+ f"Data {ticker} - {year} - {MONTHS[month_num - 1]} BadZipFile error: {e}")
552
+
553
+ else:
554
+
555
+ # return opened zip file
556
+ try:
557
+ ExtFile = zf.open(zf.namelist()[0])
558
+ except Exception as e:
559
+ logger.bind(target='histmanager').error(
560
+ f'{ticker} - {year} - {MONTHS[month_num - 1]}: '
561
+ f'not found or invalid download: {e}')
562
+ raise TickerDataNotFoundError(
563
+ f"Data {ticker} - {year} - {MONTHS[month_num - 1]} not found or not supported by histdata.com")
564
+
565
+ else:
566
+ if isinstance(ExtFile, ZipExtFile):
567
+ return ExtFile
568
+ else:
569
+ logger.bind(target='histmanager').error(
570
+ f'{ticker} - {year} - {MONTHS[month_num - 1]}: '
571
+ f'data type not expected')
572
+ raise TickerDataBadTypeException(
573
+ f"Data {ticker} - {year} - {MONTHS[month_num - 1]} type not expected")
574
+
575
+ def _raw_zipfile_to_df(self, raw_file, temp_filepath,
576
+ engine='polars') -> Union[polars_dataframe, polars_lazyframe]:
577
+ """
578
+
579
+
580
+ Parameters
581
+ ----------
582
+ raw_files_list : TYPE, optional
583
+ DESCRIPTION. The default is None.
584
+
585
+ Returns
586
+ -------
587
+ None.
588
+
589
+ """
590
+
591
+ if engine == 'pandas':
592
+
593
+ # pandas with python engine can read a runtime opened
594
+ # zip file
595
+
596
+ # funtions is specific for format of files downloaded
597
+ # parse file passed as input
598
+
599
+ df = read_csv(
600
+ 'pandas',
601
+ raw_file,
602
+ sep=',',
603
+ names=DATA_COLUMN_NAMES.TICK_DATA_NO_PVALUE,
604
+ dtype=DTYPE_DICT.TICK_DTYPE,
605
+ parse_dates=[DATA_FILE_COLUMN_INDEX.TIMESTAMP],
606
+ date_format=DATE_FORMAT_HISTDATA_CSV,
607
+ engine='c'
608
+ )
609
+
610
+ # calculate 'p'
611
+ df['p'] = (df['ask'] + df['bid']) / 2
612
+
613
+ elif engine == 'pyarrow':
614
+
615
+ # no way found to directly open a runtime zip file
616
+ # with pyarrow
617
+ # strategy rolls back to temporary file download
618
+ # open and read all
619
+ # delete temporary file
620
+
621
+ # alternative using pyarrow
622
+ buf = BufferReader(raw_file.read())
623
+
624
+ if (
625
+ Path(temp_filepath).exists() and
626
+ Path(temp_filepath).is_file()
627
+ ):
628
+
629
+ Path(temp_filepath).unlink(missing_ok=True)
630
+
631
+ else:
632
+
633
+ # create temporary files directory if not present
634
+ tempdir_path = Path(temp_filepath).parent
635
+ tempdir_path.mkdir(exist_ok=True)
636
+
637
+ # download buffer to file
638
+ buf.download(temp_filepath)
639
+
640
+ # from histdata raw files column 'p' is not present
641
+ # raw_file_dtypes = DTYPE_DICT.TICK_DTYPE.copy()
642
+ # raw_file_dtypes.pop('p')
643
+
644
+ # read temporary csv file
645
+
646
+ # use panda read_csv an its options with
647
+ # engine = 'pyarrow'
648
+ # dtype_backend = 'pyarrow'
649
+ # df = read_csv(
650
+ # 'pyarrow',
651
+ # temp_filepath,
652
+ # sep=',',
653
+ # index_col=0,
654
+ # names=DATA_COLUMN_NAMES.TICK_DATA,
655
+ # dtype=raw_file_dtypes,
656
+ # parse_dates=[0],
657
+ # date_format=DATE_FORMAT_HISTDATA_CSV,
658
+ # engine = 'pyarrow',
659
+ # dtype_backend = 'pyarrow'
660
+ # )
661
+ # perform step to convert index
662
+ # into a datetime64 dtype
663
+ # df.index = any_date_to_datetime64(df.index,
664
+ # date_format=DATE_FORMAT_HISTDATA_CSV,
665
+ # unit='ms')
666
+
667
+ # use pyarrow native options
668
+ read_opts = arrow_csv.ReadOptions(
669
+ use_threads=True,
670
+ column_names=DATA_COLUMN_NAMES.TICK_DATA_NO_PVALUE,
671
+
672
+ )
673
+
674
+ parse_opts = arrow_csv.ParseOptions(
675
+ delimiter=','
676
+ )
677
+
678
+ modtypes = PYARROW_DTYPE_DICT.TIME_TICK_DTYPE.copy()
679
+ modtypes[BASE_DATA_COLUMN_NAME.TIMESTAMP] = pyarrow_string()
680
+ modtypes.pop(BASE_DATA_COLUMN_NAME.P_VALUE)
681
+
682
+ convert_opts = arrow_csv.ConvertOptions(
683
+ column_types=modtypes
684
+ )
685
+
686
+ # at first read file with timestmap as a string
687
+ df = read_csv(
688
+ 'pyarrow',
689
+ temp_filepath,
690
+ read_options=read_opts,
691
+ parse_options=parse_opts,
692
+ convert_options=convert_opts
693
+ )
694
+
695
+ # convert timestamp string array to pyarrow timestamp('ms')
696
+
697
+ # pandas/numpy solution
698
+ # std_datetime = to_datetime(df[BASE_DATA_COLUMN_NAME.TIMESTAMP].to_numpy(),
699
+ # format=DATE_FORMAT_HISTDATA_CSV)
700
+
701
+ # timecol = pyarrow_array(std_datetime,
702
+ # type=pyarrow_timestamp('ms'))
703
+
704
+ # all pyarrow ops solution
705
+ # suggested here
706
+ # https://github.com/apache/arrow/issues/41132#issuecomment-2052555361
707
+
708
+ mod_format = DATE_FORMAT_HISTDATA_CSV.removesuffix('%f')
709
+ ts2 = pc.strptime(pc.utf8_slice_codeunits(
710
+ df[BASE_DATA_COLUMN_NAME.TIMESTAMP], 0, 15), format=mod_format, unit="ms")
711
+ d = pc.utf8_slice_codeunits(df[BASE_DATA_COLUMN_NAME.TIMESTAMP],
712
+ 15,
713
+ 99).cast(pyarrow_int64()).cast(duration("ms"))
714
+ timecol = pc.add(ts2, d)
715
+
716
+ # calculate 'p'
717
+ p_value = pc.divide(
718
+ pc.add_checked(df['ask'], df['bid']),
719
+ 2
720
+ )
721
+
722
+ # aggregate in a new table
723
+ df = Table.from_arrays(
724
+ [
725
+ timecol,
726
+ df[BASE_DATA_COLUMN_NAME.ASK],
727
+ df[BASE_DATA_COLUMN_NAME.BID],
728
+ df[BASE_DATA_COLUMN_NAME.VOL],
729
+ p_value
730
+ ],
731
+ schema=schema(PYARROW_DTYPE_DICT.TIME_TICK_DTYPE.copy().items())
732
+ )
733
+
734
+ elif engine == 'polars':
735
+
736
+ # download to temporary csv file
737
+ # for best performance with polars
738
+
739
+ # alternative using pyarrow
740
+ buf = BufferReader(raw_file.read())
741
+
742
+ if (
743
+ Path(temp_filepath).exists() and
744
+ Path(temp_filepath).is_file()
745
+ ):
746
+
747
+ Path(temp_filepath).unlink(missing_ok=True)
748
+
749
+ else:
750
+
751
+ # create temporary files directory if not present
752
+ tempdir_path = Path(temp_filepath).parent
753
+ tempdir_path.mkdir(exist_ok=True)
754
+
755
+ buf.download(temp_filepath)
756
+
757
+ # from histdata raw files column 'p' is not present
758
+ raw_file_dtypes = POLARS_DTYPE_DICT.TIME_TICK_DTYPE.copy()
759
+ raw_file_dtypes.pop('p')
760
+ raw_file_dtypes[BASE_DATA_COLUMN_NAME.TIMESTAMP] = polars_string
761
+
762
+ # read file
763
+ # set schema for columns but avoid timestamp columns
764
+ df = read_csv(
765
+ 'polars',
766
+ temp_filepath,
767
+ separator=',',
768
+ has_header=False,
769
+ new_columns=DATA_COLUMN_NAMES.TICK_DATA_NO_PVALUE,
770
+ schema=raw_file_dtypes,
771
+ use_pyarrow=True
772
+ )
773
+
774
+ # convert timestamp column to datetime data type
775
+ df = df.with_columns(
776
+ col(BASE_DATA_COLUMN_NAME.TIMESTAMP).str.strptime(
777
+ polars_datetime('ms'),
778
+ format=DATE_FORMAT_HISTDATA_CSV
779
+ )
780
+ )
781
+
782
+ # calculate 'p'
783
+ df = df.with_columns(
784
+ ((col('ask') + col('bid')) / 2).alias('p')
785
+ )
786
+
787
+ # final cast to standard dtypes
788
+ df = df.cast(POLARS_DTYPE_DICT.TIME_TICK_DTYPE)
789
+
790
+ # clean duplicated timestamps rows, keep first by default
791
+ df = df.unique(subset=[BASE_DATA_COLUMN_NAME.TIMESTAMP],
792
+ keep='first')
793
+
794
+ elif engine == 'polars_lazy':
795
+
796
+ # download to temporary csv file
797
+ # for best performance with polars
798
+
799
+ # alternative using pyarrow
800
+ buf = BufferReader(raw_file.read())
801
+
802
+ if (
803
+ Path(temp_filepath).exists() and
804
+ Path(temp_filepath).is_file()
805
+ ):
806
+
807
+ Path(temp_filepath).unlink(missing_ok=True)
808
+
809
+ else:
810
+
811
+ # create temporary files directory if not present
812
+ tempdir_path = Path(temp_filepath).parent
813
+ tempdir_path.mkdir(exist_ok=True)
814
+
815
+ # download buffer to file
816
+ buf.download(temp_filepath)
817
+
818
+ # from histdata raw files column 'p' is not present
819
+ raw_file_dtypes = POLARS_DTYPE_DICT.TIME_TICK_DTYPE.copy()
820
+ raw_file_dtypes.pop('p')
821
+ raw_file_dtypes[BASE_DATA_COLUMN_NAME.TIMESTAMP] = polars_string
822
+
823
+ # read file
824
+ # set schema for columns but avoid timestamp columns
825
+ df = read_csv(
826
+ 'polars_lazy',
827
+ temp_filepath,
828
+ separator=',',
829
+ has_header=False,
830
+ new_columns=DATA_COLUMN_NAMES.TICK_DATA_NO_PVALUE,
831
+ schema=raw_file_dtypes
832
+ )
833
+
834
+ # convert timestamp column to datetime data type
835
+ df = df.with_columns(
836
+ col(BASE_DATA_COLUMN_NAME.TIMESTAMP).str.strptime(
837
+ polars_datetime('ms'),
838
+ format=DATE_FORMAT_HISTDATA_CSV
839
+ )
840
+ )
841
+
842
+ # calculate 'p'
843
+ df = df.with_columns(
844
+ ((col('ask') + col('bid')) / 2).alias('p')
845
+ )
846
+
847
+ # final cast to standard dtypes
848
+ df = df.cast(POLARS_DTYPE_DICT.TIME_TICK_DTYPE)
849
+
850
+ # clean duplicated timestamps rows, keep first by default
851
+ df = df.unique(subset=[BASE_DATA_COLUMN_NAME.TIMESTAMP],
852
+ keep='first')
853
+
854
+ else:
855
+
856
+ logger.bind(target='histmanager').error(f'Engine {engine} is not supported')
857
+ raise TypeError
858
+
859
+ # return dataframe
860
+ return df
861
+
862
+ def _download_year(self,
863
+ ticker,
864
+ year) -> Union[polars_dataframe,
865
+ polars_lazyframe,
866
+ pandas_dataframe,
867
+ Table,
868
+ None]:
869
+
870
+ year_tick_df = empty_dataframe(self.engine)
871
+
872
+ for month in MONTHS:
873
+
874
+ month_num = MONTHS.index(month) + 1
875
+ url = HISTDATA_URL_TICKDATA_TEMPLATE.format(
876
+ ticker=ticker.lower(),
877
+ year=year,
878
+ month_num=month_num)
879
+ # TODO: test connection with url, if fails raise connection error
880
+
881
+ file = self._download_month_raw(
882
+ ticker,
883
+ url,
884
+ year,
885
+ month_num
886
+ )
887
+
888
+ if file and isinstance(file, ZipExtFile):
889
+
890
+ month_data = self._raw_zipfile_to_df(file,
891
+ str(self._temporary_data_path /
892
+ (f'{ticker}_' +
893
+ f'{year}_' +
894
+ f'{month}_' +
895
+ TEMP_CSV_FILE)
896
+ ),
897
+ engine=self.engine
898
+ )
899
+
900
+ # if first iteration, assign instead of concat
901
+ if is_empty_dataframe(year_tick_df):
902
+
903
+ year_tick_df = month_data
904
+
905
+ else:
906
+
907
+ year_tick_df = concat_data([year_tick_df, month_data])
908
+
909
+ else:
910
+
911
+ logger.bind(target='histmanager').critical(
912
+ f"Ticker {ticker}-{year}-{MONTHS[month_num - 1]} data not found or invalid")
913
+ raise TickerDataInvalidException(
914
+ f"Ticker {ticker} - {year} - {MONTHS[month_num - 1]} data not found or invalid: generic error")
915
+
916
+ return sort_dataframe(year_tick_df,
917
+ BASE_DATA_COLUMN_NAME.TIMESTAMP)
918
+
919
+ def _download(self,
920
+ ticker,
921
+ years: List[int]) -> None:
922
+
923
+ if not (
924
+ isinstance(years, list)
925
+ ):
926
+
927
+ logger.bind(target='histmanager').error('years {years} invalid, must be list type')
928
+ raise TypeError
929
+
930
+ if not (
931
+ set(years).issubset(YEARS)
932
+ ):
933
+
934
+ logger.bind(target='histmanager').error(
935
+ f'requestedyears{years} not available. '
936
+ f'Years must be limited to: {YEARS}')
937
+ raise ValueError
938
+
939
+ else:
940
+ logger.bind(target='histmanager').trace(f'Requested years {years} are valid')
941
+
942
+ # convert to list of int
943
+ if not all(isinstance(year, int) for year in years):
944
+ years = [int(year) for year in years]
945
+
946
+ for year in years:
947
+
948
+ year_tick_df = self._download_year(
949
+ ticker,
950
+ year
951
+ )
952
+
953
+ # get key for dotty dict: TICK
954
+ tick_key = self._db_connector._db_key('forex',
955
+ ticker,
956
+ TICK_TIMEFRAME)
957
+
958
+ # call to upload df to database if not empty
959
+ if not is_empty_dataframe(year_tick_df):
960
+ self._db_connector.write_data(tick_key,
961
+ year_tick_df)
962
+ else:
963
+ logger.bind(target='histmanager').warning(
964
+ f'Year tick dataframe for {tick_key} is empty, skipping database write')
965
+
966
+ # update manager database
967
+ self._update_db()
968
+
969
+ def clear_database(self, filter: Optional[str] = None) -> None:
970
+
971
+ self._db_connector.clear_database(filter=filter)
972
+
973
+ def add_timeframe(self, timeframe: str) -> None:
974
+ """
975
+ Add and cache a new timeframe to the database.
976
+
977
+ Creates aggregated data for the specified timeframe from tick data and
978
+ caches it in the database for faster future access. The timeframe is
979
+ added to the internal list of available timeframes.
980
+
981
+ Args:
982
+ timeframe (str | List[str]): Timeframe(s) to add. Can be a single string
983
+ or list of strings. Supported values: '1m', '5m', '15m', '30m',
984
+ '1h', '4h', '1D', '1W', '1M'
985
+
986
+ Returns:
987
+ None
988
+
989
+ Raises:
990
+ TypeError: If timeframe is not a string or list of strings
991
+
992
+ Example:
993
+ >>> manager = HistoricalManagerDB(config='data_config.yaml')
994
+ >>> manager.add_timeframe('1W') # Add weekly timeframe
995
+ >>> manager.add_timeframe(['4h', '1D']) # Add multiple timeframes
996
+
997
+ Note:
998
+ - Only new timeframes (not already in the list) will be processed
999
+ - Aggregation can take time for large datasets
1000
+ - Once added, the timeframe is permanently cached in the database
1001
+ """
1002
+
1003
+ if not hasattr(self, '_tf_list'):
1004
+ self._tf_list = []
1005
+ else:
1006
+ logger.bind(target='histmanager').trace('_tf_list already exists')
1007
+
1008
+ if isinstance(timeframe, str):
1009
+
1010
+ timeframe = [timeframe]
1011
+
1012
+ if not (
1013
+ isinstance(timeframe, list) and
1014
+ all([isinstance(tf, str) for tf in timeframe])
1015
+ ):
1016
+
1017
+ logger.bind(target='histmanager').error('timeframe invalid: str or list required')
1018
+ raise TypeError
1019
+
1020
+ tf_list = [check_timeframe_str(tf) for tf in timeframe]
1021
+
1022
+ if not set(tf_list).issubset(self._tf_list):
1023
+
1024
+ # concat timeframe accordingly
1025
+ # only just new elements not already present
1026
+ self._tf_list.extend(set(tf_list).difference(self._tf_list))
1027
+ self._update_db()
1028
+
1029
+ def get_data(
1030
+ self,
1031
+ ticker,
1032
+ timeframe,
1033
+ start,
1034
+ end,
1035
+ comparison_column_name: List[str] | str | None = None,
1036
+ check_level: List[int | float] | int | float | None = None,
1037
+ comparison_operator: List[SUPPORTED_SQL_COMPARISON_OPERATORS] | SUPPORTED_SQL_COMPARISON_OPERATORS | None = None,
1038
+ aggregation_mode: SUPPORTED_SQL_CONDITION_AGGREGATION_MODES | None = None,
1039
+ ) -> Union[polars_dataframe, polars_lazyframe]:
1040
+ """
1041
+ Retrieve OHLC historical data for the specified ticker and timeframe.
1042
+
1043
+ Fetches historical forex data from the database, automatically downloading
1044
+ and aggregating data if not already available. Supports multiple timeframes
1045
+ and date ranges.
1046
+
1047
+ Args:
1048
+ ticker (str): Currency pair symbol (e.g., 'EURUSD', 'GBPUSD', 'NZDUSD').
1049
+ Case-insensitive.
1050
+ timeframe (str): Candle timeframe for data aggregation. Supported values:
1051
+ '1m', '5m', '15m', '30m', '1h', '4h', '1D', '1W', '1M'
1052
+ start (str | datetime): Start date for data retrieval. Accepts:
1053
+ - ISO format: 'YYYY-MM-DD' or 'YYYY-MM-DD HH:MM:SS'
1054
+ - datetime object
1055
+ end (str | datetime): End date for data retrieval. Same format as start.
1056
+ Must be after start date.
1057
+ comparison_column_name (List[str] | str | None): Column names to retrieve. Default is None.
1058
+ check_level (List[int | float] | int | float | None): Check level for conditions. Default is None.
1059
+ comparison_operator (List[SUPPORTED_SQL_COMPARISON_OPERATORS] | SUPPORTED_SQL_COMPARISON_OPERATORS | None): Condition for data retrieval. Default is None.
1060
+ aggregation_mode (SUPPORTED_SQL_CONDITION_AGGREGATION_MODES | None): Aggregation mode for data retrieval. Default is None.
1061
+
1062
+ Returns:
1063
+ polars.DataFrame | polars.LazyFrame: DataFrame containing OHLC data with columns:
1064
+ - timestamp: datetime column with candle timestamps
1065
+ - open: Opening price (float32)
1066
+ - high: Highest price (float32)
1067
+ - low: Lowest price (float32)
1068
+ - close: Closing price (float32)
1069
+
1070
+ Raises:
1071
+ TickerNotFoundError: If the ticker is not available in the historical database
1072
+ ValueError: If timeframe is invalid or end date is before start date
1073
+
1074
+ Example:
1075
+ >>> manager = HistoricalManagerDB(config='data_config.yaml')
1076
+ >>> data = manager.get_data(
1077
+ ... ticker='EURUSD',
1078
+ ... timeframe='1h',
1079
+ ... start='2020-01-01',
1080
+ ... end='2020-01-31'
1081
+ ... )
1082
+ >>> print(f"Retrieved {len(data)} hourly candles")
1083
+ Retrieved 744 hourly candles
1084
+
1085
+ Note:
1086
+ - Data is automatically downloaded from histdata.com if not cached locally
1087
+ - First call for a new timeframe may take longer as it builds the aggregation
1088
+ - Downloaded data is cached for faster subsequent access
1089
+ - Ticker names are case-insensitive and automatically normalized
1090
+ """
1091
+
1092
+ # check ticker exists in available tickers
1093
+ # from histdata database
1094
+ if (
1095
+ ticker.upper() not in get_histdata_tickers()
1096
+ and
1097
+ ticker.lower() not in self._get_ticker_list()
1098
+ ):
1099
+ logger.bind(target='histmanager').error(f'ticker {ticker.upper()} not found in database')
1100
+ raise TickerNotFoundError(f'ticker {ticker} not found in database')
1101
+
1102
+ # force ticker parameter to lower case
1103
+ ticker = ticker.lower()
1104
+
1105
+ if not check_timeframe_str(timeframe):
1106
+
1107
+ logger.bind(target='histmanager').error(f'timeframe request {timeframe} invalid')
1108
+ raise ValueError
1109
+
1110
+ else:
1111
+
1112
+ start = any_date_to_datetime64(start)
1113
+ end = any_date_to_datetime64(end)
1114
+
1115
+ if end < start:
1116
+
1117
+ logger.bind(target='histmanager').error(
1118
+ 'date interval not coherent, '
1119
+ 'start must be older than end')
1120
+ return self._dataframe_type([])
1121
+
1122
+ # get years including interval requested
1123
+ years_interval_req = list(range(start.year, end.year + 1, 1))
1124
+
1125
+ # get all keys referring to specific ticker
1126
+ ticker_years_list = self._get_ticker_years_list(ticker, timeframe=timeframe)
1127
+
1128
+ # aggregate data to current instance if necessary
1129
+ if not set(years_interval_req).issubset(ticker_years_list):
1130
+
1131
+ year_tf_missing = list(
1132
+ set(years_interval_req).difference(ticker_years_list))
1133
+
1134
+ year_tick_keys = self._get_ticker_years_list(
1135
+ ticker, timeframe=TICK_TIMEFRAME)
1136
+
1137
+ year_tick_missing = list(set(years_interval_req).difference(year_tick_keys))
1138
+
1139
+ # if tick is missing --> download missing years
1140
+ if year_tick_missing:
1141
+
1142
+ self._download(
1143
+ ticker,
1144
+ year_tick_missing
1145
+ )
1146
+
1147
+ # if timeframe req is in tf_list
1148
+ # data requested should at this point be available
1149
+ # call add data for specific timeframe requested
1150
+ if timeframe not in self._tf_list:
1151
+
1152
+ # call add single tf data
1153
+ self.add_timeframe(timeframe)
1154
+
1155
+ else:
1156
+ logger.bind(target='histmanager').trace(f'Timeframe {timeframe} already in _tf_list')
1157
+
1158
+ # get all keys referring to specific ticker
1159
+ ticker_keys = self._get_ticker_keys(ticker)
1160
+
1161
+ # get all keys referring to specific ticker
1162
+ ticker_years_list = self._get_ticker_years_list(ticker, timeframe=timeframe)
1163
+
1164
+ if not set(years_interval_req).issubset(ticker_years_list):
1165
+
1166
+ logger.bind(target='histmanager').critical(
1167
+ f'processing year data completion for '
1168
+ f'{years_interval_req} not ok')
1169
+ raise ValueError
1170
+
1171
+ else:
1172
+ logger.bind(target='histmanager').trace(
1173
+ f'Year data completion for {years_interval_req} successful')
1174
+
1175
+ # at this point all data requested have been aggregated to the database
1176
+
1177
+ # execute a read query on database
1178
+ return self._db_connector.read_data(
1179
+ market='forex',
1180
+ ticker=ticker,
1181
+ timeframe=timeframe,
1182
+ start=start,
1183
+ end=end,
1184
+ comparison_column_name=comparison_column_name,
1185
+ check_level=check_level,
1186
+ comparison_operator=comparison_operator,
1187
+ comparison_aggregation_mode=aggregation_mode
1188
+ )
1189
+
1190
+ def plot(
1191
+ self,
1192
+ ticker,
1193
+ timeframe,
1194
+ start_date,
1195
+ end_date
1196
+ ) -> None:
1197
+ """
1198
+ Plot candlestick chart for the specified ticker and date range.
1199
+
1200
+ Generates an interactive candlestick chart using mplfinance, displaying
1201
+ OHLC (Open, High, Low, Close) data for the specified time period.
1202
+
1203
+ Args:
1204
+ ticker (str): Currency pair symbol (e.g., 'EURUSD', 'GBPUSD')
1205
+ timeframe (str): Candle timeframe (e.g., '1m', '5m', '1h', '1D', '1W')
1206
+ start_date (str): Start date in ISO format 'YYYY-MM-DD' or 'YYYY-MM-DD HH:MM:SS'
1207
+ end_date (str): End date in ISO format 'YYYY-MM-DD' or 'YYYY-MM-DD HH:MM:SS'
1208
+
1209
+ Returns:
1210
+ None: Displays the chart using matplotlib
1211
+
1212
+ Example:
1213
+ >>> manager = HistoricalManagerDB(config='data_config.yaml')
1214
+ >>> manager.plot(
1215
+ ... ticker='EURUSD',
1216
+ ... timeframe='1D',
1217
+ ... start_date='2020-01-01',
1218
+ ... end_date='2020-12-31'
1219
+ ... )
1220
+
1221
+ Note:
1222
+ The chart will be displayed in a matplotlib window. The data is automatically
1223
+ fetched using get_data() and converted to the appropriate format for plotting.
1224
+ """
1225
+
1226
+ logger.bind(target='histmanager').info(f'''Chart request:
1227
+ ticker {ticker}
1228
+ timeframe {timeframe}
1229
+ from {start_date}
1230
+ to {end_date}''')
1231
+
1232
+ chart_data = self.get_data(ticker=ticker,
1233
+ timeframe=timeframe,
1234
+ start=start_date,
1235
+ end=end_date)
1236
+
1237
+ chart_data = to_pandas_dataframe(chart_data)
1238
+
1239
+ if chart_data.index.name != BASE_DATA_COLUMN_NAME.TIMESTAMP:
1240
+
1241
+ chart_data.set_index(BASE_DATA_COLUMN_NAME.TIMESTAMP,
1242
+ inplace=True)
1243
+
1244
+ chart_data.index = to_datetime(chart_data.index)
1245
+
1246
+ else:
1247
+ logger.bind(target='histmanager').trace(f'Chart data already has {BASE_DATA_COLUMN_NAME.TIMESTAMP} as index')
1248
+
1249
+ # candlestick chart type
1250
+ # use mplfinance
1251
+ chart_kwargs = dict(style='charles',
1252
+ title=ticker,
1253
+ ylabel='Quotation',
1254
+ xlabel='Timestamp',
1255
+ volume=False,
1256
+ figratio=(12, 8),
1257
+ figscale=1
1258
+ )
1259
+
1260
+ mpf_plot(chart_data, type='candle', **chart_kwargs)
1261
+
1262
+ mpf_show()