forex_data_aggregator 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- forex_data/__init__.py +92 -0
- forex_data/config/__init__.py +20 -0
- forex_data/config/config_file.py +89 -0
- forex_data/data_management/__init__.py +84 -0
- forex_data/data_management/common.py +1773 -0
- forex_data/data_management/database.py +1322 -0
- forex_data/data_management/historicaldata.py +1262 -0
- forex_data/data_management/realtimedata.py +993 -0
- forex_data_aggregator-0.1.2.dist-info/LICENSE +21 -0
- forex_data_aggregator-0.1.2.dist-info/METADATA +562 -0
- forex_data_aggregator-0.1.2.dist-info/RECORD +12 -0
- forex_data_aggregator-0.1.2.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1262 @@
|
|
|
1
|
+
|
|
2
|
+
from loguru import logger
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
|
|
6
|
+
from attrs import (
|
|
7
|
+
define,
|
|
8
|
+
field,
|
|
9
|
+
validate,
|
|
10
|
+
validators
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
# PANDAS
|
|
14
|
+
from pandas import (
|
|
15
|
+
DataFrame as pandas_dataframe,
|
|
16
|
+
to_datetime
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# PYARROW
|
|
20
|
+
from pyarrow import (
|
|
21
|
+
int64 as pyarrow_int64,
|
|
22
|
+
string as pyarrow_string,
|
|
23
|
+
BufferReader,
|
|
24
|
+
csv as arrow_csv,
|
|
25
|
+
compute as pc,
|
|
26
|
+
schema,
|
|
27
|
+
Table,
|
|
28
|
+
table as pyarrow_table,
|
|
29
|
+
duration
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# POLARS
|
|
33
|
+
from polars import (
|
|
34
|
+
String as polars_string,
|
|
35
|
+
col,
|
|
36
|
+
DataFrame as polars_dataframe,
|
|
37
|
+
LazyFrame as polars_lazyframe
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
from zipfile import (
|
|
41
|
+
ZipFile,
|
|
42
|
+
ZipExtFile,
|
|
43
|
+
BadZipFile
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
from re import (
|
|
47
|
+
search,
|
|
48
|
+
match
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
from mplfinance import (
|
|
52
|
+
plot as mpf_plot,
|
|
53
|
+
show as mpf_show
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
from numpy import array
|
|
57
|
+
|
|
58
|
+
from pathlib import Path
|
|
59
|
+
from requests import Session
|
|
60
|
+
from io import BytesIO
|
|
61
|
+
from shutil import rmtree
|
|
62
|
+
|
|
63
|
+
# python base
|
|
64
|
+
from dotty_dict import (
|
|
65
|
+
Dotty,
|
|
66
|
+
dotty
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
from iteration_utilities import (
|
|
70
|
+
duplicates,
|
|
71
|
+
unique_everseen
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# internally defined
|
|
75
|
+
from .common import *
|
|
76
|
+
from ..config import (
|
|
77
|
+
read_config_file,
|
|
78
|
+
read_config_string,
|
|
79
|
+
read_config_folder
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
from .database import (
|
|
83
|
+
DatabaseConnector,
|
|
84
|
+
DuckDBConnector,
|
|
85
|
+
LocalDBConnector
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
__all__ = ['HistoricalManagerDB']
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# HISTORICAL DATA MANAGER
|
|
93
|
+
@define(kw_only=True, slots=True)
|
|
94
|
+
class HistoricalManagerDB:
|
|
95
|
+
|
|
96
|
+
# interface parameters
|
|
97
|
+
config: str = field(default='',
|
|
98
|
+
validator=validators.instance_of(str))
|
|
99
|
+
data_type: str = field(default='parquet',
|
|
100
|
+
validator=validators.in_(SUPPORTED_DATA_FILES))
|
|
101
|
+
engine: str = field(default='polars_lazy',
|
|
102
|
+
validator=validators.in_(SUPPORTED_DATA_ENGINES))
|
|
103
|
+
|
|
104
|
+
# internal
|
|
105
|
+
_db_connector = field(factory=DatabaseConnector)
|
|
106
|
+
_tf_list = field(factory=list, validator=validators.instance_of(list))
|
|
107
|
+
_dataframe_type = field(default=pandas_dataframe)
|
|
108
|
+
_data_path = field(default=Path(DEFAULT_PATHS.BASE_PATH),
|
|
109
|
+
validator=validator_dir_path(create_if_missing=True))
|
|
110
|
+
_histdata_path = field(
|
|
111
|
+
default=Path(DEFAULT_PATHS.BASE_PATH) / DEFAULT_PATHS.HIST_DATA_FOLDER,
|
|
112
|
+
validator=validator_dir_path(create_if_missing=True))
|
|
113
|
+
_temporary_data_path = field(
|
|
114
|
+
default=(Path(DEFAULT_PATHS.BASE_PATH) /
|
|
115
|
+
DEFAULT_PATHS.HIST_DATA_FOLDER /
|
|
116
|
+
TEMP_FOLDER),
|
|
117
|
+
validator=validator_dir_path(create_if_missing=True))
|
|
118
|
+
|
|
119
|
+
# if a valid config file or string
|
|
120
|
+
# is passed
|
|
121
|
+
# arguments contained are assigned here
|
|
122
|
+
# if instantiation passed values are present
|
|
123
|
+
# they will override the related argument
|
|
124
|
+
# value in the next initialization step
|
|
125
|
+
|
|
126
|
+
# if neither by instantation or config file
|
|
127
|
+
# an argument value is set, the argument
|
|
128
|
+
# will be set by asociated defined default
|
|
129
|
+
# or factory generator
|
|
130
|
+
|
|
131
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
132
|
+
|
|
133
|
+
_class_attributes_name = get_attrs_names(self, **kwargs)
|
|
134
|
+
_not_assigned_attrs_index_mask = [True] * len(_class_attributes_name)
|
|
135
|
+
|
|
136
|
+
if 'config' in kwargs.keys():
|
|
137
|
+
|
|
138
|
+
if kwargs['config']:
|
|
139
|
+
|
|
140
|
+
config_path = Path(kwargs['config'])
|
|
141
|
+
|
|
142
|
+
if (
|
|
143
|
+
config_path.exists()
|
|
144
|
+
and
|
|
145
|
+
config_path.is_dir()
|
|
146
|
+
):
|
|
147
|
+
|
|
148
|
+
config_filepath = read_config_folder(
|
|
149
|
+
config_path, file_pattern='data_config.yaml')
|
|
150
|
+
|
|
151
|
+
else:
|
|
152
|
+
|
|
153
|
+
config_filepath = Path()
|
|
154
|
+
|
|
155
|
+
config_args = {}
|
|
156
|
+
if config_filepath.exists() \
|
|
157
|
+
and \
|
|
158
|
+
config_filepath.is_file() \
|
|
159
|
+
and \
|
|
160
|
+
config_filepath.suffix == '.yaml':
|
|
161
|
+
|
|
162
|
+
# read parameters from config file
|
|
163
|
+
# and force keys to lower case
|
|
164
|
+
config_args = {key.lower(): val for key, val in
|
|
165
|
+
read_config_file(str(config_filepath)).items()}
|
|
166
|
+
|
|
167
|
+
elif isinstance(kwargs['config'], str):
|
|
168
|
+
|
|
169
|
+
# read parameters from config file
|
|
170
|
+
# and force keys to lower case
|
|
171
|
+
config_args = {key.lower(): val for key, val in
|
|
172
|
+
read_config_string(kwargs['config']).items()}
|
|
173
|
+
|
|
174
|
+
else:
|
|
175
|
+
|
|
176
|
+
logger.critical('invalid config type '
|
|
177
|
+
f'{kwargs["config"]}: '
|
|
178
|
+
'required str or Path, got '
|
|
179
|
+
f'{type(kwargs["config"])}')
|
|
180
|
+
raise TypeError
|
|
181
|
+
|
|
182
|
+
# check consistency of config_args
|
|
183
|
+
if (
|
|
184
|
+
not isinstance(config_args, dict)
|
|
185
|
+
or
|
|
186
|
+
not bool(config_args)
|
|
187
|
+
):
|
|
188
|
+
|
|
189
|
+
logger.critical(f'config {kwargs["config"]} '
|
|
190
|
+
'has no valid yaml formatted data')
|
|
191
|
+
raise TypeError
|
|
192
|
+
|
|
193
|
+
# set args from config file
|
|
194
|
+
attrs_keys_configfile = \
|
|
195
|
+
set(_class_attributes_name).intersection(config_args.keys())
|
|
196
|
+
|
|
197
|
+
for attr_key in attrs_keys_configfile:
|
|
198
|
+
|
|
199
|
+
self.__setattr__(attr_key,
|
|
200
|
+
config_args[attr_key])
|
|
201
|
+
|
|
202
|
+
_not_assigned_attrs_index_mask[
|
|
203
|
+
_class_attributes_name.index(attr_key)
|
|
204
|
+
] = False
|
|
205
|
+
|
|
206
|
+
# set args from instantiation
|
|
207
|
+
# override if attr already has a value from config
|
|
208
|
+
attrs_keys_input = \
|
|
209
|
+
set(_class_attributes_name).intersection(kwargs.keys())
|
|
210
|
+
|
|
211
|
+
for attr_key in attrs_keys_input:
|
|
212
|
+
|
|
213
|
+
self.__setattr__(attr_key,
|
|
214
|
+
kwargs[attr_key])
|
|
215
|
+
|
|
216
|
+
_not_assigned_attrs_index_mask[
|
|
217
|
+
_class_attributes_name.index(attr_key)
|
|
218
|
+
] = False
|
|
219
|
+
|
|
220
|
+
# attrs not present in config file or instance inputs
|
|
221
|
+
# --> self.attr leads to KeyError
|
|
222
|
+
# are manually assigned to default value derived
|
|
223
|
+
# from __attrs_attrs__
|
|
224
|
+
|
|
225
|
+
for attr_key in array(_class_attributes_name)[
|
|
226
|
+
_not_assigned_attrs_index_mask
|
|
227
|
+
]:
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
|
|
231
|
+
attr = [attr
|
|
232
|
+
for attr in self.__attrs_attrs__
|
|
233
|
+
if attr.name == attr_key][0]
|
|
234
|
+
|
|
235
|
+
except KeyError:
|
|
236
|
+
|
|
237
|
+
logger.warning('KeyError: initializing object has no '
|
|
238
|
+
f'attribute {attr.name}')
|
|
239
|
+
raise
|
|
240
|
+
|
|
241
|
+
except IndexError:
|
|
242
|
+
|
|
243
|
+
logger.warning('IndexError: initializing object has no '
|
|
244
|
+
f'attribute {attr.name}')
|
|
245
|
+
raise
|
|
246
|
+
|
|
247
|
+
else:
|
|
248
|
+
|
|
249
|
+
# assign default value
|
|
250
|
+
# try default and factory sabsequently
|
|
251
|
+
# if neither are present
|
|
252
|
+
# assign None
|
|
253
|
+
if hasattr(attr, 'default'):
|
|
254
|
+
|
|
255
|
+
if hasattr(attr.default, 'factory'):
|
|
256
|
+
|
|
257
|
+
self.__setattr__(attr.name,
|
|
258
|
+
attr.default.factory())
|
|
259
|
+
|
|
260
|
+
else:
|
|
261
|
+
|
|
262
|
+
self.__setattr__(attr.name,
|
|
263
|
+
attr.default)
|
|
264
|
+
|
|
265
|
+
else:
|
|
266
|
+
|
|
267
|
+
self.__setattr__(attr.name,
|
|
268
|
+
None)
|
|
269
|
+
|
|
270
|
+
else:
|
|
271
|
+
|
|
272
|
+
logger.trace(
|
|
273
|
+
f'config {kwargs["config"]} is empty, using default configuration')
|
|
274
|
+
|
|
275
|
+
else:
|
|
276
|
+
|
|
277
|
+
# no config file is defined
|
|
278
|
+
# call generated init
|
|
279
|
+
self.__attrs_init__(**kwargs) # type: ignore[attr-defined]
|
|
280
|
+
|
|
281
|
+
validate(self)
|
|
282
|
+
|
|
283
|
+
self.__attrs_post_init__()
|
|
284
|
+
|
|
285
|
+
def __attrs_post_init__(self, **kwargs: Any) -> None:
|
|
286
|
+
|
|
287
|
+
# set up log sink for historical manager
|
|
288
|
+
logger.add(self._data_path / 'log' / 'forexhistdata.log',
|
|
289
|
+
level="TRACE",
|
|
290
|
+
rotation="5 MB",
|
|
291
|
+
filter=lambda record: ('histmanager' == record['extra'].get('target') and
|
|
292
|
+
bool(record["extra"].get('target'))))
|
|
293
|
+
|
|
294
|
+
# set up dataframe engine internal var based on config selection
|
|
295
|
+
if self.engine == 'pandas':
|
|
296
|
+
|
|
297
|
+
self._dataframe_type = pandas_dataframe
|
|
298
|
+
|
|
299
|
+
elif self.engine == 'pyarrow':
|
|
300
|
+
|
|
301
|
+
self._dataframe_type = pyarrow_table
|
|
302
|
+
|
|
303
|
+
elif self.engine == 'polars':
|
|
304
|
+
|
|
305
|
+
self._dataframe_type = polars_dataframe
|
|
306
|
+
|
|
307
|
+
elif self.engine == 'polars_lazy':
|
|
308
|
+
|
|
309
|
+
self._dataframe_type = polars_lazyframe
|
|
310
|
+
|
|
311
|
+
else:
|
|
312
|
+
|
|
313
|
+
logger.bind(target='histmanager').error(f'Engine {self.engine} not supported')
|
|
314
|
+
raise ValueError(f'Engine {self.engine} not supported')
|
|
315
|
+
|
|
316
|
+
self._temporary_data_path = self._histdata_path \
|
|
317
|
+
/ TEMP_FOLDER
|
|
318
|
+
|
|
319
|
+
self._clear_temporary_data_folder()
|
|
320
|
+
|
|
321
|
+
# instance database connector if selected
|
|
322
|
+
if self.data_type == DATA_TYPE.DUCKDB:
|
|
323
|
+
|
|
324
|
+
self._db_connector = DuckDBConnector(duckdb_filepath=str(
|
|
325
|
+
self._histdata_path / 'DuckDB' / 'marketdata.duckdb'))
|
|
326
|
+
|
|
327
|
+
elif (
|
|
328
|
+
self.data_type == DATA_TYPE.CSV_FILETYPE or
|
|
329
|
+
self.data_type == DATA_TYPE.PARQUET_FILETYPE
|
|
330
|
+
):
|
|
331
|
+
|
|
332
|
+
self._db_connector = \
|
|
333
|
+
LocalDBConnector(
|
|
334
|
+
data_folder=str(self._histdata_path / 'LocalDB'),
|
|
335
|
+
data_type=self.data_type,
|
|
336
|
+
engine=self.engine
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
else:
|
|
340
|
+
|
|
341
|
+
logger.bind(target='histmanager').error(f'Data type {self.data_type} not supported')
|
|
342
|
+
raise ValueError(f'Data type {self.data_type} not supported')
|
|
343
|
+
|
|
344
|
+
def _clear_temporary_data_folder(self) -> None:
|
|
345
|
+
|
|
346
|
+
# delete temporary data path
|
|
347
|
+
if (
|
|
348
|
+
self._temporary_data_path.exists() and
|
|
349
|
+
self._temporary_data_path.is_dir()
|
|
350
|
+
):
|
|
351
|
+
|
|
352
|
+
try:
|
|
353
|
+
|
|
354
|
+
rmtree(str(self._temporary_data_path))
|
|
355
|
+
|
|
356
|
+
except Exception as e:
|
|
357
|
+
|
|
358
|
+
logger.bind(target='histmanager').warning(
|
|
359
|
+
'Deleting temporary data folder '
|
|
360
|
+
f'{str(self._temporary_data_path)} not successfull: {e}')
|
|
361
|
+
|
|
362
|
+
else:
|
|
363
|
+
logger.bind(target='histmanager').trace(
|
|
364
|
+
f'Temporary data folder {
|
|
365
|
+
self._temporary_data_path} does not exist')
|
|
366
|
+
|
|
367
|
+
def _get_ticker_list(self) -> List[str]:
|
|
368
|
+
|
|
369
|
+
# return list of tickers elements as str
|
|
370
|
+
|
|
371
|
+
return self._db_connector.get_tickers_list()
|
|
372
|
+
|
|
373
|
+
def _get_ticker_keys(
|
|
374
|
+
self,
|
|
375
|
+
ticker: str,
|
|
376
|
+
timeframe: Optional[str] = None) -> List[str]:
|
|
377
|
+
|
|
378
|
+
# return list of ticker keys elements as str
|
|
379
|
+
|
|
380
|
+
return self._db_connector.get_ticker_keys(ticker,
|
|
381
|
+
timeframe=timeframe)
|
|
382
|
+
|
|
383
|
+
def _get_ticker_years_list(
|
|
384
|
+
self,
|
|
385
|
+
ticker: str,
|
|
386
|
+
timeframe: str = TICK_TIMEFRAME) -> List[int]:
|
|
387
|
+
|
|
388
|
+
# return list of ticker years covered in data elements as str
|
|
389
|
+
# if timeframe is None means years in data in tick or 1m timeframe
|
|
390
|
+
|
|
391
|
+
return self._db_connector.get_ticker_years_list(ticker,
|
|
392
|
+
timeframe=timeframe)
|
|
393
|
+
|
|
394
|
+
def _complete_timeframe(self) -> None:
|
|
395
|
+
|
|
396
|
+
for ticker in self._get_ticker_list():
|
|
397
|
+
|
|
398
|
+
years_tick = self._get_ticker_years_list(ticker)
|
|
399
|
+
|
|
400
|
+
for tf in self._tf_list:
|
|
401
|
+
|
|
402
|
+
ticker_years_list = self._get_ticker_years_list(ticker, timeframe=tf)
|
|
403
|
+
|
|
404
|
+
if set(years_tick).difference(ticker_years_list):
|
|
405
|
+
years = set(years_tick).difference(ticker_years_list)
|
|
406
|
+
|
|
407
|
+
end_year = max(years)
|
|
408
|
+
start_year = min(years)
|
|
409
|
+
|
|
410
|
+
year_start = f'{start_year}-01-01 00:00:00.000'
|
|
411
|
+
year_end = f'{end_year + 1}-01-01 00:00:00.000'
|
|
412
|
+
# read missing years from tick timeframe
|
|
413
|
+
start = datetime.strptime(year_start, DATE_FORMAT_SQL)
|
|
414
|
+
end = datetime.strptime(year_end, DATE_FORMAT_SQL)
|
|
415
|
+
|
|
416
|
+
dataframe = self._db_connector.read_data(
|
|
417
|
+
market='forex',
|
|
418
|
+
ticker=ticker,
|
|
419
|
+
timeframe=TICK_TIMEFRAME,
|
|
420
|
+
start=start,
|
|
421
|
+
end=end
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
# reframe to timeframe
|
|
425
|
+
dataframe_tf = reframe_data(dataframe, tf)
|
|
426
|
+
|
|
427
|
+
# get key for dotty dict: TICK
|
|
428
|
+
tf_key = self._db_connector._db_key(
|
|
429
|
+
'forex',
|
|
430
|
+
ticker,
|
|
431
|
+
tf
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
# write to database to complete the years
|
|
435
|
+
# call to upload df to database
|
|
436
|
+
self._db_connector.write_data(tf_key,
|
|
437
|
+
dataframe_tf)
|
|
438
|
+
|
|
439
|
+
ticker_years_list = self._get_ticker_years_list(ticker,
|
|
440
|
+
timeframe=tf)
|
|
441
|
+
|
|
442
|
+
# REDO THE CHECK FOR CONSISTENCY
|
|
443
|
+
if set(years_tick).difference(ticker_years_list):
|
|
444
|
+
|
|
445
|
+
logger.bind(target='histmanager').critical(
|
|
446
|
+
f'ticker {ticker}: {tf} timeframe completing'
|
|
447
|
+
' operation FAILED')
|
|
448
|
+
|
|
449
|
+
raise KeyError
|
|
450
|
+
|
|
451
|
+
else:
|
|
452
|
+
logger.bind(target='histmanager').trace(
|
|
453
|
+
f'ticker {ticker}: {tf} timeframe completing operation successful')
|
|
454
|
+
|
|
455
|
+
else:
|
|
456
|
+
logger.bind(target='histmanager').trace(f'ticker {ticker}: no complete timeframe found')
|
|
457
|
+
|
|
458
|
+
def _update_db(self) -> None:
|
|
459
|
+
|
|
460
|
+
self._complete_timeframe()
|
|
461
|
+
|
|
462
|
+
def _download_month_raw(self,
|
|
463
|
+
ticker,
|
|
464
|
+
url,
|
|
465
|
+
year,
|
|
466
|
+
month_num
|
|
467
|
+
) -> bytes:
|
|
468
|
+
"""
|
|
469
|
+
|
|
470
|
+
Download a month data
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
Parameters
|
|
474
|
+
----------
|
|
475
|
+
year : TYPE
|
|
476
|
+
DESCRIPTION.
|
|
477
|
+
month_num : TYPE
|
|
478
|
+
DESCRIPTION.
|
|
479
|
+
|
|
480
|
+
Returns
|
|
481
|
+
-------
|
|
482
|
+
TYPE
|
|
483
|
+
DESCRIPTION.
|
|
484
|
+
|
|
485
|
+
"""
|
|
486
|
+
|
|
487
|
+
session = Session()
|
|
488
|
+
r = session.get(url)
|
|
489
|
+
|
|
490
|
+
token = None
|
|
491
|
+
try:
|
|
492
|
+
token = search('id="tk" value="(.*?)"', r.text).groups()[0]
|
|
493
|
+
except AttributeError:
|
|
494
|
+
logger.bind(target='histmanager').critical(
|
|
495
|
+
f'token value was not found scraping '
|
|
496
|
+
f'url {url}: {ticker} not existing or'
|
|
497
|
+
f'not supported by histdata.com: {ticker} - '
|
|
498
|
+
f'{year}-{MONTHS[month_num - 1]}')
|
|
499
|
+
|
|
500
|
+
# If exception was caught, token will still be None
|
|
501
|
+
if token is None:
|
|
502
|
+
raise TickerNotFoundError(
|
|
503
|
+
f"Ticker {ticker} not found or not supported by histdata.com")
|
|
504
|
+
|
|
505
|
+
''' Alternative: using BeautifulSoup parser
|
|
506
|
+
r = session.get(url, allow_redirects=True)
|
|
507
|
+
soup = BeautifulSoup(r.content, 'html.parser')
|
|
508
|
+
|
|
509
|
+
with logger.catch(exception=AttributeError,
|
|
510
|
+
level='CRITICAL',
|
|
511
|
+
message=f'token value was not found scraping url {url}'):
|
|
512
|
+
|
|
513
|
+
token = soup.find('input', {'id': 'tk'}).attrs['value']
|
|
514
|
+
|
|
515
|
+
'''
|
|
516
|
+
|
|
517
|
+
headers = {'Referer': url}
|
|
518
|
+
data = {
|
|
519
|
+
'tk': token,
|
|
520
|
+
'date': year,
|
|
521
|
+
'datemonth': "%d%02d" % (year, month_num),
|
|
522
|
+
'platform': 'ASCII',
|
|
523
|
+
'timeframe': 'T',
|
|
524
|
+
'fxpair': ticker
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
# logger trace ticker year and month specifed are being downloaded
|
|
528
|
+
logger.bind(target='histmanager').trace(f'{ticker} - {year} - {MONTHS[month_num - 1]}: downloading')
|
|
529
|
+
r = session.request(
|
|
530
|
+
HISTDATA_BASE_DOWNLOAD_METHOD,
|
|
531
|
+
HISTDATA_BASE_DOWNLOAD_URL,
|
|
532
|
+
data=data,
|
|
533
|
+
headers=headers,
|
|
534
|
+
stream=True
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
bio = BytesIO()
|
|
538
|
+
|
|
539
|
+
# write content to stream
|
|
540
|
+
bio.write(r.content)
|
|
541
|
+
|
|
542
|
+
try:
|
|
543
|
+
|
|
544
|
+
zf = ZipFile(bio)
|
|
545
|
+
|
|
546
|
+
except BadZipFile as e:
|
|
547
|
+
|
|
548
|
+
# here will be a warning log
|
|
549
|
+
logger.bind(target='histmanager').error(f'{ticker} - {year} - {MONTHS[month_num - 1]}: {e}')
|
|
550
|
+
raise TickerDataBadTypeException(
|
|
551
|
+
f"Data {ticker} - {year} - {MONTHS[month_num - 1]} BadZipFile error: {e}")
|
|
552
|
+
|
|
553
|
+
else:
|
|
554
|
+
|
|
555
|
+
# return opened zip file
|
|
556
|
+
try:
|
|
557
|
+
ExtFile = zf.open(zf.namelist()[0])
|
|
558
|
+
except Exception as e:
|
|
559
|
+
logger.bind(target='histmanager').error(
|
|
560
|
+
f'{ticker} - {year} - {MONTHS[month_num - 1]}: '
|
|
561
|
+
f'not found or invalid download: {e}')
|
|
562
|
+
raise TickerDataNotFoundError(
|
|
563
|
+
f"Data {ticker} - {year} - {MONTHS[month_num - 1]} not found or not supported by histdata.com")
|
|
564
|
+
|
|
565
|
+
else:
|
|
566
|
+
if isinstance(ExtFile, ZipExtFile):
|
|
567
|
+
return ExtFile
|
|
568
|
+
else:
|
|
569
|
+
logger.bind(target='histmanager').error(
|
|
570
|
+
f'{ticker} - {year} - {MONTHS[month_num - 1]}: '
|
|
571
|
+
f'data type not expected')
|
|
572
|
+
raise TickerDataBadTypeException(
|
|
573
|
+
f"Data {ticker} - {year} - {MONTHS[month_num - 1]} type not expected")
|
|
574
|
+
|
|
575
|
+
def _raw_zipfile_to_df(self, raw_file, temp_filepath,
|
|
576
|
+
engine='polars') -> Union[polars_dataframe, polars_lazyframe]:
|
|
577
|
+
"""
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
Parameters
|
|
581
|
+
----------
|
|
582
|
+
raw_files_list : TYPE, optional
|
|
583
|
+
DESCRIPTION. The default is None.
|
|
584
|
+
|
|
585
|
+
Returns
|
|
586
|
+
-------
|
|
587
|
+
None.
|
|
588
|
+
|
|
589
|
+
"""
|
|
590
|
+
|
|
591
|
+
if engine == 'pandas':
|
|
592
|
+
|
|
593
|
+
# pandas with python engine can read a runtime opened
|
|
594
|
+
# zip file
|
|
595
|
+
|
|
596
|
+
# funtions is specific for format of files downloaded
|
|
597
|
+
# parse file passed as input
|
|
598
|
+
|
|
599
|
+
df = read_csv(
|
|
600
|
+
'pandas',
|
|
601
|
+
raw_file,
|
|
602
|
+
sep=',',
|
|
603
|
+
names=DATA_COLUMN_NAMES.TICK_DATA_NO_PVALUE,
|
|
604
|
+
dtype=DTYPE_DICT.TICK_DTYPE,
|
|
605
|
+
parse_dates=[DATA_FILE_COLUMN_INDEX.TIMESTAMP],
|
|
606
|
+
date_format=DATE_FORMAT_HISTDATA_CSV,
|
|
607
|
+
engine='c'
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
# calculate 'p'
|
|
611
|
+
df['p'] = (df['ask'] + df['bid']) / 2
|
|
612
|
+
|
|
613
|
+
elif engine == 'pyarrow':
|
|
614
|
+
|
|
615
|
+
# no way found to directly open a runtime zip file
|
|
616
|
+
# with pyarrow
|
|
617
|
+
# strategy rolls back to temporary file download
|
|
618
|
+
# open and read all
|
|
619
|
+
# delete temporary file
|
|
620
|
+
|
|
621
|
+
# alternative using pyarrow
|
|
622
|
+
buf = BufferReader(raw_file.read())
|
|
623
|
+
|
|
624
|
+
if (
|
|
625
|
+
Path(temp_filepath).exists() and
|
|
626
|
+
Path(temp_filepath).is_file()
|
|
627
|
+
):
|
|
628
|
+
|
|
629
|
+
Path(temp_filepath).unlink(missing_ok=True)
|
|
630
|
+
|
|
631
|
+
else:
|
|
632
|
+
|
|
633
|
+
# create temporary files directory if not present
|
|
634
|
+
tempdir_path = Path(temp_filepath).parent
|
|
635
|
+
tempdir_path.mkdir(exist_ok=True)
|
|
636
|
+
|
|
637
|
+
# download buffer to file
|
|
638
|
+
buf.download(temp_filepath)
|
|
639
|
+
|
|
640
|
+
# from histdata raw files column 'p' is not present
|
|
641
|
+
# raw_file_dtypes = DTYPE_DICT.TICK_DTYPE.copy()
|
|
642
|
+
# raw_file_dtypes.pop('p')
|
|
643
|
+
|
|
644
|
+
# read temporary csv file
|
|
645
|
+
|
|
646
|
+
# use panda read_csv an its options with
|
|
647
|
+
# engine = 'pyarrow'
|
|
648
|
+
# dtype_backend = 'pyarrow'
|
|
649
|
+
# df = read_csv(
|
|
650
|
+
# 'pyarrow',
|
|
651
|
+
# temp_filepath,
|
|
652
|
+
# sep=',',
|
|
653
|
+
# index_col=0,
|
|
654
|
+
# names=DATA_COLUMN_NAMES.TICK_DATA,
|
|
655
|
+
# dtype=raw_file_dtypes,
|
|
656
|
+
# parse_dates=[0],
|
|
657
|
+
# date_format=DATE_FORMAT_HISTDATA_CSV,
|
|
658
|
+
# engine = 'pyarrow',
|
|
659
|
+
# dtype_backend = 'pyarrow'
|
|
660
|
+
# )
|
|
661
|
+
# perform step to convert index
|
|
662
|
+
# into a datetime64 dtype
|
|
663
|
+
# df.index = any_date_to_datetime64(df.index,
|
|
664
|
+
# date_format=DATE_FORMAT_HISTDATA_CSV,
|
|
665
|
+
# unit='ms')
|
|
666
|
+
|
|
667
|
+
# use pyarrow native options
|
|
668
|
+
read_opts = arrow_csv.ReadOptions(
|
|
669
|
+
use_threads=True,
|
|
670
|
+
column_names=DATA_COLUMN_NAMES.TICK_DATA_NO_PVALUE,
|
|
671
|
+
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
parse_opts = arrow_csv.ParseOptions(
|
|
675
|
+
delimiter=','
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
modtypes = PYARROW_DTYPE_DICT.TIME_TICK_DTYPE.copy()
|
|
679
|
+
modtypes[BASE_DATA_COLUMN_NAME.TIMESTAMP] = pyarrow_string()
|
|
680
|
+
modtypes.pop(BASE_DATA_COLUMN_NAME.P_VALUE)
|
|
681
|
+
|
|
682
|
+
convert_opts = arrow_csv.ConvertOptions(
|
|
683
|
+
column_types=modtypes
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
# at first read file with timestmap as a string
|
|
687
|
+
df = read_csv(
|
|
688
|
+
'pyarrow',
|
|
689
|
+
temp_filepath,
|
|
690
|
+
read_options=read_opts,
|
|
691
|
+
parse_options=parse_opts,
|
|
692
|
+
convert_options=convert_opts
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
# convert timestamp string array to pyarrow timestamp('ms')
|
|
696
|
+
|
|
697
|
+
# pandas/numpy solution
|
|
698
|
+
# std_datetime = to_datetime(df[BASE_DATA_COLUMN_NAME.TIMESTAMP].to_numpy(),
|
|
699
|
+
# format=DATE_FORMAT_HISTDATA_CSV)
|
|
700
|
+
|
|
701
|
+
# timecol = pyarrow_array(std_datetime,
|
|
702
|
+
# type=pyarrow_timestamp('ms'))
|
|
703
|
+
|
|
704
|
+
# all pyarrow ops solution
|
|
705
|
+
# suggested here
|
|
706
|
+
# https://github.com/apache/arrow/issues/41132#issuecomment-2052555361
|
|
707
|
+
|
|
708
|
+
mod_format = DATE_FORMAT_HISTDATA_CSV.removesuffix('%f')
|
|
709
|
+
ts2 = pc.strptime(pc.utf8_slice_codeunits(
|
|
710
|
+
df[BASE_DATA_COLUMN_NAME.TIMESTAMP], 0, 15), format=mod_format, unit="ms")
|
|
711
|
+
d = pc.utf8_slice_codeunits(df[BASE_DATA_COLUMN_NAME.TIMESTAMP],
|
|
712
|
+
15,
|
|
713
|
+
99).cast(pyarrow_int64()).cast(duration("ms"))
|
|
714
|
+
timecol = pc.add(ts2, d)
|
|
715
|
+
|
|
716
|
+
# calculate 'p'
|
|
717
|
+
p_value = pc.divide(
|
|
718
|
+
pc.add_checked(df['ask'], df['bid']),
|
|
719
|
+
2
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
# aggregate in a new table
|
|
723
|
+
df = Table.from_arrays(
|
|
724
|
+
[
|
|
725
|
+
timecol,
|
|
726
|
+
df[BASE_DATA_COLUMN_NAME.ASK],
|
|
727
|
+
df[BASE_DATA_COLUMN_NAME.BID],
|
|
728
|
+
df[BASE_DATA_COLUMN_NAME.VOL],
|
|
729
|
+
p_value
|
|
730
|
+
],
|
|
731
|
+
schema=schema(PYARROW_DTYPE_DICT.TIME_TICK_DTYPE.copy().items())
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
elif engine == 'polars':
|
|
735
|
+
|
|
736
|
+
# download to temporary csv file
|
|
737
|
+
# for best performance with polars
|
|
738
|
+
|
|
739
|
+
# alternative using pyarrow
|
|
740
|
+
buf = BufferReader(raw_file.read())
|
|
741
|
+
|
|
742
|
+
if (
|
|
743
|
+
Path(temp_filepath).exists() and
|
|
744
|
+
Path(temp_filepath).is_file()
|
|
745
|
+
):
|
|
746
|
+
|
|
747
|
+
Path(temp_filepath).unlink(missing_ok=True)
|
|
748
|
+
|
|
749
|
+
else:
|
|
750
|
+
|
|
751
|
+
# create temporary files directory if not present
|
|
752
|
+
tempdir_path = Path(temp_filepath).parent
|
|
753
|
+
tempdir_path.mkdir(exist_ok=True)
|
|
754
|
+
|
|
755
|
+
buf.download(temp_filepath)
|
|
756
|
+
|
|
757
|
+
# from histdata raw files column 'p' is not present
|
|
758
|
+
raw_file_dtypes = POLARS_DTYPE_DICT.TIME_TICK_DTYPE.copy()
|
|
759
|
+
raw_file_dtypes.pop('p')
|
|
760
|
+
raw_file_dtypes[BASE_DATA_COLUMN_NAME.TIMESTAMP] = polars_string
|
|
761
|
+
|
|
762
|
+
# read file
|
|
763
|
+
# set schema for columns but avoid timestamp columns
|
|
764
|
+
df = read_csv(
|
|
765
|
+
'polars',
|
|
766
|
+
temp_filepath,
|
|
767
|
+
separator=',',
|
|
768
|
+
has_header=False,
|
|
769
|
+
new_columns=DATA_COLUMN_NAMES.TICK_DATA_NO_PVALUE,
|
|
770
|
+
schema=raw_file_dtypes,
|
|
771
|
+
use_pyarrow=True
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
# convert timestamp column to datetime data type
|
|
775
|
+
df = df.with_columns(
|
|
776
|
+
col(BASE_DATA_COLUMN_NAME.TIMESTAMP).str.strptime(
|
|
777
|
+
polars_datetime('ms'),
|
|
778
|
+
format=DATE_FORMAT_HISTDATA_CSV
|
|
779
|
+
)
|
|
780
|
+
)
|
|
781
|
+
|
|
782
|
+
# calculate 'p'
|
|
783
|
+
df = df.with_columns(
|
|
784
|
+
((col('ask') + col('bid')) / 2).alias('p')
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
# final cast to standard dtypes
|
|
788
|
+
df = df.cast(POLARS_DTYPE_DICT.TIME_TICK_DTYPE)
|
|
789
|
+
|
|
790
|
+
# clean duplicated timestamps rows, keep first by default
|
|
791
|
+
df = df.unique(subset=[BASE_DATA_COLUMN_NAME.TIMESTAMP],
|
|
792
|
+
keep='first')
|
|
793
|
+
|
|
794
|
+
elif engine == 'polars_lazy':
|
|
795
|
+
|
|
796
|
+
# download to temporary csv file
|
|
797
|
+
# for best performance with polars
|
|
798
|
+
|
|
799
|
+
# alternative using pyarrow
|
|
800
|
+
buf = BufferReader(raw_file.read())
|
|
801
|
+
|
|
802
|
+
if (
|
|
803
|
+
Path(temp_filepath).exists() and
|
|
804
|
+
Path(temp_filepath).is_file()
|
|
805
|
+
):
|
|
806
|
+
|
|
807
|
+
Path(temp_filepath).unlink(missing_ok=True)
|
|
808
|
+
|
|
809
|
+
else:
|
|
810
|
+
|
|
811
|
+
# create temporary files directory if not present
|
|
812
|
+
tempdir_path = Path(temp_filepath).parent
|
|
813
|
+
tempdir_path.mkdir(exist_ok=True)
|
|
814
|
+
|
|
815
|
+
# download buffer to file
|
|
816
|
+
buf.download(temp_filepath)
|
|
817
|
+
|
|
818
|
+
# from histdata raw files column 'p' is not present
|
|
819
|
+
raw_file_dtypes = POLARS_DTYPE_DICT.TIME_TICK_DTYPE.copy()
|
|
820
|
+
raw_file_dtypes.pop('p')
|
|
821
|
+
raw_file_dtypes[BASE_DATA_COLUMN_NAME.TIMESTAMP] = polars_string
|
|
822
|
+
|
|
823
|
+
# read file
|
|
824
|
+
# set schema for columns but avoid timestamp columns
|
|
825
|
+
df = read_csv(
|
|
826
|
+
'polars_lazy',
|
|
827
|
+
temp_filepath,
|
|
828
|
+
separator=',',
|
|
829
|
+
has_header=False,
|
|
830
|
+
new_columns=DATA_COLUMN_NAMES.TICK_DATA_NO_PVALUE,
|
|
831
|
+
schema=raw_file_dtypes
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
# convert timestamp column to datetime data type
|
|
835
|
+
df = df.with_columns(
|
|
836
|
+
col(BASE_DATA_COLUMN_NAME.TIMESTAMP).str.strptime(
|
|
837
|
+
polars_datetime('ms'),
|
|
838
|
+
format=DATE_FORMAT_HISTDATA_CSV
|
|
839
|
+
)
|
|
840
|
+
)
|
|
841
|
+
|
|
842
|
+
# calculate 'p'
|
|
843
|
+
df = df.with_columns(
|
|
844
|
+
((col('ask') + col('bid')) / 2).alias('p')
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
# final cast to standard dtypes
|
|
848
|
+
df = df.cast(POLARS_DTYPE_DICT.TIME_TICK_DTYPE)
|
|
849
|
+
|
|
850
|
+
# clean duplicated timestamps rows, keep first by default
|
|
851
|
+
df = df.unique(subset=[BASE_DATA_COLUMN_NAME.TIMESTAMP],
|
|
852
|
+
keep='first')
|
|
853
|
+
|
|
854
|
+
else:
|
|
855
|
+
|
|
856
|
+
logger.bind(target='histmanager').error(f'Engine {engine} is not supported')
|
|
857
|
+
raise TypeError
|
|
858
|
+
|
|
859
|
+
# return dataframe
|
|
860
|
+
return df
|
|
861
|
+
|
|
862
|
+
def _download_year(self,
|
|
863
|
+
ticker,
|
|
864
|
+
year) -> Union[polars_dataframe,
|
|
865
|
+
polars_lazyframe,
|
|
866
|
+
pandas_dataframe,
|
|
867
|
+
Table,
|
|
868
|
+
None]:
|
|
869
|
+
|
|
870
|
+
year_tick_df = empty_dataframe(self.engine)
|
|
871
|
+
|
|
872
|
+
for month in MONTHS:
|
|
873
|
+
|
|
874
|
+
month_num = MONTHS.index(month) + 1
|
|
875
|
+
url = HISTDATA_URL_TICKDATA_TEMPLATE.format(
|
|
876
|
+
ticker=ticker.lower(),
|
|
877
|
+
year=year,
|
|
878
|
+
month_num=month_num)
|
|
879
|
+
# TODO: test connection with url, if fails raise connection error
|
|
880
|
+
|
|
881
|
+
file = self._download_month_raw(
|
|
882
|
+
ticker,
|
|
883
|
+
url,
|
|
884
|
+
year,
|
|
885
|
+
month_num
|
|
886
|
+
)
|
|
887
|
+
|
|
888
|
+
if file and isinstance(file, ZipExtFile):
|
|
889
|
+
|
|
890
|
+
month_data = self._raw_zipfile_to_df(file,
|
|
891
|
+
str(self._temporary_data_path /
|
|
892
|
+
(f'{ticker}_' +
|
|
893
|
+
f'{year}_' +
|
|
894
|
+
f'{month}_' +
|
|
895
|
+
TEMP_CSV_FILE)
|
|
896
|
+
),
|
|
897
|
+
engine=self.engine
|
|
898
|
+
)
|
|
899
|
+
|
|
900
|
+
# if first iteration, assign instead of concat
|
|
901
|
+
if is_empty_dataframe(year_tick_df):
|
|
902
|
+
|
|
903
|
+
year_tick_df = month_data
|
|
904
|
+
|
|
905
|
+
else:
|
|
906
|
+
|
|
907
|
+
year_tick_df = concat_data([year_tick_df, month_data])
|
|
908
|
+
|
|
909
|
+
else:
|
|
910
|
+
|
|
911
|
+
logger.bind(target='histmanager').critical(
|
|
912
|
+
f"Ticker {ticker}-{year}-{MONTHS[month_num - 1]} data not found or invalid")
|
|
913
|
+
raise TickerDataInvalidException(
|
|
914
|
+
f"Ticker {ticker} - {year} - {MONTHS[month_num - 1]} data not found or invalid: generic error")
|
|
915
|
+
|
|
916
|
+
return sort_dataframe(year_tick_df,
|
|
917
|
+
BASE_DATA_COLUMN_NAME.TIMESTAMP)
|
|
918
|
+
|
|
919
|
+
def _download(self,
|
|
920
|
+
ticker,
|
|
921
|
+
years: List[int]) -> None:
|
|
922
|
+
|
|
923
|
+
if not (
|
|
924
|
+
isinstance(years, list)
|
|
925
|
+
):
|
|
926
|
+
|
|
927
|
+
logger.bind(target='histmanager').error('years {years} invalid, must be list type')
|
|
928
|
+
raise TypeError
|
|
929
|
+
|
|
930
|
+
if not (
|
|
931
|
+
set(years).issubset(YEARS)
|
|
932
|
+
):
|
|
933
|
+
|
|
934
|
+
logger.bind(target='histmanager').error(
|
|
935
|
+
f'requestedyears{years} not available. '
|
|
936
|
+
f'Years must be limited to: {YEARS}')
|
|
937
|
+
raise ValueError
|
|
938
|
+
|
|
939
|
+
else:
|
|
940
|
+
logger.bind(target='histmanager').trace(f'Requested years {years} are valid')
|
|
941
|
+
|
|
942
|
+
# convert to list of int
|
|
943
|
+
if not all(isinstance(year, int) for year in years):
|
|
944
|
+
years = [int(year) for year in years]
|
|
945
|
+
|
|
946
|
+
for year in years:
|
|
947
|
+
|
|
948
|
+
year_tick_df = self._download_year(
|
|
949
|
+
ticker,
|
|
950
|
+
year
|
|
951
|
+
)
|
|
952
|
+
|
|
953
|
+
# get key for dotty dict: TICK
|
|
954
|
+
tick_key = self._db_connector._db_key('forex',
|
|
955
|
+
ticker,
|
|
956
|
+
TICK_TIMEFRAME)
|
|
957
|
+
|
|
958
|
+
# call to upload df to database if not empty
|
|
959
|
+
if not is_empty_dataframe(year_tick_df):
|
|
960
|
+
self._db_connector.write_data(tick_key,
|
|
961
|
+
year_tick_df)
|
|
962
|
+
else:
|
|
963
|
+
logger.bind(target='histmanager').warning(
|
|
964
|
+
f'Year tick dataframe for {tick_key} is empty, skipping database write')
|
|
965
|
+
|
|
966
|
+
# update manager database
|
|
967
|
+
self._update_db()
|
|
968
|
+
|
|
969
|
+
def clear_database(self, filter: Optional[str] = None) -> None:
|
|
970
|
+
|
|
971
|
+
self._db_connector.clear_database(filter=filter)
|
|
972
|
+
|
|
973
|
+
def add_timeframe(self, timeframe: str) -> None:
|
|
974
|
+
"""
|
|
975
|
+
Add and cache a new timeframe to the database.
|
|
976
|
+
|
|
977
|
+
Creates aggregated data for the specified timeframe from tick data and
|
|
978
|
+
caches it in the database for faster future access. The timeframe is
|
|
979
|
+
added to the internal list of available timeframes.
|
|
980
|
+
|
|
981
|
+
Args:
|
|
982
|
+
timeframe (str | List[str]): Timeframe(s) to add. Can be a single string
|
|
983
|
+
or list of strings. Supported values: '1m', '5m', '15m', '30m',
|
|
984
|
+
'1h', '4h', '1D', '1W', '1M'
|
|
985
|
+
|
|
986
|
+
Returns:
|
|
987
|
+
None
|
|
988
|
+
|
|
989
|
+
Raises:
|
|
990
|
+
TypeError: If timeframe is not a string or list of strings
|
|
991
|
+
|
|
992
|
+
Example:
|
|
993
|
+
>>> manager = HistoricalManagerDB(config='data_config.yaml')
|
|
994
|
+
>>> manager.add_timeframe('1W') # Add weekly timeframe
|
|
995
|
+
>>> manager.add_timeframe(['4h', '1D']) # Add multiple timeframes
|
|
996
|
+
|
|
997
|
+
Note:
|
|
998
|
+
- Only new timeframes (not already in the list) will be processed
|
|
999
|
+
- Aggregation can take time for large datasets
|
|
1000
|
+
- Once added, the timeframe is permanently cached in the database
|
|
1001
|
+
"""
|
|
1002
|
+
|
|
1003
|
+
if not hasattr(self, '_tf_list'):
|
|
1004
|
+
self._tf_list = []
|
|
1005
|
+
else:
|
|
1006
|
+
logger.bind(target='histmanager').trace('_tf_list already exists')
|
|
1007
|
+
|
|
1008
|
+
if isinstance(timeframe, str):
|
|
1009
|
+
|
|
1010
|
+
timeframe = [timeframe]
|
|
1011
|
+
|
|
1012
|
+
if not (
|
|
1013
|
+
isinstance(timeframe, list) and
|
|
1014
|
+
all([isinstance(tf, str) for tf in timeframe])
|
|
1015
|
+
):
|
|
1016
|
+
|
|
1017
|
+
logger.bind(target='histmanager').error('timeframe invalid: str or list required')
|
|
1018
|
+
raise TypeError
|
|
1019
|
+
|
|
1020
|
+
tf_list = [check_timeframe_str(tf) for tf in timeframe]
|
|
1021
|
+
|
|
1022
|
+
if not set(tf_list).issubset(self._tf_list):
|
|
1023
|
+
|
|
1024
|
+
# concat timeframe accordingly
|
|
1025
|
+
# only just new elements not already present
|
|
1026
|
+
self._tf_list.extend(set(tf_list).difference(self._tf_list))
|
|
1027
|
+
self._update_db()
|
|
1028
|
+
|
|
1029
|
+
def get_data(
|
|
1030
|
+
self,
|
|
1031
|
+
ticker,
|
|
1032
|
+
timeframe,
|
|
1033
|
+
start,
|
|
1034
|
+
end,
|
|
1035
|
+
comparison_column_name: List[str] | str | None = None,
|
|
1036
|
+
check_level: List[int | float] | int | float | None = None,
|
|
1037
|
+
comparison_operator: List[SUPPORTED_SQL_COMPARISON_OPERATORS] | SUPPORTED_SQL_COMPARISON_OPERATORS | None = None,
|
|
1038
|
+
aggregation_mode: SUPPORTED_SQL_CONDITION_AGGREGATION_MODES | None = None,
|
|
1039
|
+
) -> Union[polars_dataframe, polars_lazyframe]:
|
|
1040
|
+
"""
|
|
1041
|
+
Retrieve OHLC historical data for the specified ticker and timeframe.
|
|
1042
|
+
|
|
1043
|
+
Fetches historical forex data from the database, automatically downloading
|
|
1044
|
+
and aggregating data if not already available. Supports multiple timeframes
|
|
1045
|
+
and date ranges.
|
|
1046
|
+
|
|
1047
|
+
Args:
|
|
1048
|
+
ticker (str): Currency pair symbol (e.g., 'EURUSD', 'GBPUSD', 'NZDUSD').
|
|
1049
|
+
Case-insensitive.
|
|
1050
|
+
timeframe (str): Candle timeframe for data aggregation. Supported values:
|
|
1051
|
+
'1m', '5m', '15m', '30m', '1h', '4h', '1D', '1W', '1M'
|
|
1052
|
+
start (str | datetime): Start date for data retrieval. Accepts:
|
|
1053
|
+
- ISO format: 'YYYY-MM-DD' or 'YYYY-MM-DD HH:MM:SS'
|
|
1054
|
+
- datetime object
|
|
1055
|
+
end (str | datetime): End date for data retrieval. Same format as start.
|
|
1056
|
+
Must be after start date.
|
|
1057
|
+
comparison_column_name (List[str] | str | None): Column names to retrieve. Default is None.
|
|
1058
|
+
check_level (List[int | float] | int | float | None): Check level for conditions. Default is None.
|
|
1059
|
+
comparison_operator (List[SUPPORTED_SQL_COMPARISON_OPERATORS] | SUPPORTED_SQL_COMPARISON_OPERATORS | None): Condition for data retrieval. Default is None.
|
|
1060
|
+
aggregation_mode (SUPPORTED_SQL_CONDITION_AGGREGATION_MODES | None): Aggregation mode for data retrieval. Default is None.
|
|
1061
|
+
|
|
1062
|
+
Returns:
|
|
1063
|
+
polars.DataFrame | polars.LazyFrame: DataFrame containing OHLC data with columns:
|
|
1064
|
+
- timestamp: datetime column with candle timestamps
|
|
1065
|
+
- open: Opening price (float32)
|
|
1066
|
+
- high: Highest price (float32)
|
|
1067
|
+
- low: Lowest price (float32)
|
|
1068
|
+
- close: Closing price (float32)
|
|
1069
|
+
|
|
1070
|
+
Raises:
|
|
1071
|
+
TickerNotFoundError: If the ticker is not available in the historical database
|
|
1072
|
+
ValueError: If timeframe is invalid or end date is before start date
|
|
1073
|
+
|
|
1074
|
+
Example:
|
|
1075
|
+
>>> manager = HistoricalManagerDB(config='data_config.yaml')
|
|
1076
|
+
>>> data = manager.get_data(
|
|
1077
|
+
... ticker='EURUSD',
|
|
1078
|
+
... timeframe='1h',
|
|
1079
|
+
... start='2020-01-01',
|
|
1080
|
+
... end='2020-01-31'
|
|
1081
|
+
... )
|
|
1082
|
+
>>> print(f"Retrieved {len(data)} hourly candles")
|
|
1083
|
+
Retrieved 744 hourly candles
|
|
1084
|
+
|
|
1085
|
+
Note:
|
|
1086
|
+
- Data is automatically downloaded from histdata.com if not cached locally
|
|
1087
|
+
- First call for a new timeframe may take longer as it builds the aggregation
|
|
1088
|
+
- Downloaded data is cached for faster subsequent access
|
|
1089
|
+
- Ticker names are case-insensitive and automatically normalized
|
|
1090
|
+
"""
|
|
1091
|
+
|
|
1092
|
+
# check ticker exists in available tickers
|
|
1093
|
+
# from histdata database
|
|
1094
|
+
if (
|
|
1095
|
+
ticker.upper() not in get_histdata_tickers()
|
|
1096
|
+
and
|
|
1097
|
+
ticker.lower() not in self._get_ticker_list()
|
|
1098
|
+
):
|
|
1099
|
+
logger.bind(target='histmanager').error(f'ticker {ticker.upper()} not found in database')
|
|
1100
|
+
raise TickerNotFoundError(f'ticker {ticker} not found in database')
|
|
1101
|
+
|
|
1102
|
+
# force ticker parameter to lower case
|
|
1103
|
+
ticker = ticker.lower()
|
|
1104
|
+
|
|
1105
|
+
if not check_timeframe_str(timeframe):
|
|
1106
|
+
|
|
1107
|
+
logger.bind(target='histmanager').error(f'timeframe request {timeframe} invalid')
|
|
1108
|
+
raise ValueError
|
|
1109
|
+
|
|
1110
|
+
else:
|
|
1111
|
+
|
|
1112
|
+
start = any_date_to_datetime64(start)
|
|
1113
|
+
end = any_date_to_datetime64(end)
|
|
1114
|
+
|
|
1115
|
+
if end < start:
|
|
1116
|
+
|
|
1117
|
+
logger.bind(target='histmanager').error(
|
|
1118
|
+
'date interval not coherent, '
|
|
1119
|
+
'start must be older than end')
|
|
1120
|
+
return self._dataframe_type([])
|
|
1121
|
+
|
|
1122
|
+
# get years including interval requested
|
|
1123
|
+
years_interval_req = list(range(start.year, end.year + 1, 1))
|
|
1124
|
+
|
|
1125
|
+
# get all keys referring to specific ticker
|
|
1126
|
+
ticker_years_list = self._get_ticker_years_list(ticker, timeframe=timeframe)
|
|
1127
|
+
|
|
1128
|
+
# aggregate data to current instance if necessary
|
|
1129
|
+
if not set(years_interval_req).issubset(ticker_years_list):
|
|
1130
|
+
|
|
1131
|
+
year_tf_missing = list(
|
|
1132
|
+
set(years_interval_req).difference(ticker_years_list))
|
|
1133
|
+
|
|
1134
|
+
year_tick_keys = self._get_ticker_years_list(
|
|
1135
|
+
ticker, timeframe=TICK_TIMEFRAME)
|
|
1136
|
+
|
|
1137
|
+
year_tick_missing = list(set(years_interval_req).difference(year_tick_keys))
|
|
1138
|
+
|
|
1139
|
+
# if tick is missing --> download missing years
|
|
1140
|
+
if year_tick_missing:
|
|
1141
|
+
|
|
1142
|
+
self._download(
|
|
1143
|
+
ticker,
|
|
1144
|
+
year_tick_missing
|
|
1145
|
+
)
|
|
1146
|
+
|
|
1147
|
+
# if timeframe req is in tf_list
|
|
1148
|
+
# data requested should at this point be available
|
|
1149
|
+
# call add data for specific timeframe requested
|
|
1150
|
+
if timeframe not in self._tf_list:
|
|
1151
|
+
|
|
1152
|
+
# call add single tf data
|
|
1153
|
+
self.add_timeframe(timeframe)
|
|
1154
|
+
|
|
1155
|
+
else:
|
|
1156
|
+
logger.bind(target='histmanager').trace(f'Timeframe {timeframe} already in _tf_list')
|
|
1157
|
+
|
|
1158
|
+
# get all keys referring to specific ticker
|
|
1159
|
+
ticker_keys = self._get_ticker_keys(ticker)
|
|
1160
|
+
|
|
1161
|
+
# get all keys referring to specific ticker
|
|
1162
|
+
ticker_years_list = self._get_ticker_years_list(ticker, timeframe=timeframe)
|
|
1163
|
+
|
|
1164
|
+
if not set(years_interval_req).issubset(ticker_years_list):
|
|
1165
|
+
|
|
1166
|
+
logger.bind(target='histmanager').critical(
|
|
1167
|
+
f'processing year data completion for '
|
|
1168
|
+
f'{years_interval_req} not ok')
|
|
1169
|
+
raise ValueError
|
|
1170
|
+
|
|
1171
|
+
else:
|
|
1172
|
+
logger.bind(target='histmanager').trace(
|
|
1173
|
+
f'Year data completion for {years_interval_req} successful')
|
|
1174
|
+
|
|
1175
|
+
# at this point all data requested have been aggregated to the database
|
|
1176
|
+
|
|
1177
|
+
# execute a read query on database
|
|
1178
|
+
return self._db_connector.read_data(
|
|
1179
|
+
market='forex',
|
|
1180
|
+
ticker=ticker,
|
|
1181
|
+
timeframe=timeframe,
|
|
1182
|
+
start=start,
|
|
1183
|
+
end=end,
|
|
1184
|
+
comparison_column_name=comparison_column_name,
|
|
1185
|
+
check_level=check_level,
|
|
1186
|
+
comparison_operator=comparison_operator,
|
|
1187
|
+
comparison_aggregation_mode=aggregation_mode
|
|
1188
|
+
)
|
|
1189
|
+
|
|
1190
|
+
def plot(
|
|
1191
|
+
self,
|
|
1192
|
+
ticker,
|
|
1193
|
+
timeframe,
|
|
1194
|
+
start_date,
|
|
1195
|
+
end_date
|
|
1196
|
+
) -> None:
|
|
1197
|
+
"""
|
|
1198
|
+
Plot candlestick chart for the specified ticker and date range.
|
|
1199
|
+
|
|
1200
|
+
Generates an interactive candlestick chart using mplfinance, displaying
|
|
1201
|
+
OHLC (Open, High, Low, Close) data for the specified time period.
|
|
1202
|
+
|
|
1203
|
+
Args:
|
|
1204
|
+
ticker (str): Currency pair symbol (e.g., 'EURUSD', 'GBPUSD')
|
|
1205
|
+
timeframe (str): Candle timeframe (e.g., '1m', '5m', '1h', '1D', '1W')
|
|
1206
|
+
start_date (str): Start date in ISO format 'YYYY-MM-DD' or 'YYYY-MM-DD HH:MM:SS'
|
|
1207
|
+
end_date (str): End date in ISO format 'YYYY-MM-DD' or 'YYYY-MM-DD HH:MM:SS'
|
|
1208
|
+
|
|
1209
|
+
Returns:
|
|
1210
|
+
None: Displays the chart using matplotlib
|
|
1211
|
+
|
|
1212
|
+
Example:
|
|
1213
|
+
>>> manager = HistoricalManagerDB(config='data_config.yaml')
|
|
1214
|
+
>>> manager.plot(
|
|
1215
|
+
... ticker='EURUSD',
|
|
1216
|
+
... timeframe='1D',
|
|
1217
|
+
... start_date='2020-01-01',
|
|
1218
|
+
... end_date='2020-12-31'
|
|
1219
|
+
... )
|
|
1220
|
+
|
|
1221
|
+
Note:
|
|
1222
|
+
The chart will be displayed in a matplotlib window. The data is automatically
|
|
1223
|
+
fetched using get_data() and converted to the appropriate format for plotting.
|
|
1224
|
+
"""
|
|
1225
|
+
|
|
1226
|
+
logger.bind(target='histmanager').info(f'''Chart request:
|
|
1227
|
+
ticker {ticker}
|
|
1228
|
+
timeframe {timeframe}
|
|
1229
|
+
from {start_date}
|
|
1230
|
+
to {end_date}''')
|
|
1231
|
+
|
|
1232
|
+
chart_data = self.get_data(ticker=ticker,
|
|
1233
|
+
timeframe=timeframe,
|
|
1234
|
+
start=start_date,
|
|
1235
|
+
end=end_date)
|
|
1236
|
+
|
|
1237
|
+
chart_data = to_pandas_dataframe(chart_data)
|
|
1238
|
+
|
|
1239
|
+
if chart_data.index.name != BASE_DATA_COLUMN_NAME.TIMESTAMP:
|
|
1240
|
+
|
|
1241
|
+
chart_data.set_index(BASE_DATA_COLUMN_NAME.TIMESTAMP,
|
|
1242
|
+
inplace=True)
|
|
1243
|
+
|
|
1244
|
+
chart_data.index = to_datetime(chart_data.index)
|
|
1245
|
+
|
|
1246
|
+
else:
|
|
1247
|
+
logger.bind(target='histmanager').trace(f'Chart data already has {BASE_DATA_COLUMN_NAME.TIMESTAMP} as index')
|
|
1248
|
+
|
|
1249
|
+
# candlestick chart type
|
|
1250
|
+
# use mplfinance
|
|
1251
|
+
chart_kwargs = dict(style='charles',
|
|
1252
|
+
title=ticker,
|
|
1253
|
+
ylabel='Quotation',
|
|
1254
|
+
xlabel='Timestamp',
|
|
1255
|
+
volume=False,
|
|
1256
|
+
figratio=(12, 8),
|
|
1257
|
+
figscale=1
|
|
1258
|
+
)
|
|
1259
|
+
|
|
1260
|
+
mpf_plot(chart_data, type='candle', **chart_kwargs)
|
|
1261
|
+
|
|
1262
|
+
mpf_show()
|