featureSQL 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
featureSQL/dump_bin.py ADDED
@@ -0,0 +1,668 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ import abc
5
+ import shutil
6
+ import traceback
7
+ from pathlib import Path
8
+ from typing import Iterable, List, Union
9
+ from functools import partial
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
11
+
12
+ import fire
13
+ import numpy as np
14
+ import pandas as pd
15
+ from tqdm import tqdm
16
+ from loguru import logger
17
+
18
+
19
+ def code_to_fname(code: str):
20
+ """stock code to file name
21
+
22
+ Parameters
23
+ ----------
24
+ code: str
25
+ """
26
+ # NOTE: In windows, the following name is I/O device, and the file with the corresponding name cannot be created
27
+ # reference: https://superuser.com/questions/86999/why-cant-i-name-a-folder-or-file-con-in-windows
28
+ replace_names = ["CON", "PRN", "AUX", "NUL"]
29
+ replace_names += [f"COM{i}" for i in range(10)]
30
+ replace_names += [f"LPT{i}" for i in range(10)]
31
+
32
+ # fallback prefix used when a code matches a reserved filename
33
+ prefix = "_bin_"
34
+ if str(code).upper() in replace_names:
35
+ code = prefix + str(code)
36
+
37
+ return code
38
+
39
+ def fname_to_code(fname: str):
40
+ """file name to stock code
41
+
42
+ Parameters
43
+ ----------
44
+ fname: str
45
+ """
46
+
47
+ prefix = "_bin_"
48
+ if fname.startswith(prefix):
49
+ fname = fname.lstrip(prefix)
50
+ return fname
51
+
52
+
53
+ def read_as_df(file_path: Union[str, Path], store=None, **kwargs) -> pd.DataFrame:
54
+ """
55
+ Read a csv or parquet file into a pandas DataFrame.
56
+
57
+ Parameters
58
+ ----------
59
+ file_path : Union[str, Path]
60
+ Path to the data file.
61
+ store : Option storage backend.
62
+ **kwargs :
63
+ Additional keyword arguments passed to the underlying pandas
64
+ reader.
65
+
66
+ Returns
67
+ -------
68
+ pd.DataFrame
69
+ """
70
+ from .storage import get_storage, FileSystemStore
71
+ if store is None:
72
+ store = get_storage("fs")
73
+
74
+ # stringify to deal with non-fs paths uniformly
75
+ file_str = str(file_path)
76
+ suffix = pathlib_suffix(file_str)
77
+
78
+ keep_keys = {".csv": ("low_memory",)}
79
+ kept_kwargs = {}
80
+ for k in keep_keys.get(suffix, []):
81
+ if k in kwargs:
82
+ kept_kwargs[k] = kwargs[k]
83
+
84
+ if suffix == ".csv":
85
+ import io
86
+ if isinstance(store, FileSystemStore):
87
+ df = pd.read_csv(file_path, **kept_kwargs)
88
+ else:
89
+ df = pd.read_csv(io.BytesIO(store.read_bytes(file_str)), **kept_kwargs)
90
+ for col in df.select_dtypes(include=["string"]):
91
+ df[col] = df[col].astype("object")
92
+ return df
93
+ elif suffix == ".parquet":
94
+ if isinstance(store, FileSystemStore):
95
+ return pd.read_parquet(file_path, **kept_kwargs)
96
+ else:
97
+ import io
98
+ return pd.read_parquet(io.BytesIO(store.read_bytes(file_str)), **kept_kwargs)
99
+ else:
100
+ raise ValueError(f"Unsupported file format: {suffix}")
101
+
102
+ def pathlib_suffix(p: str) -> str:
103
+ p = str(p)
104
+ if "." in p.split("/")[-1]:
105
+ return "." + p.split("/")[-1].split(".")[-1].lower()
106
+ return ""
107
+
108
+
109
+ class DumpDataBase:
110
+ INSTRUMENTS_START_FIELD = "start_datetime"
111
+ INSTRUMENTS_END_FIELD = "end_datetime"
112
+ CALENDARS_DIR_NAME = "calendars"
113
+ FEATURES_DIR_NAME = "features"
114
+ INSTRUMENTS_DIR_NAME = "instruments"
115
+ DUMP_FILE_SUFFIX = ".bin"
116
+ DAILY_FORMAT = "%Y-%m-%d"
117
+ HIGH_FREQ_FORMAT = "%Y-%m-%d %H:%M:%S"
118
+ INSTRUMENTS_SEP = "\t"
119
+ INSTRUMENTS_FILE_NAME = "all.txt"
120
+
121
+ UPDATE_MODE = "update"
122
+ ALL_MODE = "all"
123
+
124
+ def __init__(
125
+ self,
126
+ data_path: str,
127
+ dump_dir: str,
128
+ backup_dir: str = None,
129
+ freq: str = "day",
130
+ max_workers: int = 16,
131
+ date_field_name: str = "date",
132
+ file_suffix: str = ".csv",
133
+ symbol_field_name: str = "symbol",
134
+ exclude_fields: str = "",
135
+ include_fields: str = "",
136
+ limit_nums: int = None,
137
+ store_type: str = "fs",
138
+ ):
139
+ """
140
+
141
+ Parameters
142
+ ----------
143
+ data_path: str
144
+ stock data path or directory
145
+ dump_dir: str
146
+ target directory for generated binary files
147
+ backup_dir: str, default None
148
+ if backup_dir is not None, backup dump_dir to backup_dir
149
+ freq: str, default "day"
150
+ transaction frequency
151
+ max_workers: int, default None
152
+ number of threads
153
+ date_field_name: str, default "date"
154
+ the name of the date field in the csv
155
+ file_suffix: str, default ".csv"
156
+ file suffix
157
+ symbol_field_name: str, default "symbol"
158
+ symbol field name
159
+ include_fields: tuple
160
+ dump fields
161
+ exclude_fields: tuple
162
+ fields not dumped
163
+ limit_nums: int
164
+ Use when debugging, default None
165
+ store_type: str
166
+ the storage backend type
167
+ """
168
+ from .storage import get_storage
169
+ self.store = get_storage(store_type, dump_dir)
170
+ self.store_type = store_type
171
+
172
+ # data_path might be bucket/prefix in GCS
173
+ if store_type == "fs":
174
+ data_path_obj = Path(data_path).expanduser()
175
+ if data_path_obj.is_dir():
176
+ self.df_files = sorted([str(p) for p in data_path_obj.glob(f"*{file_suffix}")])
177
+ else:
178
+ self.df_files = [str(data_path_obj)]
179
+ else:
180
+ self.df_files = sorted(self.store.glob(data_path, f"*{file_suffix}"))
181
+ # eliminate any entries that don't actually end in the expected suffix
182
+ self.df_files = [f for f in self.df_files if pathlib_suffix(f) == file_suffix]
183
+ if not self.df_files:
184
+ # if the user provided a direct file path matching suffix, honor it;
185
+ # otherwise treat this as an error since no matching files were found.
186
+ if pathlib_suffix(data_path) == file_suffix:
187
+ self.df_files = [data_path]
188
+ else:
189
+ raise FileNotFoundError(
190
+ f"no files with suffix '{file_suffix}' found under '{data_path}' on store '{store_type}'"
191
+ )
192
+
193
+ if isinstance(exclude_fields, str):
194
+ exclude_fields = exclude_fields.split(",")
195
+ if isinstance(include_fields, str):
196
+ include_fields = include_fields.split(",")
197
+ self._exclude_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, exclude_fields)))
198
+ self._include_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, include_fields)))
199
+ self.file_suffix = file_suffix
200
+ self.symbol_field_name = symbol_field_name
201
+
202
+ if limit_nums is not None:
203
+ self.df_files = self.df_files[: int(limit_nums)]
204
+
205
+ self.dump_dir = str(Path(dump_dir).expanduser()) if store_type == "fs" else dump_dir
206
+
207
+ if backup_dir is not None:
208
+ self.backup_dir = str(Path(backup_dir).expanduser()) if store_type == "fs" else backup_dir
209
+ self._backup_dir(self.backup_dir)
210
+ else:
211
+ self.backup_dir = None
212
+
213
+ self.freq = freq
214
+ self.calendar_format = self.DAILY_FORMAT if self.freq == "day" else self.HIGH_FREQ_FORMAT
215
+
216
+ self.works = max_workers
217
+ self.date_field_name = date_field_name
218
+
219
+ self._calendars_dir = self.store.joinpath(self.dump_dir, self.CALENDARS_DIR_NAME)
220
+ self._features_dir = self.store.joinpath(self.dump_dir, self.FEATURES_DIR_NAME)
221
+ self._instruments_dir = self.store.joinpath(self.dump_dir, self.INSTRUMENTS_DIR_NAME)
222
+
223
+ self._calendars_list = []
224
+
225
+ self._mode = self.ALL_MODE
226
+ self._kwargs = {}
227
+
228
+ def _backup_dir(self, target_dir: str):
229
+ if self.store_type == "fs":
230
+ shutil.copytree(str(Path(self.dump_dir).resolve()), str(Path(target_dir).resolve()))
231
+ else:
232
+ logger.warning("backup_dir is not yet fully supported for non-fs storage")
233
+
234
+ def _format_datetime(self, datetime_d: [str, pd.Timestamp]):
235
+ datetime_d = pd.Timestamp(datetime_d)
236
+ return datetime_d.strftime(self.calendar_format)
237
+
238
+ def _get_date(
239
+ self, file_or_df: [str, pd.DataFrame], *, is_begin_end: bool = False, as_set: bool = False
240
+ ) -> Iterable[pd.Timestamp]:
241
+ # guard against invalid file paths that slipped through
242
+ if isinstance(file_or_df, str) and not file_or_df:
243
+ # empty string, nothing to read
244
+ if is_begin_end and as_set:
245
+ return (None, None), set()
246
+ if is_begin_end:
247
+ return None, None
248
+ if as_set:
249
+ return set()
250
+ return []
251
+ if not isinstance(file_or_df, pd.DataFrame):
252
+ df = self._get_source_data(file_or_df)
253
+ else:
254
+ df = file_or_df
255
+ if df.empty or self.date_field_name not in df.columns.tolist():
256
+ _calendars = pd.Series(dtype=np.float32)
257
+ else:
258
+ _calendars = df[self.date_field_name]
259
+
260
+ if is_begin_end and as_set:
261
+ return (_calendars.min(), _calendars.max()), set(_calendars)
262
+ elif is_begin_end:
263
+ return _calendars.min(), _calendars.max()
264
+ elif as_set:
265
+ return set(_calendars)
266
+ else:
267
+ return _calendars.tolist()
268
+
269
+ def _get_source_data(self, file_path: str) -> pd.DataFrame:
270
+ df = read_as_df(file_path, store=self.store, low_memory=False)
271
+ if self.date_field_name in df.columns:
272
+ df[self.date_field_name] = pd.to_datetime(df[self.date_field_name])
273
+ # df.drop_duplicates([self.date_field_name], inplace=True)
274
+ return df
275
+
276
+ def get_symbol_from_file(self, file_path: str) -> str:
277
+ # file_path is string
278
+ stem = str(file_path).split("/")[-1].split(".")[0]
279
+ return fname_to_code(stem.strip().lower())
280
+
281
+ def get_dump_fields(self, df_columns: Iterable[str]) -> Iterable[str]:
282
+ return (
283
+ self._include_fields
284
+ if self._include_fields
285
+ else set(df_columns) - set(self._exclude_fields) if self._exclude_fields else df_columns
286
+ )
287
+
288
+ def _read_calendars(self, calendar_path: str) -> List[pd.Timestamp]:
289
+ import io
290
+ if self.store_type == "fs":
291
+ df = pd.read_csv(calendar_path, header=None)
292
+ else:
293
+ df = pd.read_csv(io.BytesIO(self.store.read_bytes(calendar_path)), header=None)
294
+
295
+ return sorted(
296
+ map(
297
+ pd.Timestamp,
298
+ df.loc[:, 0].tolist(),
299
+ )
300
+ )
301
+
302
+ def _read_instruments(self, instrument_path: str) -> pd.DataFrame:
303
+ import io
304
+ if self.store_type == "fs":
305
+ df = pd.read_csv(
306
+ instrument_path,
307
+ sep=self.INSTRUMENTS_SEP,
308
+ names=[
309
+ self.symbol_field_name,
310
+ self.INSTRUMENTS_START_FIELD,
311
+ self.INSTRUMENTS_END_FIELD,
312
+ ],
313
+ )
314
+ else:
315
+ df = pd.read_csv(
316
+ io.BytesIO(self.store.read_bytes(instrument_path)),
317
+ sep=self.INSTRUMENTS_SEP,
318
+ names=[
319
+ self.symbol_field_name,
320
+ self.INSTRUMENTS_START_FIELD,
321
+ self.INSTRUMENTS_END_FIELD,
322
+ ],
323
+ )
324
+
325
+ return df
326
+
327
+ def save_calendars(self, calendars_data: list):
328
+ self.store.mkdir(self._calendars_dir, parents=True, exist_ok=True)
329
+ calendars_path = self.store.joinpath(self._calendars_dir, f"{self.freq}.txt")
330
+ result_calendars_list = [self._format_datetime(x) for x in calendars_data]
331
+ if self.store_type == "fs":
332
+ np.savetxt(calendars_path, result_calendars_list, fmt="%s", encoding="utf-8")
333
+ else:
334
+ import io
335
+ bio = io.BytesIO()
336
+ np.savetxt(bio, result_calendars_list, fmt="%s", encoding="utf-8")
337
+ self.store.write_bytes(calendars_path, bio.getvalue())
338
+
339
+ def save_instruments(self, instruments_data: Union[list, pd.DataFrame]):
340
+ self.store.mkdir(self._instruments_dir, parents=True, exist_ok=True)
341
+ instruments_path = self.store.joinpath(self._instruments_dir, self.INSTRUMENTS_FILE_NAME)
342
+ import io
343
+ if isinstance(instruments_data, pd.DataFrame):
344
+ _df_fields = [self.symbol_field_name, self.INSTRUMENTS_START_FIELD, self.INSTRUMENTS_END_FIELD]
345
+ instruments_data = instruments_data.loc[:, _df_fields]
346
+ instruments_data[self.symbol_field_name] = instruments_data[self.symbol_field_name].apply(
347
+ lambda x: fname_to_code(x.lower()).upper()
348
+ )
349
+ if self.store_type == "fs":
350
+ instruments_data.to_csv(instruments_path, header=False, sep=self.INSTRUMENTS_SEP, index=False)
351
+ else:
352
+ bio = io.BytesIO()
353
+ instruments_data.to_csv(bio, header=False, sep=self.INSTRUMENTS_SEP, index=False)
354
+ self.store.write_bytes(instruments_path, bio.getvalue())
355
+ else:
356
+ if self.store_type == "fs":
357
+ np.savetxt(instruments_path, instruments_data, fmt="%s", encoding="utf-8")
358
+ else:
359
+ bio = io.BytesIO()
360
+ np.savetxt(bio, instruments_data, fmt="%s", encoding="utf-8")
361
+ self.store.write_bytes(instruments_path, bio.getvalue())
362
+
363
+ def data_merge_calendar(self, df: pd.DataFrame, calendars_list: List[pd.Timestamp]) -> pd.DataFrame:
364
+ # calendars
365
+ calendars_df = pd.DataFrame(data=calendars_list, columns=[self.date_field_name])
366
+ calendars_df[self.date_field_name] = calendars_df[self.date_field_name].astype("datetime64[ns]")
367
+ cal_df = calendars_df[
368
+ (calendars_df[self.date_field_name] >= df[self.date_field_name].min())
369
+ & (calendars_df[self.date_field_name] <= df[self.date_field_name].max())
370
+ ]
371
+ # align index
372
+ cal_df.set_index(self.date_field_name, inplace=True)
373
+ df.set_index(self.date_field_name, inplace=True)
374
+ r_df = df.reindex(cal_df.index)
375
+ return r_df
376
+
377
+ @staticmethod
378
+ def get_datetime_index(df: pd.DataFrame, calendar_list: List[pd.Timestamp]) -> int:
379
+ return calendar_list.index(df.index.min())
380
+
381
+ def _data_to_bin(self, df: pd.DataFrame, calendar_list: List[pd.Timestamp], features_dir: str, code: str):
382
+ if df.empty:
383
+ logger.warning(f"{code} data is None or empty")
384
+ return
385
+ if not calendar_list:
386
+ logger.warning("calendar_list is empty")
387
+ return
388
+ # align index
389
+ _df = self.data_merge_calendar(df, calendar_list)
390
+ if _df.empty:
391
+ logger.warning(f"{code} data is not in calendars")
392
+ return
393
+ # used when creating a bin file
394
+ date_index = self.get_datetime_index(_df, calendar_list)
395
+ for field in self.get_dump_fields(_df.columns):
396
+ bin_path = self.store.joinpath(features_dir, f"{field.lower()}.{self.freq}{self.DUMP_FILE_SUFFIX}")
397
+ if field not in _df.columns:
398
+ continue
399
+ if self.store.exists(bin_path) and self._mode == self.UPDATE_MODE:
400
+ # update
401
+ if self.store_type == "fs":
402
+ with Path(bin_path).open("ab") as fp:
403
+ np.array(_df[field]).astype("<f").tofile(fp)
404
+ else:
405
+ self.store.append_bytes(bin_path, np.array(_df[field]).astype("<f").tobytes())
406
+ logger.info(f"updated bin file: {bin_path} (symbol={code}, field={field})")
407
+ else:
408
+ # append; self._mode == self.ALL_MODE or not bin_path.exists()
409
+ if self.store_type == "fs":
410
+ np.hstack([date_index, _df[field]]).astype("<f").tofile(str(Path(bin_path).resolve()))
411
+ else:
412
+ self.store.write_bytes(bin_path, np.hstack([date_index, _df[field]]).astype("<f").tobytes())
413
+ logger.info(f"created/overwritten bin file: {bin_path} (symbol={code}, field={field})")
414
+
415
+ def _dump_bin(self, file_or_data: [str, pd.DataFrame], calendar_list: List[pd.Timestamp]):
416
+ if not calendar_list:
417
+ logger.warning("calendar_list is empty")
418
+ return
419
+ if isinstance(file_or_data, pd.DataFrame):
420
+ if file_or_data.empty:
421
+ return
422
+ code = fname_to_code(str(file_or_data.iloc[0][self.symbol_field_name]).lower())
423
+ df = file_or_data
424
+ elif isinstance(file_or_data, (str, Path)):
425
+ code = self.get_symbol_from_file(file_or_data)
426
+ df = self._get_source_data(file_or_data)
427
+ else:
428
+ raise ValueError(f"not support {type(file_or_data)}")
429
+ if df is None or df.empty:
430
+ logger.warning(f"{code} data is None or empty")
431
+ return
432
+
433
+ # try to remove dup rows or it will cause exception when reindex.
434
+ df = df.drop_duplicates(self.date_field_name)
435
+
436
+ # features save dir
437
+ features_dir = self.store.joinpath(self._features_dir, code_to_fname(code).lower())
438
+ self.store.mkdir(features_dir, parents=True, exist_ok=True)
439
+ self._data_to_bin(df, calendar_list, features_dir, code)
440
+
441
+ @abc.abstractmethod
442
+ def dump(self):
443
+ raise NotImplementedError("dump not implemented!")
444
+
445
+ def __call__(self, *args, **kwargs):
446
+ self.dump()
447
+
448
+
449
+ class DumpDataAll(DumpDataBase):
450
+ def _get_all_date(self):
451
+ logger.info("start get all date......")
452
+ all_datetime = set()
453
+ date_range_list = []
454
+ _fun = partial(self._get_date, as_set=True, is_begin_end=True)
455
+ executor_class = ProcessPoolExecutor if self.store_type == "fs" else ThreadPoolExecutor
456
+ with tqdm(total=len(self.df_files)) as p_bar:
457
+ with executor_class(max_workers=self.works) as executor:
458
+ for file_path, ((_begin_time, _end_time), _set_calendars) in zip(
459
+ self.df_files, executor.map(_fun, self.df_files)
460
+ ):
461
+ all_datetime = all_datetime | _set_calendars
462
+ if isinstance(_begin_time, pd.Timestamp) and isinstance(_end_time, pd.Timestamp):
463
+ _begin_time = self._format_datetime(_begin_time)
464
+ _end_time = self._format_datetime(_end_time)
465
+ symbol = self.get_symbol_from_file(file_path)
466
+ _inst_fields = [symbol.upper(), _begin_time, _end_time]
467
+ date_range_list.append(f"{self.INSTRUMENTS_SEP.join(_inst_fields)}")
468
+ p_bar.update()
469
+ self._kwargs["all_datetime_set"] = all_datetime
470
+ self._kwargs["date_range_list"] = date_range_list
471
+ logger.info("end of get all date.\n")
472
+
473
+ def _dump_calendars(self):
474
+ logger.info("start dump calendars......")
475
+ self._calendars_list = sorted(map(pd.Timestamp, self._kwargs["all_datetime_set"]))
476
+ self.save_calendars(self._calendars_list)
477
+ logger.info("end of calendars dump.\n")
478
+
479
+ def _dump_instruments(self):
480
+ logger.info("start dump instruments......")
481
+ self.save_instruments(self._kwargs["date_range_list"])
482
+ logger.info("end of instruments dump.\n")
483
+
484
+ def _dump_features(self):
485
+ logger.info("start dump features......")
486
+ _dump_func = partial(self._dump_bin, calendar_list=self._calendars_list)
487
+ executor_class = ProcessPoolExecutor if self.store_type == "fs" else ThreadPoolExecutor
488
+ with tqdm(total=len(self.df_files)) as p_bar:
489
+ with executor_class(max_workers=self.works) as executor:
490
+ for _ in executor.map(_dump_func, self.df_files):
491
+ p_bar.update()
492
+
493
+ logger.info("end of features dump.\n")
494
+
495
+ def dump(self):
496
+ self._get_all_date()
497
+ self._dump_calendars()
498
+ self._dump_instruments()
499
+ self._dump_features()
500
+
501
+
502
+ class DumpDataFix(DumpDataAll):
503
+ def _dump_instruments(self):
504
+ logger.info("start dump instruments......")
505
+ _fun = partial(self._get_date, is_begin_end=True)
506
+ new_stock_files = sorted(
507
+ filter(
508
+ lambda x: self.get_symbol_from_file(x).upper() not in self._old_instruments,
509
+ self.df_files,
510
+ )
511
+ )
512
+ with tqdm(total=len(new_stock_files)) as p_bar:
513
+ with ProcessPoolExecutor(max_workers=self.works) as execute:
514
+ for file_path, (_begin_time, _end_time) in zip(new_stock_files, execute.map(_fun, new_stock_files)):
515
+ if isinstance(_begin_time, pd.Timestamp) and isinstance(_end_time, pd.Timestamp):
516
+ symbol = self.get_symbol_from_file(file_path).upper()
517
+ _dt_map = self._old_instruments.setdefault(symbol, dict())
518
+ _dt_map[self.INSTRUMENTS_START_FIELD] = self._format_datetime(_begin_time)
519
+ _dt_map[self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end_time)
520
+ p_bar.update()
521
+ _inst_df = pd.DataFrame.from_dict(self._old_instruments, orient="index")
522
+ _inst_df.index.names = [self.symbol_field_name]
523
+ self.save_instruments(_inst_df.reset_index())
524
+ logger.info("end of instruments dump.\n")
525
+
526
+ def dump(self):
527
+ cal_path = self.store.joinpath(self._calendars_dir, f"{self.freq}.txt")
528
+ self._calendars_list = self._read_calendars(cal_path)
529
+ # noinspection PyAttributeOutsideInit
530
+ inst_path = self.store.joinpath(self._instruments_dir, self.INSTRUMENTS_FILE_NAME)
531
+ self._old_instruments = (
532
+ self._read_instruments(inst_path)
533
+ .set_index([self.symbol_field_name])
534
+ .to_dict(orient="index")
535
+ ) # type: dict
536
+ self._dump_instruments()
537
+ self._dump_features()
538
+
539
+
540
+ class DumpDataUpdate(DumpDataBase):
541
+ def __init__(
542
+ self,
543
+ data_path: str,
544
+ dump_dir: str,
545
+ backup_dir: str = None,
546
+ freq: str = "day",
547
+ max_workers: int = 16,
548
+ date_field_name: str = "date",
549
+ file_suffix: str = ".csv",
550
+ symbol_field_name: str = "symbol",
551
+ exclude_fields: str = "",
552
+ include_fields: str = "",
553
+ limit_nums: int = None,
554
+ store_type: str = "fs",
555
+ ):
556
+ super().__init__(
557
+ data_path,
558
+ dump_dir,
559
+ backup_dir,
560
+ freq,
561
+ max_workers,
562
+ date_field_name,
563
+ file_suffix,
564
+ symbol_field_name,
565
+ exclude_fields,
566
+ include_fields,
567
+ limit_nums,
568
+ store_type=store_type,
569
+ )
570
+ self._mode = self.UPDATE_MODE
571
+ cal_path = self.store.joinpath(self._calendars_dir, f"{self.freq}.txt")
572
+ self._old_calendar_list = self._read_calendars(cal_path)
573
+ # NOTE: all.txt only exists once for each stock
574
+ # NOTE: if a stock corresponds to multiple different time ranges, user need to modify self._update_instruments
575
+ inst_path = self.store.joinpath(self._instruments_dir, self.INSTRUMENTS_FILE_NAME)
576
+ self._update_instruments = (
577
+ self._read_instruments(inst_path)
578
+ .set_index([self.symbol_field_name])
579
+ .to_dict(orient="index")
580
+ ) # type: dict
581
+
582
+ # load all csv files
583
+ self._all_data = self._load_all_source_data() # type: pd.DataFrame
584
+ self._new_calendar_list = self._old_calendar_list + sorted(
585
+ filter(lambda x: x > self._old_calendar_list[-1], self._all_data[self.date_field_name].unique())
586
+ )
587
+
588
+ def _load_all_source_data(self):
589
+ # NOTE: Need more memory
590
+ logger.info("start load all source data....")
591
+ all_df = []
592
+
593
+ def _read_df(file_path: Path):
594
+ _df = read_as_df(file_path)
595
+ if self.date_field_name in _df.columns and not np.issubdtype(
596
+ _df[self.date_field_name].dtype, np.datetime64
597
+ ):
598
+ _df[self.date_field_name] = pd.to_datetime(_df[self.date_field_name])
599
+ if self.symbol_field_name not in _df.columns:
600
+ _df[self.symbol_field_name] = self.get_symbol_from_file(file_path)
601
+ return _df
602
+
603
+ with tqdm(total=len(self.df_files)) as p_bar:
604
+ with ThreadPoolExecutor(max_workers=self.works) as executor:
605
+ for df in executor.map(_read_df, self.df_files):
606
+ if not df.empty:
607
+ all_df.append(df)
608
+ p_bar.update()
609
+
610
+ logger.info("end of load all data.\n")
611
+ return pd.concat(all_df, sort=False)
612
+
613
+ def _dump_calendars(self):
614
+ pass
615
+
616
+ def _dump_instruments(self):
617
+ pass
618
+
619
+ def _dump_features(self):
620
+ logger.info("start dump features......")
621
+ error_code = {}
622
+ with ProcessPoolExecutor(max_workers=self.works) as executor:
623
+ futures = {}
624
+ for _code, _df in self._all_data.groupby(self.symbol_field_name, group_keys=False):
625
+ _code = fname_to_code(str(_code).lower()).upper()
626
+ _start, _end = self._get_date(_df, is_begin_end=True)
627
+ if not (isinstance(_start, pd.Timestamp) and isinstance(_end, pd.Timestamp)):
628
+ continue
629
+ if _code in self._update_instruments:
630
+ # exists stock, will append data
631
+ _update_calendars = (
632
+ _df[_df[self.date_field_name] > self._update_instruments[_code][self.INSTRUMENTS_END_FIELD]][
633
+ self.date_field_name
634
+ ]
635
+ .sort_values()
636
+ .to_list()
637
+ )
638
+ if _update_calendars:
639
+ self._update_instruments[_code][self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end)
640
+ futures[executor.submit(self._dump_bin, _df, _update_calendars)] = _code
641
+ else:
642
+ # new stock
643
+ _dt_range = self._update_instruments.setdefault(_code, dict())
644
+ _dt_range[self.INSTRUMENTS_START_FIELD] = self._format_datetime(_start)
645
+ _dt_range[self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end)
646
+ futures[executor.submit(self._dump_bin, _df, self._new_calendar_list)] = _code
647
+
648
+ with tqdm(total=len(futures)) as p_bar:
649
+ for _future in as_completed(futures):
650
+ try:
651
+ _future.result()
652
+ except Exception:
653
+ error_code[futures[_future]] = traceback.format_exc()
654
+ p_bar.update()
655
+ logger.info(f"dump bin errors: {error_code}")
656
+
657
+ logger.info("end of features dump.\n")
658
+
659
+ def dump(self):
660
+ self.save_calendars(self._new_calendar_list)
661
+ self._dump_features()
662
+ df = pd.DataFrame.from_dict(self._update_instruments, orient="index")
663
+ df.index.names = [self.symbol_field_name]
664
+ self.save_instruments(df.reset_index())
665
+
666
+
667
+ if __name__ == "__main__":
668
+ fire.Fire({"dump_all": DumpDataAll, "dump_fix": DumpDataFix, "dump_update": DumpDataUpdate})