featureSQL 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featureSQL/__init__.py +30 -0
- featureSQL/cli.py +248 -0
- featureSQL/duck.py +140 -0
- featureSQL/dump_bin.py +668 -0
- featureSQL/storage.py +220 -0
- featureSQL/utils.py +29 -0
- featureSQL/yahoo.py +334 -0
- featuresql-0.1.0.dist-info/METADATA +279 -0
- featuresql-0.1.0.dist-info/RECORD +13 -0
- featuresql-0.1.0.dist-info/WHEEL +5 -0
- featuresql-0.1.0.dist-info/licenses/LICENSE +21 -0
- featuresql-0.1.0.dist-info/licenses/LICENSE.qlib +21 -0
- featuresql-0.1.0.dist-info/top_level.txt +1 -0
featureSQL/dump_bin.py
ADDED
|
@@ -0,0 +1,668 @@
|
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
|
|
4
|
+
import abc
|
|
5
|
+
import shutil
|
|
6
|
+
import traceback
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Iterable, List, Union
|
|
9
|
+
from functools import partial
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
|
|
11
|
+
|
|
12
|
+
import fire
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
from tqdm import tqdm
|
|
16
|
+
from loguru import logger
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def code_to_fname(code: str):
|
|
20
|
+
"""stock code to file name
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
code: str
|
|
25
|
+
"""
|
|
26
|
+
# NOTE: In windows, the following name is I/O device, and the file with the corresponding name cannot be created
|
|
27
|
+
# reference: https://superuser.com/questions/86999/why-cant-i-name-a-folder-or-file-con-in-windows
|
|
28
|
+
replace_names = ["CON", "PRN", "AUX", "NUL"]
|
|
29
|
+
replace_names += [f"COM{i}" for i in range(10)]
|
|
30
|
+
replace_names += [f"LPT{i}" for i in range(10)]
|
|
31
|
+
|
|
32
|
+
# fallback prefix used when a code matches a reserved filename
|
|
33
|
+
prefix = "_bin_"
|
|
34
|
+
if str(code).upper() in replace_names:
|
|
35
|
+
code = prefix + str(code)
|
|
36
|
+
|
|
37
|
+
return code
|
|
38
|
+
|
|
39
|
+
def fname_to_code(fname: str):
|
|
40
|
+
"""file name to stock code
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
fname: str
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
prefix = "_bin_"
|
|
48
|
+
if fname.startswith(prefix):
|
|
49
|
+
fname = fname.lstrip(prefix)
|
|
50
|
+
return fname
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def read_as_df(file_path: Union[str, Path], store=None, **kwargs) -> pd.DataFrame:
|
|
54
|
+
"""
|
|
55
|
+
Read a csv or parquet file into a pandas DataFrame.
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
file_path : Union[str, Path]
|
|
60
|
+
Path to the data file.
|
|
61
|
+
store : Option storage backend.
|
|
62
|
+
**kwargs :
|
|
63
|
+
Additional keyword arguments passed to the underlying pandas
|
|
64
|
+
reader.
|
|
65
|
+
|
|
66
|
+
Returns
|
|
67
|
+
-------
|
|
68
|
+
pd.DataFrame
|
|
69
|
+
"""
|
|
70
|
+
from .storage import get_storage, FileSystemStore
|
|
71
|
+
if store is None:
|
|
72
|
+
store = get_storage("fs")
|
|
73
|
+
|
|
74
|
+
# stringify to deal with non-fs paths uniformly
|
|
75
|
+
file_str = str(file_path)
|
|
76
|
+
suffix = pathlib_suffix(file_str)
|
|
77
|
+
|
|
78
|
+
keep_keys = {".csv": ("low_memory",)}
|
|
79
|
+
kept_kwargs = {}
|
|
80
|
+
for k in keep_keys.get(suffix, []):
|
|
81
|
+
if k in kwargs:
|
|
82
|
+
kept_kwargs[k] = kwargs[k]
|
|
83
|
+
|
|
84
|
+
if suffix == ".csv":
|
|
85
|
+
import io
|
|
86
|
+
if isinstance(store, FileSystemStore):
|
|
87
|
+
df = pd.read_csv(file_path, **kept_kwargs)
|
|
88
|
+
else:
|
|
89
|
+
df = pd.read_csv(io.BytesIO(store.read_bytes(file_str)), **kept_kwargs)
|
|
90
|
+
for col in df.select_dtypes(include=["string"]):
|
|
91
|
+
df[col] = df[col].astype("object")
|
|
92
|
+
return df
|
|
93
|
+
elif suffix == ".parquet":
|
|
94
|
+
if isinstance(store, FileSystemStore):
|
|
95
|
+
return pd.read_parquet(file_path, **kept_kwargs)
|
|
96
|
+
else:
|
|
97
|
+
import io
|
|
98
|
+
return pd.read_parquet(io.BytesIO(store.read_bytes(file_str)), **kept_kwargs)
|
|
99
|
+
else:
|
|
100
|
+
raise ValueError(f"Unsupported file format: {suffix}")
|
|
101
|
+
|
|
102
|
+
def pathlib_suffix(p: str) -> str:
|
|
103
|
+
p = str(p)
|
|
104
|
+
if "." in p.split("/")[-1]:
|
|
105
|
+
return "." + p.split("/")[-1].split(".")[-1].lower()
|
|
106
|
+
return ""
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class DumpDataBase:
|
|
110
|
+
INSTRUMENTS_START_FIELD = "start_datetime"
|
|
111
|
+
INSTRUMENTS_END_FIELD = "end_datetime"
|
|
112
|
+
CALENDARS_DIR_NAME = "calendars"
|
|
113
|
+
FEATURES_DIR_NAME = "features"
|
|
114
|
+
INSTRUMENTS_DIR_NAME = "instruments"
|
|
115
|
+
DUMP_FILE_SUFFIX = ".bin"
|
|
116
|
+
DAILY_FORMAT = "%Y-%m-%d"
|
|
117
|
+
HIGH_FREQ_FORMAT = "%Y-%m-%d %H:%M:%S"
|
|
118
|
+
INSTRUMENTS_SEP = "\t"
|
|
119
|
+
INSTRUMENTS_FILE_NAME = "all.txt"
|
|
120
|
+
|
|
121
|
+
UPDATE_MODE = "update"
|
|
122
|
+
ALL_MODE = "all"
|
|
123
|
+
|
|
124
|
+
def __init__(
|
|
125
|
+
self,
|
|
126
|
+
data_path: str,
|
|
127
|
+
dump_dir: str,
|
|
128
|
+
backup_dir: str = None,
|
|
129
|
+
freq: str = "day",
|
|
130
|
+
max_workers: int = 16,
|
|
131
|
+
date_field_name: str = "date",
|
|
132
|
+
file_suffix: str = ".csv",
|
|
133
|
+
symbol_field_name: str = "symbol",
|
|
134
|
+
exclude_fields: str = "",
|
|
135
|
+
include_fields: str = "",
|
|
136
|
+
limit_nums: int = None,
|
|
137
|
+
store_type: str = "fs",
|
|
138
|
+
):
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
Parameters
|
|
142
|
+
----------
|
|
143
|
+
data_path: str
|
|
144
|
+
stock data path or directory
|
|
145
|
+
dump_dir: str
|
|
146
|
+
target directory for generated binary files
|
|
147
|
+
backup_dir: str, default None
|
|
148
|
+
if backup_dir is not None, backup dump_dir to backup_dir
|
|
149
|
+
freq: str, default "day"
|
|
150
|
+
transaction frequency
|
|
151
|
+
max_workers: int, default None
|
|
152
|
+
number of threads
|
|
153
|
+
date_field_name: str, default "date"
|
|
154
|
+
the name of the date field in the csv
|
|
155
|
+
file_suffix: str, default ".csv"
|
|
156
|
+
file suffix
|
|
157
|
+
symbol_field_name: str, default "symbol"
|
|
158
|
+
symbol field name
|
|
159
|
+
include_fields: tuple
|
|
160
|
+
dump fields
|
|
161
|
+
exclude_fields: tuple
|
|
162
|
+
fields not dumped
|
|
163
|
+
limit_nums: int
|
|
164
|
+
Use when debugging, default None
|
|
165
|
+
store_type: str
|
|
166
|
+
the storage backend type
|
|
167
|
+
"""
|
|
168
|
+
from .storage import get_storage
|
|
169
|
+
self.store = get_storage(store_type, dump_dir)
|
|
170
|
+
self.store_type = store_type
|
|
171
|
+
|
|
172
|
+
# data_path might be bucket/prefix in GCS
|
|
173
|
+
if store_type == "fs":
|
|
174
|
+
data_path_obj = Path(data_path).expanduser()
|
|
175
|
+
if data_path_obj.is_dir():
|
|
176
|
+
self.df_files = sorted([str(p) for p in data_path_obj.glob(f"*{file_suffix}")])
|
|
177
|
+
else:
|
|
178
|
+
self.df_files = [str(data_path_obj)]
|
|
179
|
+
else:
|
|
180
|
+
self.df_files = sorted(self.store.glob(data_path, f"*{file_suffix}"))
|
|
181
|
+
# eliminate any entries that don't actually end in the expected suffix
|
|
182
|
+
self.df_files = [f for f in self.df_files if pathlib_suffix(f) == file_suffix]
|
|
183
|
+
if not self.df_files:
|
|
184
|
+
# if the user provided a direct file path matching suffix, honor it;
|
|
185
|
+
# otherwise treat this as an error since no matching files were found.
|
|
186
|
+
if pathlib_suffix(data_path) == file_suffix:
|
|
187
|
+
self.df_files = [data_path]
|
|
188
|
+
else:
|
|
189
|
+
raise FileNotFoundError(
|
|
190
|
+
f"no files with suffix '{file_suffix}' found under '{data_path}' on store '{store_type}'"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
if isinstance(exclude_fields, str):
|
|
194
|
+
exclude_fields = exclude_fields.split(",")
|
|
195
|
+
if isinstance(include_fields, str):
|
|
196
|
+
include_fields = include_fields.split(",")
|
|
197
|
+
self._exclude_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, exclude_fields)))
|
|
198
|
+
self._include_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, include_fields)))
|
|
199
|
+
self.file_suffix = file_suffix
|
|
200
|
+
self.symbol_field_name = symbol_field_name
|
|
201
|
+
|
|
202
|
+
if limit_nums is not None:
|
|
203
|
+
self.df_files = self.df_files[: int(limit_nums)]
|
|
204
|
+
|
|
205
|
+
self.dump_dir = str(Path(dump_dir).expanduser()) if store_type == "fs" else dump_dir
|
|
206
|
+
|
|
207
|
+
if backup_dir is not None:
|
|
208
|
+
self.backup_dir = str(Path(backup_dir).expanduser()) if store_type == "fs" else backup_dir
|
|
209
|
+
self._backup_dir(self.backup_dir)
|
|
210
|
+
else:
|
|
211
|
+
self.backup_dir = None
|
|
212
|
+
|
|
213
|
+
self.freq = freq
|
|
214
|
+
self.calendar_format = self.DAILY_FORMAT if self.freq == "day" else self.HIGH_FREQ_FORMAT
|
|
215
|
+
|
|
216
|
+
self.works = max_workers
|
|
217
|
+
self.date_field_name = date_field_name
|
|
218
|
+
|
|
219
|
+
self._calendars_dir = self.store.joinpath(self.dump_dir, self.CALENDARS_DIR_NAME)
|
|
220
|
+
self._features_dir = self.store.joinpath(self.dump_dir, self.FEATURES_DIR_NAME)
|
|
221
|
+
self._instruments_dir = self.store.joinpath(self.dump_dir, self.INSTRUMENTS_DIR_NAME)
|
|
222
|
+
|
|
223
|
+
self._calendars_list = []
|
|
224
|
+
|
|
225
|
+
self._mode = self.ALL_MODE
|
|
226
|
+
self._kwargs = {}
|
|
227
|
+
|
|
228
|
+
def _backup_dir(self, target_dir: str):
|
|
229
|
+
if self.store_type == "fs":
|
|
230
|
+
shutil.copytree(str(Path(self.dump_dir).resolve()), str(Path(target_dir).resolve()))
|
|
231
|
+
else:
|
|
232
|
+
logger.warning("backup_dir is not yet fully supported for non-fs storage")
|
|
233
|
+
|
|
234
|
+
def _format_datetime(self, datetime_d: [str, pd.Timestamp]):
|
|
235
|
+
datetime_d = pd.Timestamp(datetime_d)
|
|
236
|
+
return datetime_d.strftime(self.calendar_format)
|
|
237
|
+
|
|
238
|
+
def _get_date(
|
|
239
|
+
self, file_or_df: [str, pd.DataFrame], *, is_begin_end: bool = False, as_set: bool = False
|
|
240
|
+
) -> Iterable[pd.Timestamp]:
|
|
241
|
+
# guard against invalid file paths that slipped through
|
|
242
|
+
if isinstance(file_or_df, str) and not file_or_df:
|
|
243
|
+
# empty string, nothing to read
|
|
244
|
+
if is_begin_end and as_set:
|
|
245
|
+
return (None, None), set()
|
|
246
|
+
if is_begin_end:
|
|
247
|
+
return None, None
|
|
248
|
+
if as_set:
|
|
249
|
+
return set()
|
|
250
|
+
return []
|
|
251
|
+
if not isinstance(file_or_df, pd.DataFrame):
|
|
252
|
+
df = self._get_source_data(file_or_df)
|
|
253
|
+
else:
|
|
254
|
+
df = file_or_df
|
|
255
|
+
if df.empty or self.date_field_name not in df.columns.tolist():
|
|
256
|
+
_calendars = pd.Series(dtype=np.float32)
|
|
257
|
+
else:
|
|
258
|
+
_calendars = df[self.date_field_name]
|
|
259
|
+
|
|
260
|
+
if is_begin_end and as_set:
|
|
261
|
+
return (_calendars.min(), _calendars.max()), set(_calendars)
|
|
262
|
+
elif is_begin_end:
|
|
263
|
+
return _calendars.min(), _calendars.max()
|
|
264
|
+
elif as_set:
|
|
265
|
+
return set(_calendars)
|
|
266
|
+
else:
|
|
267
|
+
return _calendars.tolist()
|
|
268
|
+
|
|
269
|
+
def _get_source_data(self, file_path: str) -> pd.DataFrame:
|
|
270
|
+
df = read_as_df(file_path, store=self.store, low_memory=False)
|
|
271
|
+
if self.date_field_name in df.columns:
|
|
272
|
+
df[self.date_field_name] = pd.to_datetime(df[self.date_field_name])
|
|
273
|
+
# df.drop_duplicates([self.date_field_name], inplace=True)
|
|
274
|
+
return df
|
|
275
|
+
|
|
276
|
+
def get_symbol_from_file(self, file_path: str) -> str:
|
|
277
|
+
# file_path is string
|
|
278
|
+
stem = str(file_path).split("/")[-1].split(".")[0]
|
|
279
|
+
return fname_to_code(stem.strip().lower())
|
|
280
|
+
|
|
281
|
+
def get_dump_fields(self, df_columns: Iterable[str]) -> Iterable[str]:
|
|
282
|
+
return (
|
|
283
|
+
self._include_fields
|
|
284
|
+
if self._include_fields
|
|
285
|
+
else set(df_columns) - set(self._exclude_fields) if self._exclude_fields else df_columns
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
def _read_calendars(self, calendar_path: str) -> List[pd.Timestamp]:
|
|
289
|
+
import io
|
|
290
|
+
if self.store_type == "fs":
|
|
291
|
+
df = pd.read_csv(calendar_path, header=None)
|
|
292
|
+
else:
|
|
293
|
+
df = pd.read_csv(io.BytesIO(self.store.read_bytes(calendar_path)), header=None)
|
|
294
|
+
|
|
295
|
+
return sorted(
|
|
296
|
+
map(
|
|
297
|
+
pd.Timestamp,
|
|
298
|
+
df.loc[:, 0].tolist(),
|
|
299
|
+
)
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
def _read_instruments(self, instrument_path: str) -> pd.DataFrame:
|
|
303
|
+
import io
|
|
304
|
+
if self.store_type == "fs":
|
|
305
|
+
df = pd.read_csv(
|
|
306
|
+
instrument_path,
|
|
307
|
+
sep=self.INSTRUMENTS_SEP,
|
|
308
|
+
names=[
|
|
309
|
+
self.symbol_field_name,
|
|
310
|
+
self.INSTRUMENTS_START_FIELD,
|
|
311
|
+
self.INSTRUMENTS_END_FIELD,
|
|
312
|
+
],
|
|
313
|
+
)
|
|
314
|
+
else:
|
|
315
|
+
df = pd.read_csv(
|
|
316
|
+
io.BytesIO(self.store.read_bytes(instrument_path)),
|
|
317
|
+
sep=self.INSTRUMENTS_SEP,
|
|
318
|
+
names=[
|
|
319
|
+
self.symbol_field_name,
|
|
320
|
+
self.INSTRUMENTS_START_FIELD,
|
|
321
|
+
self.INSTRUMENTS_END_FIELD,
|
|
322
|
+
],
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
return df
|
|
326
|
+
|
|
327
|
+
def save_calendars(self, calendars_data: list):
|
|
328
|
+
self.store.mkdir(self._calendars_dir, parents=True, exist_ok=True)
|
|
329
|
+
calendars_path = self.store.joinpath(self._calendars_dir, f"{self.freq}.txt")
|
|
330
|
+
result_calendars_list = [self._format_datetime(x) for x in calendars_data]
|
|
331
|
+
if self.store_type == "fs":
|
|
332
|
+
np.savetxt(calendars_path, result_calendars_list, fmt="%s", encoding="utf-8")
|
|
333
|
+
else:
|
|
334
|
+
import io
|
|
335
|
+
bio = io.BytesIO()
|
|
336
|
+
np.savetxt(bio, result_calendars_list, fmt="%s", encoding="utf-8")
|
|
337
|
+
self.store.write_bytes(calendars_path, bio.getvalue())
|
|
338
|
+
|
|
339
|
+
def save_instruments(self, instruments_data: Union[list, pd.DataFrame]):
|
|
340
|
+
self.store.mkdir(self._instruments_dir, parents=True, exist_ok=True)
|
|
341
|
+
instruments_path = self.store.joinpath(self._instruments_dir, self.INSTRUMENTS_FILE_NAME)
|
|
342
|
+
import io
|
|
343
|
+
if isinstance(instruments_data, pd.DataFrame):
|
|
344
|
+
_df_fields = [self.symbol_field_name, self.INSTRUMENTS_START_FIELD, self.INSTRUMENTS_END_FIELD]
|
|
345
|
+
instruments_data = instruments_data.loc[:, _df_fields]
|
|
346
|
+
instruments_data[self.symbol_field_name] = instruments_data[self.symbol_field_name].apply(
|
|
347
|
+
lambda x: fname_to_code(x.lower()).upper()
|
|
348
|
+
)
|
|
349
|
+
if self.store_type == "fs":
|
|
350
|
+
instruments_data.to_csv(instruments_path, header=False, sep=self.INSTRUMENTS_SEP, index=False)
|
|
351
|
+
else:
|
|
352
|
+
bio = io.BytesIO()
|
|
353
|
+
instruments_data.to_csv(bio, header=False, sep=self.INSTRUMENTS_SEP, index=False)
|
|
354
|
+
self.store.write_bytes(instruments_path, bio.getvalue())
|
|
355
|
+
else:
|
|
356
|
+
if self.store_type == "fs":
|
|
357
|
+
np.savetxt(instruments_path, instruments_data, fmt="%s", encoding="utf-8")
|
|
358
|
+
else:
|
|
359
|
+
bio = io.BytesIO()
|
|
360
|
+
np.savetxt(bio, instruments_data, fmt="%s", encoding="utf-8")
|
|
361
|
+
self.store.write_bytes(instruments_path, bio.getvalue())
|
|
362
|
+
|
|
363
|
+
def data_merge_calendar(self, df: pd.DataFrame, calendars_list: List[pd.Timestamp]) -> pd.DataFrame:
|
|
364
|
+
# calendars
|
|
365
|
+
calendars_df = pd.DataFrame(data=calendars_list, columns=[self.date_field_name])
|
|
366
|
+
calendars_df[self.date_field_name] = calendars_df[self.date_field_name].astype("datetime64[ns]")
|
|
367
|
+
cal_df = calendars_df[
|
|
368
|
+
(calendars_df[self.date_field_name] >= df[self.date_field_name].min())
|
|
369
|
+
& (calendars_df[self.date_field_name] <= df[self.date_field_name].max())
|
|
370
|
+
]
|
|
371
|
+
# align index
|
|
372
|
+
cal_df.set_index(self.date_field_name, inplace=True)
|
|
373
|
+
df.set_index(self.date_field_name, inplace=True)
|
|
374
|
+
r_df = df.reindex(cal_df.index)
|
|
375
|
+
return r_df
|
|
376
|
+
|
|
377
|
+
@staticmethod
|
|
378
|
+
def get_datetime_index(df: pd.DataFrame, calendar_list: List[pd.Timestamp]) -> int:
|
|
379
|
+
return calendar_list.index(df.index.min())
|
|
380
|
+
|
|
381
|
+
def _data_to_bin(self, df: pd.DataFrame, calendar_list: List[pd.Timestamp], features_dir: str, code: str):
|
|
382
|
+
if df.empty:
|
|
383
|
+
logger.warning(f"{code} data is None or empty")
|
|
384
|
+
return
|
|
385
|
+
if not calendar_list:
|
|
386
|
+
logger.warning("calendar_list is empty")
|
|
387
|
+
return
|
|
388
|
+
# align index
|
|
389
|
+
_df = self.data_merge_calendar(df, calendar_list)
|
|
390
|
+
if _df.empty:
|
|
391
|
+
logger.warning(f"{code} data is not in calendars")
|
|
392
|
+
return
|
|
393
|
+
# used when creating a bin file
|
|
394
|
+
date_index = self.get_datetime_index(_df, calendar_list)
|
|
395
|
+
for field in self.get_dump_fields(_df.columns):
|
|
396
|
+
bin_path = self.store.joinpath(features_dir, f"{field.lower()}.{self.freq}{self.DUMP_FILE_SUFFIX}")
|
|
397
|
+
if field not in _df.columns:
|
|
398
|
+
continue
|
|
399
|
+
if self.store.exists(bin_path) and self._mode == self.UPDATE_MODE:
|
|
400
|
+
# update
|
|
401
|
+
if self.store_type == "fs":
|
|
402
|
+
with Path(bin_path).open("ab") as fp:
|
|
403
|
+
np.array(_df[field]).astype("<f").tofile(fp)
|
|
404
|
+
else:
|
|
405
|
+
self.store.append_bytes(bin_path, np.array(_df[field]).astype("<f").tobytes())
|
|
406
|
+
logger.info(f"updated bin file: {bin_path} (symbol={code}, field={field})")
|
|
407
|
+
else:
|
|
408
|
+
# append; self._mode == self.ALL_MODE or not bin_path.exists()
|
|
409
|
+
if self.store_type == "fs":
|
|
410
|
+
np.hstack([date_index, _df[field]]).astype("<f").tofile(str(Path(bin_path).resolve()))
|
|
411
|
+
else:
|
|
412
|
+
self.store.write_bytes(bin_path, np.hstack([date_index, _df[field]]).astype("<f").tobytes())
|
|
413
|
+
logger.info(f"created/overwritten bin file: {bin_path} (symbol={code}, field={field})")
|
|
414
|
+
|
|
415
|
+
def _dump_bin(self, file_or_data: [str, pd.DataFrame], calendar_list: List[pd.Timestamp]):
|
|
416
|
+
if not calendar_list:
|
|
417
|
+
logger.warning("calendar_list is empty")
|
|
418
|
+
return
|
|
419
|
+
if isinstance(file_or_data, pd.DataFrame):
|
|
420
|
+
if file_or_data.empty:
|
|
421
|
+
return
|
|
422
|
+
code = fname_to_code(str(file_or_data.iloc[0][self.symbol_field_name]).lower())
|
|
423
|
+
df = file_or_data
|
|
424
|
+
elif isinstance(file_or_data, (str, Path)):
|
|
425
|
+
code = self.get_symbol_from_file(file_or_data)
|
|
426
|
+
df = self._get_source_data(file_or_data)
|
|
427
|
+
else:
|
|
428
|
+
raise ValueError(f"not support {type(file_or_data)}")
|
|
429
|
+
if df is None or df.empty:
|
|
430
|
+
logger.warning(f"{code} data is None or empty")
|
|
431
|
+
return
|
|
432
|
+
|
|
433
|
+
# try to remove dup rows or it will cause exception when reindex.
|
|
434
|
+
df = df.drop_duplicates(self.date_field_name)
|
|
435
|
+
|
|
436
|
+
# features save dir
|
|
437
|
+
features_dir = self.store.joinpath(self._features_dir, code_to_fname(code).lower())
|
|
438
|
+
self.store.mkdir(features_dir, parents=True, exist_ok=True)
|
|
439
|
+
self._data_to_bin(df, calendar_list, features_dir, code)
|
|
440
|
+
|
|
441
|
+
@abc.abstractmethod
|
|
442
|
+
def dump(self):
|
|
443
|
+
raise NotImplementedError("dump not implemented!")
|
|
444
|
+
|
|
445
|
+
def __call__(self, *args, **kwargs):
|
|
446
|
+
self.dump()
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
class DumpDataAll(DumpDataBase):
|
|
450
|
+
def _get_all_date(self):
|
|
451
|
+
logger.info("start get all date......")
|
|
452
|
+
all_datetime = set()
|
|
453
|
+
date_range_list = []
|
|
454
|
+
_fun = partial(self._get_date, as_set=True, is_begin_end=True)
|
|
455
|
+
executor_class = ProcessPoolExecutor if self.store_type == "fs" else ThreadPoolExecutor
|
|
456
|
+
with tqdm(total=len(self.df_files)) as p_bar:
|
|
457
|
+
with executor_class(max_workers=self.works) as executor:
|
|
458
|
+
for file_path, ((_begin_time, _end_time), _set_calendars) in zip(
|
|
459
|
+
self.df_files, executor.map(_fun, self.df_files)
|
|
460
|
+
):
|
|
461
|
+
all_datetime = all_datetime | _set_calendars
|
|
462
|
+
if isinstance(_begin_time, pd.Timestamp) and isinstance(_end_time, pd.Timestamp):
|
|
463
|
+
_begin_time = self._format_datetime(_begin_time)
|
|
464
|
+
_end_time = self._format_datetime(_end_time)
|
|
465
|
+
symbol = self.get_symbol_from_file(file_path)
|
|
466
|
+
_inst_fields = [symbol.upper(), _begin_time, _end_time]
|
|
467
|
+
date_range_list.append(f"{self.INSTRUMENTS_SEP.join(_inst_fields)}")
|
|
468
|
+
p_bar.update()
|
|
469
|
+
self._kwargs["all_datetime_set"] = all_datetime
|
|
470
|
+
self._kwargs["date_range_list"] = date_range_list
|
|
471
|
+
logger.info("end of get all date.\n")
|
|
472
|
+
|
|
473
|
+
def _dump_calendars(self):
|
|
474
|
+
logger.info("start dump calendars......")
|
|
475
|
+
self._calendars_list = sorted(map(pd.Timestamp, self._kwargs["all_datetime_set"]))
|
|
476
|
+
self.save_calendars(self._calendars_list)
|
|
477
|
+
logger.info("end of calendars dump.\n")
|
|
478
|
+
|
|
479
|
+
def _dump_instruments(self):
|
|
480
|
+
logger.info("start dump instruments......")
|
|
481
|
+
self.save_instruments(self._kwargs["date_range_list"])
|
|
482
|
+
logger.info("end of instruments dump.\n")
|
|
483
|
+
|
|
484
|
+
def _dump_features(self):
|
|
485
|
+
logger.info("start dump features......")
|
|
486
|
+
_dump_func = partial(self._dump_bin, calendar_list=self._calendars_list)
|
|
487
|
+
executor_class = ProcessPoolExecutor if self.store_type == "fs" else ThreadPoolExecutor
|
|
488
|
+
with tqdm(total=len(self.df_files)) as p_bar:
|
|
489
|
+
with executor_class(max_workers=self.works) as executor:
|
|
490
|
+
for _ in executor.map(_dump_func, self.df_files):
|
|
491
|
+
p_bar.update()
|
|
492
|
+
|
|
493
|
+
logger.info("end of features dump.\n")
|
|
494
|
+
|
|
495
|
+
def dump(self):
|
|
496
|
+
self._get_all_date()
|
|
497
|
+
self._dump_calendars()
|
|
498
|
+
self._dump_instruments()
|
|
499
|
+
self._dump_features()
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
class DumpDataFix(DumpDataAll):
|
|
503
|
+
def _dump_instruments(self):
|
|
504
|
+
logger.info("start dump instruments......")
|
|
505
|
+
_fun = partial(self._get_date, is_begin_end=True)
|
|
506
|
+
new_stock_files = sorted(
|
|
507
|
+
filter(
|
|
508
|
+
lambda x: self.get_symbol_from_file(x).upper() not in self._old_instruments,
|
|
509
|
+
self.df_files,
|
|
510
|
+
)
|
|
511
|
+
)
|
|
512
|
+
with tqdm(total=len(new_stock_files)) as p_bar:
|
|
513
|
+
with ProcessPoolExecutor(max_workers=self.works) as execute:
|
|
514
|
+
for file_path, (_begin_time, _end_time) in zip(new_stock_files, execute.map(_fun, new_stock_files)):
|
|
515
|
+
if isinstance(_begin_time, pd.Timestamp) and isinstance(_end_time, pd.Timestamp):
|
|
516
|
+
symbol = self.get_symbol_from_file(file_path).upper()
|
|
517
|
+
_dt_map = self._old_instruments.setdefault(symbol, dict())
|
|
518
|
+
_dt_map[self.INSTRUMENTS_START_FIELD] = self._format_datetime(_begin_time)
|
|
519
|
+
_dt_map[self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end_time)
|
|
520
|
+
p_bar.update()
|
|
521
|
+
_inst_df = pd.DataFrame.from_dict(self._old_instruments, orient="index")
|
|
522
|
+
_inst_df.index.names = [self.symbol_field_name]
|
|
523
|
+
self.save_instruments(_inst_df.reset_index())
|
|
524
|
+
logger.info("end of instruments dump.\n")
|
|
525
|
+
|
|
526
|
+
def dump(self):
|
|
527
|
+
cal_path = self.store.joinpath(self._calendars_dir, f"{self.freq}.txt")
|
|
528
|
+
self._calendars_list = self._read_calendars(cal_path)
|
|
529
|
+
# noinspection PyAttributeOutsideInit
|
|
530
|
+
inst_path = self.store.joinpath(self._instruments_dir, self.INSTRUMENTS_FILE_NAME)
|
|
531
|
+
self._old_instruments = (
|
|
532
|
+
self._read_instruments(inst_path)
|
|
533
|
+
.set_index([self.symbol_field_name])
|
|
534
|
+
.to_dict(orient="index")
|
|
535
|
+
) # type: dict
|
|
536
|
+
self._dump_instruments()
|
|
537
|
+
self._dump_features()
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
class DumpDataUpdate(DumpDataBase):
|
|
541
|
+
def __init__(
|
|
542
|
+
self,
|
|
543
|
+
data_path: str,
|
|
544
|
+
dump_dir: str,
|
|
545
|
+
backup_dir: str = None,
|
|
546
|
+
freq: str = "day",
|
|
547
|
+
max_workers: int = 16,
|
|
548
|
+
date_field_name: str = "date",
|
|
549
|
+
file_suffix: str = ".csv",
|
|
550
|
+
symbol_field_name: str = "symbol",
|
|
551
|
+
exclude_fields: str = "",
|
|
552
|
+
include_fields: str = "",
|
|
553
|
+
limit_nums: int = None,
|
|
554
|
+
store_type: str = "fs",
|
|
555
|
+
):
|
|
556
|
+
super().__init__(
|
|
557
|
+
data_path,
|
|
558
|
+
dump_dir,
|
|
559
|
+
backup_dir,
|
|
560
|
+
freq,
|
|
561
|
+
max_workers,
|
|
562
|
+
date_field_name,
|
|
563
|
+
file_suffix,
|
|
564
|
+
symbol_field_name,
|
|
565
|
+
exclude_fields,
|
|
566
|
+
include_fields,
|
|
567
|
+
limit_nums,
|
|
568
|
+
store_type=store_type,
|
|
569
|
+
)
|
|
570
|
+
self._mode = self.UPDATE_MODE
|
|
571
|
+
cal_path = self.store.joinpath(self._calendars_dir, f"{self.freq}.txt")
|
|
572
|
+
self._old_calendar_list = self._read_calendars(cal_path)
|
|
573
|
+
# NOTE: all.txt only exists once for each stock
|
|
574
|
+
# NOTE: if a stock corresponds to multiple different time ranges, user need to modify self._update_instruments
|
|
575
|
+
inst_path = self.store.joinpath(self._instruments_dir, self.INSTRUMENTS_FILE_NAME)
|
|
576
|
+
self._update_instruments = (
|
|
577
|
+
self._read_instruments(inst_path)
|
|
578
|
+
.set_index([self.symbol_field_name])
|
|
579
|
+
.to_dict(orient="index")
|
|
580
|
+
) # type: dict
|
|
581
|
+
|
|
582
|
+
# load all csv files
|
|
583
|
+
self._all_data = self._load_all_source_data() # type: pd.DataFrame
|
|
584
|
+
self._new_calendar_list = self._old_calendar_list + sorted(
|
|
585
|
+
filter(lambda x: x > self._old_calendar_list[-1], self._all_data[self.date_field_name].unique())
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
def _load_all_source_data(self):
|
|
589
|
+
# NOTE: Need more memory
|
|
590
|
+
logger.info("start load all source data....")
|
|
591
|
+
all_df = []
|
|
592
|
+
|
|
593
|
+
def _read_df(file_path: Path):
|
|
594
|
+
_df = read_as_df(file_path)
|
|
595
|
+
if self.date_field_name in _df.columns and not np.issubdtype(
|
|
596
|
+
_df[self.date_field_name].dtype, np.datetime64
|
|
597
|
+
):
|
|
598
|
+
_df[self.date_field_name] = pd.to_datetime(_df[self.date_field_name])
|
|
599
|
+
if self.symbol_field_name not in _df.columns:
|
|
600
|
+
_df[self.symbol_field_name] = self.get_symbol_from_file(file_path)
|
|
601
|
+
return _df
|
|
602
|
+
|
|
603
|
+
with tqdm(total=len(self.df_files)) as p_bar:
|
|
604
|
+
with ThreadPoolExecutor(max_workers=self.works) as executor:
|
|
605
|
+
for df in executor.map(_read_df, self.df_files):
|
|
606
|
+
if not df.empty:
|
|
607
|
+
all_df.append(df)
|
|
608
|
+
p_bar.update()
|
|
609
|
+
|
|
610
|
+
logger.info("end of load all data.\n")
|
|
611
|
+
return pd.concat(all_df, sort=False)
|
|
612
|
+
|
|
613
|
+
def _dump_calendars(self):
|
|
614
|
+
pass
|
|
615
|
+
|
|
616
|
+
def _dump_instruments(self):
|
|
617
|
+
pass
|
|
618
|
+
|
|
619
|
+
def _dump_features(self):
|
|
620
|
+
logger.info("start dump features......")
|
|
621
|
+
error_code = {}
|
|
622
|
+
with ProcessPoolExecutor(max_workers=self.works) as executor:
|
|
623
|
+
futures = {}
|
|
624
|
+
for _code, _df in self._all_data.groupby(self.symbol_field_name, group_keys=False):
|
|
625
|
+
_code = fname_to_code(str(_code).lower()).upper()
|
|
626
|
+
_start, _end = self._get_date(_df, is_begin_end=True)
|
|
627
|
+
if not (isinstance(_start, pd.Timestamp) and isinstance(_end, pd.Timestamp)):
|
|
628
|
+
continue
|
|
629
|
+
if _code in self._update_instruments:
|
|
630
|
+
# exists stock, will append data
|
|
631
|
+
_update_calendars = (
|
|
632
|
+
_df[_df[self.date_field_name] > self._update_instruments[_code][self.INSTRUMENTS_END_FIELD]][
|
|
633
|
+
self.date_field_name
|
|
634
|
+
]
|
|
635
|
+
.sort_values()
|
|
636
|
+
.to_list()
|
|
637
|
+
)
|
|
638
|
+
if _update_calendars:
|
|
639
|
+
self._update_instruments[_code][self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end)
|
|
640
|
+
futures[executor.submit(self._dump_bin, _df, _update_calendars)] = _code
|
|
641
|
+
else:
|
|
642
|
+
# new stock
|
|
643
|
+
_dt_range = self._update_instruments.setdefault(_code, dict())
|
|
644
|
+
_dt_range[self.INSTRUMENTS_START_FIELD] = self._format_datetime(_start)
|
|
645
|
+
_dt_range[self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end)
|
|
646
|
+
futures[executor.submit(self._dump_bin, _df, self._new_calendar_list)] = _code
|
|
647
|
+
|
|
648
|
+
with tqdm(total=len(futures)) as p_bar:
|
|
649
|
+
for _future in as_completed(futures):
|
|
650
|
+
try:
|
|
651
|
+
_future.result()
|
|
652
|
+
except Exception:
|
|
653
|
+
error_code[futures[_future]] = traceback.format_exc()
|
|
654
|
+
p_bar.update()
|
|
655
|
+
logger.info(f"dump bin errors: {error_code}")
|
|
656
|
+
|
|
657
|
+
logger.info("end of features dump.\n")
|
|
658
|
+
|
|
659
|
+
def dump(self):
|
|
660
|
+
self.save_calendars(self._new_calendar_list)
|
|
661
|
+
self._dump_features()
|
|
662
|
+
df = pd.DataFrame.from_dict(self._update_instruments, orient="index")
|
|
663
|
+
df.index.names = [self.symbol_field_name]
|
|
664
|
+
self.save_instruments(df.reset_index())
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
if __name__ == "__main__":
|
|
668
|
+
fire.Fire({"dump_all": DumpDataAll, "dump_fix": DumpDataFix, "dump_update": DumpDataUpdate})
|