lidb 1.2.0__py3-none-any.whl → 2.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lidb/__init__.py +2 -1
- lidb/dataset.py +303 -103
- lidb/decorator.py +50 -0
- lidb/init.py +5 -2
- lidb/parse.py +4 -0
- lidb/qdf/qdf.py +4 -2
- lidb/table.py +25 -16
- lidb-2.0.6.dist-info/METADATA +282 -0
- {lidb-1.2.0.dist-info → lidb-2.0.6.dist-info}/RECORD +11 -10
- lidb-1.2.0.dist-info/METADATA +0 -18
- {lidb-1.2.0.dist-info → lidb-2.0.6.dist-info}/WHEEL +0 -0
- {lidb-1.2.0.dist-info → lidb-2.0.6.dist-info}/top_level.txt +0 -0
lidb/__init__.py
CHANGED
|
@@ -22,9 +22,10 @@ from .database import (
|
|
|
22
22
|
|
|
23
23
|
from .table import Table, TableMode
|
|
24
24
|
from .dataset import Dataset, DataLoader
|
|
25
|
+
from .decorator import dataset
|
|
25
26
|
from .qdf import from_polars, Expr
|
|
26
27
|
from .svc import DataService, D
|
|
27
28
|
|
|
28
29
|
from .parse import parse_hive_partition_structure
|
|
29
30
|
|
|
30
|
-
__version__ = "
|
|
31
|
+
__version__ = "2.0.6"
|
lidb/dataset.py
CHANGED
|
@@ -5,21 +5,25 @@
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
+
import shutil
|
|
8
9
|
from collections import defaultdict
|
|
9
10
|
from enum import Enum
|
|
10
11
|
from functools import partial
|
|
11
12
|
from typing import Callable, Literal
|
|
12
13
|
|
|
13
14
|
import logair
|
|
15
|
+
import pandas as pd
|
|
14
16
|
import polars as pl
|
|
15
17
|
import polars.selectors as cs
|
|
16
18
|
import xcals
|
|
17
19
|
import ygo
|
|
20
|
+
from varname import varname
|
|
18
21
|
|
|
19
22
|
from .database import put, tb_path, scan, DB_PATH
|
|
20
23
|
from .parse import parse_hive_partition_structure
|
|
21
|
-
|
|
24
|
+
import inspect
|
|
22
25
|
|
|
26
|
+
DEFAULT_DS_PATH = DB_PATH / "datasets"
|
|
23
27
|
|
|
24
28
|
class InstrumentType(Enum):
|
|
25
29
|
STOCK = "Stock" # 股票
|
|
@@ -30,7 +34,7 @@ class InstrumentType(Enum):
|
|
|
30
34
|
def complete_data(fn, date, save_path, partitions):
|
|
31
35
|
logger = logair.get_logger(__name__)
|
|
32
36
|
try:
|
|
33
|
-
data = fn(
|
|
37
|
+
data = fn()
|
|
34
38
|
if data is None:
|
|
35
39
|
# 保存数据的逻辑在fn中实现了
|
|
36
40
|
return
|
|
@@ -44,59 +48,107 @@ def complete_data(fn, date, save_path, partitions):
|
|
|
44
48
|
cols = data.columns
|
|
45
49
|
if "date" not in cols:
|
|
46
50
|
data = data.with_columns(pl.lit(date).alias("date")).select("date", *cols)
|
|
47
|
-
|
|
51
|
+
else:
|
|
52
|
+
data = data.cast({"date": pl.Utf8})
|
|
53
|
+
data = data.filter(date=date)
|
|
54
|
+
if "time" in data.columns:
|
|
55
|
+
if data["time"].n_unique() < 2:
|
|
56
|
+
data = data.drop("time")
|
|
48
57
|
put(data, save_path, partitions=partitions)
|
|
49
58
|
except Exception as e:
|
|
50
|
-
logger.error(f"{save_path}: Error when complete data for {date}")
|
|
51
|
-
logger.warning(e)
|
|
59
|
+
logger.error(f"{save_path}: Error when complete data for {date}\n", exc_info=e)
|
|
52
60
|
|
|
53
61
|
|
|
54
62
|
class Dataset:
|
|
55
63
|
|
|
56
64
|
def __init__(self,
|
|
57
|
-
|
|
58
|
-
|
|
65
|
+
*depends: Dataset,
|
|
66
|
+
fn: Callable[..., pl.DataFrame | pl.LazyFrame],
|
|
67
|
+
tb: str = "",
|
|
59
68
|
update_time: str = "",
|
|
69
|
+
window: str = "1d",
|
|
60
70
|
partitions: list[str] = None,
|
|
61
|
-
|
|
62
|
-
|
|
71
|
+
is_hft: bool = False,
|
|
72
|
+
data_name: str = "",
|
|
73
|
+
frame: int = 1):
|
|
63
74
|
"""
|
|
64
75
|
|
|
65
76
|
Parameters
|
|
66
77
|
----------
|
|
78
|
+
depends: Dataset
|
|
79
|
+
底层依赖数据集
|
|
67
80
|
fn: str
|
|
68
|
-
|
|
81
|
+
数据集计算函数。如果要用到底层依赖数据集,则必须显示定义形参 `depend`
|
|
69
82
|
tb: str
|
|
70
|
-
|
|
83
|
+
数据集保存表格, 如果没有指定,默认 {lidb.DB_PATH}/datasets/<module>
|
|
71
84
|
update_time: str
|
|
72
85
|
更新时间: 默认没有-实时更新,也就是可以取到当天值
|
|
86
|
+
更新时间只允许三种情况:
|
|
87
|
+
- 1. 盘前时间点:比如 08:00:00, 09:00:00, 09:15:00 ...
|
|
88
|
+
- 2. 盘中时间点:归为实时更新,使用空值 ""
|
|
89
|
+
- 3. 盘后时间点:比如 15:00:00, 16:30:00, 20:00:00 ...
|
|
73
90
|
partitions: list[str]
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
91
|
+
分区: 如果指定为 None, 则自动从 fn 参数推断,如果不需要分区,应该将其设定为空列表: []
|
|
92
|
+
is_hft: bool
|
|
93
|
+
是否是高频数据,如果是,则会按照asset进行分区存储,默认 False
|
|
94
|
+
hft定义为:时间步长 < 1min
|
|
95
|
+
window: str
|
|
96
|
+
配合depends使用,在取depends时,会回看window周期,最小单位为`d`。不足 `d` 的会往上取整为`1d`
|
|
97
|
+
data_name: str
|
|
98
|
+
数据名,默认为空,会自动推断,如果指定了,则使用指定名
|
|
99
|
+
frame: int
|
|
100
|
+
用于自动推断 数据名
|
|
79
101
|
"""
|
|
102
|
+
self._depends = list(depends)
|
|
103
|
+
self._name = ""
|
|
80
104
|
self.fn = fn
|
|
81
105
|
self.fn_params_sig = ygo.fn_signature_params(fn)
|
|
82
|
-
self.
|
|
83
|
-
self.
|
|
84
|
-
self.
|
|
85
|
-
|
|
86
|
-
|
|
106
|
+
self._is_depend = "depend" in self.fn_params_sig and len(self._depends) > 0
|
|
107
|
+
self._is_hft = is_hft
|
|
108
|
+
self._frame = frame
|
|
109
|
+
self.data_name = data_name
|
|
110
|
+
if not self.data_name:
|
|
111
|
+
try:
|
|
112
|
+
self.data_name = varname(frame, strict=False)
|
|
113
|
+
except Exception as e:
|
|
114
|
+
pass
|
|
115
|
+
if self.data_name:
|
|
116
|
+
self.data_name = self.data_name.replace('ds_', '')
|
|
117
|
+
fn_params = ygo.fn_params(self.fn)
|
|
118
|
+
self.fn_params = {k: v for (k, v) in fn_params}
|
|
119
|
+
# 更新底层依赖数据集的同名参数
|
|
120
|
+
self._update_depends()
|
|
121
|
+
|
|
122
|
+
if pd.Timedelta(window).days < 1:
|
|
123
|
+
window = "1d"
|
|
124
|
+
window_td = pd.Timedelta(window)
|
|
125
|
+
self._window = window
|
|
126
|
+
self._days = window_td.days
|
|
127
|
+
if window_td.seconds > 0:
|
|
128
|
+
self._days += 1
|
|
129
|
+
# 检测是否高频数据:如果是高频数据,则按照标的进行分区,高频的定义为时间差 < 60s
|
|
130
|
+
self._append_partitions = ["asset", "date"] if is_hft else ["date", ]
|
|
87
131
|
if partitions is not None:
|
|
88
132
|
partitions = [k for k in partitions if k not in self._append_partitions]
|
|
89
133
|
partitions = [*partitions, *self._append_partitions]
|
|
90
134
|
else:
|
|
91
|
-
partitions = self._append_partitions
|
|
135
|
+
# partitions = self._append_partitions
|
|
136
|
+
partitions = [k for k in self.fn_params_sig if k not in self._append_partitions and k != "depend"]
|
|
137
|
+
partitions = [*partitions, *self._append_partitions]
|
|
92
138
|
self.partitions = partitions
|
|
93
139
|
self._type_asset = "asset" in self.fn_params_sig
|
|
140
|
+
if "09:30:00" < update_time < "15:00:00":
|
|
141
|
+
update_time = ""
|
|
94
142
|
self.update_time = update_time
|
|
143
|
+
# 根据底层依赖调整update_time
|
|
144
|
+
if update_time and self._depends:
|
|
145
|
+
dep_ut = [ds.update_time for ds in self._depends]
|
|
146
|
+
dep_ut.append(update_time)
|
|
147
|
+
self.update_time = max(dep_ut)
|
|
148
|
+
mod = inspect.getmodule(fn)
|
|
149
|
+
self.tb = tb if tb else DEFAULT_DS_PATH / mod.__name__ /f"{self.data_name}"
|
|
150
|
+
self.save_path = tb_path(self.tb)
|
|
95
151
|
|
|
96
|
-
self.tb = tb
|
|
97
|
-
self.save_path = tb_path(tb)
|
|
98
|
-
fn_params = ygo.fn_params(self.fn)
|
|
99
|
-
self.fn_params = {k: v for (k, v) in fn_params}
|
|
100
152
|
self.constraints = dict()
|
|
101
153
|
for k in self.partitions[:-len(self._append_partitions)]:
|
|
102
154
|
if k in self.fn_params:
|
|
@@ -106,20 +158,35 @@ class Dataset:
|
|
|
106
158
|
self.constraints[k] = v
|
|
107
159
|
self.save_path = self.save_path / f"{k}={v}"
|
|
108
160
|
|
|
161
|
+
def _update_depends(self):
|
|
162
|
+
new_deps = list()
|
|
163
|
+
for dep in self._depends:
|
|
164
|
+
new_dep = dep(**self.fn_params)
|
|
165
|
+
new_deps.append(new_dep)
|
|
166
|
+
self._depends = new_deps
|
|
167
|
+
|
|
109
168
|
def is_empty(self, path) -> bool:
|
|
110
169
|
return not any(path.rglob("*.parquet"))
|
|
111
170
|
|
|
112
171
|
def __call__(self, *fn_args, **fn_kwargs):
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
172
|
+
"""赋值时也会同步更新底层依赖数据集的同名参数"""
|
|
173
|
+
|
|
174
|
+
fn = ygo.delay(self.fn)(*fn_args, **fn_kwargs)
|
|
175
|
+
ds = Dataset(*self._depends,
|
|
176
|
+
fn=fn,
|
|
116
177
|
tb=self.tb,
|
|
117
178
|
partitions=self.partitions,
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
179
|
+
update_time=self.update_time,
|
|
180
|
+
is_hft=self._is_hft,
|
|
181
|
+
window=self._window,
|
|
182
|
+
frame=self._frame+1)
|
|
183
|
+
ds.data_name = self.data_name
|
|
121
184
|
return ds
|
|
122
185
|
|
|
186
|
+
def alias(self, new_name: str):
|
|
187
|
+
self._name = new_name
|
|
188
|
+
return self
|
|
189
|
+
|
|
123
190
|
def get_value(self, date, eager: bool = True, **constraints):
|
|
124
191
|
"""
|
|
125
192
|
取值: 不保证未来数据
|
|
@@ -135,6 +202,7 @@ class Dataset:
|
|
|
135
202
|
-------
|
|
136
203
|
|
|
137
204
|
"""
|
|
205
|
+
logger = logair.get_logger(f"{__name__}.{self.__class__.__name__}")
|
|
138
206
|
_constraints = {k: v for k, v in constraints.items() if k in self.partitions}
|
|
139
207
|
_limits = {k: v for k, v in constraints.items() if k not in self.partitions}
|
|
140
208
|
search_path = self.save_path
|
|
@@ -144,9 +212,22 @@ class Dataset:
|
|
|
144
212
|
search_path = search_path / f"{k}={v}"
|
|
145
213
|
search_path = search_path / f"date={date}"
|
|
146
214
|
|
|
215
|
+
# 处理空文件
|
|
216
|
+
for file_path in search_path.rglob("*.parquet"):
|
|
217
|
+
if file_path.stat().st_size == 0:
|
|
218
|
+
# 删除
|
|
219
|
+
logger.warning(f"{file_path}: Deleting empty file.")
|
|
220
|
+
file_path.unlink()
|
|
221
|
+
|
|
147
222
|
if not self.is_empty(search_path):
|
|
148
223
|
lf = scan(search_path).cast({"date": pl.Utf8})
|
|
149
|
-
|
|
224
|
+
try:
|
|
225
|
+
schema = lf.collect_schema()
|
|
226
|
+
except:
|
|
227
|
+
logger.warning(f"{search_path}: Failed to collect schema.")
|
|
228
|
+
# 删除该文件夹
|
|
229
|
+
shutil.rmtree(search_path)
|
|
230
|
+
return self.get_value(date=date, eager=eager, **constraints)
|
|
150
231
|
_limits = {k: v for k, v in constraints.items() if schema.get(k) is not None}
|
|
151
232
|
lf = lf.filter(date=date, **_limits)
|
|
152
233
|
if not eager:
|
|
@@ -156,7 +237,10 @@ class Dataset:
|
|
|
156
237
|
return data
|
|
157
238
|
fn = self.fn
|
|
158
239
|
save_path = self.save_path
|
|
159
|
-
|
|
240
|
+
if self._is_depend:
|
|
241
|
+
fn = partial(fn, depend=self._get_depends(date,))
|
|
242
|
+
else:
|
|
243
|
+
fn = partial(fn, date=date)
|
|
160
244
|
if self._type_asset:
|
|
161
245
|
if "asset" in _constraints:
|
|
162
246
|
fn = ygo.delay(self.fn)(asset=_constraints["asset"])
|
|
@@ -169,7 +253,6 @@ class Dataset:
|
|
|
169
253
|
params[k] = v
|
|
170
254
|
save_path = save_path / f"{k}={v}"
|
|
171
255
|
fn = ygo.delay(self.fn)(**params)
|
|
172
|
-
logger = logair.get_logger(__name__)
|
|
173
256
|
|
|
174
257
|
today = xcals.today()
|
|
175
258
|
now = xcals.now()
|
|
@@ -198,7 +281,7 @@ class Dataset:
|
|
|
198
281
|
def get_history(self,
|
|
199
282
|
dateList: list[str],
|
|
200
283
|
n_jobs: int = 5,
|
|
201
|
-
backend: Literal["threading", "multiprocessing", "loky"] = "
|
|
284
|
+
backend: Literal["threading", "multiprocessing", "loky"] = "threading",
|
|
202
285
|
eager: bool = True,
|
|
203
286
|
rep_asset: str = "000001", # 默认 000001
|
|
204
287
|
**constraints):
|
|
@@ -227,6 +310,14 @@ class Dataset:
|
|
|
227
310
|
missing_dates = set(dateList).difference(set(exist_dates))
|
|
228
311
|
missing_dates = sorted(list(missing_dates))
|
|
229
312
|
if missing_dates:
|
|
313
|
+
# 先逐个补齐 depends
|
|
314
|
+
_end_date = max(missing_dates)
|
|
315
|
+
_beg_date = min(missing_dates)
|
|
316
|
+
if self._days > 1:
|
|
317
|
+
_beg_date = xcals.shift_tradeday(_beg_date, -(self._days-1))
|
|
318
|
+
_depend_dates = xcals.get_tradingdays(_beg_date, _end_date)
|
|
319
|
+
for depend in self._depends:
|
|
320
|
+
depend.get_history(_depend_dates, eager=False)
|
|
230
321
|
fn = self.fn
|
|
231
322
|
save_path = self.save_path
|
|
232
323
|
|
|
@@ -250,12 +341,17 @@ class Dataset:
|
|
|
250
341
|
except:
|
|
251
342
|
pass
|
|
252
343
|
for date in missing_dates:
|
|
253
|
-
|
|
254
|
-
fn=fn,
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
344
|
+
if self._is_depend:
|
|
345
|
+
fn = partial(fn, depend=self._get_depends(date))
|
|
346
|
+
else:
|
|
347
|
+
fn = partial(fn, date=date)
|
|
348
|
+
go.submit(complete_data,
|
|
349
|
+
job_name=f"Completing",
|
|
350
|
+
postfix=info_path,
|
|
351
|
+
leave=False)(fn=fn,
|
|
352
|
+
date=date,
|
|
353
|
+
save_path=save_path,
|
|
354
|
+
partitions=self._append_partitions, )
|
|
259
355
|
go.do()
|
|
260
356
|
data = scan(search_path, ).cast({"date": pl.Utf8}).filter(pl.col("date").is_in(dateList), **constraints)
|
|
261
357
|
data = data.sort("date")
|
|
@@ -263,6 +359,26 @@ class Dataset:
|
|
|
263
359
|
return data.collect()
|
|
264
360
|
return data
|
|
265
361
|
|
|
362
|
+
def _get_depends(self, date: str) -> pl.LazyFrame | None:
|
|
363
|
+
# 获取依赖数据集数据
|
|
364
|
+
if not self._depends:
|
|
365
|
+
return None
|
|
366
|
+
end_date = date
|
|
367
|
+
beg_date = date
|
|
368
|
+
if self._days > 1:
|
|
369
|
+
beg_date = xcals.shift_tradeday(beg_date, -(self._days-1))
|
|
370
|
+
params = {
|
|
371
|
+
"ds_conf": dict(depend=self._depends),
|
|
372
|
+
"beg_date": beg_date,
|
|
373
|
+
"end_date": end_date,
|
|
374
|
+
"times": [self.update_time, ],
|
|
375
|
+
"show_progress": False,
|
|
376
|
+
"eager": False,
|
|
377
|
+
"process_time": False, # 不处理时间
|
|
378
|
+
}
|
|
379
|
+
res = load_ds(**params)
|
|
380
|
+
return res["depend"]
|
|
381
|
+
|
|
266
382
|
|
|
267
383
|
def loader(data_name: str,
|
|
268
384
|
ds: Dataset,
|
|
@@ -270,35 +386,80 @@ def loader(data_name: str,
|
|
|
270
386
|
prev_date_list: list[str],
|
|
271
387
|
prev_date_mapping: dict[str, str],
|
|
272
388
|
time: str,
|
|
389
|
+
process_time: bool,
|
|
273
390
|
**constraints) -> pl.LazyFrame:
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
391
|
+
"""
|
|
392
|
+
Parameters
|
|
393
|
+
----------
|
|
394
|
+
data_name
|
|
395
|
+
ds
|
|
396
|
+
date_list
|
|
397
|
+
prev_date_list
|
|
398
|
+
prev_date_mapping
|
|
399
|
+
time
|
|
400
|
+
process_time: bool
|
|
401
|
+
是否处理源数据的时间: 根据实参 time. 用于应对不同场景
|
|
402
|
+
场景1:依赖因子不处理,底层数据是什么就返回什么
|
|
403
|
+
场景2:zoo.load 用来加载测试日内不同时间点的数据,就应该处理
|
|
404
|
+
constraints
|
|
405
|
+
|
|
406
|
+
Returns
|
|
407
|
+
-------
|
|
408
|
+
|
|
409
|
+
"""
|
|
410
|
+
if time:
|
|
411
|
+
if time < ds.update_time:
|
|
412
|
+
if len(prev_date_list) > 1:
|
|
413
|
+
lf = ds.get_history(prev_date_list, eager=False, **constraints)
|
|
414
|
+
else:
|
|
415
|
+
lf = ds.get_value(prev_date_list[0], eager=False, **constraints)
|
|
277
416
|
else:
|
|
278
|
-
|
|
417
|
+
if len(date_list) > 1:
|
|
418
|
+
lf = ds.get_history(date_list, eager=False, **constraints)
|
|
419
|
+
else:
|
|
420
|
+
lf = ds.get_value(date_list[0], eager=False, **constraints)
|
|
279
421
|
else:
|
|
280
|
-
if
|
|
281
|
-
|
|
422
|
+
if ds.update_time > "09:30:00":
|
|
423
|
+
# 盘后因子:取上一天的值
|
|
424
|
+
if len(prev_date_list) > 1:
|
|
425
|
+
lf = ds.get_history(prev_date_list, eager=False, **constraints)
|
|
426
|
+
else:
|
|
427
|
+
lf = ds.get_value(prev_date_list[0], eager=False, **constraints)
|
|
282
428
|
else:
|
|
283
|
-
|
|
429
|
+
if len(date_list) > 1:
|
|
430
|
+
lf = ds.get_history(date_list, eager=False, **constraints)
|
|
431
|
+
else:
|
|
432
|
+
lf = ds.get_value(date_list[0], eager=False, **constraints)
|
|
433
|
+
|
|
284
434
|
schema = lf.collect_schema()
|
|
285
435
|
include_time = schema.get("time") is not None
|
|
286
|
-
if
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
436
|
+
if process_time and time:
|
|
437
|
+
if include_time:
|
|
438
|
+
lf = lf.filter(time=time)
|
|
439
|
+
else:
|
|
440
|
+
lf = lf.with_columns(time=pl.lit(time))
|
|
290
441
|
if time < ds.update_time:
|
|
291
442
|
lf = lf.with_columns(date=pl.col("date").replace(prev_date_mapping))
|
|
443
|
+
keep = {"date", "time", "asset"}
|
|
444
|
+
if ds._name:
|
|
445
|
+
columns = lf.collect_schema().names()
|
|
446
|
+
rename_cols = set(columns).difference(keep)
|
|
447
|
+
if len(rename_cols) > 1:
|
|
448
|
+
lf = lf.rename({k: f"{ds._name}.{k}" for k in rename_cols})
|
|
449
|
+
else:
|
|
450
|
+
lf = lf.rename({k: ds._name for k in rename_cols})
|
|
292
451
|
return data_name, lf
|
|
293
452
|
|
|
294
453
|
|
|
295
454
|
def load_ds(ds_conf: dict[str, list[Dataset]],
|
|
296
455
|
beg_date: str,
|
|
297
456
|
end_date: str,
|
|
298
|
-
|
|
457
|
+
times: list[str],
|
|
299
458
|
n_jobs: int = 7,
|
|
300
459
|
backend: Literal["threading", "multiprocessing", "loky"] = "threading",
|
|
460
|
+
show_progress: bool = True,
|
|
301
461
|
eager: bool = False,
|
|
462
|
+
process_time: bool = True,
|
|
302
463
|
**constraints) -> dict[str, pl.DataFrame | pl.LazyFrame]:
|
|
303
464
|
"""
|
|
304
465
|
加载数据集
|
|
@@ -310,15 +471,20 @@ def load_ds(ds_conf: dict[str, list[Dataset]],
|
|
|
310
471
|
开始日期
|
|
311
472
|
end_date: str
|
|
312
473
|
结束日期
|
|
313
|
-
|
|
474
|
+
times: list[str]
|
|
314
475
|
取值时间
|
|
315
476
|
n_jobs: int
|
|
316
477
|
并发数量
|
|
317
478
|
backend: str
|
|
479
|
+
show_progress: bool
|
|
318
480
|
eager: bool
|
|
319
481
|
是否返回 DataFrame
|
|
320
482
|
- True: 返回DataFrame
|
|
321
483
|
- False: 返回LazyFrame
|
|
484
|
+
process_time: bool
|
|
485
|
+
是否处理源数据的时间: 根据实参 time. 用于应对不同场景
|
|
486
|
+
场景1:依赖因子不处理,底层数据是什么就返回什么
|
|
487
|
+
场景2:zoo.load 用来加载测试日内不同时间点的数据,就应该处理
|
|
322
488
|
constraints
|
|
323
489
|
限制条件,比如 asset='000001'
|
|
324
490
|
Returns
|
|
@@ -332,31 +498,79 @@ def load_ds(ds_conf: dict[str, list[Dataset]],
|
|
|
332
498
|
raise ValueError("beg_date must be less than end_date")
|
|
333
499
|
date_list = xcals.get_tradingdays(beg_date, end_date)
|
|
334
500
|
beg_date, end_date = date_list[0], date_list[-1]
|
|
335
|
-
prev_date_list = xcals.get_tradingdays(xcals.shift_tradeday(beg_date, -1),
|
|
501
|
+
prev_date_list = xcals.get_tradingdays(xcals.shift_tradeday(beg_date, -1),
|
|
502
|
+
xcals.shift_tradeday(end_date, -1))
|
|
336
503
|
prev_date_mapping = {prev_date: date_list[i] for i, prev_date in enumerate(prev_date_list)}
|
|
337
504
|
results = defaultdict(list)
|
|
338
|
-
|
|
505
|
+
index = ("date", "time", "asset")
|
|
506
|
+
_index = ("date", "asset")
|
|
507
|
+
with ygo.pool(n_jobs=n_jobs,
|
|
508
|
+
backend=backend,
|
|
509
|
+
show_progress=show_progress) as go:
|
|
339
510
|
for data_name, ds_list in ds_conf.items():
|
|
340
511
|
for ds in ds_list:
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
512
|
+
_data_name = f"{data_name}:{ds.tb}"
|
|
513
|
+
for time in times:
|
|
514
|
+
go.submit(loader,
|
|
515
|
+
job_name="Loading",
|
|
516
|
+
postfix=data_name, )(data_name=_data_name,
|
|
517
|
+
ds=ds,
|
|
518
|
+
date_list=date_list,
|
|
519
|
+
prev_date_list=prev_date_list,
|
|
520
|
+
prev_date_mapping=prev_date_mapping,
|
|
521
|
+
time=time,
|
|
522
|
+
process_time=process_time,
|
|
523
|
+
**constraints)
|
|
350
524
|
for name, lf in go.do():
|
|
351
525
|
results[name].append(lf)
|
|
352
|
-
|
|
353
|
-
|
|
526
|
+
# _LFs = {
|
|
527
|
+
# name: (pl.concat(lfList, )
|
|
528
|
+
# .select(*index,
|
|
529
|
+
# cs.exclude(index))
|
|
530
|
+
# )
|
|
531
|
+
# for name, lfList in results.items()}
|
|
532
|
+
_LFs_with_time = {}
|
|
533
|
+
_LFs_without_time = {}
|
|
534
|
+
for name, lfList in results.items():
|
|
535
|
+
lf = pl.concat(lfList)
|
|
536
|
+
# print(lf)
|
|
537
|
+
if "time" not in lf.collect_schema().names():
|
|
538
|
+
_LFs_without_time[name] = lf
|
|
539
|
+
else:
|
|
540
|
+
_LFs_with_time[name] = lf
|
|
541
|
+
LFs_with_time = defaultdict(list)
|
|
542
|
+
LFs_without_time = defaultdict(list)
|
|
543
|
+
for name, lf in _LFs_with_time.items():
|
|
544
|
+
dn, _ = name.split(":")
|
|
545
|
+
LFs_with_time[dn].append(lf)
|
|
546
|
+
for name, lf in _LFs_without_time.items():
|
|
547
|
+
dn, _ = name.split(":")
|
|
548
|
+
LFs_without_time[dn].append(lf)
|
|
549
|
+
LFs_with_time = {
|
|
354
550
|
name: (pl.concat(lfList, how="align")
|
|
355
551
|
.sort(index)
|
|
356
552
|
.select(*index,
|
|
357
553
|
cs.exclude(index))
|
|
358
554
|
)
|
|
359
|
-
for name, lfList in
|
|
555
|
+
for name, lfList in LFs_with_time.items()}
|
|
556
|
+
LFs_without_time = {
|
|
557
|
+
name: (pl.concat(lfList, how="align")
|
|
558
|
+
.sort(_index)
|
|
559
|
+
.select(*_index,
|
|
560
|
+
cs.exclude(_index))
|
|
561
|
+
)
|
|
562
|
+
for name, lfList in LFs_without_time.items()}
|
|
563
|
+
dns = list(LFs_with_time.keys()) if LFs_with_time else list(LFs_without_time.keys())
|
|
564
|
+
LFs = dict()
|
|
565
|
+
for dn in dns:
|
|
566
|
+
_lf_with_time = LFs_with_time.get(dn)
|
|
567
|
+
_lf_without_time = LFs_without_time.get(dn)
|
|
568
|
+
if _lf_with_time is not None:
|
|
569
|
+
LFs[dn] = _lf_with_time
|
|
570
|
+
if _lf_without_time is not None:
|
|
571
|
+
LFs[dn] = LFs[dn].join(_lf_without_time, on=["date", "asset"], how="left")
|
|
572
|
+
else:
|
|
573
|
+
LFs[dn] = _lf_without_time
|
|
360
574
|
if not eager:
|
|
361
575
|
return LFs
|
|
362
576
|
return {
|
|
@@ -364,21 +578,20 @@ def load_ds(ds_conf: dict[str, list[Dataset]],
|
|
|
364
578
|
for name, lf in LFs.items()
|
|
365
579
|
}
|
|
366
580
|
|
|
367
|
-
|
|
368
581
|
class DataLoader:
|
|
369
582
|
|
|
370
583
|
def __init__(self, name: str):
|
|
371
584
|
self._name = name
|
|
372
|
-
self._lf: pl.LazyFrame = None
|
|
373
|
-
self._df: pl.DataFrame = None
|
|
374
585
|
self._index: tuple[str] = ("date", "time", "asset")
|
|
375
|
-
self.
|
|
376
|
-
self.
|
|
586
|
+
self._df: pl.LazyFrame | pl.DataFrame = None
|
|
587
|
+
# self._db: QDF = None
|
|
377
588
|
|
|
378
589
|
def get(self,
|
|
379
590
|
ds_list: list[Dataset],
|
|
380
591
|
beg_date: str,
|
|
381
592
|
end_date: str,
|
|
593
|
+
times: list[str],
|
|
594
|
+
eager: bool = False,
|
|
382
595
|
n_jobs: int = 11,
|
|
383
596
|
backend: Literal["threading", "multiprocessing", "loky"] = "threading",
|
|
384
597
|
**constraints):
|
|
@@ -389,6 +602,9 @@ class DataLoader:
|
|
|
389
602
|
ds_list: list[Dataset]
|
|
390
603
|
beg_date: str
|
|
391
604
|
end_date: str
|
|
605
|
+
times: list[str]
|
|
606
|
+
加载的时间列表
|
|
607
|
+
eager: bool
|
|
392
608
|
n_jobs: int
|
|
393
609
|
backend: str
|
|
394
610
|
constraints
|
|
@@ -402,41 +618,25 @@ class DataLoader:
|
|
|
402
618
|
end_date=end_date,
|
|
403
619
|
n_jobs=n_jobs,
|
|
404
620
|
backend=backend,
|
|
405
|
-
|
|
621
|
+
times=times,
|
|
622
|
+
eager=eager,
|
|
623
|
+
process_time=True,
|
|
406
624
|
**constraints)
|
|
407
|
-
self.
|
|
408
|
-
self._df = None
|
|
409
|
-
self._db = from_polars(self._lf, self._index, align=True)
|
|
410
|
-
dateList = xcals.get_tradingdays(beg_date, end_date)
|
|
411
|
-
_data_name = f"{self._name}(one_day)"
|
|
412
|
-
self._one = load_ds(ds_conf={_data_name: ds_list},
|
|
413
|
-
beg_date=dateList[0],
|
|
414
|
-
end_date=dateList[0],
|
|
415
|
-
n_jobs=n_jobs,
|
|
416
|
-
backend=backend,
|
|
417
|
-
eager=False,
|
|
418
|
-
**constraints).get(_data_name).collect()
|
|
625
|
+
self._df = lf[self._name]
|
|
419
626
|
|
|
420
627
|
@property
|
|
421
628
|
def name(self) -> str:
|
|
422
629
|
return self._name
|
|
423
630
|
|
|
424
631
|
@property
|
|
425
|
-
def
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
def schema(self) -> pl.Schema:
|
|
430
|
-
return self._one.schema
|
|
431
|
-
|
|
432
|
-
@property
|
|
433
|
-
def columns(self) -> list[str]:
|
|
434
|
-
return self._one.columns
|
|
435
|
-
|
|
436
|
-
def collect(self) -> pl.DataFrame:
|
|
437
|
-
if self._df is None:
|
|
438
|
-
self._df = self._lf.collect()
|
|
632
|
+
def data(self) -> pl.DataFrame | None:
|
|
633
|
+
"""返回全量数据"""
|
|
634
|
+
if isinstance(self._df, pl.LazyFrame):
|
|
635
|
+
self._df = self._df.collect()
|
|
439
636
|
return self._df
|
|
440
637
|
|
|
441
|
-
def
|
|
442
|
-
|
|
638
|
+
def add_data(self, df: pl.DataFrame | pl.LazyFrame):
|
|
639
|
+
"""添加dataframe, index 保持为原有的 _df.index"""
|
|
640
|
+
if isinstance(df, pl.LazyFrame):
|
|
641
|
+
df = df.collect()
|
|
642
|
+
self._df = pl.concat([self._df, df], how="align").sort(self._index)
|
lidb/decorator.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Copyright (c) ZhangYundi.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
# Created on 2025/12/31 10:58
|
|
4
|
+
# Description:
|
|
5
|
+
|
|
6
|
+
from .dataset import Dataset
|
|
7
|
+
from typing import Callable, TypeVar, cast
|
|
8
|
+
|
|
9
|
+
F = TypeVar('F', bound=Callable)
|
|
10
|
+
|
|
11
|
+
def dataset(*depends: Dataset,
|
|
12
|
+
tb: str = "",
|
|
13
|
+
update_time: str = "",
|
|
14
|
+
window: str = "1d",
|
|
15
|
+
partitions: list[str] = None,
|
|
16
|
+
is_hft: bool = False) -> Callable[[F], Dataset]:
|
|
17
|
+
"""
|
|
18
|
+
装饰器:将函数转换为Dataset对象
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
depends: Dataset
|
|
23
|
+
底层依赖数据集
|
|
24
|
+
tb: str
|
|
25
|
+
数据集保存表格, 如果没有指定,默认 {DEFAULT_DS_PATH}/
|
|
26
|
+
update_time: str
|
|
27
|
+
更新时间: 默认没有-实时更新,也就是可以取到当天值
|
|
28
|
+
window: str
|
|
29
|
+
配合depends使用,在取depends时,会回看window周期,最小单位为`d`。不足 `d` 的会往上取整为`1d`
|
|
30
|
+
partitions: list[str]
|
|
31
|
+
分区: 如果指定为 None, 则自动从 fn 参数推断,如果不需要分区,应该将其设定为空列表: []
|
|
32
|
+
is_hft: bool
|
|
33
|
+
是否是高频数据,如果是,则会按照asset进行分区存储,默认 False
|
|
34
|
+
hft定义为:时间步长 < 1min
|
|
35
|
+
"""
|
|
36
|
+
def decorator(fn: F):
|
|
37
|
+
# 创建Dataset实例
|
|
38
|
+
ds = Dataset(
|
|
39
|
+
*depends,
|
|
40
|
+
fn=fn,
|
|
41
|
+
tb=tb,
|
|
42
|
+
update_time=update_time,
|
|
43
|
+
window=window,
|
|
44
|
+
partitions=partitions,
|
|
45
|
+
is_hft=is_hft,
|
|
46
|
+
data_name=fn.__name__,
|
|
47
|
+
frame=1
|
|
48
|
+
)
|
|
49
|
+
return ds
|
|
50
|
+
return decorator
|
lidb/init.py
CHANGED
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from dynaconf import Dynaconf
|
|
8
8
|
import logair
|
|
9
|
+
import os
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
USERHOME = Path("~").expanduser() # 用户家目录
|
|
@@ -22,8 +23,7 @@ if not CONFIG_PATH.exists():
|
|
|
22
23
|
except Exception as e:
|
|
23
24
|
logger.error(f"Failed to create settings file: {e}")
|
|
24
25
|
with open(CONFIG_PATH, "w") as f:
|
|
25
|
-
template_content = f'[
|
|
26
|
-
with open(CONFIG_PATH, "w") as f:
|
|
26
|
+
template_content = f'[GLOBAL]\npath="{DB_PATH}"\n\n[POLARS]\nmax_threads=32\n'
|
|
27
27
|
f.write(template_content)
|
|
28
28
|
logger.info(f"Settings file created: {CONFIG_PATH}")
|
|
29
29
|
|
|
@@ -38,5 +38,8 @@ def get_settings():
|
|
|
38
38
|
_settiings = get_settings()
|
|
39
39
|
if _settiings is not None:
|
|
40
40
|
setting_db_path = _settiings.get(f"global.path", "")
|
|
41
|
+
# 配置 polars
|
|
42
|
+
setting_polars_threads = _settiings.get("polars.max_threads", 32)
|
|
43
|
+
os.environ["POLARS_MAX_THREADS"] = str(setting_polars_threads)
|
|
41
44
|
if setting_db_path:
|
|
42
45
|
DB_PATH = Path(setting_db_path)
|
lidb/parse.py
CHANGED
|
@@ -86,6 +86,10 @@ def parse_hive_partition_structure(root_path: Path | str, file_pattern: str = "*
|
|
|
86
86
|
partition_combinations = set()
|
|
87
87
|
|
|
88
88
|
for file_path in root_path.rglob(file_pattern):
|
|
89
|
+
if file_path.stat().st_size == 0:
|
|
90
|
+
# 删除
|
|
91
|
+
file_path.unlink()
|
|
92
|
+
continue
|
|
89
93
|
relative_path = file_path.relative_to(root_path)
|
|
90
94
|
|
|
91
95
|
# 收集分区信息
|
lidb/qdf/qdf.py
CHANGED
|
@@ -118,7 +118,7 @@ class QDF:
|
|
|
118
118
|
except Exception as error:
|
|
119
119
|
raise CompileError(message=f"{e.fn_name}({', '.join([str(arg) for arg in args])})\n{error}") from error
|
|
120
120
|
|
|
121
|
-
def sql(self, *exprs: str, show_progress: bool = False) -> pl.DataFrame:
|
|
121
|
+
def sql(self, *exprs: str, show_progress: bool = False, leave: bool = False) -> pl.DataFrame:
|
|
122
122
|
"""
|
|
123
123
|
表达式查询
|
|
124
124
|
Parameters
|
|
@@ -127,6 +127,8 @@ class QDF:
|
|
|
127
127
|
表达式,比如 "ts_mean(close, 5) as close_ma5"
|
|
128
128
|
show_progress: bool
|
|
129
129
|
是否展示进度条
|
|
130
|
+
leave: bool
|
|
131
|
+
是否保留进度条
|
|
130
132
|
Returns
|
|
131
133
|
-------
|
|
132
134
|
polars.DataFrame
|
|
@@ -146,7 +148,7 @@ class QDF:
|
|
|
146
148
|
pbar = None
|
|
147
149
|
lvl_num = len(lvls)
|
|
148
150
|
if show_progress:
|
|
149
|
-
pbar = tqdm(total=lvl_num, desc=f"{len(exprs)}")
|
|
151
|
+
pbar = tqdm(total=lvl_num, desc=f"{len(exprs)}", leave=leave)
|
|
150
152
|
for i, batch_exprs in enumerate(lvls):
|
|
151
153
|
if show_progress:
|
|
152
154
|
pbar.set_postfix_str(f"level-{i + 1}:{len(batch_exprs)}")
|
lidb/table.py
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
+
import sys
|
|
8
9
|
from collections.abc import Callable
|
|
9
10
|
from enum import Enum
|
|
10
11
|
|
|
@@ -24,8 +25,6 @@ class TableMode(Enum):
|
|
|
24
25
|
F = "full" # 全量更新
|
|
25
26
|
I = "increment" # 增量更新
|
|
26
27
|
|
|
27
|
-
|
|
28
|
-
|
|
29
28
|
class Table:
|
|
30
29
|
|
|
31
30
|
def __init__(self,
|
|
@@ -58,10 +57,10 @@ class Table:
|
|
|
58
57
|
"""获取数据并且保存数据"""
|
|
59
58
|
data = ygo.delay(self.fn)(this=self)()
|
|
60
59
|
if data is None:
|
|
61
|
-
self.logger.error("No data.")
|
|
60
|
+
self.logger.error(f"{self.tb}: No data.")
|
|
62
61
|
return
|
|
63
62
|
if data.is_empty():
|
|
64
|
-
self.logger.warning("No data.")
|
|
63
|
+
self.logger.warning(f"{self.tb}: No data.")
|
|
65
64
|
return
|
|
66
65
|
if self.mode == TableMode.I:
|
|
67
66
|
time_uuid = uuid.uuid1()
|
|
@@ -77,33 +76,43 @@ class Table:
|
|
|
77
76
|
def update(self, verbose: bool = False):
|
|
78
77
|
"""更新最新数据: 全量更新, 覆盖旧数据"""
|
|
79
78
|
self.verbose = verbose
|
|
79
|
+
if self._need_update(date=xcals.today()):
|
|
80
|
+
self._log("Updating.", "info")
|
|
81
|
+
self._do_job()
|
|
82
|
+
|
|
83
|
+
def _need_update(self, date: str) -> bool:
|
|
84
|
+
"""是否需要更新"""
|
|
80
85
|
existed = self._data_dir.exists()
|
|
81
86
|
if not existed:
|
|
82
87
|
self._data_dir.mkdir(parents=True, exist_ok=True)
|
|
83
|
-
|
|
84
|
-
self._do_job()
|
|
88
|
+
return True
|
|
85
89
|
else:
|
|
86
90
|
modified_time = self.modified_time
|
|
87
91
|
if modified_time is not None:
|
|
88
92
|
modified_datetime = modified_time.strftime("%Y-%m-%d %H:%M:%S")
|
|
89
93
|
modified_d, modified_t = modified_datetime.split(" ")
|
|
90
|
-
if self._updated(data_date=modified_d, data_time=modified_t):
|
|
91
|
-
return
|
|
92
|
-
|
|
93
|
-
self._do_job()
|
|
94
|
-
self._log("Updated.", "info")
|
|
94
|
+
if self._updated(date, data_date=modified_d, data_time=modified_t):
|
|
95
|
+
return False
|
|
96
|
+
return True
|
|
95
97
|
|
|
96
|
-
def get_value(self, eager: bool = True) -> pl.DataFrame | pl.LazyFrame:
|
|
98
|
+
def get_value(self, date: str, eager: bool = True) -> pl.DataFrame | pl.LazyFrame:
|
|
97
99
|
"""获取数据"""
|
|
98
|
-
self.update(verbose=True)
|
|
100
|
+
# self.update(verbose=True)
|
|
101
|
+
if not date:
|
|
102
|
+
date = xcals.today()
|
|
103
|
+
self.verbose = True
|
|
104
|
+
if self._need_update(date):
|
|
105
|
+
self._log("Update first plz.", "warning")
|
|
106
|
+
sys.exit()
|
|
107
|
+
|
|
99
108
|
df = scan(self._data_dir)
|
|
100
109
|
if eager:
|
|
101
110
|
return df.collect()
|
|
102
111
|
return df
|
|
103
112
|
|
|
104
|
-
def _updated(self, data_date: str, data_time: str) -> bool:
|
|
105
|
-
"""
|
|
106
|
-
recent_tradeday = xcals.get_recent_tradeday()
|
|
113
|
+
def _updated(self, date: str, data_date: str, data_time: str) -> bool:
|
|
114
|
+
"""判断是否已经更新数据"""
|
|
115
|
+
recent_tradeday = xcals.get_recent_tradeday(date)
|
|
107
116
|
prev_tradeday = xcals.shift_tradeday(recent_tradeday, -1)
|
|
108
117
|
now = xcals.now()
|
|
109
118
|
latest_update_date = recent_tradeday if now >= self.update_time else prev_tradeday
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lidb
|
|
3
|
+
Version: 2.0.6
|
|
4
|
+
Summary: Light database for quantor
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: dynaconf>=3.2.11
|
|
8
|
+
Requires-Dist: polars>=1.31.0
|
|
9
|
+
Requires-Dist: sqlparse>=0.5.3
|
|
10
|
+
Requires-Dist: logair>=1.0.8
|
|
11
|
+
Requires-Dist: clickhouse-df>=0.1.5
|
|
12
|
+
Requires-Dist: connectorx>=0.4.3
|
|
13
|
+
Requires-Dist: pymysql>=1.1.2
|
|
14
|
+
Requires-Dist: xcals>=0.0.4
|
|
15
|
+
Requires-Dist: ygo>=1.2.9
|
|
16
|
+
Requires-Dist: lark>=1.3.1
|
|
17
|
+
Requires-Dist: numpy>=2.3.1
|
|
18
|
+
Requires-Dist: tqdm>=4.67.1
|
|
19
|
+
Requires-Dist: varname>=0.15.1
|
|
20
|
+
|
|
21
|
+
## lidb
|
|
22
|
+
|
|
23
|
+
### 项目简介
|
|
24
|
+
lidb 是一个基于 Polars 的数据管理和分析库,专为金融量化研究设计。它提供了高效的数据存储、查询和表达式计算功能,支持多种时间序列和横截面数据分析操作。
|
|
25
|
+
|
|
26
|
+
### 功能特性
|
|
27
|
+
- **多数据源支持**: 本地 Parquet 存储、MySQL、ClickHouse 等数据库连接
|
|
28
|
+
- **高效数据存储**: 基于 Parquet 格式的分区存储机制
|
|
29
|
+
- **SQL 查询接口**: 支持标准 SQL 语法进行数据查询
|
|
30
|
+
- **表达式计算引擎**: 提供丰富的 UDF 函数库,包括时间序列、横截面、维度等分析函数
|
|
31
|
+
- **数据集管理**: 自动化数据补全、历史数据加载和 PIT(Point-in-Time)数据处理
|
|
32
|
+
- **数据服务**: 异步加载数据,用于数据密集型任务的数据加载(如大量标的的高频数据)
|
|
33
|
+
|
|
34
|
+
### 安装
|
|
35
|
+
```bash
|
|
36
|
+
pip install -U lidb
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### 快速开始
|
|
40
|
+
|
|
41
|
+
#### 基础数据操作
|
|
42
|
+
```python
|
|
43
|
+
import lidb
|
|
44
|
+
import polars as pl
|
|
45
|
+
|
|
46
|
+
df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
|
47
|
+
|
|
48
|
+
# 写入数据
|
|
49
|
+
lidb.put(df, "my_table")
|
|
50
|
+
|
|
51
|
+
# sql 查询
|
|
52
|
+
res = lidb.sql("select * from my_table;")
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
#### 数据集使用
|
|
56
|
+
```python
|
|
57
|
+
import lidb
|
|
58
|
+
from lidb import Dataset, dataset
|
|
59
|
+
import polars as pl
|
|
60
|
+
|
|
61
|
+
# 定义一个tick级别的高频数据集: 高频成交量
|
|
62
|
+
def hft_vol(date: str, num: int) -> pl.DataFrame | pl.LazyFrame | None:
|
|
63
|
+
# 假设上游tick行情表在clickhouse
|
|
64
|
+
quote_query = f"select * from quote where date = '{date}'"
|
|
65
|
+
quote = lidb.read_ck(quote_query, db_conf="databases.ck")
|
|
66
|
+
# 特征计算: 比如过去20根tick的成交量总和, 使用表达式引擎计算
|
|
67
|
+
return lidb.from_polars(quote).sql(f"itd_sum(volume, {num}) as vol_s20")
|
|
68
|
+
|
|
69
|
+
ds_hft_vol = Dataset(fn=hft_vol,
|
|
70
|
+
tb="path/to/hft_vol",
|
|
71
|
+
partitions=["num"], # 默认值 None, 会自动识别 num
|
|
72
|
+
update_time="", # 实时更新
|
|
73
|
+
is_hft=True, # 根据asset_id进行分区
|
|
74
|
+
)(num=20)
|
|
75
|
+
|
|
76
|
+
# 获取历史数据
|
|
77
|
+
history_data = ds_hft_vol.get_history(["2023-01-01", "2023-01-02", ...])
|
|
78
|
+
|
|
79
|
+
# 更加便捷的创建数据集方式:通过dataset装饰器
|
|
80
|
+
@dataset()
|
|
81
|
+
def hft_vol(date: str, num: int) -> pl.DataFrame | pl.LazyFrame | None:
|
|
82
|
+
# 假设上游tick行情表在clickhouse
|
|
83
|
+
quote_query = f"select * from quote where date = '{date}'"
|
|
84
|
+
quote = lidb.read_ck(quote_query, db_conf="databases.ck")
|
|
85
|
+
# 特征计算: 比如过去20根tick的成交量总和, 使用表达式引擎计算
|
|
86
|
+
return lidb.from_polars(quote).sql(f"itd_sum(volume, {num}) as vol_s20")
|
|
87
|
+
|
|
88
|
+
hft_vol.get_value("2025-05-15")
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
#### `Table`
|
|
92
|
+
除了 `Dataset` 类用于管理复杂的、可分区的历史数据集之外,lidb 还提供了一个更轻量级的 `Table` 类。
|
|
93
|
+
它适用于那些不需要复杂分区逻辑,且通常以单一文件形式存储的表格数据。`Table` 类同样支持基于更新时间的自动化数据管理和加载。
|
|
94
|
+
##### 特性
|
|
95
|
+
- **简化数据管理**: 专为单表数据设计,无需复杂的分区结构。
|
|
96
|
+
- **灵活更新策略**:
|
|
97
|
+
- **全量更新(`TableMode.F`)**: 每次更新时覆盖旧数据,仅保留最新的数据文件(0.parquet)。
|
|
98
|
+
- **增量更新(`TableMode.I`)**: 每次更新时生成一个新的带时间戳的文件(<uuid>.parquet),保留历史版本。
|
|
99
|
+
- **自动更新检查**: 根据设定的 `update_time` 和文件修改时间,自动判断是否需要更新数据。
|
|
100
|
+
|
|
101
|
+
##### 使用示例
|
|
102
|
+
```python
|
|
103
|
+
from lidb import Table, TableMode
|
|
104
|
+
import polars as pl
|
|
105
|
+
|
|
106
|
+
# 1. 定义一个数据获取函数
|
|
107
|
+
def fetch_latest_stock_list() -> pl.DataFrame:
|
|
108
|
+
# 模拟从某个API或数据库获取最新的股票列表
|
|
109
|
+
import time
|
|
110
|
+
time.sleep(1) # 模拟网络延迟
|
|
111
|
+
return pl.DataFrame({
|
|
112
|
+
"symbol": ["AAPL", "GOOGL", "MSFT"],
|
|
113
|
+
"name": ["Apple Inc.", "Alphabet Inc.", "Microsoft Corp."],
|
|
114
|
+
"sector": ["Technology", "Communication Services", "Technology"]
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
# 2. 创建 Table 实例
|
|
118
|
+
# 假设此表每天上午9点更新
|
|
119
|
+
stock_list_table = Table(
|
|
120
|
+
fn=fetch_latest_stock_list,
|
|
121
|
+
tb="stock_list",
|
|
122
|
+
update_time="09:00:00",
|
|
123
|
+
mode=TableMode.F # 使用全量更新模式
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# 3. 更新数据 (可选,get_value 会自动检查并提示更新)
|
|
127
|
+
# stock_list_table.update(verbose=True)
|
|
128
|
+
|
|
129
|
+
# 4. 获取数据
|
|
130
|
+
# 如果数据过期,get_value 会打印警告并退出,提示先调用 update()
|
|
131
|
+
df = stock_list_table.get_value(date="2023-10-27")
|
|
132
|
+
print(df)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
#### 表达式计算
|
|
137
|
+
```python
|
|
138
|
+
import lidb
|
|
139
|
+
|
|
140
|
+
date = "2025-05-15"
|
|
141
|
+
quote_query = f"select * from quote where date = '{date}'"
|
|
142
|
+
quote = lidb.read_ck(quote_query, db_conf="databases.ck")
|
|
143
|
+
|
|
144
|
+
qdf = lidb.from_polars(quote)
|
|
145
|
+
|
|
146
|
+
# 使用 QDF 进行表达式计算
|
|
147
|
+
res = qdf.sql(
|
|
148
|
+
"ts_mean(close, 5) as c_m5",
|
|
149
|
+
"cs_rank(volume) as vol_rank",
|
|
150
|
+
)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
#### 数据服务
|
|
154
|
+
lidb 提供了一个名为 `D` 的全局 `DataService` 实例。
|
|
155
|
+
用于在后台线程中预加载数据并缓存,从而提升数据密集型任务的性能。
|
|
156
|
+
这对于需要提前准备大量数据的应用非常有用,例如回测系统或实时数据处理流水线。
|
|
157
|
+
##### 启动数据服务
|
|
158
|
+
你可以通过调用 `D.start()` 方法来启动数据服务,指定一个数据加载函数、需要加载的键列表以及迭代配置。
|
|
159
|
+
```python
|
|
160
|
+
from lidb import D
|
|
161
|
+
import polars as pl
|
|
162
|
+
|
|
163
|
+
# 定义一个模拟的数据加载函数
|
|
164
|
+
def mock_data_loader(key: str, iterables: list[str]) -> pl.DataFrame:
|
|
165
|
+
# 模拟耗时操作
|
|
166
|
+
import time
|
|
167
|
+
time.sleep(1)
|
|
168
|
+
|
|
169
|
+
# 返回简单的 DataFrame 示例
|
|
170
|
+
return pl.DataFrame({
|
|
171
|
+
"key": [key],
|
|
172
|
+
"value": [sum(len(s) for s in iterables)]
|
|
173
|
+
})
|
|
174
|
+
|
|
175
|
+
# 启动数据服务
|
|
176
|
+
D.start(
|
|
177
|
+
fn=mock_data_loader,
|
|
178
|
+
keys=["2023-01-01", "2023-01-02", "2023-01-03"],
|
|
179
|
+
iter_conf={"data_source_a": ["a", "b"], "data_source_b": ["x", "y"]},
|
|
180
|
+
max_cache_size=3
|
|
181
|
+
)
|
|
182
|
+
```
|
|
183
|
+
##### 消费数据
|
|
184
|
+
一旦数据服务启动,你就可以通过 `D.do()` 来消费已加载的数据。
|
|
185
|
+
这个方法接受一个消费者函数作为参数,每当有新数据可用时就会被调用。
|
|
186
|
+
```python
|
|
187
|
+
def data_consumer(data_package: dict):
|
|
188
|
+
print(f"Consumed data for key: {data_package['key']}")
|
|
189
|
+
for name, df in data_package['data'].items():
|
|
190
|
+
print(f" Data from {name}:")
|
|
191
|
+
print(df)
|
|
192
|
+
|
|
193
|
+
# 开始消费数据
|
|
194
|
+
D.do(consumer=data_consumer, wait_secs=1)
|
|
195
|
+
```
|
|
196
|
+
##### 停止数据服务
|
|
197
|
+
当你需要停止数据服务时,你可以调用 `D.stop()` 方法。
|
|
198
|
+
##### 完整示例
|
|
199
|
+
以下是一个完整的示例,演示了如何使用 D 进行异步数据加载与消费:
|
|
200
|
+
```python
|
|
201
|
+
import lidb
|
|
202
|
+
from lidb import D
|
|
203
|
+
import polars as pl
|
|
204
|
+
import time
|
|
205
|
+
|
|
206
|
+
def fetch_market_data(key: str, iterables: list[str]) -> pl.DataFrame:
|
|
207
|
+
# 模拟网络请求或复杂计算
|
|
208
|
+
time.sleep(0.5)
|
|
209
|
+
return pl.DataFrame({
|
|
210
|
+
"date": [key],
|
|
211
|
+
"symbol_count": [len(iterables)],
|
|
212
|
+
"total_volume": [sum(ord(c) for s in iterables for c in s)] # Dummy volume
|
|
213
|
+
})
|
|
214
|
+
|
|
215
|
+
# 启动服务
|
|
216
|
+
D.start(
|
|
217
|
+
fn=fetch_market_data,
|
|
218
|
+
keys=["2023-01-01", "2023-01-02", "2023-01-03"],
|
|
219
|
+
iter_conf={"symbols": ["AAPL", "GOOGL", "MSFT"]},
|
|
220
|
+
max_cache_size=2
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# 消费者函数
|
|
224
|
+
def handle_data(data_package: dict):
|
|
225
|
+
print(f"\nReceived data for {data_package['key']}:")
|
|
226
|
+
print(data_package['data']['market_data'])
|
|
227
|
+
|
|
228
|
+
# 启动消费过程
|
|
229
|
+
try:
|
|
230
|
+
D.do(consumer=handle_data, wait_secs=1)
|
|
231
|
+
except KeyboardInterrupt:
|
|
232
|
+
print("\nShutting down data service...")
|
|
233
|
+
finally:
|
|
234
|
+
D.stop()
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### 核心模块
|
|
238
|
+
|
|
239
|
+
#### 数据库操作(`database.py`)
|
|
240
|
+
- `put`: 将 `polars.DataFrame` 写入指定表
|
|
241
|
+
- `sql`: 执行 `SQL` 查询
|
|
242
|
+
- `has`: 检查表是否存在
|
|
243
|
+
- `read_mysql`,`write_mysql`: mysql 数据读写
|
|
244
|
+
- `read_ck`: clickhouse 数据读取
|
|
245
|
+
|
|
246
|
+
#### 数据服务(`svc/data.py`)
|
|
247
|
+
- `DataService`: 数据服务管理
|
|
248
|
+
- `D`: `DataService` 全局实例
|
|
249
|
+
|
|
250
|
+
#### 数据集管理(`dataset.py`)
|
|
251
|
+
- `Dataset`: 数据集定义和管理
|
|
252
|
+
- `DataLoader`: 数据加载器
|
|
253
|
+
|
|
254
|
+
#### 表达式计算(`qdf/`)
|
|
255
|
+
- `QDF`: 表达式数据库
|
|
256
|
+
- `Expr`: 表达式解析器
|
|
257
|
+
- `UDF 函数库`:
|
|
258
|
+
- `base_udf`: 基础运算函数
|
|
259
|
+
- `ts_udf`: 时间序列函数
|
|
260
|
+
- `cs_udf`: 横截面函数
|
|
261
|
+
- `d_udf`: 日期维度函数
|
|
262
|
+
- `itd_udf`: 日内函数
|
|
263
|
+
|
|
264
|
+
#### 配置管理(`init.py`)
|
|
265
|
+
- 自动创建配置文件
|
|
266
|
+
- 支持自定义数据存储路径
|
|
267
|
+
- `polars` 线程配置
|
|
268
|
+
#### 配置说明
|
|
269
|
+
首次运行会在 `~/.config/lidb/settings.toml` 创建配置文件:
|
|
270
|
+
```toml
|
|
271
|
+
[GLOBAL]
|
|
272
|
+
path = "~/lidb" # 数据存储路径
|
|
273
|
+
|
|
274
|
+
[POLARS]
|
|
275
|
+
max_threads = 32 # Polars 最大线程数
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
### 许可证
|
|
279
|
+
本项目采用 MIT 许可证, 请在项目根目录下查看
|
|
280
|
+
|
|
281
|
+
### 联系方式
|
|
282
|
+
Zhangyundi - yundi.xxii@outlook.com
|
|
@@ -1,15 +1,16 @@
|
|
|
1
|
-
lidb/__init__.py,sha256
|
|
1
|
+
lidb/__init__.py,sha256=WuGdkD4QzcCkIG3zbXupaXJV0b3o8gvaMGhs6MhVa_c,536
|
|
2
2
|
lidb/database.py,sha256=DnPXRXvUO6g0kuMo3LPl6eKo_HbD3JNW1qzoaJ14Sgo,7533
|
|
3
|
-
lidb/dataset.py,sha256=
|
|
4
|
-
lidb/
|
|
5
|
-
lidb/
|
|
6
|
-
lidb/
|
|
3
|
+
lidb/dataset.py,sha256=rZGUmvRwaIdynWbTFF-D1fPE1NyAbhDLVxJ3J0y1MYo,24363
|
|
4
|
+
lidb/decorator.py,sha256=bFnUPcJED6F95nBxHq1a8j5pM2JF9rjFtNvxIQUs9_I,1605
|
|
5
|
+
lidb/init.py,sha256=N_PiBGZO3hKUhQQYzly3GKHgSf4eJVO7xyxjX-chUpQ,1327
|
|
6
|
+
lidb/parse.py,sha256=6awnc14OK7XBkkSrAJFOCZOQ0JUHmm6yDI9F3kkLwcQ,3494
|
|
7
|
+
lidb/table.py,sha256=NeqOU0EJU3DA0yz-1T2GVLpKASu1_1fdOLK3yxf7DtA,4494
|
|
7
8
|
lidb/qdf/__init__.py,sha256=gYiSxijoPQZmbgATQX4GsutjolPpN82Kea0eQz6zGyg,1037
|
|
8
9
|
lidb/qdf/errors.py,sha256=lJhhjDRdQOOKUFGlLQ9ELK4AexXBwYQSYus_V-kc5K8,1180
|
|
9
10
|
lidb/qdf/expr.py,sha256=kBzXwjL_PVsJUL9FIHJ2W_G_OVRqFR-kS2mUHTt9thM,10412
|
|
10
11
|
lidb/qdf/lazy.py,sha256=I08IvSkSC84qJkgtZ7nwvG_4UH07jaHBKRp7qQnwqbs,6937
|
|
11
12
|
lidb/qdf/lazy2.py,sha256=ADKQaxmo-BlndhLY-idWCFypZF1icxKNHNMWEfmWy-Q,6294
|
|
12
|
-
lidb/qdf/qdf.py,sha256=
|
|
13
|
+
lidb/qdf/qdf.py,sha256=UWG9G1GI0YdG4dMz5uTV731ETEcZelHqnb0QUGrmHPM,6324
|
|
13
14
|
lidb/qdf/udf/__init__.py,sha256=yIySmkWjtJ-Lj_PMP5O4EnXGDjMAPQL40NmFCekKXBw,313
|
|
14
15
|
lidb/qdf/udf/base_udf.py,sha256=ZjRF2UIrZFgznbm1gxFpdf4V92oO84IaakLeeSNF44U,3444
|
|
15
16
|
lidb/qdf/udf/cs_udf.py,sha256=qlBZd2c1enIdGp_DrNyQWzH3cth4ZpLBIE1hGZuJXbA,3528
|
|
@@ -18,7 +19,7 @@ lidb/qdf/udf/itd_udf.py,sha256=O_OOdSTEaeCoqjtlKnpvNF-_10QoamJL_tw2xEZCYVw,6747
|
|
|
18
19
|
lidb/qdf/udf/ts_udf.py,sha256=Ag6-ffhmIugkA-st2QY-GP4hclQZcRG8SB-bVa7k5cc,5674
|
|
19
20
|
lidb/svc/__init__.py,sha256=9vQo7gCm5LRgWSiq_UU2hlbwvXi0FlGYt2UDVZixx_U,141
|
|
20
21
|
lidb/svc/data.py,sha256=tLOI_YylnsVejyqv9l-KgPetkPO0QzybOf1PEeFSZNI,4380
|
|
21
|
-
lidb-
|
|
22
|
-
lidb-
|
|
23
|
-
lidb-
|
|
24
|
-
lidb-
|
|
22
|
+
lidb-2.0.6.dist-info/METADATA,sha256=ldndXJNXi7y_k1rh5fRPbBVF4a97LqRykzW2gEk8lEM,9087
|
|
23
|
+
lidb-2.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
24
|
+
lidb-2.0.6.dist-info/top_level.txt,sha256=NgXJNwt6ld6oLXtW1vOPaEh-VO5R0JEX_KmGIJR4ueE,5
|
|
25
|
+
lidb-2.0.6.dist-info/RECORD,,
|
lidb-1.2.0.dist-info/METADATA
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: lidb
|
|
3
|
-
Version: 1.2.0
|
|
4
|
-
Summary: Light database for quantor
|
|
5
|
-
Requires-Python: >=3.12
|
|
6
|
-
Description-Content-Type: text/markdown
|
|
7
|
-
Requires-Dist: dynaconf>=3.2.11
|
|
8
|
-
Requires-Dist: polars>=1.31.0
|
|
9
|
-
Requires-Dist: sqlparse>=0.5.3
|
|
10
|
-
Requires-Dist: logair>=1.0.1
|
|
11
|
-
Requires-Dist: clickhouse-df>=0.1.5
|
|
12
|
-
Requires-Dist: connectorx>=0.4.3
|
|
13
|
-
Requires-Dist: pymysql>=1.1.2
|
|
14
|
-
Requires-Dist: xcals>=0.0.4
|
|
15
|
-
Requires-Dist: ygo>=1.2.8
|
|
16
|
-
Requires-Dist: lark>=1.3.1
|
|
17
|
-
Requires-Dist: numpy>=2.3.1
|
|
18
|
-
Requires-Dist: tqdm>=4.67.1
|
|
File without changes
|
|
File without changes
|