lidb 1.3.6__py3-none-any.whl → 2.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lidb/__init__.py CHANGED
@@ -22,9 +22,10 @@ from .database import (
22
22
 
23
23
  from .table import Table, TableMode
24
24
  from .dataset import Dataset, DataLoader
25
+ from .decorator import dataset
25
26
  from .qdf import from_polars, Expr
26
27
  from .svc import DataService, D
27
28
 
28
29
  from .parse import parse_hive_partition_structure
29
30
 
30
- __version__ = "1.3.6"
31
+ __version__ = "2.0.6"
lidb/dataset.py CHANGED
@@ -21,6 +21,7 @@ from varname import varname
21
21
 
22
22
  from .database import put, tb_path, scan, DB_PATH
23
23
  from .parse import parse_hive_partition_structure
24
+ import inspect
24
25
 
25
26
  DEFAULT_DS_PATH = DB_PATH / "datasets"
26
27
 
@@ -50,6 +51,9 @@ def complete_data(fn, date, save_path, partitions):
50
51
  else:
51
52
  data = data.cast({"date": pl.Utf8})
52
53
  data = data.filter(date=date)
54
+ if "time" in data.columns:
55
+ if data["time"].n_unique() < 2:
56
+ data = data.drop("time")
53
57
  put(data, save_path, partitions=partitions)
54
58
  except Exception as e:
55
59
  logger.error(f"{save_path}: Error when complete data for {date}\n", exc_info=e)
@@ -65,6 +69,7 @@ class Dataset:
65
69
  window: str = "1d",
66
70
  partitions: list[str] = None,
67
71
  is_hft: bool = False,
72
+ data_name: str = "",
68
73
  frame: int = 1):
69
74
  """
70
75
 
@@ -75,16 +80,22 @@ class Dataset:
75
80
  fn: str
76
81
  数据集计算函数。如果要用到底层依赖数据集,则必须显示定义形参 `depend`
77
82
  tb: str
78
- 数据集保存表格, 如果没有指定,默认 {lidb.DB_PATH}/datasets/
83
+ 数据集保存表格, 如果没有指定,默认 {lidb.DB_PATH}/datasets/<module>
79
84
  update_time: str
80
85
  更新时间: 默认没有-实时更新,也就是可以取到当天值
86
+ 更新时间只允许三种情况:
87
+ - 1. 盘前时间点:比如 08:00:00, 09:00:00, 09:15:00 ...
88
+ - 2. 盘中时间点:归为实时更新,使用空值 ""
89
+ - 3. 盘后时间点:比如 15:00:00, 16:30:00, 20:00:00 ...
81
90
  partitions: list[str]
82
- 分区
91
+ 分区: 如果指定为 None, 则自动从 fn 参数推断,如果不需要分区,应该将其设定为空列表: []
83
92
  is_hft: bool
84
93
  是否是高频数据,如果是,则会按照asset进行分区存储,默认 False
85
94
  hft定义为:时间步长 < 1min
86
95
  window: str
87
96
  配合depends使用,在取depends时,会回看window周期,最小单位为`d`。不足 `d` 的会往上取整为`1d`
97
+ data_name: str
98
+ 数据名,默认为空,会自动推断,如果指定了,则使用指定名
88
99
  frame: int
89
100
  用于自动推断 数据名
90
101
  """
@@ -95,13 +106,19 @@ class Dataset:
95
106
  self._is_depend = "depend" in self.fn_params_sig and len(self._depends) > 0
96
107
  self._is_hft = is_hft
97
108
  self._frame = frame
98
- self.data_name = ""
99
- try:
100
- self.data_name = varname(frame, strict=False)
101
- except Exception as e:
102
- pass
109
+ self.data_name = data_name
110
+ if not self.data_name:
111
+ try:
112
+ self.data_name = varname(frame, strict=False)
113
+ except Exception as e:
114
+ pass
103
115
  if self.data_name:
104
116
  self.data_name = self.data_name.replace('ds_', '')
117
+ fn_params = ygo.fn_params(self.fn)
118
+ self.fn_params = {k: v for (k, v) in fn_params}
119
+ # 更新底层依赖数据集的同名参数
120
+ self._update_depends()
121
+
105
122
  if pd.Timedelta(window).days < 1:
106
123
  window = "1d"
107
124
  window_td = pd.Timedelta(window)
@@ -120,17 +137,18 @@ class Dataset:
120
137
  partitions = [*partitions, *self._append_partitions]
121
138
  self.partitions = partitions
122
139
  self._type_asset = "asset" in self.fn_params_sig
140
+ if "09:30:00" < update_time < "15:00:00":
141
+ update_time = ""
123
142
  self.update_time = update_time
124
143
  # 根据底层依赖调整update_time
125
- if self._depends:
144
+ if update_time and self._depends:
126
145
  dep_ut = [ds.update_time for ds in self._depends]
127
146
  dep_ut.append(update_time)
128
147
  self.update_time = max(dep_ut)
129
-
130
- self.tb = tb if tb else DEFAULT_DS_PATH / f"{self.data_name}"
148
+ mod = inspect.getmodule(fn)
149
+ self.tb = tb if tb else DEFAULT_DS_PATH / mod.__name__ /f"{self.data_name}"
131
150
  self.save_path = tb_path(self.tb)
132
- fn_params = ygo.fn_params(self.fn)
133
- self.fn_params = {k: v for (k, v) in fn_params}
151
+
134
152
  self.constraints = dict()
135
153
  for k in self.partitions[:-len(self._append_partitions)]:
136
154
  if k in self.fn_params:
@@ -140,12 +158,20 @@ class Dataset:
140
158
  self.constraints[k] = v
141
159
  self.save_path = self.save_path / f"{k}={v}"
142
160
 
161
+ def _update_depends(self):
162
+ new_deps = list()
163
+ for dep in self._depends:
164
+ new_dep = dep(**self.fn_params)
165
+ new_deps.append(new_dep)
166
+ self._depends = new_deps
167
+
143
168
  def is_empty(self, path) -> bool:
144
169
  return not any(path.rglob("*.parquet"))
145
170
 
146
171
  def __call__(self, *fn_args, **fn_kwargs):
147
- # self.fn =
148
- fn = partial(self.fn, *fn_args, **fn_kwargs)
172
+ """赋值时也会同步更新底层依赖数据集的同名参数"""
173
+
174
+ fn = ygo.delay(self.fn)(*fn_args, **fn_kwargs)
149
175
  ds = Dataset(*self._depends,
150
176
  fn=fn,
151
177
  tb=self.tb,
@@ -212,7 +238,7 @@ class Dataset:
212
238
  fn = self.fn
213
239
  save_path = self.save_path
214
240
  if self._is_depend:
215
- fn = partial(fn, depend=self._get_depends(date))
241
+ fn = partial(fn, depend=self._get_depends(date,))
216
242
  else:
217
243
  fn = partial(fn, date=date)
218
244
  if self._type_asset:
@@ -288,7 +314,7 @@ class Dataset:
288
314
  _end_date = max(missing_dates)
289
315
  _beg_date = min(missing_dates)
290
316
  if self._days > 1:
291
- _beg_date = xcals.shift_tradeday(_beg_date, -self._days)
317
+ _beg_date = xcals.shift_tradeday(_beg_date, -(self._days-1))
292
318
  _depend_dates = xcals.get_tradingdays(_beg_date, _end_date)
293
319
  for depend in self._depends:
294
320
  depend.get_history(_depend_dates, eager=False)
@@ -333,19 +359,22 @@ class Dataset:
333
359
  return data.collect()
334
360
  return data
335
361
 
336
- def _get_depends(self, date: str) -> pl.DataFrame | None:
362
+ def _get_depends(self, date: str) -> pl.LazyFrame | None:
337
363
  # 获取依赖数据集数据
338
364
  if not self._depends:
339
365
  return None
340
366
  end_date = date
341
- beg_date = xcals.shift_tradeday(date, -self._days)
367
+ beg_date = date
368
+ if self._days > 1:
369
+ beg_date = xcals.shift_tradeday(beg_date, -(self._days-1))
342
370
  params = {
343
371
  "ds_conf": dict(depend=self._depends),
344
372
  "beg_date": beg_date,
345
373
  "end_date": end_date,
346
- "times": [self.update_time],
374
+ "times": [self.update_time, ],
347
375
  "show_progress": False,
348
- "eager": True,
376
+ "eager": False,
377
+ "process_time": False, # 不处理时间
349
378
  }
350
379
  res = load_ds(**params)
351
380
  return res["depend"]
@@ -357,20 +386,54 @@ def loader(data_name: str,
357
386
  prev_date_list: list[str],
358
387
  prev_date_mapping: dict[str, str],
359
388
  time: str,
389
+ process_time: bool,
360
390
  **constraints) -> pl.LazyFrame:
361
- if time < ds.update_time:
362
- if len(prev_date_list) > 1:
363
- lf = ds.get_history(prev_date_list, eager=False, **constraints)
391
+ """
392
+ Parameters
393
+ ----------
394
+ data_name
395
+ ds
396
+ date_list
397
+ prev_date_list
398
+ prev_date_mapping
399
+ time
400
+ process_time: bool
401
+ 是否处理源数据的时间: 根据实参 time. 用于应对不同场景
402
+ 场景1:依赖因子不处理,底层数据是什么就返回什么
403
+ 场景2:zoo.load 用来加载测试日内不同时间点的数据,就应该处理
404
+ constraints
405
+
406
+ Returns
407
+ -------
408
+
409
+ """
410
+ if time:
411
+ if time < ds.update_time:
412
+ if len(prev_date_list) > 1:
413
+ lf = ds.get_history(prev_date_list, eager=False, **constraints)
414
+ else:
415
+ lf = ds.get_value(prev_date_list[0], eager=False, **constraints)
364
416
  else:
365
- lf = ds.get_value(prev_date_list[0], eager=False, **constraints)
417
+ if len(date_list) > 1:
418
+ lf = ds.get_history(date_list, eager=False, **constraints)
419
+ else:
420
+ lf = ds.get_value(date_list[0], eager=False, **constraints)
366
421
  else:
367
- if len(date_list) > 1:
368
- lf = ds.get_history(date_list, eager=False, **constraints)
422
+ if ds.update_time > "09:30:00":
423
+ # 盘后因子:取上一天的值
424
+ if len(prev_date_list) > 1:
425
+ lf = ds.get_history(prev_date_list, eager=False, **constraints)
426
+ else:
427
+ lf = ds.get_value(prev_date_list[0], eager=False, **constraints)
369
428
  else:
370
- lf = ds.get_value(date_list[0], eager=False, **constraints)
429
+ if len(date_list) > 1:
430
+ lf = ds.get_history(date_list, eager=False, **constraints)
431
+ else:
432
+ lf = ds.get_value(date_list[0], eager=False, **constraints)
433
+
371
434
  schema = lf.collect_schema()
372
435
  include_time = schema.get("time") is not None
373
- if time:
436
+ if process_time and time:
374
437
  if include_time:
375
438
  lf = lf.filter(time=time)
376
439
  else:
@@ -396,6 +459,7 @@ def load_ds(ds_conf: dict[str, list[Dataset]],
396
459
  backend: Literal["threading", "multiprocessing", "loky"] = "threading",
397
460
  show_progress: bool = True,
398
461
  eager: bool = False,
462
+ process_time: bool = True,
399
463
  **constraints) -> dict[str, pl.DataFrame | pl.LazyFrame]:
400
464
  """
401
465
  加载数据集
@@ -417,6 +481,10 @@ def load_ds(ds_conf: dict[str, list[Dataset]],
417
481
  是否返回 DataFrame
418
482
  - True: 返回DataFrame
419
483
  - False: 返回LazyFrame
484
+ process_time: bool
485
+ 是否处理源数据的时间: 根据实参 time. 用于应对不同场景
486
+ 场景1:依赖因子不处理,底层数据是什么就返回什么
487
+ 场景2:zoo.load 用来加载测试日内不同时间点的数据,就应该处理
420
488
  constraints
421
489
  限制条件,比如 asset='000001'
422
490
  Returns
@@ -435,6 +503,7 @@ def load_ds(ds_conf: dict[str, list[Dataset]],
435
503
  prev_date_mapping = {prev_date: date_list[i] for i, prev_date in enumerate(prev_date_list)}
436
504
  results = defaultdict(list)
437
505
  index = ("date", "time", "asset")
506
+ _index = ("date", "asset")
438
507
  with ygo.pool(n_jobs=n_jobs,
439
508
  backend=backend,
440
509
  show_progress=show_progress) as go:
@@ -450,27 +519,58 @@ def load_ds(ds_conf: dict[str, list[Dataset]],
450
519
  prev_date_list=prev_date_list,
451
520
  prev_date_mapping=prev_date_mapping,
452
521
  time=time,
522
+ process_time=process_time,
453
523
  **constraints)
454
524
  for name, lf in go.do():
455
525
  results[name].append(lf)
456
- _LFs = {
457
- name: (pl.concat(lfList, )
458
- .select(*index,
459
- cs.exclude(index))
460
- )
461
- for name, lfList in results.items()}
462
- LFs = defaultdict(list)
463
- for name, lf in _LFs.items():
526
+ # _LFs = {
527
+ # name: (pl.concat(lfList, )
528
+ # .select(*index,
529
+ # cs.exclude(index))
530
+ # )
531
+ # for name, lfList in results.items()}
532
+ _LFs_with_time = {}
533
+ _LFs_without_time = {}
534
+ for name, lfList in results.items():
535
+ lf = pl.concat(lfList)
536
+ # print(lf)
537
+ if "time" not in lf.collect_schema().names():
538
+ _LFs_without_time[name] = lf
539
+ else:
540
+ _LFs_with_time[name] = lf
541
+ LFs_with_time = defaultdict(list)
542
+ LFs_without_time = defaultdict(list)
543
+ for name, lf in _LFs_with_time.items():
464
544
  dn, _ = name.split(":")
465
- LFs[dn].append(lf)
466
- LFs = {
545
+ LFs_with_time[dn].append(lf)
546
+ for name, lf in _LFs_without_time.items():
547
+ dn, _ = name.split(":")
548
+ LFs_without_time[dn].append(lf)
549
+ LFs_with_time = {
467
550
  name: (pl.concat(lfList, how="align")
468
551
  .sort(index)
469
552
  .select(*index,
470
553
  cs.exclude(index))
471
554
  )
472
- for name, lfList in LFs.items()}
473
-
555
+ for name, lfList in LFs_with_time.items()}
556
+ LFs_without_time = {
557
+ name: (pl.concat(lfList, how="align")
558
+ .sort(_index)
559
+ .select(*_index,
560
+ cs.exclude(_index))
561
+ )
562
+ for name, lfList in LFs_without_time.items()}
563
+ dns = list(LFs_with_time.keys()) if LFs_with_time else list(LFs_without_time.keys())
564
+ LFs = dict()
565
+ for dn in dns:
566
+ _lf_with_time = LFs_with_time.get(dn)
567
+ _lf_without_time = LFs_without_time.get(dn)
568
+ if _lf_with_time is not None:
569
+ LFs[dn] = _lf_with_time
570
+ if _lf_without_time is not None:
571
+ LFs[dn] = LFs[dn].join(_lf_without_time, on=["date", "asset"], how="left")
572
+ else:
573
+ LFs[dn] = _lf_without_time
474
574
  if not eager:
475
575
  return LFs
476
576
  return {
@@ -478,7 +578,6 @@ def load_ds(ds_conf: dict[str, list[Dataset]],
478
578
  for name, lf in LFs.items()
479
579
  }
480
580
 
481
-
482
581
  class DataLoader:
483
582
 
484
583
  def __init__(self, name: str):
@@ -521,6 +620,7 @@ class DataLoader:
521
620
  backend=backend,
522
621
  times=times,
523
622
  eager=eager,
623
+ process_time=True,
524
624
  **constraints)
525
625
  self._df = lf[self._name]
526
626
 
lidb/decorator.py ADDED
@@ -0,0 +1,50 @@
1
+ # Copyright (c) ZhangYundi.
2
+ # Licensed under the MIT License.
3
+ # Created on 2025/12/31 10:58
4
+ # Description:
5
+
6
+ from .dataset import Dataset
7
+ from typing import Callable, TypeVar, cast
8
+
9
+ F = TypeVar('F', bound=Callable)
10
+
11
+ def dataset(*depends: Dataset,
12
+ tb: str = "",
13
+ update_time: str = "",
14
+ window: str = "1d",
15
+ partitions: list[str] = None,
16
+ is_hft: bool = False) -> Callable[[F], Dataset]:
17
+ """
18
+ 装饰器:将函数转换为Dataset对象
19
+
20
+ Parameters
21
+ ----------
22
+ depends: Dataset
23
+ 底层依赖数据集
24
+ tb: str
25
+ 数据集保存表格, 如果没有指定,默认 {DEFAULT_DS_PATH}/
26
+ update_time: str
27
+ 更新时间: 默认没有-实时更新,也就是可以取到当天值
28
+ window: str
29
+ 配合depends使用,在取depends时,会回看window周期,最小单位为`d`。不足 `d` 的会往上取整为`1d`
30
+ partitions: list[str]
31
+ 分区: 如果指定为 None, 则自动从 fn 参数推断,如果不需要分区,应该将其设定为空列表: []
32
+ is_hft: bool
33
+ 是否是高频数据,如果是,则会按照asset进行分区存储,默认 False
34
+ hft定义为:时间步长 < 1min
35
+ """
36
+ def decorator(fn: F):
37
+ # 创建Dataset实例
38
+ ds = Dataset(
39
+ *depends,
40
+ fn=fn,
41
+ tb=tb,
42
+ update_time=update_time,
43
+ window=window,
44
+ partitions=partitions,
45
+ is_hft=is_hft,
46
+ data_name=fn.__name__,
47
+ frame=1
48
+ )
49
+ return ds
50
+ return decorator
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lidb
3
- Version: 1.3.6
3
+ Version: 2.0.6
4
4
  Summary: Light database for quantor
5
5
  Requires-Python: >=3.12
6
6
  Description-Content-Type: text/markdown
@@ -55,7 +55,7 @@ res = lidb.sql("select * from my_table;")
55
55
  #### 数据集使用
56
56
  ```python
57
57
  import lidb
58
- from lidb import Dataset
58
+ from lidb import Dataset, dataset
59
59
  import polars as pl
60
60
 
61
61
  # 定义一个tick级别的高频数据集: 高频成交量
@@ -68,13 +68,24 @@ def hft_vol(date: str, num: int) -> pl.DataFrame | pl.LazyFrame | None:
68
68
 
69
69
  ds_hft_vol = Dataset(fn=hft_vol,
70
70
  tb="path/to/hft_vol",
71
- partitions=["num"],
71
+ partitions=["num"], # 默认值 None, 会自动识别 num
72
72
  update_time="", # 实时更新
73
- by_asset=True, # 根据asset_id进行分区
73
+ is_hft=True, # 根据asset_id进行分区
74
74
  )(num=20)
75
75
 
76
76
  # 获取历史数据
77
77
  history_data = ds_hft_vol.get_history(["2023-01-01", "2023-01-02", ...])
78
+
79
+ # 更加便捷的创建数据集方式:通过dataset装饰器
80
+ @dataset()
81
+ def hft_vol(date: str, num: int) -> pl.DataFrame | pl.LazyFrame | None:
82
+ # 假设上游tick行情表在clickhouse
83
+ quote_query = f"select * from quote where date = '{date}'"
84
+ quote = lidb.read_ck(quote_query, db_conf="databases.ck")
85
+ # 特征计算: 比如过去20根tick的成交量总和, 使用表达式引擎计算
86
+ return lidb.from_polars(quote).sql(f"itd_sum(volume, {num}) as vol_s20")
87
+
88
+ hft_vol.get_value("2025-05-15")
78
89
  ```
79
90
 
80
91
  #### `Table`
@@ -239,7 +250,6 @@ finally:
239
250
  #### 数据集管理(`dataset.py`)
240
251
  - `Dataset`: 数据集定义和管理
241
252
  - `DataLoader`: 数据加载器
242
- - `zoo`: alpha因子数据管理
243
253
 
244
254
  #### 表达式计算(`qdf/`)
245
255
  - `QDF`: 表达式数据库
@@ -1,6 +1,7 @@
1
- lidb/__init__.py,sha256=tcPBOs0UltwU3tCqjIBHrklITYQcAeWkdrV3_SYCu1I,505
1
+ lidb/__init__.py,sha256=WuGdkD4QzcCkIG3zbXupaXJV0b3o8gvaMGhs6MhVa_c,536
2
2
  lidb/database.py,sha256=DnPXRXvUO6g0kuMo3LPl6eKo_HbD3JNW1qzoaJ14Sgo,7533
3
- lidb/dataset.py,sha256=hC2D2uJ7xV6yRB-j9TOYxb0aqZQME_5_BbXND2NPBK0,20254
3
+ lidb/dataset.py,sha256=rZGUmvRwaIdynWbTFF-D1fPE1NyAbhDLVxJ3J0y1MYo,24363
4
+ lidb/decorator.py,sha256=bFnUPcJED6F95nBxHq1a8j5pM2JF9rjFtNvxIQUs9_I,1605
4
5
  lidb/init.py,sha256=N_PiBGZO3hKUhQQYzly3GKHgSf4eJVO7xyxjX-chUpQ,1327
5
6
  lidb/parse.py,sha256=6awnc14OK7XBkkSrAJFOCZOQ0JUHmm6yDI9F3kkLwcQ,3494
6
7
  lidb/table.py,sha256=NeqOU0EJU3DA0yz-1T2GVLpKASu1_1fdOLK3yxf7DtA,4494
@@ -18,7 +19,7 @@ lidb/qdf/udf/itd_udf.py,sha256=O_OOdSTEaeCoqjtlKnpvNF-_10QoamJL_tw2xEZCYVw,6747
18
19
  lidb/qdf/udf/ts_udf.py,sha256=Ag6-ffhmIugkA-st2QY-GP4hclQZcRG8SB-bVa7k5cc,5674
19
20
  lidb/svc/__init__.py,sha256=9vQo7gCm5LRgWSiq_UU2hlbwvXi0FlGYt2UDVZixx_U,141
20
21
  lidb/svc/data.py,sha256=tLOI_YylnsVejyqv9l-KgPetkPO0QzybOf1PEeFSZNI,4380
21
- lidb-1.3.6.dist-info/METADATA,sha256=0f7wFU6CZwD_jiqmJjzc_HNCx48mKA24_JBUREiEfSs,8558
22
- lidb-1.3.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
23
- lidb-1.3.6.dist-info/top_level.txt,sha256=NgXJNwt6ld6oLXtW1vOPaEh-VO5R0JEX_KmGIJR4ueE,5
24
- lidb-1.3.6.dist-info/RECORD,,
22
+ lidb-2.0.6.dist-info/METADATA,sha256=ldndXJNXi7y_k1rh5fRPbBVF4a97LqRykzW2gEk8lEM,9087
23
+ lidb-2.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
+ lidb-2.0.6.dist-info/top_level.txt,sha256=NgXJNwt6ld6oLXtW1vOPaEh-VO5R0JEX_KmGIJR4ueE,5
25
+ lidb-2.0.6.dist-info/RECORD,,
File without changes