lidb 1.2.0__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lidb/__init__.py CHANGED
@@ -27,4 +27,4 @@ from .svc import DataService, D
27
27
 
28
28
  from .parse import parse_hive_partition_structure
29
29
 
30
- __version__ = "1.2.0"
30
+ __version__ = "1.3.6"
lidb/dataset.py CHANGED
@@ -5,21 +5,24 @@
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
+ import shutil
8
9
  from collections import defaultdict
9
10
  from enum import Enum
10
11
  from functools import partial
11
12
  from typing import Callable, Literal
12
13
 
13
14
  import logair
15
+ import pandas as pd
14
16
  import polars as pl
15
17
  import polars.selectors as cs
16
18
  import xcals
17
19
  import ygo
20
+ from varname import varname
18
21
 
19
22
  from .database import put, tb_path, scan, DB_PATH
20
23
  from .parse import parse_hive_partition_structure
21
- from .qdf import QDF, from_polars
22
24
 
25
+ DEFAULT_DS_PATH = DB_PATH / "datasets"
23
26
 
24
27
  class InstrumentType(Enum):
25
28
  STOCK = "Stock" # 股票
@@ -30,7 +33,7 @@ class InstrumentType(Enum):
30
33
  def complete_data(fn, date, save_path, partitions):
31
34
  logger = logair.get_logger(__name__)
32
35
  try:
33
- data = fn(date=date)
36
+ data = fn()
34
37
  if data is None:
35
38
  # 保存数据的逻辑在fn中实现了
36
39
  return
@@ -44,57 +47,88 @@ def complete_data(fn, date, save_path, partitions):
44
47
  cols = data.columns
45
48
  if "date" not in cols:
46
49
  data = data.with_columns(pl.lit(date).alias("date")).select("date", *cols)
47
-
50
+ else:
51
+ data = data.cast({"date": pl.Utf8})
52
+ data = data.filter(date=date)
48
53
  put(data, save_path, partitions=partitions)
49
54
  except Exception as e:
50
- logger.error(f"{save_path}: Error when complete data for {date}")
51
- logger.warning(e)
55
+ logger.error(f"{save_path}: Error when complete data for {date}\n", exc_info=e)
52
56
 
53
57
 
54
58
  class Dataset:
55
59
 
56
60
  def __init__(self,
57
- fn: Callable[..., pl.DataFrame],
58
- tb: str,
61
+ *depends: Dataset,
62
+ fn: Callable[..., pl.DataFrame | pl.LazyFrame],
63
+ tb: str = "",
59
64
  update_time: str = "",
65
+ window: str = "1d",
60
66
  partitions: list[str] = None,
61
- by_asset: bool = True,
62
- by_time: bool = False):
67
+ is_hft: bool = False,
68
+ frame: int = 1):
63
69
  """
64
70
 
65
71
  Parameters
66
72
  ----------
73
+ depends: Dataset
74
+ 底层依赖数据集
67
75
  fn: str
68
- 数据集计算函数
76
+ 数据集计算函数。如果要用到底层依赖数据集,则必须显示定义形参 `depend`
69
77
  tb: str
70
- 数据集保存表格
78
+ 数据集保存表格, 如果没有指定,默认 {lidb.DB_PATH}/datasets/
71
79
  update_time: str
72
80
  更新时间: 默认没有-实时更新,也就是可以取到当天值
73
81
  partitions: list[str]
74
82
  分区
75
- by_asset: bool
76
- 是否按照标的进行分区,默认 True
77
- by_time: bool
78
- 是否按照标的进行分区,默认 False
83
+ is_hft: bool
84
+ 是否是高频数据,如果是,则会按照asset进行分区存储,默认 False
85
+ hft定义为:时间步长 < 1min
86
+ window: str
87
+ 配合depends使用,在取depends时,会回看window周期,最小单位为`d`。不足 `d` 的会往上取整为`1d`
88
+ frame: int
89
+ 用于自动推断 数据名
79
90
  """
91
+ self._depends = list(depends)
92
+ self._name = ""
80
93
  self.fn = fn
81
94
  self.fn_params_sig = ygo.fn_signature_params(fn)
82
- self._by_asset = by_asset
83
- self._by_time = by_time
84
- self._append_partitions = ["asset", "date"] if by_asset else ["date", ]
85
- if by_time:
86
- self._append_partitions.append("time")
95
+ self._is_depend = "depend" in self.fn_params_sig and len(self._depends) > 0
96
+ self._is_hft = is_hft
97
+ self._frame = frame
98
+ self.data_name = ""
99
+ try:
100
+ self.data_name = varname(frame, strict=False)
101
+ except Exception as e:
102
+ pass
103
+ if self.data_name:
104
+ self.data_name = self.data_name.replace('ds_', '')
105
+ if pd.Timedelta(window).days < 1:
106
+ window = "1d"
107
+ window_td = pd.Timedelta(window)
108
+ self._window = window
109
+ self._days = window_td.days
110
+ if window_td.seconds > 0:
111
+ self._days += 1
112
+ # 检测是否高频数据:如果是高频数据,则按照标的进行分区,高频的定义为时间差 < 60s
113
+ self._append_partitions = ["asset", "date"] if is_hft else ["date", ]
87
114
  if partitions is not None:
88
115
  partitions = [k for k in partitions if k not in self._append_partitions]
89
116
  partitions = [*partitions, *self._append_partitions]
90
117
  else:
91
- partitions = self._append_partitions
118
+ # partitions = self._append_partitions
119
+ partitions = [k for k in self.fn_params_sig if k not in self._append_partitions and k != "depend"]
120
+ partitions = [*partitions, *self._append_partitions]
92
121
  self.partitions = partitions
93
122
  self._type_asset = "asset" in self.fn_params_sig
94
123
  self.update_time = update_time
95
-
96
- self.tb = tb
97
- self.save_path = tb_path(tb)
124
+ # 根据底层依赖调整update_time
125
+ if self._depends:
126
+ dep_ut = [ds.update_time for ds in self._depends]
127
+ dep_ut.append(update_time)
128
+ self.update_time = max(dep_ut)
129
+
130
+ self.tb = tb if tb else DEFAULT_DS_PATH / f"{self.data_name}"
131
+ self.save_path = tb_path(self.tb)
98
132
  fn_params = ygo.fn_params(self.fn)
99
133
  self.fn_params = {k: v for (k, v) in fn_params}
100
134
  self.constraints = dict()
@@ -112,14 +146,21 @@ class Dataset:
112
146
  def __call__(self, *fn_args, **fn_kwargs):
113
147
  # self.fn =
114
148
  fn = partial(self.fn, *fn_args, **fn_kwargs)
115
- ds = Dataset(fn=fn,
149
+ ds = Dataset(*self._depends,
150
+ fn=fn,
116
151
  tb=self.tb,
117
152
  partitions=self.partitions,
118
- by_asset=self._by_asset,
119
- by_time=self._by_time,
120
- update_time=self.update_time)
153
+ update_time=self.update_time,
154
+ is_hft=self._is_hft,
155
+ window=self._window,
156
+ frame=self._frame+1)
157
+ ds.data_name = self.data_name
121
158
  return ds
122
159
 
160
+ def alias(self, new_name: str):
161
+ self._name = new_name
162
+ return self
163
+
123
164
  def get_value(self, date, eager: bool = True, **constraints):
124
165
  """
125
166
  取值: 不保证未来数据
@@ -135,6 +176,7 @@ class Dataset:
135
176
  -------
136
177
 
137
178
  """
179
+ logger = logair.get_logger(f"{__name__}.{self.__class__.__name__}")
138
180
  _constraints = {k: v for k, v in constraints.items() if k in self.partitions}
139
181
  _limits = {k: v for k, v in constraints.items() if k not in self.partitions}
140
182
  search_path = self.save_path
@@ -144,9 +186,22 @@ class Dataset:
144
186
  search_path = search_path / f"{k}={v}"
145
187
  search_path = search_path / f"date={date}"
146
188
 
189
+ # 处理空文件
190
+ for file_path in search_path.rglob("*.parquet"):
191
+ if file_path.stat().st_size == 0:
192
+ # 删除
193
+ logger.warning(f"{file_path}: Deleting empty file.")
194
+ file_path.unlink()
195
+
147
196
  if not self.is_empty(search_path):
148
197
  lf = scan(search_path).cast({"date": pl.Utf8})
149
- schema = lf.collect_schema()
198
+ try:
199
+ schema = lf.collect_schema()
200
+ except:
201
+ logger.warning(f"{search_path}: Failed to collect schema.")
202
+ # 删除该文件夹
203
+ shutil.rmtree(search_path)
204
+ return self.get_value(date=date, eager=eager, **constraints)
150
205
  _limits = {k: v for k, v in constraints.items() if schema.get(k) is not None}
151
206
  lf = lf.filter(date=date, **_limits)
152
207
  if not eager:
@@ -156,7 +211,10 @@ class Dataset:
156
211
  return data
157
212
  fn = self.fn
158
213
  save_path = self.save_path
159
-
214
+ if self._is_depend:
215
+ fn = partial(fn, depend=self._get_depends(date))
216
+ else:
217
+ fn = partial(fn, date=date)
160
218
  if self._type_asset:
161
219
  if "asset" in _constraints:
162
220
  fn = ygo.delay(self.fn)(asset=_constraints["asset"])
@@ -169,7 +227,6 @@ class Dataset:
169
227
  params[k] = v
170
228
  save_path = save_path / f"{k}={v}"
171
229
  fn = ygo.delay(self.fn)(**params)
172
- logger = logair.get_logger(__name__)
173
230
 
174
231
  today = xcals.today()
175
232
  now = xcals.now()
@@ -198,7 +255,7 @@ class Dataset:
198
255
  def get_history(self,
199
256
  dateList: list[str],
200
257
  n_jobs: int = 5,
201
- backend: Literal["threading", "multiprocessing", "loky"] = "loky",
258
+ backend: Literal["threading", "multiprocessing", "loky"] = "threading",
202
259
  eager: bool = True,
203
260
  rep_asset: str = "000001", # 默认 000001
204
261
  **constraints):
@@ -227,6 +284,14 @@ class Dataset:
227
284
  missing_dates = set(dateList).difference(set(exist_dates))
228
285
  missing_dates = sorted(list(missing_dates))
229
286
  if missing_dates:
287
+ # 先逐个补齐 depends
288
+ _end_date = max(missing_dates)
289
+ _beg_date = min(missing_dates)
290
+ if self._days > 1:
291
+ _beg_date = xcals.shift_tradeday(_beg_date, -self._days)
292
+ _depend_dates = xcals.get_tradingdays(_beg_date, _end_date)
293
+ for depend in self._depends:
294
+ depend.get_history(_depend_dates, eager=False)
230
295
  fn = self.fn
231
296
  save_path = self.save_path
232
297
 
@@ -250,12 +315,17 @@ class Dataset:
250
315
  except:
251
316
  pass
252
317
  for date in missing_dates:
253
- go.submit(complete_data, job_name=f"Completing {info_path}")(
254
- fn=fn,
255
- date=date,
256
- save_path=save_path,
257
- partitions=self._append_partitions,
258
- )
318
+ if self._is_depend:
319
+ fn = partial(fn, depend=self._get_depends(date))
320
+ else:
321
+ fn = partial(fn, date=date)
322
+ go.submit(complete_data,
323
+ job_name=f"Completing",
324
+ postfix=info_path,
325
+ leave=False)(fn=fn,
326
+ date=date,
327
+ save_path=save_path,
328
+ partitions=self._append_partitions, )
259
329
  go.do()
260
330
  data = scan(search_path, ).cast({"date": pl.Utf8}).filter(pl.col("date").is_in(dateList), **constraints)
261
331
  data = data.sort("date")
@@ -263,6 +333,23 @@ class Dataset:
263
333
  return data.collect()
264
334
  return data
265
335
 
336
+ def _get_depends(self, date: str) -> pl.DataFrame | None:
337
+ # 获取依赖数据集数据
338
+ if not self._depends:
339
+ return None
340
+ end_date = date
341
+ beg_date = xcals.shift_tradeday(date, -self._days)
342
+ params = {
343
+ "ds_conf": dict(depend=self._depends),
344
+ "beg_date": beg_date,
345
+ "end_date": end_date,
346
+ "times": [self.update_time],
347
+ "show_progress": False,
348
+ "eager": True,
349
+ }
350
+ res = load_ds(**params)
351
+ return res["depend"]
352
+
266
353
 
267
354
  def loader(data_name: str,
268
355
  ds: Dataset,
@@ -283,21 +370,31 @@ def loader(data_name: str,
283
370
  lf = ds.get_value(date_list[0], eager=False, **constraints)
284
371
  schema = lf.collect_schema()
285
372
  include_time = schema.get("time") is not None
286
- if include_time:
287
- lf = lf.filter(time=time)
288
- else:
289
- lf = lf.with_columns(time=pl.lit(time))
373
+ if time:
374
+ if include_time:
375
+ lf = lf.filter(time=time)
376
+ else:
377
+ lf = lf.with_columns(time=pl.lit(time))
290
378
  if time < ds.update_time:
291
379
  lf = lf.with_columns(date=pl.col("date").replace(prev_date_mapping))
380
+ keep = {"date", "time", "asset"}
381
+ if ds._name:
382
+ columns = lf.collect_schema().names()
383
+ rename_cols = set(columns).difference(keep)
384
+ if len(rename_cols) > 1:
385
+ lf = lf.rename({k: f"{ds._name}.{k}" for k in rename_cols})
386
+ else:
387
+ lf = lf.rename({k: ds._name for k in rename_cols})
292
388
  return data_name, lf
293
389
 
294
390
 
295
391
  def load_ds(ds_conf: dict[str, list[Dataset]],
296
392
  beg_date: str,
297
393
  end_date: str,
298
- time: str,
394
+ times: list[str],
299
395
  n_jobs: int = 7,
300
396
  backend: Literal["threading", "multiprocessing", "loky"] = "threading",
397
+ show_progress: bool = True,
301
398
  eager: bool = False,
302
399
  **constraints) -> dict[str, pl.DataFrame | pl.LazyFrame]:
303
400
  """
@@ -310,11 +407,12 @@ def load_ds(ds_conf: dict[str, list[Dataset]],
310
407
  开始日期
311
408
  end_date: str
312
409
  结束日期
313
- time: str
410
+ times: list[str]
314
411
  取值时间
315
412
  n_jobs: int
316
413
  并发数量
317
414
  backend: str
415
+ show_progress: bool
318
416
  eager: bool
319
417
  是否返回 DataFrame
320
418
  - True: 返回DataFrame
@@ -332,31 +430,47 @@ def load_ds(ds_conf: dict[str, list[Dataset]],
332
430
  raise ValueError("beg_date must be less than end_date")
333
431
  date_list = xcals.get_tradingdays(beg_date, end_date)
334
432
  beg_date, end_date = date_list[0], date_list[-1]
335
- prev_date_list = xcals.get_tradingdays(xcals.shift_tradeday(beg_date, -1), xcals.shift_tradeday(end_date, -1))
433
+ prev_date_list = xcals.get_tradingdays(xcals.shift_tradeday(beg_date, -1),
434
+ xcals.shift_tradeday(end_date, -1))
336
435
  prev_date_mapping = {prev_date: date_list[i] for i, prev_date in enumerate(prev_date_list)}
337
436
  results = defaultdict(list)
338
- with ygo.pool(n_jobs=n_jobs, backend=backend) as go:
437
+ index = ("date", "time", "asset")
438
+ with ygo.pool(n_jobs=n_jobs,
439
+ backend=backend,
440
+ show_progress=show_progress) as go:
339
441
  for data_name, ds_list in ds_conf.items():
340
442
  for ds in ds_list:
341
- go.submit(loader,
342
- job_name="Loading",
343
- postfix=data_name)(data_name=data_name,
344
- ds=ds,
345
- date_list=date_list,
346
- prev_date_list=prev_date_list,
347
- prev_date_mapping=prev_date_mapping,
348
- time=time,
349
- **constraints)
443
+ _data_name = f"{data_name}:{ds.tb}"
444
+ for time in times:
445
+ go.submit(loader,
446
+ job_name="Loading",
447
+ postfix=data_name, )(data_name=_data_name,
448
+ ds=ds,
449
+ date_list=date_list,
450
+ prev_date_list=prev_date_list,
451
+ prev_date_mapping=prev_date_mapping,
452
+ time=time,
453
+ **constraints)
350
454
  for name, lf in go.do():
351
455
  results[name].append(lf)
352
- index = ("date", "time", "asset")
456
+ _LFs = {
457
+ name: (pl.concat(lfList, )
458
+ .select(*index,
459
+ cs.exclude(index))
460
+ )
461
+ for name, lfList in results.items()}
462
+ LFs = defaultdict(list)
463
+ for name, lf in _LFs.items():
464
+ dn, _ = name.split(":")
465
+ LFs[dn].append(lf)
353
466
  LFs = {
354
467
  name: (pl.concat(lfList, how="align")
355
468
  .sort(index)
356
469
  .select(*index,
357
470
  cs.exclude(index))
358
471
  )
359
- for name, lfList in results.items()}
472
+ for name, lfList in LFs.items()}
473
+
360
474
  if not eager:
361
475
  return LFs
362
476
  return {
@@ -369,16 +483,16 @@ class DataLoader:
369
483
 
370
484
  def __init__(self, name: str):
371
485
  self._name = name
372
- self._lf: pl.LazyFrame = None
373
- self._df: pl.DataFrame = None
374
486
  self._index: tuple[str] = ("date", "time", "asset")
375
- self._db: QDF = None
376
- self._one: pl.DataFrame = None
487
+ self._df: pl.LazyFrame | pl.DataFrame = None
488
+ # self._db: QDF = None
377
489
 
378
490
  def get(self,
379
491
  ds_list: list[Dataset],
380
492
  beg_date: str,
381
493
  end_date: str,
494
+ times: list[str],
495
+ eager: bool = False,
382
496
  n_jobs: int = 11,
383
497
  backend: Literal["threading", "multiprocessing", "loky"] = "threading",
384
498
  **constraints):
@@ -389,6 +503,9 @@ class DataLoader:
389
503
  ds_list: list[Dataset]
390
504
  beg_date: str
391
505
  end_date: str
506
+ times: list[str]
507
+ 加载的时间列表
508
+ eager: bool
392
509
  n_jobs: int
393
510
  backend: str
394
511
  constraints
@@ -402,41 +519,24 @@ class DataLoader:
402
519
  end_date=end_date,
403
520
  n_jobs=n_jobs,
404
521
  backend=backend,
405
- eager=False,
522
+ times=times,
523
+ eager=eager,
406
524
  **constraints)
407
- self._lf = lf.get(self._name)
408
- self._df = None
409
- self._db = from_polars(self._lf, self._index, align=True)
410
- dateList = xcals.get_tradingdays(beg_date, end_date)
411
- _data_name = f"{self._name}(one_day)"
412
- self._one = load_ds(ds_conf={_data_name: ds_list},
413
- beg_date=dateList[0],
414
- end_date=dateList[0],
415
- n_jobs=n_jobs,
416
- backend=backend,
417
- eager=False,
418
- **constraints).get(_data_name).collect()
525
+ self._df = lf[self._name]
419
526
 
420
527
  @property
421
528
  def name(self) -> str:
422
529
  return self._name
423
530
 
424
531
  @property
425
- def one_day(self) -> pl.DataFrame:
426
- return self._one
427
-
428
- @property
429
- def schema(self) -> pl.Schema:
430
- return self._one.schema
431
-
432
- @property
433
- def columns(self) -> list[str]:
434
- return self._one.columns
435
-
436
- def collect(self) -> pl.DataFrame:
437
- if self._df is None:
438
- self._df = self._lf.collect()
532
+ def data(self) -> pl.DataFrame | None:
533
+ """返回全量数据"""
534
+ if isinstance(self._df, pl.LazyFrame):
535
+ self._df = self._df.collect()
439
536
  return self._df
440
537
 
441
- def sql(self, *exprs: str) -> pl.DataFrame:
442
- return self._db.sql(*exprs)
538
+ def add_data(self, df: pl.DataFrame | pl.LazyFrame):
539
+ """添加dataframe, index 保持为原有的 _df.index"""
540
+ if isinstance(df, pl.LazyFrame):
541
+ df = df.collect()
542
+ self._df = pl.concat([self._df, df], how="align").sort(self._index)
lidb/init.py CHANGED
@@ -6,6 +6,7 @@
6
6
  from pathlib import Path
7
7
  from dynaconf import Dynaconf
8
8
  import logair
9
+ import os
9
10
 
10
11
 
11
12
  USERHOME = Path("~").expanduser() # 用户家目录
@@ -22,8 +23,7 @@ if not CONFIG_PATH.exists():
22
23
  except Exception as e:
23
24
  logger.error(f"Failed to create settings file: {e}")
24
25
  with open(CONFIG_PATH, "w") as f:
25
- template_content = f'[global]\npath="{DB_PATH}"\n'
26
- with open(CONFIG_PATH, "w") as f:
26
+ template_content = f'[GLOBAL]\npath="{DB_PATH}"\n\n[POLARS]\nmax_threads=32\n'
27
27
  f.write(template_content)
28
28
  logger.info(f"Settings file created: {CONFIG_PATH}")
29
29
 
@@ -38,5 +38,8 @@ def get_settings():
38
38
  _settiings = get_settings()
39
39
  if _settiings is not None:
40
40
  setting_db_path = _settiings.get(f"global.path", "")
41
+ # 配置 polars
42
+ setting_polars_threads = _settiings.get("polars.max_threads", 32)
43
+ os.environ["POLARS_MAX_THREADS"] = str(setting_polars_threads)
41
44
  if setting_db_path:
42
45
  DB_PATH = Path(setting_db_path)
lidb/parse.py CHANGED
@@ -86,6 +86,10 @@ def parse_hive_partition_structure(root_path: Path | str, file_pattern: str = "*
86
86
  partition_combinations = set()
87
87
 
88
88
  for file_path in root_path.rglob(file_pattern):
89
+ if file_path.stat().st_size == 0:
90
+ # 删除
91
+ file_path.unlink()
92
+ continue
89
93
  relative_path = file_path.relative_to(root_path)
90
94
 
91
95
  # 收集分区信息
lidb/qdf/qdf.py CHANGED
@@ -118,7 +118,7 @@ class QDF:
118
118
  except Exception as error:
119
119
  raise CompileError(message=f"{e.fn_name}({', '.join([str(arg) for arg in args])})\n{error}") from error
120
120
 
121
- def sql(self, *exprs: str, show_progress: bool = False) -> pl.DataFrame:
121
+ def sql(self, *exprs: str, show_progress: bool = False, leave: bool = False) -> pl.DataFrame:
122
122
  """
123
123
  表达式查询
124
124
  Parameters
@@ -127,6 +127,8 @@ class QDF:
127
127
  表达式,比如 "ts_mean(close, 5) as close_ma5"
128
128
  show_progress: bool
129
129
  是否展示进度条
130
+ leave: bool
131
+ 是否保留进度条
130
132
  Returns
131
133
  -------
132
134
  polars.DataFrame
@@ -146,7 +148,7 @@ class QDF:
146
148
  pbar = None
147
149
  lvl_num = len(lvls)
148
150
  if show_progress:
149
- pbar = tqdm(total=lvl_num, desc=f"{len(exprs)}")
151
+ pbar = tqdm(total=lvl_num, desc=f"{len(exprs)}", leave=leave)
150
152
  for i, batch_exprs in enumerate(lvls):
151
153
  if show_progress:
152
154
  pbar.set_postfix_str(f"level-{i + 1}:{len(batch_exprs)}")
lidb/table.py CHANGED
@@ -5,6 +5,7 @@
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
+ import sys
8
9
  from collections.abc import Callable
9
10
  from enum import Enum
10
11
 
@@ -24,8 +25,6 @@ class TableMode(Enum):
24
25
  F = "full" # 全量更新
25
26
  I = "increment" # 增量更新
26
27
 
27
-
28
-
29
28
  class Table:
30
29
 
31
30
  def __init__(self,
@@ -58,10 +57,10 @@ class Table:
58
57
  """获取数据并且保存数据"""
59
58
  data = ygo.delay(self.fn)(this=self)()
60
59
  if data is None:
61
- self.logger.error("No data.")
60
+ self.logger.error(f"{self.tb}: No data.")
62
61
  return
63
62
  if data.is_empty():
64
- self.logger.warning("No data.")
63
+ self.logger.warning(f"{self.tb}: No data.")
65
64
  return
66
65
  if self.mode == TableMode.I:
67
66
  time_uuid = uuid.uuid1()
@@ -77,33 +76,43 @@ class Table:
77
76
  def update(self, verbose: bool = False):
78
77
  """更新最新数据: 全量更新, 覆盖旧数据"""
79
78
  self.verbose = verbose
79
+ if self._need_update(date=xcals.today()):
80
+ self._log("Updating.", "info")
81
+ self._do_job()
82
+
83
+ def _need_update(self, date: str) -> bool:
84
+ """是否需要更新"""
80
85
  existed = self._data_dir.exists()
81
86
  if not existed:
82
87
  self._data_dir.mkdir(parents=True, exist_ok=True)
83
- self._log("Creating new data.", "info")
84
- self._do_job()
88
+ return True
85
89
  else:
86
90
  modified_time = self.modified_time
87
91
  if modified_time is not None:
88
92
  modified_datetime = modified_time.strftime("%Y-%m-%d %H:%M:%S")
89
93
  modified_d, modified_t = modified_datetime.split(" ")
90
- if self._updated(data_date=modified_d, data_time=modified_t):
91
- return
92
- self._log("Updating.", "info")
93
- self._do_job()
94
- self._log("Updated.", "info")
94
+ if self._updated(date, data_date=modified_d, data_time=modified_t):
95
+ return False
96
+ return True
95
97
 
96
- def get_value(self, eager: bool = True) -> pl.DataFrame | pl.LazyFrame:
98
+ def get_value(self, date: str, eager: bool = True) -> pl.DataFrame | pl.LazyFrame:
97
99
  """获取数据"""
98
- self.update(verbose=True)
100
+ # self.update(verbose=True)
101
+ if not date:
102
+ date = xcals.today()
103
+ self.verbose = True
104
+ if self._need_update(date):
105
+ self._log("Update first plz.", "warning")
106
+ sys.exit()
107
+
99
108
  df = scan(self._data_dir)
100
109
  if eager:
101
110
  return df.collect()
102
111
  return df
103
112
 
104
- def _updated(self, data_date: str, data_time: str) -> bool:
105
- """判断是否需要更新数据"""
106
- recent_tradeday = xcals.get_recent_tradeday()
113
+ def _updated(self, date: str, data_date: str, data_time: str) -> bool:
114
+ """判断是否已经更新数据"""
115
+ recent_tradeday = xcals.get_recent_tradeday(date)
107
116
  prev_tradeday = xcals.shift_tradeday(recent_tradeday, -1)
108
117
  now = xcals.now()
109
118
  latest_update_date = recent_tradeday if now >= self.update_time else prev_tradeday
@@ -0,0 +1,272 @@
1
+ Metadata-Version: 2.4
2
+ Name: lidb
3
+ Version: 1.3.6
4
+ Summary: Light database for quantor
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: dynaconf>=3.2.11
8
+ Requires-Dist: polars>=1.31.0
9
+ Requires-Dist: sqlparse>=0.5.3
10
+ Requires-Dist: logair>=1.0.8
11
+ Requires-Dist: clickhouse-df>=0.1.5
12
+ Requires-Dist: connectorx>=0.4.3
13
+ Requires-Dist: pymysql>=1.1.2
14
+ Requires-Dist: xcals>=0.0.4
15
+ Requires-Dist: ygo>=1.2.9
16
+ Requires-Dist: lark>=1.3.1
17
+ Requires-Dist: numpy>=2.3.1
18
+ Requires-Dist: tqdm>=4.67.1
19
+ Requires-Dist: varname>=0.15.1
20
+
21
+ ## lidb
22
+
23
+ ### 项目简介
24
+ lidb 是一个基于 Polars 的数据管理和分析库,专为金融量化研究设计。它提供了高效的数据存储、查询和表达式计算功能,支持多种时间序列和横截面数据分析操作。
25
+
26
+ ### 功能特性
27
+ - **多数据源支持**: 本地 Parquet 存储、MySQL、ClickHouse 等数据库连接
28
+ - **高效数据存储**: 基于 Parquet 格式的分区存储机制
29
+ - **SQL 查询接口**: 支持标准 SQL 语法进行数据查询
30
+ - **表达式计算引擎**: 提供丰富的 UDF 函数库,包括时间序列、横截面、维度等分析函数
31
+ - **数据集管理**: 自动化数据补全、历史数据加载和 PIT(Point-in-Time)数据处理
32
+ - **数据服务**: 异步加载数据,用于数据密集型任务的数据加载(如大量标的的高频数据)
33
+
34
+ ### 安装
35
+ ```bash
36
+ pip install -U lidb
37
+ ```
38
+
39
+ ### 快速开始
40
+
41
+ #### 基础数据操作
42
+ ```python
43
+ import lidb
44
+ import polars as pl
45
+
46
+ df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
47
+
48
+ # 写入数据
49
+ lidb.put(df, "my_table")
50
+
51
+ # sql 查询
52
+ res = lidb.sql("select * from my_table;")
53
+ ```
54
+
55
+ #### 数据集使用
56
+ ```python
57
+ import lidb
58
+ from lidb import Dataset
59
+ import polars as pl
60
+
61
+ # 定义一个tick级别的高频数据集: 高频成交量
62
+ def hft_vol(date: str, num: int) -> pl.DataFrame | pl.LazyFrame | None:
63
+ # 假设上游tick行情表在clickhouse
64
+ quote_query = f"select * from quote where date = '{date}'"
65
+ quote = lidb.read_ck(quote_query, db_conf="databases.ck")
66
+ # 特征计算: 比如过去20根tick的成交量总和, 使用表达式引擎计算
67
+ return lidb.from_polars(quote).sql(f"itd_sum(volume, {num}) as vol_s20")
68
+
69
+ ds_hft_vol = Dataset(fn=hft_vol,
70
+ tb="path/to/hft_vol",
71
+ partitions=["num"],
72
+ update_time="", # 实时更新
73
+ by_asset=True, # 根据asset_id进行分区
74
+ )(num=20)
75
+
76
+ # 获取历史数据
77
+ history_data = ds_hft_vol.get_history(["2023-01-01", "2023-01-02", ...])
78
+ ```
79
+
80
+ #### `Table`
81
+ 除了 `Dataset` 类用于管理复杂的、可分区的历史数据集之外,lidb 还提供了一个更轻量级的 `Table` 类。
82
+ 它适用于那些不需要复杂分区逻辑,且通常以单一文件形式存储的表格数据。`Table` 类同样支持基于更新时间的自动化数据管理和加载。
83
+ ##### 特性
84
+ - **简化数据管理**: 专为单表数据设计,无需复杂的分区结构。
85
+ - **灵活更新策略**:
86
+ - **全量更新(`TableMode.F`)**: 每次更新时覆盖旧数据,仅保留最新的数据文件(0.parquet)。
87
+ - **增量更新(`TableMode.I`)**: 每次更新时生成一个新的带时间戳的文件(<uuid>.parquet),保留历史版本。
88
+ - **自动更新检查**: 根据设定的 `update_time` 和文件修改时间,自动判断是否需要更新数据。
89
+
90
+ ##### 使用示例
91
+ ```python
92
+ from lidb import Table, TableMode
93
+ import polars as pl
94
+
95
+ # 1. 定义一个数据获取函数
96
+ def fetch_latest_stock_list() -> pl.DataFrame:
97
+ # 模拟从某个API或数据库获取最新的股票列表
98
+ import time
99
+ time.sleep(1) # 模拟网络延迟
100
+ return pl.DataFrame({
101
+ "symbol": ["AAPL", "GOOGL", "MSFT"],
102
+ "name": ["Apple Inc.", "Alphabet Inc.", "Microsoft Corp."],
103
+ "sector": ["Technology", "Communication Services", "Technology"]
104
+ })
105
+
106
+ # 2. 创建 Table 实例
107
+ # 假设此表每天上午9点更新
108
+ stock_list_table = Table(
109
+ fn=fetch_latest_stock_list,
110
+ tb="stock_list",
111
+ update_time="09:00:00",
112
+ mode=TableMode.F # 使用全量更新模式
113
+ )
114
+
115
+ # 3. 更新数据 (可选,get_value 会自动检查并提示更新)
116
+ # stock_list_table.update(verbose=True)
117
+
118
+ # 4. 获取数据
119
+ # 如果数据过期,get_value 会打印警告并退出,提示先调用 update()
120
+ df = stock_list_table.get_value(date="2023-10-27")
121
+ print(df)
122
+ ```
123
+
124
+
125
+ #### 表达式计算
126
+ ```python
127
+ import lidb
128
+
129
+ date = "2025-05-15"
130
+ quote_query = f"select * from quote where date = '{date}'"
131
+ quote = lidb.read_ck(quote_query, db_conf="databases.ck")
132
+
133
+ qdf = lidb.from_polars(quote)
134
+
135
+ # 使用 QDF 进行表达式计算
136
+ res = qdf.sql(
137
+ "ts_mean(close, 5) as c_m5",
138
+ "cs_rank(volume) as vol_rank",
139
+ )
140
+ ```
141
+
142
+ #### 数据服务
143
+ lidb 提供了一个名为 `D` 的全局 `DataService` 实例。
144
+ 用于在后台线程中预加载数据并缓存,从而提升数据密集型任务的性能。
145
+ 这对于需要提前准备大量数据的应用非常有用,例如回测系统或实时数据处理流水线。
146
+ ##### 启动数据服务
147
+ 你可以通过调用 `D.start()` 方法来启动数据服务,指定一个数据加载函数、需要加载的键列表以及迭代配置。
148
+ ```python
149
+ from lidb import D
150
+ import polars as pl
151
+
152
+ # 定义一个模拟的数据加载函数
153
+ def mock_data_loader(key: str, iterables: list[str]) -> pl.DataFrame:
154
+ # 模拟耗时操作
155
+ import time
156
+ time.sleep(1)
157
+
158
+ # 返回简单的 DataFrame 示例
159
+ return pl.DataFrame({
160
+ "key": [key],
161
+ "value": [sum(len(s) for s in iterables)]
162
+ })
163
+
164
+ # 启动数据服务
165
+ D.start(
166
+ fn=mock_data_loader,
167
+ keys=["2023-01-01", "2023-01-02", "2023-01-03"],
168
+ iter_conf={"data_source_a": ["a", "b"], "data_source_b": ["x", "y"]},
169
+ max_cache_size=3
170
+ )
171
+ ```
172
+ ##### 消费数据
173
+ 一旦数据服务启动,你就可以通过 `D.do()` 来消费已加载的数据。
174
+ 这个方法接受一个消费者函数作为参数,每当有新数据可用时就会被调用。
175
+ ```python
176
+ def data_consumer(data_package: dict):
177
+ print(f"Consumed data for key: {data_package['key']}")
178
+ for name, df in data_package['data'].items():
179
+ print(f" Data from {name}:")
180
+ print(df)
181
+
182
+ # 开始消费数据
183
+ D.do(consumer=data_consumer, wait_secs=1)
184
+ ```
185
+ ##### 停止数据服务
186
+ 当你需要停止数据服务时,你可以调用 `D.stop()` 方法。
187
+ ##### 完整示例
188
+ 以下是一个完整的示例,演示了如何使用 D 进行异步数据加载与消费:
189
+ ```python
190
+ import lidb
191
+ from lidb import D
192
+ import polars as pl
193
+ import time
194
+
195
+ def fetch_market_data(key: str, iterables: list[str]) -> pl.DataFrame:
196
+ # 模拟网络请求或复杂计算
197
+ time.sleep(0.5)
198
+ return pl.DataFrame({
199
+ "date": [key],
200
+ "symbol_count": [len(iterables)],
201
+ "total_volume": [sum(ord(c) for s in iterables for c in s)] # Dummy volume
202
+ })
203
+
204
+ # 启动服务
205
+ D.start(
206
+ fn=fetch_market_data,
207
+ keys=["2023-01-01", "2023-01-02", "2023-01-03"],
208
+ iter_conf={"symbols": ["AAPL", "GOOGL", "MSFT"]},
209
+ max_cache_size=2
210
+ )
211
+
212
+ # 消费者函数
213
+ def handle_data(data_package: dict):
214
+ print(f"\nReceived data for {data_package['key']}:")
215
+ print(data_package['data']['market_data'])
216
+
217
+ # 启动消费过程
218
+ try:
219
+ D.do(consumer=handle_data, wait_secs=1)
220
+ except KeyboardInterrupt:
221
+ print("\nShutting down data service...")
222
+ finally:
223
+ D.stop()
224
+ ```
225
+
226
+ ### 核心模块
227
+
228
+ #### 数据库操作(`database.py`)
229
+ - `put`: 将 `polars.DataFrame` 写入指定表
230
+ - `sql`: 执行 `SQL` 查询
231
+ - `has`: 检查表是否存在
232
+ - `read_mysql`,`write_mysql`: mysql 数据读写
233
+ - `read_ck`: clickhouse 数据读取
234
+
235
+ #### 数据服务(`svc/data.py`)
236
+ - `DataService`: 数据服务管理
237
+ - `D`: `DataService` 全局实例
238
+
239
+ #### 数据集管理(`dataset.py`)
240
+ - `Dataset`: 数据集定义和管理
241
+ - `DataLoader`: 数据加载器
242
+ - `zoo`: alpha因子数据管理
243
+
244
+ #### 表达式计算(`qdf/`)
245
+ - `QDF`: 表达式数据库
246
+ - `Expr`: 表达式解析器
247
+ - `UDF 函数库`:
248
+ - `base_udf`: 基础运算函数
249
+ - `ts_udf`: 时间序列函数
250
+ - `cs_udf`: 横截面函数
251
+ - `d_udf`: 日期维度函数
252
+ - `itd_udf`: 日内函数
253
+
254
+ #### 配置管理(`init.py`)
255
+ - 自动创建配置文件
256
+ - 支持自定义数据存储路径
257
+ - `polars` 线程配置
258
+ #### 配置说明
259
+ 首次运行会在 `~/.config/lidb/settings.toml` 创建配置文件:
260
+ ```toml
261
+ [GLOBAL]
262
+ path = "~/lidb" # 数据存储路径
263
+
264
+ [POLARS]
265
+ max_threads = 32 # Polars 最大线程数
266
+ ```
267
+
268
+ ### 许可证
269
+ 本项目采用 MIT 许可证, 请在项目根目录下查看
270
+
271
+ ### 联系方式
272
+ Zhangyundi - yundi.xxii@outlook.com
@@ -1,15 +1,15 @@
1
- lidb/__init__.py,sha256=-EUd5pO1o7cBs__gvTsxquuHRBpiVn7mVGkL9miyc2k,504
1
+ lidb/__init__.py,sha256=tcPBOs0UltwU3tCqjIBHrklITYQcAeWkdrV3_SYCu1I,505
2
2
  lidb/database.py,sha256=DnPXRXvUO6g0kuMo3LPl6eKo_HbD3JNW1qzoaJ14Sgo,7533
3
- lidb/dataset.py,sha256=j3yFtokbNILVhjV-etAJunnbgfxYAu68Dkr2cgtCYSc,15766
4
- lidb/init.py,sha256=jLHpeL5mIM4YjdMYAndZlDilMiKXJMr_51Ke3ZSJWCM,1170
5
- lidb/parse.py,sha256=f7vfj6Nguw1WzUVEUb7fs2Oh-_2YQzB_atJhm3WGC28,3379
6
- lidb/table.py,sha256=-85U2N1ECDtZTTCJtgOM8XBKyueIgBmYRF5DocPvkh8,4167
3
+ lidb/dataset.py,sha256=hC2D2uJ7xV6yRB-j9TOYxb0aqZQME_5_BbXND2NPBK0,20254
4
+ lidb/init.py,sha256=N_PiBGZO3hKUhQQYzly3GKHgSf4eJVO7xyxjX-chUpQ,1327
5
+ lidb/parse.py,sha256=6awnc14OK7XBkkSrAJFOCZOQ0JUHmm6yDI9F3kkLwcQ,3494
6
+ lidb/table.py,sha256=NeqOU0EJU3DA0yz-1T2GVLpKASu1_1fdOLK3yxf7DtA,4494
7
7
  lidb/qdf/__init__.py,sha256=gYiSxijoPQZmbgATQX4GsutjolPpN82Kea0eQz6zGyg,1037
8
8
  lidb/qdf/errors.py,sha256=lJhhjDRdQOOKUFGlLQ9ELK4AexXBwYQSYus_V-kc5K8,1180
9
9
  lidb/qdf/expr.py,sha256=kBzXwjL_PVsJUL9FIHJ2W_G_OVRqFR-kS2mUHTt9thM,10412
10
10
  lidb/qdf/lazy.py,sha256=I08IvSkSC84qJkgtZ7nwvG_4UH07jaHBKRp7qQnwqbs,6937
11
11
  lidb/qdf/lazy2.py,sha256=ADKQaxmo-BlndhLY-idWCFypZF1icxKNHNMWEfmWy-Q,6294
12
- lidb/qdf/qdf.py,sha256=tfPnnQvh8uQZT4aOqJi6bDyDoJwLObvQrFeM2Ilz6vM,6236
12
+ lidb/qdf/qdf.py,sha256=UWG9G1GI0YdG4dMz5uTV731ETEcZelHqnb0QUGrmHPM,6324
13
13
  lidb/qdf/udf/__init__.py,sha256=yIySmkWjtJ-Lj_PMP5O4EnXGDjMAPQL40NmFCekKXBw,313
14
14
  lidb/qdf/udf/base_udf.py,sha256=ZjRF2UIrZFgznbm1gxFpdf4V92oO84IaakLeeSNF44U,3444
15
15
  lidb/qdf/udf/cs_udf.py,sha256=qlBZd2c1enIdGp_DrNyQWzH3cth4ZpLBIE1hGZuJXbA,3528
@@ -18,7 +18,7 @@ lidb/qdf/udf/itd_udf.py,sha256=O_OOdSTEaeCoqjtlKnpvNF-_10QoamJL_tw2xEZCYVw,6747
18
18
  lidb/qdf/udf/ts_udf.py,sha256=Ag6-ffhmIugkA-st2QY-GP4hclQZcRG8SB-bVa7k5cc,5674
19
19
  lidb/svc/__init__.py,sha256=9vQo7gCm5LRgWSiq_UU2hlbwvXi0FlGYt2UDVZixx_U,141
20
20
  lidb/svc/data.py,sha256=tLOI_YylnsVejyqv9l-KgPetkPO0QzybOf1PEeFSZNI,4380
21
- lidb-1.2.0.dist-info/METADATA,sha256=fj1SvELa0jivjl6dcyut8IHbE7V00h5o6mGJkZa04S0,506
22
- lidb-1.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
23
- lidb-1.2.0.dist-info/top_level.txt,sha256=NgXJNwt6ld6oLXtW1vOPaEh-VO5R0JEX_KmGIJR4ueE,5
24
- lidb-1.2.0.dist-info/RECORD,,
21
+ lidb-1.3.6.dist-info/METADATA,sha256=0f7wFU6CZwD_jiqmJjzc_HNCx48mKA24_JBUREiEfSs,8558
22
+ lidb-1.3.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
23
+ lidb-1.3.6.dist-info/top_level.txt,sha256=NgXJNwt6ld6oLXtW1vOPaEh-VO5R0JEX_KmGIJR4ueE,5
24
+ lidb-1.3.6.dist-info/RECORD,,
@@ -1,18 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: lidb
3
- Version: 1.2.0
4
- Summary: Light database for quantor
5
- Requires-Python: >=3.12
6
- Description-Content-Type: text/markdown
7
- Requires-Dist: dynaconf>=3.2.11
8
- Requires-Dist: polars>=1.31.0
9
- Requires-Dist: sqlparse>=0.5.3
10
- Requires-Dist: logair>=1.0.1
11
- Requires-Dist: clickhouse-df>=0.1.5
12
- Requires-Dist: connectorx>=0.4.3
13
- Requires-Dist: pymysql>=1.1.2
14
- Requires-Dist: xcals>=0.0.4
15
- Requires-Dist: ygo>=1.2.8
16
- Requires-Dist: lark>=1.3.1
17
- Requires-Dist: numpy>=2.3.1
18
- Requires-Dist: tqdm>=4.67.1
File without changes