silars 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
silars-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: silars
3
+ Version: 0.1.0
4
+ License-Expression: MIT
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: numpy>=2.3.1
8
+ Requires-Dist: pandas>=2.3.1
9
+ Requires-Dist: polars>=1.31.0
10
+ Requires-Dist: scikit-learn>=1.7.0
11
+ Requires-Dist: ygo>=1.1.6
silars-0.1.0/README.md ADDED
File without changes
@@ -0,0 +1,22 @@
1
+ [project]
2
+ name = "silars"
3
+ version = "0.1.0"
4
+ description = ""
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ license = "MIT"
8
+ dependencies = [
9
+ "numpy>=2.3.1",
10
+ "pandas>=2.3.1",
11
+ "polars>=1.31.0",
12
+ "scikit-learn>=1.7.0",
13
+ "ygo>=1.1.6",
14
+ ]
15
+
16
+ [build-system]
17
+ requires = ["setuptools>=42", "wheel"]
18
+ build-backend = "setuptools.build_meta"
19
+
20
+ [tool.setuptools.packages.find]
21
+ where = ["."]
22
+ include = ["silars", "silars.*"]
silars-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,4 @@
1
+ # Copyright (c) ZhangYundi.
2
+ # Licensed under the MIT License.
3
+ # Created on 2025/7/17 10:53
4
+ # Description:
@@ -0,0 +1,29 @@
1
+ # Copyright (c) ZhangYundi.
2
+ # Licensed under the MIT License.
3
+ # Created on 2025/7/17 10:53
4
+ # Description:
5
+
6
+ from .factory import (
7
+ Function,
8
+ Cast,
9
+ Imputer,
10
+ Replace,
11
+ Target,
12
+ DropNull,
13
+ TargetFromDifferentTag,
14
+ Reindex,
15
+ )
16
+
17
+ from .partial import StandardScaler
18
+
19
+ __all__ = [
20
+ "Function",
21
+ "Cast",
22
+ "Imputer",
23
+ "Replace",
24
+ "Target",
25
+ "DropNull",
26
+ "StandardScaler",
27
+ "TargetFromDifferentTag",
28
+ "Reindex",
29
+ ]
@@ -0,0 +1,354 @@
1
+ # Copyright (c) ZhangYundi.
2
+ # Licensed under the MIT License.
3
+ # Created on 2025/7/17 10:53
4
+ # Description:
5
+
6
+ import inspect
7
+ import math
8
+ from collections.abc import Sequence, Iterable
9
+ from typing import Literal
10
+
11
+ import numpy as np
12
+ import polars as pl
13
+ import ygo
14
+ from sklearn.base import BaseEstimator, TransformerMixin
15
+
16
+ SPECIAL_VALUES = {
17
+ "null": None,
18
+ "none": None,
19
+ "nan": None,
20
+ "inf": np.inf,
21
+ "-inf": -np.inf,
22
+ "int": int,
23
+ }
24
+
25
+
26
+ class Function(BaseEstimator, TransformerMixin):
27
+ """
28
+ 将任意函数封装为支持 set_params 的 BaseEstimator
29
+
30
+ Parameters
31
+ ----------
32
+ fn: callable
33
+ 方法对象
34
+ kwargs: dict
35
+ fn的参数
36
+
37
+ Examples
38
+ --------
39
+ >>> import polars as pl
40
+ >>> func = Function(lambda date, ds_path: pl.scan_parquet("/path/to/your/data.parquet"))
41
+ >>> func.set_params(ds_path="mc/stock_kline_day")
42
+ """
43
+
44
+ def __init__(self, fn: callable, **kwargs):
45
+ self.fn = ygo.delay(fn)(**kwargs)
46
+ self.params = {k: inspect.Parameter.empty for k in ygo.fn_signature_params(self.fn)}
47
+ self.params.update(self.fn.stored_kwargs)
48
+
49
+ def set_params(self, **params):
50
+ self.fn = ygo.delay(self.fn)(**params)
51
+ self.params.update(self.fn.stored_kwargs)
52
+ return self
53
+
54
+ def get_params(self, deep=True):
55
+ return self.params
56
+
57
+ def fit(self, X, y=None):
58
+ return self
59
+
60
+ def transform(self, X):
61
+ if self.fn is None:
62
+ raise ValueError("Function not provided.")
63
+ params = {k: v for k, v in self.fn.stored_kwargs.items() if v is not inspect.Parameter.empty}
64
+ return self.fn(X, **params)
65
+
66
+
67
+ class DropNull(BaseEstimator, TransformerMixin):
68
+ """
69
+ Drop all rows that contain one or more null values
70
+
71
+ Parameters
72
+ ----------
73
+ subset: Sequence[str] | str | None
74
+ 剔除空值的字段
75
+
76
+ """
77
+
78
+ def __init__(self, subset: Sequence[str] | str | None = None):
79
+ self.subset = subset
80
+
81
+ def fit(self, X=None, y=None):
82
+ return self
83
+
84
+ def transform(self, X: pl.LazyFrame):
85
+ if self.subset:
86
+ return X.drop_nulls(pl.sql_expr(self.subset))
87
+ return X
88
+
89
+
90
+ class Cast(BaseEstimator, TransformerMixin):
91
+ """
92
+ 类型转换
93
+
94
+ Parameters
95
+ ----------
96
+ old: str | polars.DataType | polars.Expr
97
+ 原本类型或者需要转换的列
98
+ new: polars.DataType
99
+ 转换的目标类型
100
+
101
+ Examples
102
+ --------
103
+ >>> Cast("asset", pl.UInt16)
104
+
105
+ """
106
+
107
+ def __init__(self, old: pl.Expr | str | pl.DataType = None, new=None):
108
+ self.old = old
109
+ self.new = new
110
+ self.old_expr: pl.Expr = None
111
+
112
+ def fit(self, X, y=None):
113
+ if isinstance(self.old, str):
114
+ if self.old in SPECIAL_VALUES:
115
+ self.old = SPECIAL_VALUES[self.old]
116
+ self.old_expr = self.old if isinstance(self.old, pl.Expr) else pl.col(self.old)
117
+ return self
118
+
119
+ def transform(self, X):
120
+ return X.with_columns(self.old_expr.cast(self.new))
121
+
122
+
123
+ class Imputer(BaseEstimator, TransformerMixin):
124
+ """
125
+ 插值器
126
+
127
+ Parameters
128
+ ----------
129
+ strategy
130
+ 插值策略
131
+ over_spec
132
+ 用于滚动插值
133
+ columns
134
+ 需要处理的列
135
+ """
136
+
137
+ def __init__(self,
138
+ strategy: Literal[
139
+ "forward", "backward", "mean", "zero", "max", "min", "one"] | None = "forward",
140
+ over_spec: dict | None = None,
141
+ columns: Sequence[str] | None = None):
142
+ self.strategy = strategy
143
+ self.over_spec = over_spec
144
+ self.columns = [columns, ] if isinstance(columns, str) else columns
145
+ self._exprs: list[pl.Expr] = list()
146
+
147
+ def fit(self, X=None, y=None):
148
+ return self
149
+
150
+ def transform(self, X: pl.LazyFrame):
151
+ cols = X.collect_schema().names() if not self.columns else self.columns
152
+ if not self.over_spec:
153
+ self._exprs = [pl.col(col).fill_null(strategy=self.strategy) for col in cols]
154
+ else:
155
+ self._exprs = [pl.col(col).fill_null(strategy=self.strategy).over(**self.over_spec) for col in self.columns]
156
+ return X.with_columns(self._exprs)
157
+
158
+
159
+ class Replace(BaseEstimator, TransformerMixin):
160
+ """
161
+ 替换器
162
+
163
+ Parameters
164
+ ----------
165
+ columns: list
166
+ 需要替换的目标列
167
+ old
168
+ 旧值, 可以是list, 如[np.inf, -np.inf]
169
+ new
170
+ 新值
171
+ """
172
+
173
+ def __init__(self, columns: list | None = None, old=None, new=None):
174
+
175
+ self.columns = columns
176
+ self.old = old
177
+ self.new = new
178
+
179
+ def fit(self, X, y=None):
180
+ if isinstance(self.old, str):
181
+ if self.old in SPECIAL_VALUES:
182
+ self.old = SPECIAL_VALUES[self.old]
183
+ elif isinstance(self.old, Iterable):
184
+ self.old = [SPECIAL_VALUES[old] if old in SPECIAL_VALUES else old for old in self.old]
185
+ if self.columns is None:
186
+ self.columns = list()
187
+ return self
188
+
189
+ def transform(self, X):
190
+ return X.with_columns(pl.col(c).replace(self.old, self.new) for c in self.columns)
191
+
192
+
193
+ class Target(BaseEstimator, TransformerMixin):
194
+ """
195
+ 添加目标收益列
196
+
197
+ Parameters
198
+ ----------
199
+ price_tag
200
+ 价格列,支持sql语法,如 `if(ask_p1 > 0, bid_p1 > 0, (ask_p1+bid_p1)/2, null) as price`
201
+ frequency
202
+ 数据集频率, 默认`3s`
203
+ target
204
+ 目标收益,默认`5min`。大于日级别的:出场价使用最后一条数据
205
+ partition_by
206
+ 分区
207
+ order_by
208
+ 排序
209
+ """
210
+
211
+ def __init__(self,
212
+ price_tag: str = "price",
213
+ frequency: str = "3s",
214
+ target: str = "5min",
215
+ gap: str = "3s",
216
+ partition_by: Sequence[str] | str = "asset",
217
+ order_by: Sequence[str] | str | None = "time",
218
+ alias: str = None):
219
+ self.price_tag = price_tag
220
+ self.frequency = frequency
221
+ self.target = target
222
+ self.gap = gap
223
+ self.partition_by = partition_by
224
+ self.order_by = order_by
225
+ self.alias = alias
226
+ self._exprs = list()
227
+
228
+ def fit(self, X=None, y=None):
229
+ from pandas import Timedelta
230
+ self.alias = self.alias if self.alias else self.target
231
+ over_spec = {"partition_by": self.partition_by}
232
+ if self.order_by:
233
+ over_spec["order_by"] = self.order_by
234
+ freq_secs = int(Timedelta(self.frequency).seconds)
235
+ target_timedelta = Timedelta(self.target)
236
+ if target_timedelta.days > 0:
237
+ expr_target = (
238
+ (
239
+ pl.sql_expr(self.price_tag)
240
+ .last()
241
+ .over(**over_spec) / pl.sql_expr(self.price_tag) - 1
242
+ ).alias(self.alias)
243
+ )
244
+ else:
245
+ expr_target = (
246
+ (
247
+ pl.sql_expr(self.price_tag)
248
+ .shift(-math.ceil(target_timedelta.seconds / freq_secs))
249
+ .over(**over_spec) / pl.sql_expr(self.price_tag) - 1
250
+ ).alias(self.alias)
251
+ )
252
+ expr_inf = pl.col(self.alias).cast(pl.Float32).replace([float("inf"), float("-inf")], None)
253
+
254
+ self._exprs = [expr_target, expr_inf]
255
+ gap_secs = int(Timedelta(self.gap).seconds)
256
+ if gap_secs > 0:
257
+ expr_gap = (
258
+ pl.col(self.alias)
259
+ .shift(-math.ceil(gap_secs / freq_secs))
260
+ .over(**over_spec)
261
+ )
262
+ self._exprs.insert(1, expr_gap)
263
+ return self
264
+
265
+ def transform(self, X: pl.LazyFrame):
266
+ for expr in self._exprs:
267
+ X = X.with_columns(expr)
268
+ return X
269
+
270
+
271
+ class TargetFromDifferentTag(BaseEstimator, TransformerMixin):
272
+ """
273
+ 添加目标收益列: 使用不同的价格列
274
+
275
+ Parameters
276
+ ----------
277
+ enter_price_tag: str
278
+ 进场价格列,支持sql语法,如 `if(ask_p1 > 0, bid_p1 > 0, (ask_p1+bid_p1)/2, null) as price`
279
+ exit_price_tag: str
280
+ 出场价格列
281
+ frequency
282
+ 数据集频率, 默认`3s`
283
+ target
284
+ 目标收益,默认`5min`
285
+ partition_by
286
+ 分区
287
+ order_by
288
+ 排序
289
+ alias: str
290
+ 重新命名
291
+ """
292
+
293
+ def __init__(self,
294
+ enter_price_tag: str,
295
+ exit_price_tag: str,
296
+ frequency: str = "3s",
297
+ target: str = "5min",
298
+ gap: str = "3s",
299
+ partition_by: Sequence[str] | str = "asset",
300
+ order_by: Sequence[str] | str | None = "time",
301
+ alias: str = None):
302
+ self.enter_price_tag = enter_price_tag
303
+ self.exit_price_tag = exit_price_tag
304
+ self.frequency = frequency
305
+ self.target = target
306
+ self.gap = gap
307
+ self.partition_by = partition_by
308
+ self.order_by = order_by
309
+ self.alias = alias
310
+ self._exprs = list()
311
+
312
+ def fit(self, X=None, y=None):
313
+ from pandas import Timedelta
314
+ self.alias = self.alias if self.alias else self.target
315
+ over_spec = {"partition_by": self.partition_by}
316
+ if self.order_by:
317
+ over_spec["order_by"] = self.order_by
318
+ freq_secs = int(Timedelta(self.frequency).seconds)
319
+ expr_target = (
320
+ (
321
+ pl.sql_expr(self.exit_price_tag)
322
+ .shift(-math.ceil(Timedelta(self.target).seconds / freq_secs))
323
+ .over(**over_spec) / pl.sql_expr(self.enter_price_tag) - 1
324
+ ).alias(self.alias)
325
+ )
326
+ expr_inf = pl.col(self.alias).cast(pl.Float32).replace([float("inf"), float("-inf")], None)
327
+
328
+ self._exprs = [expr_target, expr_inf]
329
+ gap_secs = int(Timedelta(self.gap).seconds)
330
+ if gap_secs > 0:
331
+ expr_gap = (
332
+ pl.col(self.alias)
333
+ .shift(-math.ceil(gap_secs / freq_secs))
334
+ .over(**over_spec)
335
+ )
336
+ self._exprs.insert(1, expr_gap)
337
+ return self
338
+
339
+ def transform(self, X: pl.LazyFrame):
340
+ for expr in self._exprs:
341
+ X = X.with_columns(expr)
342
+ return X
343
+
344
+ class Reindex(BaseEstimator, TransformerMixin):
345
+
346
+ def __init__(self, new_index: pl.LazyFrame | pl.DataFrame):
347
+ self.new_index = new_index
348
+
349
+ def fit(self, X, y=None):
350
+ self.new_index = self.new_index.lazy()
351
+ return self
352
+
353
+ def transform(self, X: pl.LazyFrame):
354
+ return self.new_index.join(X, on=self.new_index.collect_schema().names(), how="left")
@@ -0,0 +1,77 @@
1
+ # Copyright (c) ZhangYundi.
2
+ # Licensed under the MIT License.
3
+ # Created on 2025/7/17 10:53
4
+ # Description:
5
+
6
+ import polars as pl
7
+ from sklearn.base import BaseEstimator, TransformerMixin
8
+
9
+
10
+ class StandardScaler(BaseEstimator, TransformerMixin):
11
+ """
12
+ Parameters
13
+ ----------
14
+ ddof: int
15
+ 自由度(0为总体标准差,1为样本标准差)
16
+ """
17
+
18
+ def __init__(self, subset: list[str], ddof=1):
19
+ self.subset = subset
20
+ self._count = {col: 0 for col in subset}
21
+ self._mean = {col: 0 for col in subset}
22
+ self._M2 = {col: 0 for col in subset}
23
+ self.count = None
24
+ self.mean = None
25
+ self.M2 = None
26
+ self.ddof = ddof
27
+ self.reset()
28
+
29
+ def reset(self):
30
+ """重置"""
31
+ self._count = {col: 0 for col in self.subset}
32
+ self._mean = {col: 0 for col in self.subset}
33
+ self._M2 = {col: 0 for col in self.subset}
34
+ self.count = pl.DataFrame(self._count)
35
+ self.mean = pl.DataFrame(self._mean)
36
+ self.M2 = pl.DataFrame(self._M2)
37
+
38
+ def fit(self, X: pl.DataFrame, y=None):
39
+ self.reset()
40
+ self.partial_fit(X, y)
41
+ return self
42
+
43
+ def partial_fit(self, X: pl.DataFrame, y=None):
44
+ data = X.select(self.subset)
45
+ mu_new = data.mean()
46
+ n_new = data.count() - data.null_count()
47
+ m2_new = data.select(((pl.col(c) - mu_new[c]) ** 2).sum() for c in self.subset)
48
+ # 新旧均值差异
49
+ delta = mu_new - self.mean
50
+ # 全局均值
51
+ new_count = self.count + n_new
52
+ new_mean = (self.mean * self.count + mu_new * n_new) / new_count
53
+ # 全局M2
54
+ # 误差项
55
+ correction_term = delta.select(pl.col(c) ** 2 * (self.count[c] * n_new[c]) / new_count[c] for c in self.subset)
56
+
57
+ new_M2 = self.M2 + m2_new + correction_term
58
+ # 更新状态
59
+ self.count = new_count
60
+ self.mean = new_mean
61
+ self.M2 = new_M2
62
+ return self
63
+
64
+ def get_std(self, col: str):
65
+ return (self.M2[col] / (self.count[col] - self.ddof)) ** 0.5
66
+
67
+ def transform(self, X: pl.DataFrame):
68
+ if not self.subset:
69
+ return X
70
+ return (X
71
+ .with_columns((pl.col(c) - self.mean[c]) / self.get_std(c) for c in self.subset)
72
+ .with_columns(pl.col(c).cast(pl.Float32).replace([float("inf"), float("-inf")], None) for c in self.subset))
73
+
74
+ def partial_fit_transform(self, X: pl.DataFrame, y=None):
75
+ if not self.subset:
76
+ return X
77
+ return self.partial_fit(X).transform(X)
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: silars
3
+ Version: 0.1.0
4
+ License-Expression: MIT
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: numpy>=2.3.1
8
+ Requires-Dist: pandas>=2.3.1
9
+ Requires-Dist: polars>=1.31.0
10
+ Requires-Dist: scikit-learn>=1.7.0
11
+ Requires-Dist: ygo>=1.1.6
@@ -0,0 +1,11 @@
1
+ README.md
2
+ pyproject.toml
3
+ silars/__init__.py
4
+ silars.egg-info/PKG-INFO
5
+ silars.egg-info/SOURCES.txt
6
+ silars.egg-info/dependency_links.txt
7
+ silars.egg-info/requires.txt
8
+ silars.egg-info/top_level.txt
9
+ silars/transformer/__init__.py
10
+ silars/transformer/factory.py
11
+ silars/transformer/partial.py
@@ -0,0 +1,5 @@
1
+ numpy>=2.3.1
2
+ pandas>=2.3.1
3
+ polars>=1.31.0
4
+ scikit-learn>=1.7.0
5
+ ygo>=1.1.6
@@ -0,0 +1 @@
1
+ silars