vizflow 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vizflow/__init__.py +10 -5
- vizflow/config.py +40 -44
- vizflow/io.py +134 -161
- vizflow/ops.py +11 -3
- vizflow/schema_evolution.py +394 -0
- {vizflow-0.5.0.dist-info → vizflow-0.5.2.dist-info}/METADATA +1 -1
- vizflow-0.5.2.dist-info/RECORD +10 -0
- vizflow/presets.py +0 -87
- vizflow-0.5.0.dist-info/RECORD +0 -10
- {vizflow-0.5.0.dist-info → vizflow-0.5.2.dist-info}/WHEEL +0 -0
vizflow/__init__.py
CHANGED
|
@@ -5,13 +5,11 @@ Usage:
|
|
|
5
5
|
import vizflow as vf
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
__version__ = "0.5.
|
|
8
|
+
__version__ = "0.5.2"
|
|
9
9
|
|
|
10
|
-
from .config import
|
|
10
|
+
from .config import Config, get_config, set_config
|
|
11
11
|
from .io import (
|
|
12
|
-
load_alpha,
|
|
13
12
|
load_calendar,
|
|
14
|
-
load_trade,
|
|
15
13
|
scan_alpha,
|
|
16
14
|
scan_alphas,
|
|
17
15
|
scan_trade,
|
|
@@ -19,4 +17,11 @@ from .io import (
|
|
|
19
17
|
)
|
|
20
18
|
from .market import CN, CRYPTO, Market, Session
|
|
21
19
|
from .ops import aggregate, bin, forward_return, parse_time
|
|
22
|
-
from .
|
|
20
|
+
from .schema_evolution import (
|
|
21
|
+
JYAO_V20251114,
|
|
22
|
+
SCHEMAS,
|
|
23
|
+
YLIN_V20251204,
|
|
24
|
+
ColumnSpec,
|
|
25
|
+
SchemaEvolution,
|
|
26
|
+
get_schema,
|
|
27
|
+
)
|
vizflow/config.py
CHANGED
|
@@ -4,25 +4,28 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from .schema_evolution import SchemaEvolution
|
|
8
11
|
|
|
9
12
|
# Global config instance
|
|
10
13
|
_global_config: Config | None = None
|
|
11
14
|
|
|
12
15
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
"""Schema for a column with type casting.
|
|
16
|
+
def _validate_date(date: str) -> None:
|
|
17
|
+
"""Validate date string format to prevent path traversal.
|
|
16
18
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
+
Args:
|
|
20
|
+
date: Date string to validate
|
|
19
21
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
ColumnSchema(cast_to=pl.Int64)
|
|
22
|
+
Raises:
|
|
23
|
+
ValueError: If date is not exactly 8 digits (YYYYMMDD format)
|
|
23
24
|
"""
|
|
24
|
-
|
|
25
|
-
|
|
25
|
+
if not (len(date) == 8 and date.isdigit()):
|
|
26
|
+
raise ValueError(
|
|
27
|
+
f"Invalid date format: {date!r}. Expected YYYYMMDD (8 digits)."
|
|
28
|
+
)
|
|
26
29
|
|
|
27
30
|
|
|
28
31
|
@dataclass
|
|
@@ -38,14 +41,20 @@ class Config:
|
|
|
38
41
|
replay_dir: Directory for FIFO replay output (materialization 1)
|
|
39
42
|
aggregate_dir: Directory for aggregation output (materialization 2)
|
|
40
43
|
market: Market identifier, e.g. "CN"
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
alpha_schema: Schema evolution for alpha columns
|
|
44
|
-
trade_schema: Schema evolution for trade columns
|
|
44
|
+
trade_schema: Schema evolution for trade data (name or SchemaEvolution)
|
|
45
|
+
alpha_schema: Schema evolution for alpha data (name or SchemaEvolution)
|
|
45
46
|
binwidths: Mapping from column names to bin widths
|
|
46
47
|
group_by: Columns to group by in aggregation
|
|
47
48
|
horizons: List of forward return horizons in seconds
|
|
48
49
|
time_cutoff: Optional time cutoff (e.g. 143000000 for 14:30:00)
|
|
50
|
+
|
|
51
|
+
Example:
|
|
52
|
+
>>> config = vf.Config(
|
|
53
|
+
... trade_dir=Path("data/ylin/trade"),
|
|
54
|
+
... trade_pattern="{date}.meords",
|
|
55
|
+
... trade_schema="ylin_v20251204", # Use registered schema by name
|
|
56
|
+
... market="CN",
|
|
57
|
+
... )
|
|
49
58
|
"""
|
|
50
59
|
|
|
51
60
|
# === Input Paths ===
|
|
@@ -62,17 +71,10 @@ class Config:
|
|
|
62
71
|
# === Market ===
|
|
63
72
|
market: str = "CN"
|
|
64
73
|
|
|
65
|
-
# === Column Mapping ===
|
|
66
|
-
alpha_columns: dict[str, str] = field(default_factory=dict)
|
|
67
|
-
trade_columns: dict[str, str] = field(default_factory=dict)
|
|
68
|
-
|
|
69
74
|
# === Schema Evolution ===
|
|
70
|
-
|
|
71
|
-
trade_schema:
|
|
72
|
-
|
|
73
|
-
# === Column Mapping Presets ===
|
|
74
|
-
trade_preset: str | None = None # "ylin" or None
|
|
75
|
-
alpha_preset: str | None = None # "jyao_v20251114" or None
|
|
75
|
+
# Can be a string (schema name) or SchemaEvolution instance
|
|
76
|
+
trade_schema: str | SchemaEvolution | None = None
|
|
77
|
+
alpha_schema: str | SchemaEvolution | None = None
|
|
76
78
|
|
|
77
79
|
# === Aggregation ===
|
|
78
80
|
binwidths: dict[str, float] = field(default_factory=dict)
|
|
@@ -83,7 +85,11 @@ class Config:
|
|
|
83
85
|
time_cutoff: int | None = None
|
|
84
86
|
|
|
85
87
|
def __post_init__(self):
|
|
86
|
-
"""Convert paths to Path objects
|
|
88
|
+
"""Convert string paths to Path objects.
|
|
89
|
+
|
|
90
|
+
Note: String values for path fields (alpha_dir, trade_dir, calendar_path,
|
|
91
|
+
replay_dir, aggregate_dir) are automatically converted to Path objects.
|
|
92
|
+
"""
|
|
87
93
|
if isinstance(self.alpha_dir, str):
|
|
88
94
|
self.alpha_dir = Path(self.alpha_dir)
|
|
89
95
|
if isinstance(self.trade_dir, str):
|
|
@@ -95,20 +101,6 @@ class Config:
|
|
|
95
101
|
if isinstance(self.aggregate_dir, str):
|
|
96
102
|
self.aggregate_dir = Path(self.aggregate_dir)
|
|
97
103
|
|
|
98
|
-
def col(self, semantic: str, source: str = "trade") -> str:
|
|
99
|
-
"""Get actual column name from semantic name.
|
|
100
|
-
|
|
101
|
-
Args:
|
|
102
|
-
semantic: Semantic column name (e.g. "timestamp", "price")
|
|
103
|
-
source: "alpha" or "trade"
|
|
104
|
-
|
|
105
|
-
Returns:
|
|
106
|
-
Actual column name, or the semantic name if no mapping exists
|
|
107
|
-
"""
|
|
108
|
-
if source == "alpha":
|
|
109
|
-
return self.alpha_columns.get(semantic, semantic)
|
|
110
|
-
return self.trade_columns.get(semantic, semantic)
|
|
111
|
-
|
|
112
104
|
def get_alpha_path(self, date: str) -> Path:
|
|
113
105
|
"""Get alpha file path for a date.
|
|
114
106
|
|
|
@@ -119,8 +111,9 @@ class Config:
|
|
|
119
111
|
Full path to alpha file
|
|
120
112
|
|
|
121
113
|
Raises:
|
|
122
|
-
ValueError: If alpha_dir is not set
|
|
114
|
+
ValueError: If alpha_dir is not set or date format is invalid
|
|
123
115
|
"""
|
|
116
|
+
_validate_date(date)
|
|
124
117
|
if self.alpha_dir is None:
|
|
125
118
|
raise ValueError("alpha_dir is not set in Config")
|
|
126
119
|
return self.alpha_dir / self.alpha_pattern.format(date=date)
|
|
@@ -135,8 +128,9 @@ class Config:
|
|
|
135
128
|
Full path to trade file
|
|
136
129
|
|
|
137
130
|
Raises:
|
|
138
|
-
ValueError: If trade_dir is not set
|
|
131
|
+
ValueError: If trade_dir is not set or date format is invalid
|
|
139
132
|
"""
|
|
133
|
+
_validate_date(date)
|
|
140
134
|
if self.trade_dir is None:
|
|
141
135
|
raise ValueError("trade_dir is not set in Config")
|
|
142
136
|
return self.trade_dir / self.trade_pattern.format(date=date)
|
|
@@ -152,8 +146,9 @@ class Config:
|
|
|
152
146
|
Full path to replay output file
|
|
153
147
|
|
|
154
148
|
Raises:
|
|
155
|
-
ValueError: If replay_dir is not set
|
|
149
|
+
ValueError: If replay_dir is not set or date format is invalid
|
|
156
150
|
"""
|
|
151
|
+
_validate_date(date)
|
|
157
152
|
if self.replay_dir is None:
|
|
158
153
|
raise ValueError("replay_dir is not set in Config")
|
|
159
154
|
return self.replay_dir / f"{date}{suffix}"
|
|
@@ -169,8 +164,9 @@ class Config:
|
|
|
169
164
|
Full path to aggregate output file
|
|
170
165
|
|
|
171
166
|
Raises:
|
|
172
|
-
ValueError: If aggregate_dir is not set
|
|
167
|
+
ValueError: If aggregate_dir is not set or date format is invalid
|
|
173
168
|
"""
|
|
169
|
+
_validate_date(date)
|
|
174
170
|
if self.aggregate_dir is None:
|
|
175
171
|
raise ValueError("aggregate_dir is not set in Config")
|
|
176
172
|
return self.aggregate_dir / f"{date}{suffix}"
|
vizflow/io.py
CHANGED
|
@@ -2,110 +2,42 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from typing import TYPE_CHECKING
|
|
6
|
-
|
|
7
5
|
import polars as pl
|
|
8
6
|
|
|
9
7
|
from .config import Config, get_config
|
|
8
|
+
from .schema_evolution import SchemaEvolution, get_schema
|
|
10
9
|
|
|
11
|
-
if TYPE_CHECKING:
|
|
12
|
-
pass
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def load_alpha(date: str, config: Config | None = None) -> pl.LazyFrame:
|
|
16
|
-
"""Load alpha data for a date with automatic schema evolution.
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
date: Date string, e.g. "20241001"
|
|
20
|
-
config: Config to use, or get_config() if None
|
|
21
|
-
|
|
22
|
-
Returns:
|
|
23
|
-
LazyFrame with schema evolution applied
|
|
24
|
-
|
|
25
|
-
Example:
|
|
26
|
-
>>> config = vf.Config(
|
|
27
|
-
... output_dir=Path("/data/output"),
|
|
28
|
-
... alpha_dir=Path("/data/alpha"),
|
|
29
|
-
... alpha_schema={"qty": vf.ColumnSchema(cast_to=pl.Int64)}
|
|
30
|
-
... )
|
|
31
|
-
>>> vf.set_config(config)
|
|
32
|
-
>>> alpha = vf.load_alpha("20241001")
|
|
33
|
-
"""
|
|
34
|
-
config = config or get_config()
|
|
35
|
-
path = config.get_alpha_path(date)
|
|
36
|
-
df = pl.scan_ipc(path)
|
|
37
|
-
|
|
38
|
-
# Apply schema evolution (type casting)
|
|
39
|
-
for col_name, schema in config.alpha_schema.items():
|
|
40
|
-
df = df.with_columns(pl.col(col_name).cast(schema.cast_to))
|
|
41
|
-
|
|
42
|
-
return df
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def load_trade(date: str, config: Config | None = None) -> pl.LazyFrame:
|
|
46
|
-
"""Load trade data for a date with automatic schema evolution.
|
|
47
|
-
|
|
48
|
-
Args:
|
|
49
|
-
date: Date string, e.g. "20241001"
|
|
50
|
-
config: Config to use, or get_config() if None
|
|
51
|
-
|
|
52
|
-
Returns:
|
|
53
|
-
LazyFrame with schema evolution applied
|
|
54
|
-
|
|
55
|
-
Example:
|
|
56
|
-
>>> config = vf.Config(
|
|
57
|
-
... output_dir=Path("/data/output"),
|
|
58
|
-
... trade_dir=Path("/data/trade"),
|
|
59
|
-
... trade_schema={"qty": vf.ColumnSchema(cast_to=pl.Int64)}
|
|
60
|
-
... )
|
|
61
|
-
>>> vf.set_config(config)
|
|
62
|
-
>>> trade = vf.load_trade("20241001") # qty: 1.00000002 → 1
|
|
63
|
-
"""
|
|
64
|
-
config = config or get_config()
|
|
65
|
-
path = config.get_trade_path(date)
|
|
66
|
-
df = pl.scan_ipc(path)
|
|
67
10
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
return df
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
def load_calendar(config: Config | None = None) -> pl.DataFrame:
|
|
76
|
-
"""Load trading calendar.
|
|
11
|
+
def _resolve_schema(
|
|
12
|
+
schema_ref: str | SchemaEvolution | None,
|
|
13
|
+
) -> SchemaEvolution | None:
|
|
14
|
+
"""Resolve schema reference to SchemaEvolution instance.
|
|
77
15
|
|
|
78
16
|
Args:
|
|
79
|
-
|
|
17
|
+
schema_ref: Schema name string, SchemaEvolution instance, or None.
|
|
80
18
|
|
|
81
19
|
Returns:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
Raises:
|
|
85
|
-
ValueError: If calendar_path is not set in config
|
|
86
|
-
|
|
87
|
-
Example:
|
|
88
|
-
>>> config = vf.Config(
|
|
89
|
-
... output_dir=Path("/data/output"),
|
|
90
|
-
... calendar_path=Path("/data/calendar.parquet")
|
|
91
|
-
... )
|
|
92
|
-
>>> vf.set_config(config)
|
|
93
|
-
>>> calendar = vf.load_calendar()
|
|
20
|
+
SchemaEvolution instance or None.
|
|
94
21
|
"""
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
22
|
+
if schema_ref is None:
|
|
23
|
+
return None
|
|
24
|
+
if isinstance(schema_ref, SchemaEvolution):
|
|
25
|
+
return schema_ref
|
|
26
|
+
return get_schema(schema_ref)
|
|
99
27
|
|
|
100
28
|
|
|
101
|
-
def _scan_file(
|
|
102
|
-
|
|
29
|
+
def _scan_file(
|
|
30
|
+
path,
|
|
31
|
+
schema: SchemaEvolution | None = None,
|
|
32
|
+
) -> pl.LazyFrame:
|
|
33
|
+
"""Scan a file based on its extension with optional schema.
|
|
103
34
|
|
|
104
35
|
Args:
|
|
105
|
-
path: Path to file
|
|
36
|
+
path: Path to file.
|
|
37
|
+
schema: Optional SchemaEvolution for CSV parsing options.
|
|
106
38
|
|
|
107
39
|
Returns:
|
|
108
|
-
LazyFrame from the file
|
|
40
|
+
LazyFrame from the file.
|
|
109
41
|
|
|
110
42
|
Supported formats:
|
|
111
43
|
- .feather, .ipc, .arrow: IPC format (pl.scan_ipc)
|
|
@@ -117,58 +49,114 @@ def _scan_file(path) -> pl.LazyFrame:
|
|
|
117
49
|
if suffix in ("feather", "ipc", "arrow"):
|
|
118
50
|
return pl.scan_ipc(path)
|
|
119
51
|
elif suffix in ("csv", "meords"):
|
|
120
|
-
|
|
52
|
+
csv_kwargs = {}
|
|
53
|
+
if schema:
|
|
54
|
+
schema_overrides = schema.get_schema_overrides()
|
|
55
|
+
if schema_overrides:
|
|
56
|
+
csv_kwargs["schema_overrides"] = schema_overrides
|
|
57
|
+
null_values = schema.get_null_values()
|
|
58
|
+
if null_values:
|
|
59
|
+
csv_kwargs["null_values"] = null_values
|
|
60
|
+
return pl.scan_csv(path, **csv_kwargs)
|
|
121
61
|
elif suffix == "parquet":
|
|
122
62
|
return pl.scan_parquet(path)
|
|
123
63
|
else:
|
|
124
|
-
|
|
125
|
-
|
|
64
|
+
raise ValueError(
|
|
65
|
+
f"Unsupported file format: .{suffix}. "
|
|
66
|
+
"Supported: .feather, .ipc, .arrow, .csv, .meords, .parquet"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _apply_schema_evolution(
|
|
71
|
+
df: pl.LazyFrame,
|
|
72
|
+
schema: SchemaEvolution,
|
|
73
|
+
) -> pl.LazyFrame:
|
|
74
|
+
"""Apply full schema evolution: drop, rename, cast.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
df: LazyFrame to transform.
|
|
78
|
+
schema: SchemaEvolution with transformation rules.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Transformed LazyFrame.
|
|
82
|
+
"""
|
|
83
|
+
existing = set(df.collect_schema().names())
|
|
84
|
+
|
|
85
|
+
# Step 1: Drop excluded columns
|
|
86
|
+
drop_cols = schema.get_drop_columns()
|
|
87
|
+
to_drop = [c for c in drop_cols if c in existing]
|
|
88
|
+
if to_drop:
|
|
89
|
+
df = df.drop(to_drop)
|
|
90
|
+
existing -= set(to_drop)
|
|
91
|
+
|
|
92
|
+
# Step 2: Rename columns
|
|
93
|
+
rename_map = schema.get_rename_map()
|
|
94
|
+
to_rename = {k: v for k, v in rename_map.items() if k in existing}
|
|
95
|
+
if to_rename:
|
|
96
|
+
df = df.rename(to_rename)
|
|
97
|
+
# Update existing names after rename
|
|
98
|
+
for old, new in to_rename.items():
|
|
99
|
+
existing.discard(old)
|
|
100
|
+
existing.add(new)
|
|
101
|
+
|
|
102
|
+
# Step 3: Cast columns (using FINAL names after rename)
|
|
103
|
+
cast_map = schema.get_cast_map()
|
|
104
|
+
for col_name, dtype in cast_map.items():
|
|
105
|
+
if col_name in existing:
|
|
106
|
+
df = df.with_columns(pl.col(col_name).cast(dtype))
|
|
107
|
+
|
|
108
|
+
return df
|
|
126
109
|
|
|
127
110
|
|
|
128
111
|
def scan_trade(date: str, config: Config | None = None) -> pl.LazyFrame:
|
|
129
|
-
"""Scan single date trade file with
|
|
112
|
+
"""Scan single date trade file with schema evolution.
|
|
130
113
|
|
|
131
|
-
Supports
|
|
114
|
+
Supports IPC/feather, CSV (including .meords), and Parquet formats.
|
|
132
115
|
|
|
133
116
|
Args:
|
|
134
117
|
date: Date string, e.g. "20241001"
|
|
135
118
|
config: Config to use, or get_config() if None
|
|
136
119
|
|
|
137
120
|
Returns:
|
|
138
|
-
LazyFrame with
|
|
121
|
+
LazyFrame with schema evolution applied
|
|
139
122
|
|
|
140
123
|
Example:
|
|
141
124
|
>>> config = vf.Config(
|
|
142
|
-
... trade_dir=Path("/data/
|
|
125
|
+
... trade_dir=Path("/data/ylin/trade"),
|
|
143
126
|
... trade_pattern="{date}.meords",
|
|
144
|
-
...
|
|
127
|
+
... trade_schema="ylin_v20251204",
|
|
145
128
|
... )
|
|
146
129
|
>>> vf.set_config(config)
|
|
147
130
|
>>> df = vf.scan_trade("20241001")
|
|
148
131
|
"""
|
|
149
132
|
config = config or get_config()
|
|
150
133
|
path = config.get_trade_path(date)
|
|
151
|
-
|
|
152
|
-
|
|
134
|
+
schema = _resolve_schema(config.trade_schema)
|
|
135
|
+
|
|
136
|
+
df = _scan_file(path, schema=schema)
|
|
137
|
+
if schema:
|
|
138
|
+
df = _apply_schema_evolution(df, schema)
|
|
139
|
+
|
|
140
|
+
return df
|
|
153
141
|
|
|
154
142
|
|
|
155
143
|
def scan_trades(config: Config | None = None) -> pl.LazyFrame:
|
|
156
|
-
"""Scan all trade files with
|
|
144
|
+
"""Scan all trade files with schema evolution.
|
|
157
145
|
|
|
158
146
|
Args:
|
|
159
147
|
config: Config to use, or get_config() if None
|
|
160
148
|
|
|
161
149
|
Returns:
|
|
162
|
-
LazyFrame with
|
|
150
|
+
LazyFrame with schema evolution applied
|
|
163
151
|
|
|
164
152
|
Raises:
|
|
165
153
|
ValueError: If trade_dir is not set or no files found
|
|
166
154
|
|
|
167
155
|
Example:
|
|
168
156
|
>>> config = vf.Config(
|
|
169
|
-
... trade_dir=Path("/data/
|
|
170
|
-
... trade_pattern="{date}.
|
|
171
|
-
...
|
|
157
|
+
... trade_dir=Path("/data/ylin/trade"),
|
|
158
|
+
... trade_pattern="{date}.meords",
|
|
159
|
+
... trade_schema="ylin_v20251204",
|
|
172
160
|
... )
|
|
173
161
|
>>> vf.set_config(config)
|
|
174
162
|
>>> df = vf.scan_trades()
|
|
@@ -182,86 +170,56 @@ def scan_trades(config: Config | None = None) -> pl.LazyFrame:
|
|
|
182
170
|
if not files:
|
|
183
171
|
raise ValueError(f"No files found matching {pattern} in {config.trade_dir}")
|
|
184
172
|
|
|
173
|
+
schema = _resolve_schema(config.trade_schema)
|
|
174
|
+
|
|
185
175
|
# Concatenate all files using lazy scanning
|
|
186
|
-
dfs = [_scan_file(f) for f in files]
|
|
176
|
+
dfs = [_scan_file(f, schema=schema) for f in files]
|
|
187
177
|
df = pl.concat(dfs)
|
|
188
|
-
return _apply_trade_mapping(df, config)
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
def _apply_trade_mapping(df: pl.LazyFrame, config: Config) -> pl.LazyFrame:
|
|
192
|
-
"""Apply column rename + schema evolution for trade data."""
|
|
193
|
-
df = _apply_rename(df, config.trade_preset)
|
|
194
|
-
for col_name, schema in config.trade_schema.items():
|
|
195
|
-
df = df.with_columns(pl.col(col_name).cast(schema.cast_to))
|
|
196
|
-
return df
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
def _apply_alpha_mapping(df: pl.LazyFrame, config: Config) -> pl.LazyFrame:
|
|
200
|
-
"""Apply column rename + schema evolution for alpha data."""
|
|
201
|
-
df = _apply_rename(df, config.alpha_preset)
|
|
202
|
-
for col_name, schema in config.alpha_schema.items():
|
|
203
|
-
df = df.with_columns(pl.col(col_name).cast(schema.cast_to))
|
|
204
|
-
return df
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
def _apply_rename(df: pl.LazyFrame, preset: str | None) -> pl.LazyFrame:
|
|
208
|
-
"""Apply column rename from preset name.
|
|
209
|
-
|
|
210
|
-
Args:
|
|
211
|
-
df: LazyFrame to rename columns
|
|
212
|
-
preset: Preset name (e.g., "ylin", "jyao_v20251114") or None
|
|
213
|
-
"""
|
|
214
|
-
# Drop record type prefix column if present (from CSV files)
|
|
215
|
-
existing = set(df.collect_schema().names())
|
|
216
|
-
if "#HFTORD" in existing:
|
|
217
|
-
df = df.drop("#HFTORD")
|
|
218
|
-
existing.remove("#HFTORD")
|
|
219
|
-
|
|
220
|
-
# Get rename map from preset
|
|
221
|
-
rename_map = _get_rename_map(preset)
|
|
222
178
|
|
|
223
|
-
if
|
|
224
|
-
|
|
225
|
-
to_rename = {k: v for k, v in rename_map.items() if k in existing}
|
|
226
|
-
if to_rename:
|
|
227
|
-
df = df.rename(to_rename)
|
|
179
|
+
if schema:
|
|
180
|
+
df = _apply_schema_evolution(df, schema)
|
|
228
181
|
|
|
229
182
|
return df
|
|
230
183
|
|
|
231
184
|
|
|
232
185
|
def scan_alpha(date: str, config: Config | None = None) -> pl.LazyFrame:
|
|
233
|
-
"""Scan single date alpha file with
|
|
186
|
+
"""Scan single date alpha file with schema evolution.
|
|
234
187
|
|
|
235
188
|
Args:
|
|
236
189
|
date: Date string, e.g. "20241001"
|
|
237
190
|
config: Config to use, or get_config() if None
|
|
238
191
|
|
|
239
192
|
Returns:
|
|
240
|
-
LazyFrame with
|
|
193
|
+
LazyFrame with schema evolution applied
|
|
241
194
|
|
|
242
195
|
Example:
|
|
243
196
|
>>> config = vf.Config(
|
|
244
197
|
... alpha_dir=Path("/data/jyao/alpha"),
|
|
245
198
|
... alpha_pattern="alpha_{date}.feather",
|
|
246
|
-
...
|
|
199
|
+
... alpha_schema="jyao_v20251114",
|
|
247
200
|
... )
|
|
248
201
|
>>> vf.set_config(config)
|
|
249
202
|
>>> df = vf.scan_alpha("20251114")
|
|
250
203
|
"""
|
|
251
204
|
config = config or get_config()
|
|
252
205
|
path = config.get_alpha_path(date)
|
|
253
|
-
|
|
254
|
-
|
|
206
|
+
schema = _resolve_schema(config.alpha_schema)
|
|
207
|
+
|
|
208
|
+
df = _scan_file(path, schema=schema)
|
|
209
|
+
if schema:
|
|
210
|
+
df = _apply_schema_evolution(df, schema)
|
|
211
|
+
|
|
212
|
+
return df
|
|
255
213
|
|
|
256
214
|
|
|
257
215
|
def scan_alphas(config: Config | None = None) -> pl.LazyFrame:
|
|
258
|
-
"""Scan all alpha files with
|
|
216
|
+
"""Scan all alpha files with schema evolution.
|
|
259
217
|
|
|
260
218
|
Args:
|
|
261
219
|
config: Config to use, or get_config() if None
|
|
262
220
|
|
|
263
221
|
Returns:
|
|
264
|
-
LazyFrame with
|
|
222
|
+
LazyFrame with schema evolution applied
|
|
265
223
|
|
|
266
224
|
Raises:
|
|
267
225
|
ValueError: If alpha_dir is not set or no files found
|
|
@@ -275,22 +233,37 @@ def scan_alphas(config: Config | None = None) -> pl.LazyFrame:
|
|
|
275
233
|
if not files:
|
|
276
234
|
raise ValueError(f"No files found matching {pattern} in {config.alpha_dir}")
|
|
277
235
|
|
|
278
|
-
|
|
236
|
+
schema = _resolve_schema(config.alpha_schema)
|
|
237
|
+
|
|
238
|
+
dfs = [_scan_file(f, schema=schema) for f in files]
|
|
279
239
|
df = pl.concat(dfs)
|
|
280
|
-
return _apply_alpha_mapping(df, config)
|
|
281
240
|
|
|
241
|
+
if schema:
|
|
242
|
+
df = _apply_schema_evolution(df, schema)
|
|
243
|
+
|
|
244
|
+
return df
|
|
282
245
|
|
|
283
|
-
|
|
284
|
-
|
|
246
|
+
|
|
247
|
+
def load_calendar(config: Config | None = None) -> pl.DataFrame:
|
|
248
|
+
"""Load trading calendar.
|
|
285
249
|
|
|
286
250
|
Args:
|
|
287
|
-
|
|
251
|
+
config: Config to use, or get_config() if None
|
|
288
252
|
|
|
289
253
|
Returns:
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
from .presets import PRESETS
|
|
254
|
+
DataFrame with date, prev_date, next_date columns
|
|
255
|
+
|
|
256
|
+
Raises:
|
|
257
|
+
ValueError: If calendar_path is not set in config
|
|
295
258
|
|
|
296
|
-
|
|
259
|
+
Example:
|
|
260
|
+
>>> config = vf.Config(
|
|
261
|
+
... calendar_path=Path("/data/calendar.parquet")
|
|
262
|
+
... )
|
|
263
|
+
>>> vf.set_config(config)
|
|
264
|
+
>>> calendar = vf.load_calendar()
|
|
265
|
+
"""
|
|
266
|
+
config = config or get_config()
|
|
267
|
+
if config.calendar_path is None:
|
|
268
|
+
raise ValueError("calendar_path is not set in Config")
|
|
269
|
+
return pl.read_parquet(config.calendar_path)
|
vizflow/ops.py
CHANGED
|
@@ -157,6 +157,7 @@ def forward_return(
|
|
|
157
157
|
alpha_time_col: str = "elapsed_ticktime",
|
|
158
158
|
price_col: str = "mid",
|
|
159
159
|
symbol_col: str = "ukey",
|
|
160
|
+
tolerance_ms: int = 5000,
|
|
160
161
|
) -> pl.LazyFrame:
|
|
161
162
|
"""Merge alpha's future price to trade and calculate forward returns.
|
|
162
163
|
|
|
@@ -177,6 +178,7 @@ def forward_return(
|
|
|
177
178
|
alpha_time_col: Time column in alpha df (default: "elapsed_ticktime")
|
|
178
179
|
price_col: Column name for price in both dfs (default: "mid")
|
|
179
180
|
symbol_col: Symbol column for grouping (default: "ukey")
|
|
181
|
+
tolerance_ms: Max time difference in ms for asof join (default: 5000)
|
|
180
182
|
|
|
181
183
|
Returns:
|
|
182
184
|
Trade LazyFrame with forward_* and y_* columns added
|
|
@@ -210,6 +212,9 @@ def forward_return(
|
|
|
210
212
|
(pl.col(trade_time_col) + horizon_ms).alias("_forward_time")
|
|
211
213
|
)
|
|
212
214
|
|
|
215
|
+
# Sort by join columns (required for asof join)
|
|
216
|
+
trade = trade.sort([symbol_col, "_forward_time"])
|
|
217
|
+
|
|
213
218
|
# Asof join: find alpha price at forward_time
|
|
214
219
|
joined = trade.join_asof(
|
|
215
220
|
alpha_lookup.rename({alpha_time_col: "_alpha_time", price_col: "_forward_price"}),
|
|
@@ -217,13 +222,16 @@ def forward_return(
|
|
|
217
222
|
right_on="_alpha_time",
|
|
218
223
|
by=symbol_col,
|
|
219
224
|
strategy="nearest",
|
|
220
|
-
tolerance=
|
|
225
|
+
tolerance=tolerance_ms,
|
|
221
226
|
)
|
|
222
227
|
|
|
223
|
-
# Add forward price and calculate return
|
|
228
|
+
# Add forward price and calculate return (guard against zero price)
|
|
224
229
|
trade = joined.with_columns([
|
|
225
230
|
pl.col("_forward_price").alias(forward_col),
|
|
226
|
-
|
|
231
|
+
pl.when(pl.col(price_col) != 0)
|
|
232
|
+
.then((pl.col("_forward_price") - pl.col(price_col)) / pl.col(price_col))
|
|
233
|
+
.otherwise(pl.lit(None))
|
|
234
|
+
.alias(return_col),
|
|
227
235
|
]).drop(["_forward_time", "_alpha_time", "_forward_price"])
|
|
228
236
|
|
|
229
237
|
return trade.lazy()
|
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
"""Schema Evolution for VizFlow.
|
|
2
|
+
|
|
3
|
+
This module defines how raw data evolves into standard format through:
|
|
4
|
+
- Column renaming (raw names → standard names)
|
|
5
|
+
- Parse-time type specification (for CSV parsing)
|
|
6
|
+
- Post-load type casting (e.g., Float64 → Int64)
|
|
7
|
+
- Null value handling
|
|
8
|
+
- Column exclusion
|
|
9
|
+
|
|
10
|
+
Example:
|
|
11
|
+
>>> schema = SchemaEvolution(
|
|
12
|
+
... columns={
|
|
13
|
+
... "fillQty": ColumnSpec(
|
|
14
|
+
... rename_to="order_filled_qty",
|
|
15
|
+
... parse_dtype=pl.Float64, # Parse as float (catch decimals)
|
|
16
|
+
... cast_dtype=pl.Int64, # Then cast to int
|
|
17
|
+
... ),
|
|
18
|
+
... },
|
|
19
|
+
... null_values=["", "NA"],
|
|
20
|
+
... drop=["#HFTORD"],
|
|
21
|
+
... )
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
import polars as pl
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class ColumnSpec:
|
|
34
|
+
"""Specification for a single column's parsing and transformation.
|
|
35
|
+
|
|
36
|
+
Attributes:
|
|
37
|
+
rename_to: Standard column name after rename. None keeps original name.
|
|
38
|
+
parse_dtype: Type to use when parsing CSV. None uses Polars inference.
|
|
39
|
+
cast_dtype: Final type after post-load casting. None keeps parse type.
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
# Rename only (most common)
|
|
43
|
+
ColumnSpec(rename_to="ukey")
|
|
44
|
+
|
|
45
|
+
# Parse as Float64, cast to Int64 (handle decimal errors in qty)
|
|
46
|
+
ColumnSpec(rename_to="order_filled_qty",
|
|
47
|
+
parse_dtype=pl.Float64,
|
|
48
|
+
cast_dtype=pl.Int64)
|
|
49
|
+
|
|
50
|
+
# Parse as specific type, no cast (trusted integer)
|
|
51
|
+
ColumnSpec(rename_to="timestamp", parse_dtype=pl.Int64)
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
rename_to: str | None = None
|
|
55
|
+
parse_dtype: Any = None # pl.DataType
|
|
56
|
+
cast_dtype: Any = None # pl.DataType
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class SchemaEvolution:
|
|
61
|
+
"""Defines how raw data evolves into standard format.
|
|
62
|
+
|
|
63
|
+
Combines column renaming, parse-time types, post-load casting,
|
|
64
|
+
null value handling, and column exclusion into a single structure.
|
|
65
|
+
|
|
66
|
+
Attributes:
|
|
67
|
+
columns: Mapping from original column name to ColumnSpec.
|
|
68
|
+
null_values: Strings to treat as null at parse time.
|
|
69
|
+
drop: Column names to exclude from output.
|
|
70
|
+
parent: Optional parent schema for version inheritance.
|
|
71
|
+
|
|
72
|
+
Example:
|
|
73
|
+
>>> YLIN_V20251204 = SchemaEvolution(
|
|
74
|
+
... columns={
|
|
75
|
+
... "symbol": ColumnSpec(rename_to="ukey", parse_dtype=pl.Int64),
|
|
76
|
+
... "fillQty": ColumnSpec(
|
|
77
|
+
... rename_to="order_filled_qty",
|
|
78
|
+
... parse_dtype=pl.Float64,
|
|
79
|
+
... cast_dtype=pl.Int64,
|
|
80
|
+
... ),
|
|
81
|
+
... },
|
|
82
|
+
... null_values=["", "NA", "null"],
|
|
83
|
+
... drop=["#HFTORD"],
|
|
84
|
+
... )
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
columns: dict[str, ColumnSpec] = field(default_factory=dict)
|
|
88
|
+
null_values: list[str] = field(default_factory=lambda: ["", "NA", "null"])
|
|
89
|
+
drop: list[str] = field(default_factory=list)
|
|
90
|
+
parent: SchemaEvolution | None = None
|
|
91
|
+
|
|
92
|
+
def get_schema_overrides(self) -> dict[str, Any]:
|
|
93
|
+
"""Get schema_overrides dict for pl.scan_csv().
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Mapping from original column name to Polars dtype.
|
|
97
|
+
"""
|
|
98
|
+
result = {}
|
|
99
|
+
if self.parent:
|
|
100
|
+
result.update(self.parent.get_schema_overrides())
|
|
101
|
+
for col_name, spec in self.columns.items():
|
|
102
|
+
if spec.parse_dtype is not None:
|
|
103
|
+
result[col_name] = spec.parse_dtype
|
|
104
|
+
return result
|
|
105
|
+
|
|
106
|
+
def get_rename_map(self) -> dict[str, str]:
|
|
107
|
+
"""Get rename mapping dict for df.rename().
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
Mapping from original column name to new name.
|
|
111
|
+
"""
|
|
112
|
+
result = {}
|
|
113
|
+
if self.parent:
|
|
114
|
+
result.update(self.parent.get_rename_map())
|
|
115
|
+
for col_name, spec in self.columns.items():
|
|
116
|
+
if spec.rename_to is not None:
|
|
117
|
+
result[col_name] = spec.rename_to
|
|
118
|
+
return result
|
|
119
|
+
|
|
120
|
+
def get_cast_map(self) -> dict[str, Any]:
|
|
121
|
+
"""Get post-load cast mapping dict.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Mapping from FINAL column name (after rename) to cast dtype.
|
|
125
|
+
"""
|
|
126
|
+
result = {}
|
|
127
|
+
if self.parent:
|
|
128
|
+
result.update(self.parent.get_cast_map())
|
|
129
|
+
for col_name, spec in self.columns.items():
|
|
130
|
+
if spec.cast_dtype is not None:
|
|
131
|
+
final_name = spec.rename_to or col_name
|
|
132
|
+
result[final_name] = spec.cast_dtype
|
|
133
|
+
return result
|
|
134
|
+
|
|
135
|
+
def get_drop_columns(self) -> set[str]:
|
|
136
|
+
"""Get set of columns to drop.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Set of original column names to exclude.
|
|
140
|
+
"""
|
|
141
|
+
result = set()
|
|
142
|
+
if self.parent:
|
|
143
|
+
result.update(self.parent.get_drop_columns())
|
|
144
|
+
result.update(self.drop)
|
|
145
|
+
return result
|
|
146
|
+
|
|
147
|
+
def get_null_values(self) -> list[str]:
|
|
148
|
+
"""Get list of null value strings.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
List of strings to treat as null at parse time.
|
|
152
|
+
"""
|
|
153
|
+
return self.null_values
|
|
154
|
+
|
|
155
|
+
def validate(self) -> list[str]:
|
|
156
|
+
"""Validate schema configuration.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
List of warnings about potential issues.
|
|
160
|
+
"""
|
|
161
|
+
warnings = []
|
|
162
|
+
for col_name, spec in self.columns.items():
|
|
163
|
+
if spec.cast_dtype is not None and spec.parse_dtype is None:
|
|
164
|
+
warnings.append(
|
|
165
|
+
f"{col_name}: cast_dtype without parse_dtype may fail "
|
|
166
|
+
"if Polars infers wrong type"
|
|
167
|
+
)
|
|
168
|
+
return warnings
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# =============================================================================
|
|
172
|
+
# YLIN Trade Format (v2025-12-04)
|
|
173
|
+
# =============================================================================
|
|
174
|
+
|
|
175
|
+
YLIN_V20251204 = SchemaEvolution(
|
|
176
|
+
columns={
|
|
177
|
+
# === Order columns (18) ===
|
|
178
|
+
"symbol": ColumnSpec(rename_to="ukey", parse_dtype=pl.Int64),
|
|
179
|
+
"orderId": ColumnSpec(rename_to="order_id", parse_dtype=pl.Int64),
|
|
180
|
+
"orderSide": ColumnSpec(rename_to="order_side", parse_dtype=pl.String),
|
|
181
|
+
"orderQty": ColumnSpec(
|
|
182
|
+
rename_to="order_qty",
|
|
183
|
+
parse_dtype=pl.Float64,
|
|
184
|
+
cast_dtype=pl.Int64,
|
|
185
|
+
),
|
|
186
|
+
"orderPrice": ColumnSpec(rename_to="order_price", parse_dtype=pl.Float64),
|
|
187
|
+
"priceType": ColumnSpec(rename_to="order_price_type", parse_dtype=pl.String),
|
|
188
|
+
"fillQty": ColumnSpec(
|
|
189
|
+
rename_to="order_filled_qty",
|
|
190
|
+
parse_dtype=pl.Float64,
|
|
191
|
+
cast_dtype=pl.Int64,
|
|
192
|
+
),
|
|
193
|
+
"fillPrice": ColumnSpec(rename_to="fill_price", parse_dtype=pl.Float64),
|
|
194
|
+
"lastExchangeTs": ColumnSpec(rename_to="update_exchange_ts", parse_dtype=pl.Int64),
|
|
195
|
+
"createdTs": ColumnSpec(rename_to="create_exchange_ts", parse_dtype=pl.Int64),
|
|
196
|
+
"localTs": ColumnSpec(rename_to="create_local_ts", parse_dtype=pl.Int64),
|
|
197
|
+
"qtyAhead": ColumnSpec(
|
|
198
|
+
rename_to="qty_ahead",
|
|
199
|
+
parse_dtype=pl.Float64,
|
|
200
|
+
cast_dtype=pl.Int64,
|
|
201
|
+
),
|
|
202
|
+
"qtyBehind": ColumnSpec(
|
|
203
|
+
rename_to="qty_behind",
|
|
204
|
+
parse_dtype=pl.Float64,
|
|
205
|
+
cast_dtype=pl.Int64,
|
|
206
|
+
),
|
|
207
|
+
"orderStatus": ColumnSpec(rename_to="order_curr_state", parse_dtype=pl.String),
|
|
208
|
+
"orderTposType": ColumnSpec(rename_to="order_tpos_type", parse_dtype=pl.String),
|
|
209
|
+
"alphaTs": ColumnSpec(rename_to="alpha_ts", parse_dtype=pl.Int64),
|
|
210
|
+
"event": ColumnSpec(rename_to="event_type", parse_dtype=pl.String),
|
|
211
|
+
"cumFilledNotional": ColumnSpec(
|
|
212
|
+
rename_to="order_filled_notional",
|
|
213
|
+
parse_dtype=pl.Float64,
|
|
214
|
+
),
|
|
215
|
+
# === Quote columns (20) ===
|
|
216
|
+
"bid": ColumnSpec(rename_to="bid_px0", parse_dtype=pl.Float64),
|
|
217
|
+
"bid2": ColumnSpec(rename_to="bid_px1", parse_dtype=pl.Float64),
|
|
218
|
+
"bid3": ColumnSpec(rename_to="bid_px2", parse_dtype=pl.Float64),
|
|
219
|
+
"bid4": ColumnSpec(rename_to="bid_px3", parse_dtype=pl.Float64),
|
|
220
|
+
"bid5": ColumnSpec(rename_to="bid_px4", parse_dtype=pl.Float64),
|
|
221
|
+
"ask": ColumnSpec(rename_to="ask_px0", parse_dtype=pl.Float64),
|
|
222
|
+
"ask2": ColumnSpec(rename_to="ask_px1", parse_dtype=pl.Float64),
|
|
223
|
+
"ask3": ColumnSpec(rename_to="ask_px2", parse_dtype=pl.Float64),
|
|
224
|
+
"ask4": ColumnSpec(rename_to="ask_px3", parse_dtype=pl.Float64),
|
|
225
|
+
"ask5": ColumnSpec(rename_to="ask_px4", parse_dtype=pl.Float64),
|
|
226
|
+
"bsize": ColumnSpec(
|
|
227
|
+
rename_to="bid_size0",
|
|
228
|
+
parse_dtype=pl.Float64,
|
|
229
|
+
cast_dtype=pl.Int64,
|
|
230
|
+
),
|
|
231
|
+
"bsize2": ColumnSpec(
|
|
232
|
+
rename_to="bid_size1",
|
|
233
|
+
parse_dtype=pl.Float64,
|
|
234
|
+
cast_dtype=pl.Int64,
|
|
235
|
+
),
|
|
236
|
+
"bsize3": ColumnSpec(
|
|
237
|
+
rename_to="bid_size2",
|
|
238
|
+
parse_dtype=pl.Float64,
|
|
239
|
+
cast_dtype=pl.Int64,
|
|
240
|
+
),
|
|
241
|
+
"bsize4": ColumnSpec(
|
|
242
|
+
rename_to="bid_size3",
|
|
243
|
+
parse_dtype=pl.Float64,
|
|
244
|
+
cast_dtype=pl.Int64,
|
|
245
|
+
),
|
|
246
|
+
"bsize5": ColumnSpec(
|
|
247
|
+
rename_to="bid_size4",
|
|
248
|
+
parse_dtype=pl.Float64,
|
|
249
|
+
cast_dtype=pl.Int64,
|
|
250
|
+
),
|
|
251
|
+
"asize": ColumnSpec(
|
|
252
|
+
rename_to="ask_size0",
|
|
253
|
+
parse_dtype=pl.Float64,
|
|
254
|
+
cast_dtype=pl.Int64,
|
|
255
|
+
),
|
|
256
|
+
"asize2": ColumnSpec(
|
|
257
|
+
rename_to="ask_size1",
|
|
258
|
+
parse_dtype=pl.Float64,
|
|
259
|
+
cast_dtype=pl.Int64,
|
|
260
|
+
),
|
|
261
|
+
"asize3": ColumnSpec(
|
|
262
|
+
rename_to="ask_size2",
|
|
263
|
+
parse_dtype=pl.Float64,
|
|
264
|
+
cast_dtype=pl.Int64,
|
|
265
|
+
),
|
|
266
|
+
"asize4": ColumnSpec(
|
|
267
|
+
rename_to="ask_size3",
|
|
268
|
+
parse_dtype=pl.Float64,
|
|
269
|
+
cast_dtype=pl.Int64,
|
|
270
|
+
),
|
|
271
|
+
"asize5": ColumnSpec(
|
|
272
|
+
rename_to="ask_size4",
|
|
273
|
+
parse_dtype=pl.Float64,
|
|
274
|
+
cast_dtype=pl.Int64,
|
|
275
|
+
),
|
|
276
|
+
"isRebasedQuote": ColumnSpec(rename_to="is_rebased", parse_dtype=pl.String),
|
|
277
|
+
"quoteSeqNum": ColumnSpec(rename_to="seq_num", parse_dtype=pl.Int64),
|
|
278
|
+
"quoteTs": ColumnSpec(rename_to="timestamp", parse_dtype=pl.Int64),
|
|
279
|
+
# === Position columns (11) ===
|
|
280
|
+
"startPos": ColumnSpec(
|
|
281
|
+
rename_to="init_net_pos",
|
|
282
|
+
parse_dtype=pl.Float64,
|
|
283
|
+
cast_dtype=pl.Int64,
|
|
284
|
+
),
|
|
285
|
+
"pos": ColumnSpec(
|
|
286
|
+
rename_to="current_net_pos",
|
|
287
|
+
parse_dtype=pl.Float64,
|
|
288
|
+
cast_dtype=pl.Int64,
|
|
289
|
+
),
|
|
290
|
+
"realizedPos": ColumnSpec(
|
|
291
|
+
rename_to="current_realized_net_pos",
|
|
292
|
+
parse_dtype=pl.Float64,
|
|
293
|
+
cast_dtype=pl.Int64,
|
|
294
|
+
),
|
|
295
|
+
"openBuyPos": ColumnSpec(
|
|
296
|
+
rename_to="open_buy",
|
|
297
|
+
parse_dtype=pl.Float64,
|
|
298
|
+
cast_dtype=pl.Int64,
|
|
299
|
+
),
|
|
300
|
+
"openSellPos": ColumnSpec(
|
|
301
|
+
rename_to="open_sell",
|
|
302
|
+
parse_dtype=pl.Float64,
|
|
303
|
+
cast_dtype=pl.Int64,
|
|
304
|
+
),
|
|
305
|
+
"cumBuy": ColumnSpec(
|
|
306
|
+
rename_to="cum_buy",
|
|
307
|
+
parse_dtype=pl.Float64,
|
|
308
|
+
cast_dtype=pl.Int64,
|
|
309
|
+
),
|
|
310
|
+
"cumSell": ColumnSpec(
|
|
311
|
+
rename_to="cum_sell",
|
|
312
|
+
parse_dtype=pl.Float64,
|
|
313
|
+
cast_dtype=pl.Int64,
|
|
314
|
+
),
|
|
315
|
+
"cashFlow": ColumnSpec(rename_to="cash_flow", parse_dtype=pl.Float64),
|
|
316
|
+
"frozenCash": ColumnSpec(rename_to="frozen_cash", parse_dtype=pl.Float64),
|
|
317
|
+
"globalCumBuyNotional": ColumnSpec(
|
|
318
|
+
rename_to="cum_buy_filled_notional",
|
|
319
|
+
parse_dtype=pl.Float64,
|
|
320
|
+
),
|
|
321
|
+
"globalCumSellNotional": ColumnSpec(
|
|
322
|
+
rename_to="cum_sell_filled_notional",
|
|
323
|
+
parse_dtype=pl.Float64,
|
|
324
|
+
),
|
|
325
|
+
},
|
|
326
|
+
null_values=["", "NA", "null", "NULL"],
|
|
327
|
+
drop=["#HFTORD"],
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
# =============================================================================
|
|
332
|
+
# JYAO Alpha Format (v2025-11-14)
|
|
333
|
+
# =============================================================================
|
|
334
|
+
|
|
335
|
+
JYAO_V20251114 = SchemaEvolution(
|
|
336
|
+
columns={
|
|
337
|
+
# Symbol column
|
|
338
|
+
"ukey": ColumnSpec(parse_dtype=pl.Int64), # No rename, just parse type
|
|
339
|
+
# Quote columns
|
|
340
|
+
"BidPrice1": ColumnSpec(rename_to="bid_px0", parse_dtype=pl.Float64),
|
|
341
|
+
"AskPrice1": ColumnSpec(rename_to="ask_px0", parse_dtype=pl.Float64),
|
|
342
|
+
"BidVolume1": ColumnSpec(
|
|
343
|
+
rename_to="bid_size0",
|
|
344
|
+
parse_dtype=pl.Float64,
|
|
345
|
+
cast_dtype=pl.Int64,
|
|
346
|
+
),
|
|
347
|
+
"AskVolume1": ColumnSpec(
|
|
348
|
+
rename_to="ask_size0",
|
|
349
|
+
parse_dtype=pl.Float64,
|
|
350
|
+
cast_dtype=pl.Int64,
|
|
351
|
+
),
|
|
352
|
+
# Time columns
|
|
353
|
+
"TimeStamp": ColumnSpec(rename_to="timestamp", parse_dtype=pl.Int64),
|
|
354
|
+
"GlobalExTime": ColumnSpec(rename_to="global_exchange_ts", parse_dtype=pl.Int64),
|
|
355
|
+
"DataDate": ColumnSpec(rename_to="data_date", parse_dtype=pl.String),
|
|
356
|
+
# Volume
|
|
357
|
+
"Volume": ColumnSpec(
|
|
358
|
+
rename_to="volume",
|
|
359
|
+
parse_dtype=pl.Float64,
|
|
360
|
+
cast_dtype=pl.Int64,
|
|
361
|
+
),
|
|
362
|
+
# Predictor columns (x_* = alpha predictions)
|
|
363
|
+
# Rule: ≤60s → s, >60s → m
|
|
364
|
+
"x10s": ColumnSpec(rename_to="x_10s", parse_dtype=pl.Float64),
|
|
365
|
+
"x60s": ColumnSpec(rename_to="x_60s", parse_dtype=pl.Float64),
|
|
366
|
+
"alpha1": ColumnSpec(rename_to="x_3m", parse_dtype=pl.Float64),
|
|
367
|
+
"alpha2": ColumnSpec(rename_to="x_30m", parse_dtype=pl.Float64),
|
|
368
|
+
},
|
|
369
|
+
null_values=["", "NA"],
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
# =============================================================================
|
|
374
|
+
# Schema Registry
|
|
375
|
+
# =============================================================================
|
|
376
|
+
|
|
377
|
+
SCHEMAS: dict[str, SchemaEvolution] = {
|
|
378
|
+
"ylin_v20251204": YLIN_V20251204,
|
|
379
|
+
"jyao_v20251114": JYAO_V20251114,
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def get_schema(name: str | None) -> SchemaEvolution | None:
|
|
384
|
+
"""Get SchemaEvolution by name.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
name: Schema name (e.g., "ylin_v20251204") or None.
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
SchemaEvolution or None if name is None or not found.
|
|
391
|
+
"""
|
|
392
|
+
if not name:
|
|
393
|
+
return None
|
|
394
|
+
return SCHEMAS.get(name.lower())
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
vizflow/__init__.py,sha256=nmZ9_4DkT6ndFefemioNGNw9ELWFCQsQASxBxLHoAZs,529
|
|
2
|
+
vizflow/config.py,sha256=y4vRvjVTa1H5AdQf0q_XhYr-3EBDJst8BJq52ODN3uk,6456
|
|
3
|
+
vizflow/io.py,sha256=eheqyLHGiSh69erxMk98FK-GYycbSheqkrIYRYGFy3A,7687
|
|
4
|
+
vizflow/market.py,sha256=MtNz_nnZxC66Aq-i2PXEwaFCTknijFWYZUUv6798k2s,2493
|
|
5
|
+
vizflow/ops.py,sha256=4UwxOTPhvZ1_4PI3pcxbXfLAYsn1Ecj6nyBtBBr7KS8,7761
|
|
6
|
+
vizflow/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
vizflow/schema_evolution.py,sha256=7ZgybN7aS6SWxteTnX_uXZWn-IfosIPzr42_f3BlFv8,13909
|
|
8
|
+
vizflow-0.5.2.dist-info/METADATA,sha256=DQdMKOm4yHQ4REnaOptw5avVjlaCpuXbQLFwHbH9_Gk,388
|
|
9
|
+
vizflow-0.5.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
10
|
+
vizflow-0.5.2.dist-info/RECORD,,
|
vizflow/presets.py
DELETED
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
"""Column mapping presets for VizFlow."""
|
|
2
|
-
|
|
3
|
-
# ylin's trade format (v2025-12-04)
|
|
4
|
-
YLIN_V20251204 = {
|
|
5
|
-
# Order columns (18)
|
|
6
|
-
"symbol": "ukey",
|
|
7
|
-
"orderId": "order_id",
|
|
8
|
-
"orderSide": "order_side",
|
|
9
|
-
"orderQty": "order_qty",
|
|
10
|
-
"orderPrice": "order_price",
|
|
11
|
-
"priceType": "order_price_type",
|
|
12
|
-
"fillQty": "order_filled_qty",
|
|
13
|
-
"fillPrice": "fill_price",
|
|
14
|
-
"lastExchangeTs": "update_exchange_ts",
|
|
15
|
-
"createdTs": "create_exchange_ts",
|
|
16
|
-
"localTs": "create_local_ts",
|
|
17
|
-
"qtyAhead": "qty_ahead",
|
|
18
|
-
"qtyBehind": "qty_behind",
|
|
19
|
-
"orderStatus": "order_curr_state",
|
|
20
|
-
"orderTposType": "order_tpos_type",
|
|
21
|
-
"alphaTs": "alpha_ts",
|
|
22
|
-
"event": "event_type",
|
|
23
|
-
"cumFilledNotional": "order_filled_notional",
|
|
24
|
-
# Quote columns (15)
|
|
25
|
-
"bid": "bid_px0",
|
|
26
|
-
"bid2": "bid_px1",
|
|
27
|
-
"bid3": "bid_px2",
|
|
28
|
-
"bid4": "bid_px3",
|
|
29
|
-
"bid5": "bid_px4",
|
|
30
|
-
"ask": "ask_px0",
|
|
31
|
-
"ask2": "ask_px1",
|
|
32
|
-
"ask3": "ask_px2",
|
|
33
|
-
"ask4": "ask_px3",
|
|
34
|
-
"ask5": "ask_px4",
|
|
35
|
-
"bsize": "bid_size0",
|
|
36
|
-
"bsize2": "bid_size1",
|
|
37
|
-
"bsize3": "bid_size2",
|
|
38
|
-
"bsize4": "bid_size3",
|
|
39
|
-
"bsize5": "bid_size4",
|
|
40
|
-
"asize": "ask_size0",
|
|
41
|
-
"asize2": "ask_size1",
|
|
42
|
-
"asize3": "ask_size2",
|
|
43
|
-
"asize4": "ask_size3",
|
|
44
|
-
"asize5": "ask_size4",
|
|
45
|
-
"isRebasedQuote": "is_rebased",
|
|
46
|
-
"quoteSeqNum": "seq_num",
|
|
47
|
-
"quoteTs": "timestamp",
|
|
48
|
-
# Position columns (11)
|
|
49
|
-
"startPos": "init_net_pos",
|
|
50
|
-
"pos": "current_net_pos",
|
|
51
|
-
"realizedPos": "current_realized_net_pos",
|
|
52
|
-
"openBuyPos": "open_buy",
|
|
53
|
-
"openSellPos": "open_sell",
|
|
54
|
-
"cumBuy": "cum_buy",
|
|
55
|
-
"cumSell": "cum_sell",
|
|
56
|
-
"cashFlow": "cash_flow",
|
|
57
|
-
"frozenCash": "frozen_cash",
|
|
58
|
-
"globalCumBuyNotional": "cum_buy_filled_notional",
|
|
59
|
-
"globalCumSellNotional": "cum_sell_filled_notional",
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
# jyao's alpha format (v2025-11-14)
|
|
63
|
-
JYAO_V20251114 = {
|
|
64
|
-
# Quote columns
|
|
65
|
-
"BidPrice1": "bid_px0",
|
|
66
|
-
"AskPrice1": "ask_px0",
|
|
67
|
-
"BidVolume1": "bid_size0",
|
|
68
|
-
"AskVolume1": "ask_size0",
|
|
69
|
-
# Time columns
|
|
70
|
-
"TimeStamp": "timestamp",
|
|
71
|
-
"GlobalExTime": "global_exchange_ts",
|
|
72
|
-
"DataDate": "data_date",
|
|
73
|
-
# Volume
|
|
74
|
-
"Volume": "volume",
|
|
75
|
-
# Predictor columns (x_* = alpha predictions)
|
|
76
|
-
# Rule: ≤60s → s, >60s → m
|
|
77
|
-
"x10s": "x_10s",
|
|
78
|
-
"x60s": "x_60s",
|
|
79
|
-
"alpha1": "x_3m",
|
|
80
|
-
"alpha2": "x_30m",
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
# Preset registry for dynamic lookup
|
|
84
|
-
PRESETS: dict[str, dict[str, str]] = {
|
|
85
|
-
"ylin_v20251204": YLIN_V20251204,
|
|
86
|
-
"jyao_v20251114": JYAO_V20251114,
|
|
87
|
-
}
|
vizflow-0.5.0.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
vizflow/__init__.py,sha256=ZIMYQ-Yzh3eEAd7MSqNA00SlSuj45bEE6NsXM9Qc6O0,496
|
|
2
|
-
vizflow/config.py,sha256=zSZnhdHzgXSqhDenHcHKm4CDGrMpKAdkNNRoUYYF1uc,6530
|
|
3
|
-
vizflow/io.py,sha256=zmN0fFQOTmSBEBKangMExr0Q5mC2gajZM6GgdAyWkw4,8824
|
|
4
|
-
vizflow/market.py,sha256=MtNz_nnZxC66Aq-i2PXEwaFCTknijFWYZUUv6798k2s,2493
|
|
5
|
-
vizflow/ops.py,sha256=6hKOjJowFrw1b6z4y8Liea9KTp8Fgy6kCGPZ6t15PVk,7426
|
|
6
|
-
vizflow/presets.py,sha256=h91NZoOH4YAx0bbsaNigECf9WOcWh1QZavguunWkaLE,2452
|
|
7
|
-
vizflow/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
vizflow-0.5.0.dist-info/METADATA,sha256=cIIvBPZo2U6Sp46Wxgyu_tCVPWF4DGsgvapbavEBGl8,388
|
|
9
|
-
vizflow-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
10
|
-
vizflow-0.5.0.dist-info/RECORD,,
|
|
File without changes
|