vizflow 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vizflow/__init__.py CHANGED
@@ -5,13 +5,11 @@ Usage:
5
5
  import vizflow as vf
6
6
  """
7
7
 
8
- __version__ = "0.5.1"
8
+ __version__ = "0.5.2"
9
9
 
10
- from .config import ColumnSchema, Config, get_config, set_config
10
+ from .config import Config, get_config, set_config
11
11
  from .io import (
12
- load_alpha,
13
12
  load_calendar,
14
- load_trade,
15
13
  scan_alpha,
16
14
  scan_alphas,
17
15
  scan_trade,
@@ -19,4 +17,11 @@ from .io import (
19
17
  )
20
18
  from .market import CN, CRYPTO, Market, Session
21
19
  from .ops import aggregate, bin, forward_return, parse_time
22
- from .presets import JYAO_V20251114, PRESETS, YLIN_V20251204
20
+ from .schema_evolution import (
21
+ JYAO_V20251114,
22
+ SCHEMAS,
23
+ YLIN_V20251204,
24
+ ColumnSpec,
25
+ SchemaEvolution,
26
+ get_schema,
27
+ )
vizflow/config.py CHANGED
@@ -4,7 +4,10 @@ from __future__ import annotations
4
4
 
5
5
  from dataclasses import dataclass, field
6
6
  from pathlib import Path
7
- from typing import Any
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ if TYPE_CHECKING:
10
+ from .schema_evolution import SchemaEvolution
8
11
 
9
12
  # Global config instance
10
13
  _global_config: Config | None = None
@@ -25,21 +28,6 @@ def _validate_date(date: str) -> None:
25
28
  )
26
29
 
27
30
 
28
- @dataclass
29
- class ColumnSchema:
30
- """Schema for a column with type casting.
31
-
32
- Attributes:
33
- cast_to: Target type after casting (e.g. pl.Int64)
34
-
35
- Example:
36
- # Handle float precision errors: 1.00000002 → 1
37
- ColumnSchema(cast_to=pl.Int64)
38
- """
39
-
40
- cast_to: Any # pl.DataType, but avoid import for now
41
-
42
-
43
31
  @dataclass
44
32
  class Config:
45
33
  """Central configuration for a pipeline run.
@@ -53,14 +41,20 @@ class Config:
53
41
  replay_dir: Directory for FIFO replay output (materialization 1)
54
42
  aggregate_dir: Directory for aggregation output (materialization 2)
55
43
  market: Market identifier, e.g. "CN"
56
- alpha_columns: Mapping from semantic names to alpha column names
57
- trade_columns: Mapping from semantic names to trade column names
58
- alpha_schema: Schema evolution for alpha columns
59
- trade_schema: Schema evolution for trade columns
44
+ trade_schema: Schema evolution for trade data (name or SchemaEvolution)
45
+ alpha_schema: Schema evolution for alpha data (name or SchemaEvolution)
60
46
  binwidths: Mapping from column names to bin widths
61
47
  group_by: Columns to group by in aggregation
62
48
  horizons: List of forward return horizons in seconds
63
49
  time_cutoff: Optional time cutoff (e.g. 143000000 for 14:30:00)
50
+
51
+ Example:
52
+ >>> config = vf.Config(
53
+ ... trade_dir=Path("data/ylin/trade"),
54
+ ... trade_pattern="{date}.meords",
55
+ ... trade_schema="ylin_v20251204", # Use registered schema by name
56
+ ... market="CN",
57
+ ... )
64
58
  """
65
59
 
66
60
  # === Input Paths ===
@@ -77,17 +71,10 @@ class Config:
77
71
  # === Market ===
78
72
  market: str = "CN"
79
73
 
80
- # === Column Mapping ===
81
- alpha_columns: dict[str, str] = field(default_factory=dict)
82
- trade_columns: dict[str, str] = field(default_factory=dict)
83
-
84
74
  # === Schema Evolution ===
85
- alpha_schema: dict[str, ColumnSchema] = field(default_factory=dict)
86
- trade_schema: dict[str, ColumnSchema] = field(default_factory=dict)
87
-
88
- # === Column Mapping Presets ===
89
- trade_preset: str | None = None # "ylin" or None
90
- alpha_preset: str | None = None # "jyao_v20251114" or None
75
+ # Can be a string (schema name) or SchemaEvolution instance
76
+ trade_schema: str | SchemaEvolution | None = None
77
+ alpha_schema: str | SchemaEvolution | None = None
91
78
 
92
79
  # === Aggregation ===
93
80
  binwidths: dict[str, float] = field(default_factory=dict)
@@ -114,20 +101,6 @@ class Config:
114
101
  if isinstance(self.aggregate_dir, str):
115
102
  self.aggregate_dir = Path(self.aggregate_dir)
116
103
 
117
- def col(self, semantic: str, source: str = "trade") -> str:
118
- """Get actual column name from semantic name.
119
-
120
- Args:
121
- semantic: Semantic column name (e.g. "timestamp", "price")
122
- source: "alpha" or "trade"
123
-
124
- Returns:
125
- Actual column name, or the semantic name if no mapping exists
126
- """
127
- if source == "alpha":
128
- return self.alpha_columns.get(semantic, semantic)
129
- return self.trade_columns.get(semantic, semantic)
130
-
131
104
  def get_alpha_path(self, date: str) -> Path:
132
105
  """Get alpha file path for a date.
133
106
 
vizflow/io.py CHANGED
@@ -4,108 +4,40 @@ from __future__ import annotations
4
4
 
5
5
  import polars as pl
6
6
 
7
- from .config import ColumnSchema, Config, get_config
7
+ from .config import Config, get_config
8
+ from .schema_evolution import SchemaEvolution, get_schema
8
9
 
9
10
 
10
- def _apply_schema(df: pl.LazyFrame, schema: dict[str, ColumnSchema]) -> pl.LazyFrame:
11
- """Apply type casting from schema definition.
11
+ def _resolve_schema(
12
+ schema_ref: str | SchemaEvolution | None,
13
+ ) -> SchemaEvolution | None:
14
+ """Resolve schema reference to SchemaEvolution instance.
12
15
 
13
16
  Args:
14
- df: LazyFrame to apply schema to
15
- schema: Mapping from column names to ColumnSchema
17
+ schema_ref: Schema name string, SchemaEvolution instance, or None.
16
18
 
17
19
  Returns:
18
- LazyFrame with type casts applied
20
+ SchemaEvolution instance or None.
19
21
  """
20
- for col_name, col_schema in schema.items():
21
- df = df.with_columns(pl.col(col_name).cast(col_schema.cast_to))
22
- return df
23
-
24
-
25
- def load_alpha(date: str, config: Config | None = None) -> pl.LazyFrame:
26
- """Load alpha data for a date with automatic schema evolution.
27
-
28
- Args:
29
- date: Date string, e.g. "20241001"
30
- config: Config to use, or get_config() if None
31
-
32
- Returns:
33
- LazyFrame with schema evolution applied
34
-
35
- Example:
36
- >>> config = vf.Config(
37
- ... output_dir=Path("/data/output"),
38
- ... alpha_dir=Path("/data/alpha"),
39
- ... alpha_schema={"qty": vf.ColumnSchema(cast_to=pl.Int64)}
40
- ... )
41
- >>> vf.set_config(config)
42
- >>> alpha = vf.load_alpha("20241001")
43
- """
44
- config = config or get_config()
45
- path = config.get_alpha_path(date)
46
- df = pl.scan_ipc(path)
47
- return _apply_schema(df, config.alpha_schema)
22
+ if schema_ref is None:
23
+ return None
24
+ if isinstance(schema_ref, SchemaEvolution):
25
+ return schema_ref
26
+ return get_schema(schema_ref)
48
27
 
49
28
 
50
- def load_trade(date: str, config: Config | None = None) -> pl.LazyFrame:
51
- """Load trade data for a date with automatic schema evolution.
29
+ def _scan_file(
30
+ path,
31
+ schema: SchemaEvolution | None = None,
32
+ ) -> pl.LazyFrame:
33
+ """Scan a file based on its extension with optional schema.
52
34
 
53
35
  Args:
54
- date: Date string, e.g. "20241001"
55
- config: Config to use, or get_config() if None
36
+ path: Path to file.
37
+ schema: Optional SchemaEvolution for CSV parsing options.
56
38
 
57
39
  Returns:
58
- LazyFrame with schema evolution applied
59
-
60
- Example:
61
- >>> config = vf.Config(
62
- ... output_dir=Path("/data/output"),
63
- ... trade_dir=Path("/data/trade"),
64
- ... trade_schema={"qty": vf.ColumnSchema(cast_to=pl.Int64)}
65
- ... )
66
- >>> vf.set_config(config)
67
- >>> trade = vf.load_trade("20241001") # qty: 1.00000002 → 1
68
- """
69
- config = config or get_config()
70
- path = config.get_trade_path(date)
71
- df = pl.scan_ipc(path)
72
- return _apply_schema(df, config.trade_schema)
73
-
74
-
75
- def load_calendar(config: Config | None = None) -> pl.DataFrame:
76
- """Load trading calendar.
77
-
78
- Args:
79
- config: Config to use, or get_config() if None
80
-
81
- Returns:
82
- DataFrame with date, prev_date, next_date columns
83
-
84
- Raises:
85
- ValueError: If calendar_path is not set in config
86
-
87
- Example:
88
- >>> config = vf.Config(
89
- ... output_dir=Path("/data/output"),
90
- ... calendar_path=Path("/data/calendar.parquet")
91
- ... )
92
- >>> vf.set_config(config)
93
- >>> calendar = vf.load_calendar()
94
- """
95
- config = config or get_config()
96
- if config.calendar_path is None:
97
- raise ValueError("calendar_path is not set in Config")
98
- return pl.read_parquet(config.calendar_path)
99
-
100
-
101
- def _scan_file(path) -> pl.LazyFrame:
102
- """Scan a file based on its extension.
103
-
104
- Args:
105
- path: Path to file
106
-
107
- Returns:
108
- LazyFrame from the file
40
+ LazyFrame from the file.
109
41
 
110
42
  Supported formats:
111
43
  - .feather, .ipc, .arrow: IPC format (pl.scan_ipc)
@@ -117,7 +49,15 @@ def _scan_file(path) -> pl.LazyFrame:
117
49
  if suffix in ("feather", "ipc", "arrow"):
118
50
  return pl.scan_ipc(path)
119
51
  elif suffix in ("csv", "meords"):
120
- return pl.scan_csv(path)
52
+ csv_kwargs = {}
53
+ if schema:
54
+ schema_overrides = schema.get_schema_overrides()
55
+ if schema_overrides:
56
+ csv_kwargs["schema_overrides"] = schema_overrides
57
+ null_values = schema.get_null_values()
58
+ if null_values:
59
+ csv_kwargs["null_values"] = null_values
60
+ return pl.scan_csv(path, **csv_kwargs)
121
61
  elif suffix == "parquet":
122
62
  return pl.scan_parquet(path)
123
63
  else:
@@ -127,50 +67,96 @@ def _scan_file(path) -> pl.LazyFrame:
127
67
  )
128
68
 
129
69
 
70
+ def _apply_schema_evolution(
71
+ df: pl.LazyFrame,
72
+ schema: SchemaEvolution,
73
+ ) -> pl.LazyFrame:
74
+ """Apply full schema evolution: drop, rename, cast.
75
+
76
+ Args:
77
+ df: LazyFrame to transform.
78
+ schema: SchemaEvolution with transformation rules.
79
+
80
+ Returns:
81
+ Transformed LazyFrame.
82
+ """
83
+ existing = set(df.collect_schema().names())
84
+
85
+ # Step 1: Drop excluded columns
86
+ drop_cols = schema.get_drop_columns()
87
+ to_drop = [c for c in drop_cols if c in existing]
88
+ if to_drop:
89
+ df = df.drop(to_drop)
90
+ existing -= set(to_drop)
91
+
92
+ # Step 2: Rename columns
93
+ rename_map = schema.get_rename_map()
94
+ to_rename = {k: v for k, v in rename_map.items() if k in existing}
95
+ if to_rename:
96
+ df = df.rename(to_rename)
97
+ # Update existing names after rename
98
+ for old, new in to_rename.items():
99
+ existing.discard(old)
100
+ existing.add(new)
101
+
102
+ # Step 3: Cast columns (using FINAL names after rename)
103
+ cast_map = schema.get_cast_map()
104
+ for col_name, dtype in cast_map.items():
105
+ if col_name in existing:
106
+ df = df.with_columns(pl.col(col_name).cast(dtype))
107
+
108
+ return df
109
+
110
+
130
111
  def scan_trade(date: str, config: Config | None = None) -> pl.LazyFrame:
131
- """Scan single date trade file with column mapping.
112
+ """Scan single date trade file with schema evolution.
132
113
 
133
- Supports both IPC/feather format and CSV format (including .meords files).
114
+ Supports IPC/feather, CSV (including .meords), and Parquet formats.
134
115
 
135
116
  Args:
136
117
  date: Date string, e.g. "20241001"
137
118
  config: Config to use, or get_config() if None
138
119
 
139
120
  Returns:
140
- LazyFrame with column mapping and schema evolution applied
121
+ LazyFrame with schema evolution applied
141
122
 
142
123
  Example:
143
124
  >>> config = vf.Config(
144
- ... trade_dir=Path("/data/yuanzhao/"),
125
+ ... trade_dir=Path("/data/ylin/trade"),
145
126
  ... trade_pattern="{date}.meords",
146
- ... trade_preset="ylin_v20251204",
127
+ ... trade_schema="ylin_v20251204",
147
128
  ... )
148
129
  >>> vf.set_config(config)
149
130
  >>> df = vf.scan_trade("20241001")
150
131
  """
151
132
  config = config or get_config()
152
133
  path = config.get_trade_path(date)
153
- df = _scan_file(path)
154
- return _apply_trade_mapping(df, config)
134
+ schema = _resolve_schema(config.trade_schema)
135
+
136
+ df = _scan_file(path, schema=schema)
137
+ if schema:
138
+ df = _apply_schema_evolution(df, schema)
139
+
140
+ return df
155
141
 
156
142
 
157
143
  def scan_trades(config: Config | None = None) -> pl.LazyFrame:
158
- """Scan all trade files with column mapping.
144
+ """Scan all trade files with schema evolution.
159
145
 
160
146
  Args:
161
147
  config: Config to use, or get_config() if None
162
148
 
163
149
  Returns:
164
- LazyFrame with column mapping and schema evolution applied
150
+ LazyFrame with schema evolution applied
165
151
 
166
152
  Raises:
167
153
  ValueError: If trade_dir is not set or no files found
168
154
 
169
155
  Example:
170
156
  >>> config = vf.Config(
171
- ... trade_dir=Path("/data/yuanzhao/"),
172
- ... trade_pattern="{date}.feather",
173
- ... trade_preset="ylin_v20251204",
157
+ ... trade_dir=Path("/data/ylin/trade"),
158
+ ... trade_pattern="{date}.meords",
159
+ ... trade_schema="ylin_v20251204",
174
160
  ... )
175
161
  >>> vf.set_config(config)
176
162
  >>> df = vf.scan_trades()
@@ -184,82 +170,56 @@ def scan_trades(config: Config | None = None) -> pl.LazyFrame:
184
170
  if not files:
185
171
  raise ValueError(f"No files found matching {pattern} in {config.trade_dir}")
186
172
 
173
+ schema = _resolve_schema(config.trade_schema)
174
+
187
175
  # Concatenate all files using lazy scanning
188
- dfs = [_scan_file(f) for f in files]
176
+ dfs = [_scan_file(f, schema=schema) for f in files]
189
177
  df = pl.concat(dfs)
190
- return _apply_trade_mapping(df, config)
191
-
192
-
193
- def _apply_trade_mapping(df: pl.LazyFrame, config: Config) -> pl.LazyFrame:
194
- """Apply column rename + schema evolution for trade data."""
195
- df = _apply_rename(df, config.trade_preset)
196
- return _apply_schema(df, config.trade_schema)
197
-
198
-
199
- def _apply_alpha_mapping(df: pl.LazyFrame, config: Config) -> pl.LazyFrame:
200
- """Apply column rename + schema evolution for alpha data."""
201
- df = _apply_rename(df, config.alpha_preset)
202
- return _apply_schema(df, config.alpha_schema)
203
-
204
-
205
- def _apply_rename(df: pl.LazyFrame, preset: str | None) -> pl.LazyFrame:
206
- """Apply column rename from preset name.
207
-
208
- Args:
209
- df: LazyFrame to rename columns
210
- preset: Preset name (e.g., "ylin", "jyao_v20251114") or None
211
- """
212
- # Drop record type prefix column if present (from CSV files)
213
- existing = set(df.collect_schema().names())
214
- if "#HFTORD" in existing:
215
- df = df.drop("#HFTORD")
216
- existing.remove("#HFTORD")
217
178
 
218
- # Get rename map from preset
219
- rename_map = _get_rename_map(preset)
220
-
221
- if rename_map:
222
- existing = set(df.collect_schema().names())
223
- to_rename = {k: v for k, v in rename_map.items() if k in existing}
224
- if to_rename:
225
- df = df.rename(to_rename)
179
+ if schema:
180
+ df = _apply_schema_evolution(df, schema)
226
181
 
227
182
  return df
228
183
 
229
184
 
230
185
  def scan_alpha(date: str, config: Config | None = None) -> pl.LazyFrame:
231
- """Scan single date alpha file with column mapping.
186
+ """Scan single date alpha file with schema evolution.
232
187
 
233
188
  Args:
234
189
  date: Date string, e.g. "20241001"
235
190
  config: Config to use, or get_config() if None
236
191
 
237
192
  Returns:
238
- LazyFrame with column mapping and schema evolution applied
193
+ LazyFrame with schema evolution applied
239
194
 
240
195
  Example:
241
196
  >>> config = vf.Config(
242
197
  ... alpha_dir=Path("/data/jyao/alpha"),
243
198
  ... alpha_pattern="alpha_{date}.feather",
244
- ... alpha_preset="jyao_v20251114",
199
+ ... alpha_schema="jyao_v20251114",
245
200
  ... )
246
201
  >>> vf.set_config(config)
247
202
  >>> df = vf.scan_alpha("20251114")
248
203
  """
249
204
  config = config or get_config()
250
205
  path = config.get_alpha_path(date)
251
- df = _scan_file(path)
252
- return _apply_alpha_mapping(df, config)
206
+ schema = _resolve_schema(config.alpha_schema)
207
+
208
+ df = _scan_file(path, schema=schema)
209
+ if schema:
210
+ df = _apply_schema_evolution(df, schema)
211
+
212
+ return df
253
213
 
254
214
 
255
215
  def scan_alphas(config: Config | None = None) -> pl.LazyFrame:
256
- """Scan all alpha files with column mapping.
216
+ """Scan all alpha files with schema evolution.
257
217
 
258
218
  Args:
259
219
  config: Config to use, or get_config() if None
260
220
 
261
221
  Returns:
262
- LazyFrame with column mapping and schema evolution applied
222
+ LazyFrame with schema evolution applied
263
223
 
264
224
  Raises:
265
225
  ValueError: If alpha_dir is not set or no files found
@@ -273,22 +233,37 @@ def scan_alphas(config: Config | None = None) -> pl.LazyFrame:
273
233
  if not files:
274
234
  raise ValueError(f"No files found matching {pattern} in {config.alpha_dir}")
275
235
 
276
- dfs = [_scan_file(f) for f in files]
236
+ schema = _resolve_schema(config.alpha_schema)
237
+
238
+ dfs = [_scan_file(f, schema=schema) for f in files]
277
239
  df = pl.concat(dfs)
278
- return _apply_alpha_mapping(df, config)
279
240
 
241
+ if schema:
242
+ df = _apply_schema_evolution(df, schema)
280
243
 
281
- def _get_rename_map(preset: str | None) -> dict[str, str]:
282
- """Get rename map from preset name.
244
+ return df
245
+
246
+
247
+ def load_calendar(config: Config | None = None) -> pl.DataFrame:
248
+ """Load trading calendar.
283
249
 
284
250
  Args:
285
- preset: Preset name (e.g., "ylin_v20251204", "jyao_v20251114") or None
251
+ config: Config to use, or get_config() if None
286
252
 
287
253
  Returns:
288
- Dict mapping old column names to new names
289
- """
290
- if not preset:
291
- return {}
292
- from .presets import PRESETS
254
+ DataFrame with date, prev_date, next_date columns
293
255
 
294
- return PRESETS.get(preset.lower(), {})
256
+ Raises:
257
+ ValueError: If calendar_path is not set in config
258
+
259
+ Example:
260
+ >>> config = vf.Config(
261
+ ... calendar_path=Path("/data/calendar.parquet")
262
+ ... )
263
+ >>> vf.set_config(config)
264
+ >>> calendar = vf.load_calendar()
265
+ """
266
+ config = config or get_config()
267
+ if config.calendar_path is None:
268
+ raise ValueError("calendar_path is not set in Config")
269
+ return pl.read_parquet(config.calendar_path)
@@ -0,0 +1,394 @@
1
+ """Schema Evolution for VizFlow.
2
+
3
+ This module defines how raw data evolves into standard format through:
4
+ - Column renaming (raw names → standard names)
5
+ - Parse-time type specification (for CSV parsing)
6
+ - Post-load type casting (e.g., Float64 → Int64)
7
+ - Null value handling
8
+ - Column exclusion
9
+
10
+ Example:
11
+ >>> schema = SchemaEvolution(
12
+ ... columns={
13
+ ... "fillQty": ColumnSpec(
14
+ ... rename_to="order_filled_qty",
15
+ ... parse_dtype=pl.Float64, # Parse as float (catch decimals)
16
+ ... cast_dtype=pl.Int64, # Then cast to int
17
+ ... ),
18
+ ... },
19
+ ... null_values=["", "NA"],
20
+ ... drop=["#HFTORD"],
21
+ ... )
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from dataclasses import dataclass, field
27
+ from typing import Any
28
+
29
+ import polars as pl
30
+
31
+
32
+ @dataclass
33
+ class ColumnSpec:
34
+ """Specification for a single column's parsing and transformation.
35
+
36
+ Attributes:
37
+ rename_to: Standard column name after rename. None keeps original name.
38
+ parse_dtype: Type to use when parsing CSV. None uses Polars inference.
39
+ cast_dtype: Final type after post-load casting. None keeps parse type.
40
+
41
+ Examples:
42
+ # Rename only (most common)
43
+ ColumnSpec(rename_to="ukey")
44
+
45
+ # Parse as Float64, cast to Int64 (handle decimal errors in qty)
46
+ ColumnSpec(rename_to="order_filled_qty",
47
+ parse_dtype=pl.Float64,
48
+ cast_dtype=pl.Int64)
49
+
50
+ # Parse as specific type, no cast (trusted integer)
51
+ ColumnSpec(rename_to="timestamp", parse_dtype=pl.Int64)
52
+ """
53
+
54
+ rename_to: str | None = None
55
+ parse_dtype: Any = None # pl.DataType
56
+ cast_dtype: Any = None # pl.DataType
57
+
58
+
59
+ @dataclass
60
+ class SchemaEvolution:
61
+ """Defines how raw data evolves into standard format.
62
+
63
+ Combines column renaming, parse-time types, post-load casting,
64
+ null value handling, and column exclusion into a single structure.
65
+
66
+ Attributes:
67
+ columns: Mapping from original column name to ColumnSpec.
68
+ null_values: Strings to treat as null at parse time.
69
+ drop: Column names to exclude from output.
70
+ parent: Optional parent schema for version inheritance.
71
+
72
+ Example:
73
+ >>> YLIN_V20251204 = SchemaEvolution(
74
+ ... columns={
75
+ ... "symbol": ColumnSpec(rename_to="ukey", parse_dtype=pl.Int64),
76
+ ... "fillQty": ColumnSpec(
77
+ ... rename_to="order_filled_qty",
78
+ ... parse_dtype=pl.Float64,
79
+ ... cast_dtype=pl.Int64,
80
+ ... ),
81
+ ... },
82
+ ... null_values=["", "NA", "null"],
83
+ ... drop=["#HFTORD"],
84
+ ... )
85
+ """
86
+
87
+ columns: dict[str, ColumnSpec] = field(default_factory=dict)
88
+ null_values: list[str] = field(default_factory=lambda: ["", "NA", "null"])
89
+ drop: list[str] = field(default_factory=list)
90
+ parent: SchemaEvolution | None = None
91
+
92
+ def get_schema_overrides(self) -> dict[str, Any]:
93
+ """Get schema_overrides dict for pl.scan_csv().
94
+
95
+ Returns:
96
+ Mapping from original column name to Polars dtype.
97
+ """
98
+ result = {}
99
+ if self.parent:
100
+ result.update(self.parent.get_schema_overrides())
101
+ for col_name, spec in self.columns.items():
102
+ if spec.parse_dtype is not None:
103
+ result[col_name] = spec.parse_dtype
104
+ return result
105
+
106
+ def get_rename_map(self) -> dict[str, str]:
107
+ """Get rename mapping dict for df.rename().
108
+
109
+ Returns:
110
+ Mapping from original column name to new name.
111
+ """
112
+ result = {}
113
+ if self.parent:
114
+ result.update(self.parent.get_rename_map())
115
+ for col_name, spec in self.columns.items():
116
+ if spec.rename_to is not None:
117
+ result[col_name] = spec.rename_to
118
+ return result
119
+
120
+ def get_cast_map(self) -> dict[str, Any]:
121
+ """Get post-load cast mapping dict.
122
+
123
+ Returns:
124
+ Mapping from FINAL column name (after rename) to cast dtype.
125
+ """
126
+ result = {}
127
+ if self.parent:
128
+ result.update(self.parent.get_cast_map())
129
+ for col_name, spec in self.columns.items():
130
+ if spec.cast_dtype is not None:
131
+ final_name = spec.rename_to or col_name
132
+ result[final_name] = spec.cast_dtype
133
+ return result
134
+
135
+ def get_drop_columns(self) -> set[str]:
136
+ """Get set of columns to drop.
137
+
138
+ Returns:
139
+ Set of original column names to exclude.
140
+ """
141
+ result = set()
142
+ if self.parent:
143
+ result.update(self.parent.get_drop_columns())
144
+ result.update(self.drop)
145
+ return result
146
+
147
+ def get_null_values(self) -> list[str]:
148
+ """Get list of null value strings.
149
+
150
+ Returns:
151
+ List of strings to treat as null at parse time.
152
+ """
153
+ return self.null_values
154
+
155
+ def validate(self) -> list[str]:
156
+ """Validate schema configuration.
157
+
158
+ Returns:
159
+ List of warnings about potential issues.
160
+ """
161
+ warnings = []
162
+ for col_name, spec in self.columns.items():
163
+ if spec.cast_dtype is not None and spec.parse_dtype is None:
164
+ warnings.append(
165
+ f"{col_name}: cast_dtype without parse_dtype may fail "
166
+ "if Polars infers wrong type"
167
+ )
168
+ return warnings
169
+
170
+
171
+ # =============================================================================
172
+ # YLIN Trade Format (v2025-12-04)
173
+ # =============================================================================
174
+
175
+ YLIN_V20251204 = SchemaEvolution(
176
+ columns={
177
+ # === Order columns (18) ===
178
+ "symbol": ColumnSpec(rename_to="ukey", parse_dtype=pl.Int64),
179
+ "orderId": ColumnSpec(rename_to="order_id", parse_dtype=pl.Int64),
180
+ "orderSide": ColumnSpec(rename_to="order_side", parse_dtype=pl.String),
181
+ "orderQty": ColumnSpec(
182
+ rename_to="order_qty",
183
+ parse_dtype=pl.Float64,
184
+ cast_dtype=pl.Int64,
185
+ ),
186
+ "orderPrice": ColumnSpec(rename_to="order_price", parse_dtype=pl.Float64),
187
+ "priceType": ColumnSpec(rename_to="order_price_type", parse_dtype=pl.String),
188
+ "fillQty": ColumnSpec(
189
+ rename_to="order_filled_qty",
190
+ parse_dtype=pl.Float64,
191
+ cast_dtype=pl.Int64,
192
+ ),
193
+ "fillPrice": ColumnSpec(rename_to="fill_price", parse_dtype=pl.Float64),
194
+ "lastExchangeTs": ColumnSpec(rename_to="update_exchange_ts", parse_dtype=pl.Int64),
195
+ "createdTs": ColumnSpec(rename_to="create_exchange_ts", parse_dtype=pl.Int64),
196
+ "localTs": ColumnSpec(rename_to="create_local_ts", parse_dtype=pl.Int64),
197
+ "qtyAhead": ColumnSpec(
198
+ rename_to="qty_ahead",
199
+ parse_dtype=pl.Float64,
200
+ cast_dtype=pl.Int64,
201
+ ),
202
+ "qtyBehind": ColumnSpec(
203
+ rename_to="qty_behind",
204
+ parse_dtype=pl.Float64,
205
+ cast_dtype=pl.Int64,
206
+ ),
207
+ "orderStatus": ColumnSpec(rename_to="order_curr_state", parse_dtype=pl.String),
208
+ "orderTposType": ColumnSpec(rename_to="order_tpos_type", parse_dtype=pl.String),
209
+ "alphaTs": ColumnSpec(rename_to="alpha_ts", parse_dtype=pl.Int64),
210
+ "event": ColumnSpec(rename_to="event_type", parse_dtype=pl.String),
211
+ "cumFilledNotional": ColumnSpec(
212
+ rename_to="order_filled_notional",
213
+ parse_dtype=pl.Float64,
214
+ ),
215
+ # === Quote columns (20) ===
216
+ "bid": ColumnSpec(rename_to="bid_px0", parse_dtype=pl.Float64),
217
+ "bid2": ColumnSpec(rename_to="bid_px1", parse_dtype=pl.Float64),
218
+ "bid3": ColumnSpec(rename_to="bid_px2", parse_dtype=pl.Float64),
219
+ "bid4": ColumnSpec(rename_to="bid_px3", parse_dtype=pl.Float64),
220
+ "bid5": ColumnSpec(rename_to="bid_px4", parse_dtype=pl.Float64),
221
+ "ask": ColumnSpec(rename_to="ask_px0", parse_dtype=pl.Float64),
222
+ "ask2": ColumnSpec(rename_to="ask_px1", parse_dtype=pl.Float64),
223
+ "ask3": ColumnSpec(rename_to="ask_px2", parse_dtype=pl.Float64),
224
+ "ask4": ColumnSpec(rename_to="ask_px3", parse_dtype=pl.Float64),
225
+ "ask5": ColumnSpec(rename_to="ask_px4", parse_dtype=pl.Float64),
226
+ "bsize": ColumnSpec(
227
+ rename_to="bid_size0",
228
+ parse_dtype=pl.Float64,
229
+ cast_dtype=pl.Int64,
230
+ ),
231
+ "bsize2": ColumnSpec(
232
+ rename_to="bid_size1",
233
+ parse_dtype=pl.Float64,
234
+ cast_dtype=pl.Int64,
235
+ ),
236
+ "bsize3": ColumnSpec(
237
+ rename_to="bid_size2",
238
+ parse_dtype=pl.Float64,
239
+ cast_dtype=pl.Int64,
240
+ ),
241
+ "bsize4": ColumnSpec(
242
+ rename_to="bid_size3",
243
+ parse_dtype=pl.Float64,
244
+ cast_dtype=pl.Int64,
245
+ ),
246
+ "bsize5": ColumnSpec(
247
+ rename_to="bid_size4",
248
+ parse_dtype=pl.Float64,
249
+ cast_dtype=pl.Int64,
250
+ ),
251
+ "asize": ColumnSpec(
252
+ rename_to="ask_size0",
253
+ parse_dtype=pl.Float64,
254
+ cast_dtype=pl.Int64,
255
+ ),
256
+ "asize2": ColumnSpec(
257
+ rename_to="ask_size1",
258
+ parse_dtype=pl.Float64,
259
+ cast_dtype=pl.Int64,
260
+ ),
261
+ "asize3": ColumnSpec(
262
+ rename_to="ask_size2",
263
+ parse_dtype=pl.Float64,
264
+ cast_dtype=pl.Int64,
265
+ ),
266
+ "asize4": ColumnSpec(
267
+ rename_to="ask_size3",
268
+ parse_dtype=pl.Float64,
269
+ cast_dtype=pl.Int64,
270
+ ),
271
+ "asize5": ColumnSpec(
272
+ rename_to="ask_size4",
273
+ parse_dtype=pl.Float64,
274
+ cast_dtype=pl.Int64,
275
+ ),
276
+ "isRebasedQuote": ColumnSpec(rename_to="is_rebased", parse_dtype=pl.String),
277
+ "quoteSeqNum": ColumnSpec(rename_to="seq_num", parse_dtype=pl.Int64),
278
+ "quoteTs": ColumnSpec(rename_to="timestamp", parse_dtype=pl.Int64),
279
+ # === Position columns (11) ===
280
+ "startPos": ColumnSpec(
281
+ rename_to="init_net_pos",
282
+ parse_dtype=pl.Float64,
283
+ cast_dtype=pl.Int64,
284
+ ),
285
+ "pos": ColumnSpec(
286
+ rename_to="current_net_pos",
287
+ parse_dtype=pl.Float64,
288
+ cast_dtype=pl.Int64,
289
+ ),
290
+ "realizedPos": ColumnSpec(
291
+ rename_to="current_realized_net_pos",
292
+ parse_dtype=pl.Float64,
293
+ cast_dtype=pl.Int64,
294
+ ),
295
+ "openBuyPos": ColumnSpec(
296
+ rename_to="open_buy",
297
+ parse_dtype=pl.Float64,
298
+ cast_dtype=pl.Int64,
299
+ ),
300
+ "openSellPos": ColumnSpec(
301
+ rename_to="open_sell",
302
+ parse_dtype=pl.Float64,
303
+ cast_dtype=pl.Int64,
304
+ ),
305
+ "cumBuy": ColumnSpec(
306
+ rename_to="cum_buy",
307
+ parse_dtype=pl.Float64,
308
+ cast_dtype=pl.Int64,
309
+ ),
310
+ "cumSell": ColumnSpec(
311
+ rename_to="cum_sell",
312
+ parse_dtype=pl.Float64,
313
+ cast_dtype=pl.Int64,
314
+ ),
315
+ "cashFlow": ColumnSpec(rename_to="cash_flow", parse_dtype=pl.Float64),
316
+ "frozenCash": ColumnSpec(rename_to="frozen_cash", parse_dtype=pl.Float64),
317
+ "globalCumBuyNotional": ColumnSpec(
318
+ rename_to="cum_buy_filled_notional",
319
+ parse_dtype=pl.Float64,
320
+ ),
321
+ "globalCumSellNotional": ColumnSpec(
322
+ rename_to="cum_sell_filled_notional",
323
+ parse_dtype=pl.Float64,
324
+ ),
325
+ },
326
+ null_values=["", "NA", "null", "NULL"],
327
+ drop=["#HFTORD"],
328
+ )
329
+
330
+
331
+ # =============================================================================
332
+ # JYAO Alpha Format (v2025-11-14)
333
+ # =============================================================================
334
+
335
+ JYAO_V20251114 = SchemaEvolution(
336
+ columns={
337
+ # Symbol column
338
+ "ukey": ColumnSpec(parse_dtype=pl.Int64), # No rename, just parse type
339
+ # Quote columns
340
+ "BidPrice1": ColumnSpec(rename_to="bid_px0", parse_dtype=pl.Float64),
341
+ "AskPrice1": ColumnSpec(rename_to="ask_px0", parse_dtype=pl.Float64),
342
+ "BidVolume1": ColumnSpec(
343
+ rename_to="bid_size0",
344
+ parse_dtype=pl.Float64,
345
+ cast_dtype=pl.Int64,
346
+ ),
347
+ "AskVolume1": ColumnSpec(
348
+ rename_to="ask_size0",
349
+ parse_dtype=pl.Float64,
350
+ cast_dtype=pl.Int64,
351
+ ),
352
+ # Time columns
353
+ "TimeStamp": ColumnSpec(rename_to="timestamp", parse_dtype=pl.Int64),
354
+ "GlobalExTime": ColumnSpec(rename_to="global_exchange_ts", parse_dtype=pl.Int64),
355
+ "DataDate": ColumnSpec(rename_to="data_date", parse_dtype=pl.String),
356
+ # Volume
357
+ "Volume": ColumnSpec(
358
+ rename_to="volume",
359
+ parse_dtype=pl.Float64,
360
+ cast_dtype=pl.Int64,
361
+ ),
362
+ # Predictor columns (x_* = alpha predictions)
363
+ # Rule: ≤60s → s, >60s → m
364
+ "x10s": ColumnSpec(rename_to="x_10s", parse_dtype=pl.Float64),
365
+ "x60s": ColumnSpec(rename_to="x_60s", parse_dtype=pl.Float64),
366
+ "alpha1": ColumnSpec(rename_to="x_3m", parse_dtype=pl.Float64),
367
+ "alpha2": ColumnSpec(rename_to="x_30m", parse_dtype=pl.Float64),
368
+ },
369
+ null_values=["", "NA"],
370
+ )
371
+
372
+
373
+ # =============================================================================
374
+ # Schema Registry
375
+ # =============================================================================
376
+
377
+ SCHEMAS: dict[str, SchemaEvolution] = {
378
+ "ylin_v20251204": YLIN_V20251204,
379
+ "jyao_v20251114": JYAO_V20251114,
380
+ }
381
+
382
+
383
+ def get_schema(name: str | None) -> SchemaEvolution | None:
384
+ """Get SchemaEvolution by name.
385
+
386
+ Args:
387
+ name: Schema name (e.g., "ylin_v20251204") or None.
388
+
389
+ Returns:
390
+ SchemaEvolution or None if name is None or not found.
391
+ """
392
+ if not name:
393
+ return None
394
+ return SCHEMAS.get(name.lower())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vizflow
3
- Version: 0.5.1
3
+ Version: 0.5.2
4
4
  Requires-Python: >=3.9
5
5
  Requires-Dist: polars>=0.20.0
6
6
  Provides-Extra: dev
@@ -0,0 +1,10 @@
1
+ vizflow/__init__.py,sha256=nmZ9_4DkT6ndFefemioNGNw9ELWFCQsQASxBxLHoAZs,529
2
+ vizflow/config.py,sha256=y4vRvjVTa1H5AdQf0q_XhYr-3EBDJst8BJq52ODN3uk,6456
3
+ vizflow/io.py,sha256=eheqyLHGiSh69erxMk98FK-GYycbSheqkrIYRYGFy3A,7687
4
+ vizflow/market.py,sha256=MtNz_nnZxC66Aq-i2PXEwaFCTknijFWYZUUv6798k2s,2493
5
+ vizflow/ops.py,sha256=4UwxOTPhvZ1_4PI3pcxbXfLAYsn1Ecj6nyBtBBr7KS8,7761
6
+ vizflow/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ vizflow/schema_evolution.py,sha256=7ZgybN7aS6SWxteTnX_uXZWn-IfosIPzr42_f3BlFv8,13909
8
+ vizflow-0.5.2.dist-info/METADATA,sha256=DQdMKOm4yHQ4REnaOptw5avVjlaCpuXbQLFwHbH9_Gk,388
9
+ vizflow-0.5.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
+ vizflow-0.5.2.dist-info/RECORD,,
vizflow/presets.py DELETED
@@ -1,87 +0,0 @@
1
- """Column mapping presets for VizFlow."""
2
-
3
- # ylin's trade format (v2025-12-04)
4
- YLIN_V20251204 = {
5
- # Order columns (18)
6
- "symbol": "ukey",
7
- "orderId": "order_id",
8
- "orderSide": "order_side",
9
- "orderQty": "order_qty",
10
- "orderPrice": "order_price",
11
- "priceType": "order_price_type",
12
- "fillQty": "order_filled_qty",
13
- "fillPrice": "fill_price",
14
- "lastExchangeTs": "update_exchange_ts",
15
- "createdTs": "create_exchange_ts",
16
- "localTs": "create_local_ts",
17
- "qtyAhead": "qty_ahead",
18
- "qtyBehind": "qty_behind",
19
- "orderStatus": "order_curr_state",
20
- "orderTposType": "order_tpos_type",
21
- "alphaTs": "alpha_ts",
22
- "event": "event_type",
23
- "cumFilledNotional": "order_filled_notional",
24
- # Quote columns (15)
25
- "bid": "bid_px0",
26
- "bid2": "bid_px1",
27
- "bid3": "bid_px2",
28
- "bid4": "bid_px3",
29
- "bid5": "bid_px4",
30
- "ask": "ask_px0",
31
- "ask2": "ask_px1",
32
- "ask3": "ask_px2",
33
- "ask4": "ask_px3",
34
- "ask5": "ask_px4",
35
- "bsize": "bid_size0",
36
- "bsize2": "bid_size1",
37
- "bsize3": "bid_size2",
38
- "bsize4": "bid_size3",
39
- "bsize5": "bid_size4",
40
- "asize": "ask_size0",
41
- "asize2": "ask_size1",
42
- "asize3": "ask_size2",
43
- "asize4": "ask_size3",
44
- "asize5": "ask_size4",
45
- "isRebasedQuote": "is_rebased",
46
- "quoteSeqNum": "seq_num",
47
- "quoteTs": "timestamp",
48
- # Position columns (11)
49
- "startPos": "init_net_pos",
50
- "pos": "current_net_pos",
51
- "realizedPos": "current_realized_net_pos",
52
- "openBuyPos": "open_buy",
53
- "openSellPos": "open_sell",
54
- "cumBuy": "cum_buy",
55
- "cumSell": "cum_sell",
56
- "cashFlow": "cash_flow",
57
- "frozenCash": "frozen_cash",
58
- "globalCumBuyNotional": "cum_buy_filled_notional",
59
- "globalCumSellNotional": "cum_sell_filled_notional",
60
- }
61
-
62
- # jyao's alpha format (v2025-11-14)
63
- JYAO_V20251114 = {
64
- # Quote columns
65
- "BidPrice1": "bid_px0",
66
- "AskPrice1": "ask_px0",
67
- "BidVolume1": "bid_size0",
68
- "AskVolume1": "ask_size0",
69
- # Time columns
70
- "TimeStamp": "timestamp",
71
- "GlobalExTime": "global_exchange_ts",
72
- "DataDate": "data_date",
73
- # Volume
74
- "Volume": "volume",
75
- # Predictor columns (x_* = alpha predictions)
76
- # Rule: ≤60s → s, >60s → m
77
- "x10s": "x_10s",
78
- "x60s": "x_60s",
79
- "alpha1": "x_3m",
80
- "alpha2": "x_30m",
81
- }
82
-
83
- # Preset registry for dynamic lookup
84
- PRESETS: dict[str, dict[str, str]] = {
85
- "ylin_v20251204": YLIN_V20251204,
86
- "jyao_v20251114": JYAO_V20251114,
87
- }
@@ -1,10 +0,0 @@
1
- vizflow/__init__.py,sha256=EJ8qF4o2grf4aSochaasaaf0unyXV5yhrMs6rAhyp7k,496
2
- vizflow/config.py,sha256=JNW5-TshQ1v-Ft3-VV0JYJ5PdC3Yhgy4fW0AV0RWzkE,7322
3
- vizflow/io.py,sha256=ypiEuuPoHFKSt6VnhXcEI7u7dyVjKORunjqGpkFauXM,8877
4
- vizflow/market.py,sha256=MtNz_nnZxC66Aq-i2PXEwaFCTknijFWYZUUv6798k2s,2493
5
- vizflow/ops.py,sha256=4UwxOTPhvZ1_4PI3pcxbXfLAYsn1Ecj6nyBtBBr7KS8,7761
6
- vizflow/presets.py,sha256=h91NZoOH4YAx0bbsaNigECf9WOcWh1QZavguunWkaLE,2452
7
- vizflow/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- vizflow-0.5.1.dist-info/METADATA,sha256=DHzwPBvYuj7Rc4BawcXD2juS5iR5UD1FnAxt3cgvpo4,388
9
- vizflow-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
- vizflow-0.5.1.dist-info/RECORD,,