litequant 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1379 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import datetime
4
+ import json
5
+ import os
6
+ import re
7
+ import shutil
8
+ import tempfile
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ from typing import Callable, Iterable, List, Optional, Tuple
11
+
12
+ import pandas as pd
13
+ from tqdm import tqdm
14
+
15
+ from .exceptions import CategoryNotFoundError, LocalStorageError
16
+ from .log import GetLogger
17
+
18
+
19
+ def DFSaveToFile(df: pd.DataFrame, filepath: str, file_type: Optional[str] = None, debug: bool = False) -> None:
20
+ """
21
+ 原子写入文件:
22
+ 1. 写入临时文件
23
+ 2. os.replace 原子替换
24
+ """
25
+ filepath = os.path.abspath(filepath)
26
+ directory = os.path.dirname(filepath)
27
+ os.makedirs(directory, exist_ok=True)
28
+
29
+ if file_type is None:
30
+ file_type = filepath.split(".")[-1].lower()
31
+
32
+ fd, temp_filepath = tempfile.mkstemp(
33
+ prefix=".tmp_",
34
+ suffix=f".{file_type}",
35
+ dir=directory,
36
+ )
37
+ os.close(fd)
38
+
39
+ try:
40
+ if file_type == "pickle":
41
+ df.to_pickle(temp_filepath)
42
+ elif file_type == "parquet":
43
+ df.to_parquet(temp_filepath)
44
+ else:
45
+ raise ValueError(f"file_type must be 'pickle' or 'parquet', got {file_type}")
46
+
47
+ os.replace(temp_filepath, filepath)
48
+
49
+ if debug:
50
+ print(f"File saved successfully: {filepath}")
51
+
52
+ except Exception:
53
+ if os.path.exists(temp_filepath):
54
+ os.remove(temp_filepath)
55
+ raise LocalStorageError() from None
56
+
57
+
58
+ PIVOT_MONTHLY_SUBDIR = "data_pivot_monthly_partition"
59
+ PIVOT_DAILY_SUBDIR = "data_pivot_daily_partition"
60
+ UNSTACK_SUBDIR = "data_unstack"
61
+ CATEGORY_META_SUBDIR = "category_metadata"
62
+
63
+
64
+ class ParquetDataManager:
65
+ def __init__(self, root_dir: str, max_workers: int = 8, logger=None):
66
+ self.root_dir = os.path.abspath(root_dir)
67
+ self.max_workers = max_workers
68
+ self.logger = logger or GetLogger("litequant.parquet")
69
+ os.makedirs(self.root_dir, exist_ok=True)
70
+
71
+ def _get_typed_category_path(self, category: str, category_type: str, raise_err: bool = False) -> str:
72
+ mapping = {
73
+ "pivot_monthly": PIVOT_MONTHLY_SUBDIR,
74
+ "pivot_daily": PIVOT_DAILY_SUBDIR,
75
+ "unstack": UNSTACK_SUBDIR,
76
+ }
77
+ subdir = mapping.get(category_type, PIVOT_MONTHLY_SUBDIR)
78
+ dir_path = os.path.join(self.root_dir, subdir, category)
79
+ if raise_err and not os.path.exists(dir_path):
80
+ raise CategoryNotFoundError(f"Directory not exists: {dir_path}")
81
+ return dir_path
82
+
83
+ def _get_category_path(self, category: str, raise_err: bool = False) -> str:
84
+ """自动探测 category 所在的子目录(daily -> monthly -> unstack)"""
85
+ for subdir in [PIVOT_DAILY_SUBDIR, PIVOT_MONTHLY_SUBDIR, UNSTACK_SUBDIR]:
86
+ dir_path = os.path.join(self.root_dir, subdir, category)
87
+ if os.path.exists(dir_path):
88
+ return dir_path
89
+ dir_path = os.path.join(self.root_dir, PIVOT_MONTHLY_SUBDIR, category)
90
+ if raise_err and not os.path.exists(dir_path):
91
+ raise CategoryNotFoundError(f"Directory not exists: {dir_path}")
92
+ return dir_path
93
+
94
+ def _get_all_categories(self) -> List[str]:
95
+ categories = []
96
+ for subdir in [PIVOT_MONTHLY_SUBDIR, PIVOT_DAILY_SUBDIR, UNSTACK_SUBDIR]:
97
+ path = os.path.join(self.root_dir, subdir)
98
+ if os.path.exists(path):
99
+ categories.extend([
100
+ x for x in os.listdir(path)
101
+ if os.path.isdir(os.path.join(path, x))
102
+ ])
103
+ return sorted(categories)
104
+
105
+ def _if_exists_unstack_category(self, category: str) -> bool:
106
+ return os.path.exists(self._get_unstack_file_path(category))
107
+
108
+ def _if_exists_pivot_category(self, category: str) -> bool:
109
+ for category_type in ("pivot_monthly", "pivot_daily"):
110
+ category_path = self._get_typed_category_path(category, category_type, raise_err=False)
111
+ if not os.path.exists(category_path):
112
+ continue
113
+ parquet_files = [
114
+ x for x in os.listdir(category_path)
115
+ if x.endswith(".parquet") and (
116
+ re.search(r"=\d{4}-\d{2}\.parquet$", x) or
117
+ re.search(r"=\d{4}-\d{2}-\d{2}\.parquet$", x)
118
+ )
119
+ ]
120
+ if len(parquet_files) > 0:
121
+ return True
122
+ return False
123
+
124
+ @staticmethod
125
+ def ExtractYearMonth(input_string: str) -> Tuple[Optional[str], Optional[str]]:
126
+ match = re.search(r"=(\d{4})-(\d{1,2})\.parquet$", input_string)
127
+ if match:
128
+ year = match.group(1)
129
+ month = f"{int(match.group(2)):02d}"
130
+ return year, month
131
+ return None, None
132
+
133
+ @staticmethod
134
+ def ExtractYearMonthDay(input_string: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
135
+ match = re.search(r"=(?:\(')?(\d{4})-(\d{2})-(\d{2})(?:',\))?\.parquet$", input_string)
136
+ if match:
137
+ return match.group(1), match.group(2), match.group(3)
138
+ return None, None, None
139
+
140
+ def _get_pivot_year_month(self, category: str) -> List[Tuple[str, str]]:
141
+ directory = self._get_typed_category_path(category, "pivot_monthly", raise_err=True)
142
+ result = []
143
+ for x in os.listdir(directory):
144
+ year, month = self.ExtractYearMonth(x)
145
+ if year is not None:
146
+ result.append((year, month))
147
+ return sorted(result)
148
+
149
+ def _get_pivot_dates(self, category: str) -> List[str]:
150
+ directory = self._get_typed_category_path(category, "pivot_daily", raise_err=True)
151
+ result = []
152
+ for x in os.listdir(directory):
153
+ y, m, d = self.ExtractYearMonthDay(x)
154
+ if y is not None:
155
+ result.append(f"{y}-{m}-{d}")
156
+ return sorted(result)
157
+
158
+ def _get_pivot_file_path(self, category: str, year: int, month: int) -> str:
159
+ directory = self._get_typed_category_path(category, "pivot_monthly")
160
+ return os.path.join(directory, f"{category}={int(year):04d}-{int(month):02d}.parquet")
161
+
162
+ def _get_pivot_daily_file_path(self, category: str, date_str: str) -> str:
163
+ directory = self._get_typed_category_path(category, "pivot_daily")
164
+ return os.path.join(directory, f"{category}={date_str}.parquet")
165
+
166
+ def _get_unstack_file_path(self, category: str) -> str:
167
+ directory = self._get_typed_category_path(category, "unstack")
168
+ return os.path.join(directory, f"{category}.parquet")
169
+
170
+ @staticmethod
171
+ def _infer_category_type(category: str) -> str:
172
+ if "pivot#" in category:
173
+ return "pivot"
174
+ if "unstack#" in category:
175
+ return "unstack"
176
+ return "unknown"
177
+
178
+ @staticmethod
179
+ def _safe_meta_filename(category: str) -> str:
180
+ safe_name = re.sub(r'[<>:"/\\|?*]', "_", category)
181
+ return f"{safe_name}.json"
182
+
183
+ def _get_category_meta_dir(self) -> str:
184
+ return os.path.join(self.root_dir, CATEGORY_META_SUBDIR)
185
+
186
+ def _get_category_meta_file_path(self, category: str) -> str:
187
+ return os.path.join(self._get_category_meta_dir(), self._safe_meta_filename(category))
188
+
189
+ @staticmethod
190
+ def _write_json_atomic(obj: dict, file_path: str) -> None:
191
+ directory = os.path.dirname(os.path.abspath(file_path))
192
+ os.makedirs(directory, exist_ok=True)
193
+
194
+ fd, temp_filepath = tempfile.mkstemp(
195
+ prefix=".tmp_",
196
+ suffix=".json",
197
+ dir=directory,
198
+ )
199
+ os.close(fd)
200
+
201
+ try:
202
+ with open(temp_filepath, "w", encoding="utf-8") as f:
203
+ json.dump(obj, f, ensure_ascii=False, sort_keys=True, indent=2)
204
+ f.write("\n")
205
+ os.replace(temp_filepath, file_path)
206
+ except Exception:
207
+ if os.path.exists(temp_filepath):
208
+ os.remove(temp_filepath)
209
+ raise LocalStorageError() from None
210
+
211
+ def get_category_meta(self, category: str, default: Optional[dict] = None) -> dict:
212
+ file_path = self._get_category_meta_file_path(category)
213
+ if not os.path.exists(file_path):
214
+ return dict(default or {})
215
+
216
+ try:
217
+ with open(file_path, "r", encoding="utf-8") as f:
218
+ meta = json.load(f)
219
+ except Exception:
220
+ raise LocalStorageError() from None
221
+
222
+ if not isinstance(meta, dict):
223
+ raise LocalStorageError() from None
224
+ return meta
225
+
226
+ def set_category_meta(self, category: str, meta: Optional[dict], merge: bool = True) -> dict:
227
+ if meta is None:
228
+ meta = {}
229
+ if not isinstance(meta, dict):
230
+ raise LocalStorageError() from None
231
+
232
+ base = self.get_category_meta(category, default={}) if merge else {}
233
+ merged = dict(base)
234
+ merged.update(meta)
235
+ merged["category"] = category
236
+ merged.setdefault("category_type", self._infer_category_type(category))
237
+
238
+ if merged["category_type"] == "pivot":
239
+ mode = self._first_meta_value(merged, ("partition_mode", "storage.partition_mode"))
240
+ merged["partition_mode"] = mode if mode in ("daily", "monthly") else "monthly"
241
+ elif merged["category_type"] == "unstack":
242
+ mode = self._first_meta_value(merged, ("partition_mode", "storage.partition_mode"))
243
+ merged["partition_mode"] = mode if mode == "full" else "full"
244
+
245
+ self._write_json_atomic(merged, self._get_category_meta_file_path(category))
246
+ return merged
247
+
248
+ def ensure_category_meta(self, category: str, **fields) -> dict:
249
+ meta = self.get_category_meta(category, default={})
250
+ next_meta = dict(meta)
251
+ next_meta.update({k: v for k, v in fields.items() if v is not None})
252
+ return self.set_category_meta(category, next_meta, merge=True)
253
+
254
+ @staticmethod
255
+ def _get_nested_value(meta: dict, path: str, default=None):
256
+ current = meta
257
+ for part in path.split("."):
258
+ if not isinstance(current, dict) or part not in current:
259
+ return default
260
+ current = current[part]
261
+ return current
262
+
263
+ @staticmethod
264
+ def _first_meta_value(meta: dict, paths: Iterable[str], default=None):
265
+ for path in paths:
266
+ value = ParquetDataManager._get_nested_value(meta, path, default=None)
267
+ if value is not None:
268
+ return value
269
+ return default
270
+
271
+ def get_metadata_field(self, category: str, path: str, default=None):
272
+ meta = self.get_category_meta(category, default={})
273
+ return self._get_nested_value(meta, path, default=default)
274
+
275
+ def get_first_metadata_field(self, category: str, paths: Iterable[str], default=None):
276
+ meta = self.get_category_meta(category, default={})
277
+ return self._first_meta_value(meta, paths, default=default)
278
+
279
+ def _resolve_partition_mode(self, category: str, explicit_partition_mode: Optional[str] = None) -> Optional[str]:
280
+ if explicit_partition_mode in ("daily", "monthly", "full"):
281
+ return explicit_partition_mode
282
+ meta = self.get_category_meta(category, default={})
283
+ mode = self._first_meta_value(meta, ("partition_mode", "storage.partition_mode"))
284
+ if mode in ("daily", "monthly", "full"):
285
+ return mode
286
+ return None
287
+
288
+ def _resolve_calendar_category(self, category: str) -> Optional[str]:
289
+ meta = self.get_category_meta(category, default={})
290
+ value = self._first_meta_value(
291
+ meta,
292
+ (
293
+ "calendar_category",
294
+ "time.calendar_category",
295
+ "market.calendar.calendar_category",
296
+ "references.calendar_category",
297
+ ),
298
+ )
299
+ return str(value) if value else None
300
+
301
+ def _resolve_duplicate_keys(self, category: str) -> List[str]:
302
+ meta = self.get_category_meta(category, default={})
303
+ value = self._first_meta_value(
304
+ meta,
305
+ (
306
+ "duplicate_keys",
307
+ "schema_contract.primary_key",
308
+ ),
309
+ default=[],
310
+ )
311
+ if not isinstance(value, list):
312
+ return []
313
+ return [str(x).strip() for x in value if str(x).strip()]
314
+
315
+ def _resolve_date_column(self, category: str, df: Optional[pd.DataFrame] = None) -> Optional[str]:
316
+ meta = self.get_category_meta(category, default={})
317
+ value = self._first_meta_value(
318
+ meta,
319
+ (
320
+ "date_column",
321
+ "time.date_column",
322
+ "time.timestamp_column",
323
+ ),
324
+ )
325
+ if value and (df is None or str(value) in df.columns):
326
+ return str(value)
327
+ if df is not None and not df.empty:
328
+ return next(
329
+ (col for col in ("trade_date", "cal_date", "date", "timestamp") if col in df.columns),
330
+ df.columns[0],
331
+ )
332
+ return str(value) if value else None
333
+
334
+ def _resolve_is_open_column(self, category: str, df: Optional[pd.DataFrame] = None) -> Optional[str]:
335
+ meta = self.get_category_meta(category, default={})
336
+ value = self._first_meta_value(
337
+ meta,
338
+ (
339
+ "is_open_column",
340
+ "market.calendar.is_open_column",
341
+ ),
342
+ )
343
+ if value and (df is None or str(value) in df.columns):
344
+ return str(value)
345
+ if df is not None and "is_open" in df.columns:
346
+ return "is_open"
347
+ return str(value) if value else None
348
+
349
+ @staticmethod
350
+ def _normalize_date_columns(df: pd.DataFrame) -> pd.DataFrame:
351
+ if df.empty:
352
+ return df.copy()
353
+ out = df.copy()
354
+ out.columns = pd.to_datetime(out.columns)
355
+ out.sort_index(axis=0, inplace=True)
356
+ out.sort_index(axis=1, inplace=True)
357
+ return out
358
+
359
+ @staticmethod
360
+ def _format_date_columns(df: pd.DataFrame, date_format: str = "%Y-%m-%d") -> pd.DataFrame:
361
+ if df.empty:
362
+ return df.copy()
363
+ out = df.copy()
364
+ out.columns = pd.to_datetime(out.columns).strftime(date_format)
365
+ return out
366
+
367
+ @staticmethod
368
+ def _drop_all_null_rows(df: pd.DataFrame) -> pd.DataFrame:
369
+ if df.empty:
370
+ return df
371
+ return df[df.notnull().sum(axis=1) > 0]
372
+
373
+ @staticmethod
374
+ def _drop_duplicate_index_columns(df: pd.DataFrame) -> pd.DataFrame:
375
+ if df.empty:
376
+ return df
377
+
378
+ out = df
379
+ if out.index.duplicated().any():
380
+ out = out[~out.index.duplicated(keep="last")]
381
+ if out.columns.duplicated().any():
382
+ out = out.loc[:, ~out.columns.duplicated(keep="last")]
383
+ return out
384
+
385
+ def update_pivot_category(
386
+ self,
387
+ df: pd.DataFrame,
388
+ category: str,
389
+ strf: str = "%Y-%m-%d",
390
+ partition_mode: Optional[str] = None,
391
+ replace_partition: bool = False,
392
+ ) -> None:
393
+ partition_mode = self._resolve_partition_mode(category, explicit_partition_mode=partition_mode)
394
+ if partition_mode not in ("monthly", "daily"):
395
+ partition_mode = "monthly"
396
+
397
+ self.ensure_category_meta(
398
+ category,
399
+ category_type="pivot",
400
+ partition_mode=partition_mode,
401
+ )
402
+
403
+ if df is None or df.empty:
404
+ self.logger.info("%s: input df is empty, skip update.", category)
405
+ return
406
+
407
+ df = df.copy()
408
+ df.columns = pd.to_datetime(df.columns)
409
+ df = df.dropna(axis=1, how="all")
410
+
411
+ if df.empty:
412
+ self.logger.info("%s: all columns are empty after dropna, skip update.", category)
413
+ return
414
+
415
+ df = self._drop_duplicate_index_columns(df)
416
+
417
+ if partition_mode == "daily":
418
+ os.makedirs(self._get_typed_category_path(category, "pivot_daily"), exist_ok=True)
419
+ grouped = df.T.groupby([df.columns.strftime("%Y-%m-%d")])
420
+ tasks = [
421
+ [
422
+ self._get_pivot_daily_file_path(category, date_key[0] if isinstance(date_key, tuple) else date_key),
423
+ sub_df.T,
424
+ strf,
425
+ replace_partition,
426
+ ]
427
+ for date_key, sub_df in grouped
428
+ ]
429
+ else:
430
+ os.makedirs(self._get_typed_category_path(category, "pivot_monthly"), exist_ok=True)
431
+ grouped = df.T.groupby([df.columns.year, df.columns.month])
432
+ tasks = [
433
+ [self._get_pivot_file_path(category, int(year), int(month)), sub_df.T, strf, replace_partition]
434
+ for (year, month), sub_df in grouped
435
+ ]
436
+
437
+ if not tasks:
438
+ return
439
+
440
+ if len(tasks) == 1:
441
+ ParquetDataManager.UpdatePivotParquet(*tasks[0])
442
+ else:
443
+ self.RunConcurrently(
444
+ tasks=tasks,
445
+ worker_function=ParquetDataManager.UpdatePivotParquet,
446
+ max_workers=self.max_workers,
447
+ task_desc=f"Updating {category}",
448
+ )
449
+
450
+ @staticmethod
451
+ def UpdatePivotParquet(
452
+ file_path: str,
453
+ sub_df: pd.DataFrame,
454
+ date_format: str = "%Y-%m-%d",
455
+ replace_partition: bool = False,
456
+ ) -> None:
457
+ if sub_df is None or sub_df.empty:
458
+ return
459
+
460
+ sub_df = sub_df.copy()
461
+ sub_df = ParquetDataManager._drop_all_null_rows(sub_df)
462
+ if sub_df.empty:
463
+ if replace_partition and os.path.exists(file_path):
464
+ os.remove(file_path)
465
+ return
466
+
467
+ try:
468
+ sub_df = ParquetDataManager._normalize_date_columns(sub_df)
469
+ sub_df = ParquetDataManager._drop_duplicate_index_columns(sub_df)
470
+ except Exception:
471
+ raise LocalStorageError() from None
472
+
473
+ if replace_partition:
474
+ write_df = ParquetDataManager._format_date_columns(sub_df, date_format=date_format)
475
+ DFSaveToFile(write_df, file_path, debug=False)
476
+ return
477
+
478
+ if not os.path.exists(file_path):
479
+ write_df = ParquetDataManager._format_date_columns(sub_df, date_format=date_format)
480
+ DFSaveToFile(write_df, file_path, debug=False)
481
+ return
482
+
483
+ try:
484
+ existing_df = pd.read_parquet(file_path)
485
+ existing_df = ParquetDataManager._normalize_date_columns(existing_df)
486
+ existing_df = ParquetDataManager._drop_duplicate_index_columns(existing_df)
487
+ except Exception:
488
+ raise LocalStorageError() from None
489
+
490
+ old_index = existing_df.index
491
+ old_columns = existing_df.columns
492
+
493
+ common_index = old_index.union(sub_df.index)
494
+ common_columns = old_columns.union(sub_df.columns)
495
+
496
+ existing_df = existing_df.reindex(index=common_index, columns=common_columns)
497
+ sub_df = sub_df.reindex(index=common_index, columns=common_columns)
498
+
499
+ has_new_index = len(common_index.difference(old_index)) > 0
500
+ has_new_columns = len(common_columns.difference(old_columns)) > 0
501
+
502
+ candidate_mask = sub_df.notna()
503
+ candidate_count = int(candidate_mask.to_numpy().sum())
504
+
505
+ if candidate_count == 0 and not has_new_index and not has_new_columns:
506
+ return
507
+
508
+ diff_mask = candidate_mask & (existing_df.isna() | existing_df.ne(sub_df))
509
+ diff_count = int(diff_mask.to_numpy().sum())
510
+
511
+ if diff_count == 0 and not has_new_index and not has_new_columns:
512
+ return
513
+
514
+ with pd.option_context("future.no_silent_downcasting", True):
515
+ merged_df = existing_df.where(~candidate_mask, sub_df)
516
+ merged_df = merged_df.infer_objects(copy=False)
517
+ merged_df = ParquetDataManager._drop_all_null_rows(merged_df)
518
+ merged_df = ParquetDataManager._drop_duplicate_index_columns(merged_df)
519
+ merged_df.sort_index(axis=0, inplace=True)
520
+ merged_df.sort_index(axis=1, inplace=True)
521
+
522
+ write_df = ParquetDataManager._format_date_columns(merged_df, date_format=date_format)
523
+ DFSaveToFile(write_df, file_path, debug=False)
524
+
525
+ @staticmethod
526
+ def _normalize_date_strings(values: Iterable) -> List[str]:
527
+ if values is None:
528
+ return []
529
+ dates = pd.to_datetime(list(values), errors="coerce")
530
+ if len(dates) == 0:
531
+ return []
532
+ valid_dates = pd.Series(dates).dropna()
533
+ if valid_dates.empty:
534
+ return []
535
+ return sorted(valid_dates.dt.strftime("%Y-%m-%d").unique().tolist())
536
+
537
+ @staticmethod
538
+ def _filter_open_dates(df: pd.DataFrame, is_open_col: Optional[str]) -> pd.DataFrame:
539
+ if not is_open_col or is_open_col not in df.columns:
540
+ return df
541
+ open_flag = df[is_open_col]
542
+ if pd.api.types.is_numeric_dtype(open_flag):
543
+ return df[open_flag.fillna(0).astype(int) == 1]
544
+ return df[open_flag.astype(str).str.strip().isin(("1", "True", "true", "\u4ea4\u6613"))]
545
+
546
+ @staticmethod
547
+ def _us_juneteenth_observed_date(year: int) -> datetime.date:
548
+ holiday = datetime.date(year, 6, 19)
549
+ if holiday.weekday() == 5:
550
+ return datetime.date(year, 6, 18)
551
+ if holiday.weekday() == 6:
552
+ return datetime.date(year, 6, 20)
553
+ return holiday
554
+
555
+ @staticmethod
556
+ def _filter_stock_calendar_dates(calendar_category: str, dates: List[str]) -> List[str]:
557
+ if not dates or "_stock_unstack#trade_date" not in calendar_category:
558
+ return dates
559
+
560
+ result = []
561
+ for date_str in dates:
562
+ try:
563
+ date_obj = datetime.datetime.strptime(date_str, "%Y-%m-%d").date()
564
+ except ValueError:
565
+ continue
566
+ if date_obj.weekday() >= 5:
567
+ continue
568
+ if (
569
+ calendar_category.startswith("us_stock_")
570
+ and date_obj.year >= 2022
571
+ and date_obj == ParquetDataManager._us_juneteenth_observed_date(date_obj.year)
572
+ ):
573
+ continue
574
+ result.append(date_str)
575
+ return result
576
+
577
+ def get_calendar_trade_dates(self, calendar_category: str) -> List[str]:
578
+ file_path = self._get_unstack_file_path(calendar_category)
579
+ if not os.path.exists(file_path):
580
+ raise LocalStorageError() from None
581
+
582
+ try:
583
+ df = pd.read_parquet(file_path)
584
+ except Exception:
585
+ raise LocalStorageError() from None
586
+
587
+ if df.empty:
588
+ return []
589
+
590
+ date_col = self._resolve_date_column(calendar_category, df=df)
591
+ if not date_col or date_col not in df.columns:
592
+ date_col = next(
593
+ (col for col in ("trade_date", "cal_date", "date") if col in df.columns),
594
+ df.columns[0],
595
+ )
596
+
597
+ is_open_col = self._resolve_is_open_column(calendar_category, df=df)
598
+ work_df = self._filter_open_dates(df, is_open_col)
599
+ dates = self._normalize_date_strings(work_df[date_col])
600
+ return self._filter_stock_calendar_dates(calendar_category, dates)
601
+
602
+ def _get_expected_trade_dates(
603
+ self,
604
+ category: str,
605
+ start_date: Optional[str],
606
+ end_date: Optional[str],
607
+ ) -> Tuple[Optional[str], List[str]]:
608
+ calendar_category = self._resolve_calendar_category(category)
609
+ if not calendar_category:
610
+ return None, []
611
+
612
+ trade_dates = self.get_calendar_trade_dates(str(calendar_category))
613
+ if start_date:
614
+ start = pd.to_datetime(start_date).strftime("%Y-%m-%d")
615
+ trade_dates = [x for x in trade_dates if x >= start]
616
+ if end_date:
617
+ end = pd.to_datetime(end_date).strftime("%Y-%m-%d")
618
+ trade_dates = [x for x in trade_dates if x <= end]
619
+ return str(calendar_category), trade_dates
620
+
621
+ def _missing_trade_dates_from_actual(
622
+ self,
623
+ category: str,
624
+ start_date: Optional[str],
625
+ end_date: Optional[str],
626
+ actual_dates: Iterable,
627
+ ) -> Tuple[Optional[str], List[str]]:
628
+ calendar_category, expected_dates = self._get_expected_trade_dates(category, start_date, end_date)
629
+ if not calendar_category or not expected_dates:
630
+ return calendar_category, []
631
+
632
+ actual_set = set(self._normalize_date_strings(actual_dates))
633
+ missing_dates = [x for x in expected_dates if x not in actual_set]
634
+ return calendar_category, missing_dates
635
+
636
+ def _raise_if_pivot_missing_trade_dates(
637
+ self,
638
+ category: str,
639
+ start_date: Optional[str],
640
+ end_date: Optional[str],
641
+ actual_dates: Iterable,
642
+ ) -> None:
643
+ _, missing_dates = self._missing_trade_dates_from_actual(
644
+ category=category,
645
+ start_date=start_date,
646
+ end_date=end_date,
647
+ actual_dates=actual_dates,
648
+ )
649
+ if missing_dates:
650
+ raise LocalStorageError() from None
651
+
652
+ def check_pivot_missing_trade_dates(
653
+ self,
654
+ category: str,
655
+ start_date: Optional[str] = None,
656
+ end_date: Optional[str] = None,
657
+ ) -> List[str]:
658
+ df = self.read_pivot_category(
659
+ category=category,
660
+ start_date=start_date,
661
+ end_date=end_date,
662
+ raise_err=False,
663
+ validate_calendar=False,
664
+ show_progress=False,
665
+ )
666
+ if df.empty:
667
+ if start_date and end_date:
668
+ _, missing_dates = self._missing_trade_dates_from_actual(
669
+ category=category,
670
+ start_date=start_date,
671
+ end_date=end_date,
672
+ actual_dates=[],
673
+ )
674
+ return missing_dates
675
+ return []
676
+
677
+ actual_dates = self._normalize_date_strings(df.columns)
678
+ effective_start = start_date or (actual_dates[0] if actual_dates else None)
679
+ effective_end = end_date or (actual_dates[-1] if actual_dates else None)
680
+ _, missing_dates = self._missing_trade_dates_from_actual(
681
+ category=category,
682
+ start_date=effective_start,
683
+ end_date=effective_end,
684
+ actual_dates=actual_dates,
685
+ )
686
+ return missing_dates
687
+
688
+ def read_pivot_category(
689
+ self,
690
+ category: str,
691
+ start_date: Optional[str] = None,
692
+ end_date: Optional[str] = None,
693
+ column_type: str = "str",
694
+ raise_err: bool = True,
695
+ partition_mode: Optional[str] = None,
696
+ show_progress: bool = True,
697
+ validate_calendar: bool = True,
698
+ ) -> pd.DataFrame:
699
+ requested_start_date = start_date
700
+ requested_end_date = end_date
701
+ partition_mode = self._resolve_partition_mode(category, explicit_partition_mode=partition_mode)
702
+
703
+ # auto detect partition_mode by subdir existence
704
+ if partition_mode is None:
705
+ if os.path.exists(self._get_typed_category_path(category, "pivot_daily", raise_err=False)):
706
+ partition_mode = "daily"
707
+ else:
708
+ partition_mode = "monthly"
709
+
710
+ if partition_mode == "daily":
711
+ directory = self._get_typed_category_path(category, "pivot_daily", raise_err=False)
712
+ if not os.path.exists(directory):
713
+ if validate_calendar:
714
+ validation_start = requested_start_date or start_date
715
+ validation_end = requested_end_date or end_date
716
+ self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, [])
717
+ if raise_err:
718
+ raise CategoryNotFoundError() from None
719
+ return pd.DataFrame()
720
+
721
+ exists_parquets = [f for f in os.listdir(directory) if f.endswith(".parquet")]
722
+ part_file_map = {}
723
+ for name in exists_parquets:
724
+ y, m, d = self.ExtractYearMonthDay(name)
725
+ if y is not None:
726
+ part_file_map[f"{y}-{m}-{d}"] = name
727
+ exists_parts = sorted(part_file_map)
728
+
729
+ if not exists_parts:
730
+ self.logger.info("数据不存在 %s", category)
731
+ return pd.DataFrame()
732
+
733
+ if start_date is None:
734
+ start_date = exists_parts[0]
735
+ if end_date is None:
736
+ end_date = exists_parts[-1]
737
+
738
+ read_parts = [p for p in exists_parts if start_date <= p <= end_date]
739
+
740
+ if not read_parts:
741
+ self.logger.info("数据不存在 %s: [%s, %s]", category, start_date, end_date)
742
+ if validate_calendar:
743
+ validation_start = requested_start_date or start_date
744
+ validation_end = requested_end_date or end_date
745
+ self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, [])
746
+ return pd.DataFrame()
747
+
748
+ tasks = [[os.path.join(directory, part_file_map[p])] for p in read_parts]
749
+ else:
750
+ directory = self._get_typed_category_path(category, "pivot_monthly", raise_err=False)
751
+ if not os.path.exists(directory):
752
+ if validate_calendar:
753
+ validation_start = requested_start_date or start_date
754
+ validation_end = requested_end_date or end_date
755
+ self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, [])
756
+ if raise_err:
757
+ raise CategoryNotFoundError() from None
758
+ return pd.DataFrame()
759
+
760
+ exists_parquets = [f for f in os.listdir(directory) if f.endswith(".parquet")]
761
+ date_pattern = re.compile(r"(\d{4})-(\d{2})\.parquet$")
762
+ exists_ms = []
763
+ for name in exists_parquets:
764
+ match = date_pattern.search(name)
765
+ if match:
766
+ exists_ms.append(f"{match.group(1)}-{match.group(2)}")
767
+ exists_ms = sorted(set(exists_ms))
768
+
769
+ if not exists_ms:
770
+ self.logger.info("数据不存在 %s", category)
771
+ return pd.DataFrame()
772
+
773
+ if start_date is None:
774
+ start_date = "2000-01-01"
775
+ if end_date is None:
776
+ end_date = (pd.Timestamp(exists_ms[-1]) + pd.offsets.MonthEnd(0)).strftime("%Y-%m-%d")
777
+
778
+ start_month = pd.to_datetime(start_date).replace(day=1)
779
+ end_month = pd.to_datetime(end_date).replace(day=1)
780
+ target_ms = pd.date_range(start=start_month, end=end_month, freq="MS").strftime("%Y-%m").tolist()
781
+ read_ms = sorted(set(target_ms) & set(exists_ms))
782
+
783
+ if not read_ms:
784
+ self.logger.info("数据不存在 %s: [%s, %s]", category, start_date, end_date)
785
+ if validate_calendar:
786
+ validation_start = requested_start_date or start_date
787
+ validation_end = requested_end_date or end_date
788
+ self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, [])
789
+ return pd.DataFrame()
790
+
791
+ tasks = [[os.path.join(directory, f"{category}={ms}.parquet")] for ms in read_ms]
792
+
793
+ results = self.RunConcurrently(
794
+ tasks=tasks,
795
+ worker_function=ParquetDataManager.ReadParquet,
796
+ max_workers=self.max_workers,
797
+ task_desc=f"Reading {category}" if show_progress else None,
798
+ )
799
+
800
+ valid_results = [x for x in results if isinstance(x, pd.DataFrame) and not x.empty]
801
+ if not valid_results:
802
+ if validate_calendar:
803
+ validation_start = requested_start_date or start_date
804
+ validation_end = requested_end_date or end_date
805
+ self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, [])
806
+ return pd.DataFrame()
807
+
808
+ dfs = pd.concat(valid_results, axis=1)
809
+ dfs = self._drop_duplicate_index_columns(dfs)
810
+
811
+ if column_type == "datetime":
812
+ dfs.columns = pd.to_datetime(dfs.columns)
813
+ dfs.sort_index(axis=0, inplace=True)
814
+ dfs.sort_index(axis=1, inplace=True)
815
+ start = pd.to_datetime(start_date)
816
+ end = pd.to_datetime(end_date)
817
+ result_df = dfs.loc[:, (dfs.columns >= start) & (dfs.columns <= end)]
818
+ if validate_calendar:
819
+ actual_dates = self._normalize_date_strings(result_df.columns)
820
+ validation_start = requested_start_date or (actual_dates[0] if actual_dates else start_date)
821
+ validation_end = requested_end_date or (actual_dates[-1] if actual_dates else end_date)
822
+ self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, result_df.columns)
823
+ return result_df
824
+
825
+ dfs.columns = pd.to_datetime(dfs.columns).strftime("%Y-%m-%d")
826
+ dfs.sort_index(axis=0, inplace=True)
827
+ dfs.sort_index(axis=1, inplace=True)
828
+
829
+ start = pd.to_datetime(start_date).strftime("%Y-%m-%d")
830
+ end = pd.to_datetime(end_date).strftime("%Y-%m-%d")
831
+ result_df = dfs.loc[:, (dfs.columns >= start) & (dfs.columns <= end)]
832
+ if validate_calendar:
833
+ actual_dates = self._normalize_date_strings(result_df.columns)
834
+ validation_start = requested_start_date or (actual_dates[0] if actual_dates else start_date)
835
+ validation_end = requested_end_date or (actual_dates[-1] if actual_dates else end_date)
836
+ self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, result_df.columns)
837
+ return result_df
838
+
839
+ def ReadPivotCategoryLastNDays(
840
+ self,
841
+ category: str,
842
+ n_days: int = 20,
843
+ column_type: str = "datetime",
844
+ raise_err: bool = True,
845
+ ) -> pd.DataFrame:
846
+ latest_date = self.GetCategoryLatestDate(category)
847
+ if latest_date is None:
848
+ return pd.DataFrame()
849
+
850
+ start_date = (pd.Timestamp(latest_date) - pd.Timedelta(days=n_days * 3)).strftime("%Y-%m-%d")
851
+ df = self.read_pivot_category(
852
+ category=category,
853
+ start_date=start_date,
854
+ end_date=pd.Timestamp(latest_date).strftime("%Y-%m-%d"),
855
+ column_type=column_type,
856
+ raise_err=raise_err,
857
+ )
858
+ if df.empty:
859
+ return df
860
+ return df.iloc[:, -n_days:]
861
+
862
+ @staticmethod
863
+ def ReadParquet(file_path: str) -> pd.DataFrame:
864
+ return pd.read_parquet(file_path)
865
+
866
+ def update_unstack_category(
867
+ self,
868
+ category: str,
869
+ sub_df: pd.DataFrame,
870
+ duplicate_keys: Optional[Iterable[str]] = None,
871
+ key_columns: Optional[Iterable[str]] = None,
872
+ ) -> None:
873
+ if duplicate_keys is None:
874
+ duplicate_keys = key_columns
875
+ if duplicate_keys is None:
876
+ duplicate_keys = self._resolve_duplicate_keys(category)
877
+
878
+ resolved_key_columns = (
879
+ self.ResolveUnstackKeyColumns(sub_df, key_columns=duplicate_keys)
880
+ if sub_df is not None
881
+ else None
882
+ )
883
+ self.ensure_category_meta(
884
+ category,
885
+ category_type="unstack",
886
+ partition_mode="full",
887
+ duplicate_keys=resolved_key_columns,
888
+ )
889
+
890
+ if sub_df is None or sub_df.empty:
891
+ self.logger.info("%s: sub_df is empty, skip update.", category)
892
+ return
893
+
894
+ os.makedirs(self._get_typed_category_path(category, "unstack"), exist_ok=True)
895
+ file_path = self._get_unstack_file_path(category)
896
+ sub_df = self.NormalizeUnstackDf(sub_df, key_columns=resolved_key_columns)
897
+
898
+ if os.path.exists(file_path):
899
+ existing_df = pd.read_parquet(file_path)
900
+ if existing_df.equals(sub_df):
901
+ self.logger.info("%s: 无需更新,DataFrames identical.", file_path)
902
+ return
903
+
904
+ new_df = self.UpdateUnstackDf(existing_df, sub_df, duplicate_keys=resolved_key_columns)
905
+ DFSaveToFile(new_df, file_path, debug=False)
906
+ self.logger.info("%s: 更新 Unstack,Pre=%s After=%s", file_path, existing_df.shape, new_df.shape)
907
+ else:
908
+ DFSaveToFile(sub_df, file_path, debug=False)
909
+ self.logger.info("%s: 初始化 Unstack Shape=%s", file_path, sub_df.shape)
910
+
911
+ def read_unstack_category(self, category: str, raise_err: bool = True) -> pd.DataFrame:
912
+ file_path = self._get_unstack_file_path(category)
913
+ if raise_err:
914
+ return pd.read_parquet(file_path)
915
+
916
+ try:
917
+ return pd.read_parquet(file_path)
918
+ except Exception:
919
+ self.logger.warning("数据暂时不可用,请稍后重试")
920
+ return pd.DataFrame()
921
+
922
+ @staticmethod
923
+ def ResolveUnstackKeyColumns(
924
+ df: pd.DataFrame,
925
+ key_columns: Optional[Iterable[str]] = None,
926
+ ) -> List[str]:
927
+ if key_columns is None:
928
+ columns_name = df.columns.name
929
+ if columns_name is not None and str(columns_name).startswith("keys:"):
930
+ key_part = str(columns_name)[5:]
931
+ key_columns = [x.strip() for x in key_part.split(",") if x.strip()]
932
+
933
+ if key_columns is None:
934
+ return list(df.columns)
935
+
936
+ if isinstance(key_columns, str):
937
+ key_columns = [x.strip() for x in key_columns.split(",") if x.strip()]
938
+ else:
939
+ key_columns = [str(x).strip() for x in key_columns if str(x).strip()]
940
+
941
+ missing = [col for col in key_columns if col not in df.columns]
942
+ if missing:
943
+ raise LocalStorageError()
944
+
945
+ return key_columns
946
+
947
+ @staticmethod
948
+ def NormalizeUnstackDf(
949
+ df: pd.DataFrame,
950
+ key_columns: Optional[Iterable[str]] = None,
951
+ ) -> pd.DataFrame:
952
+ if df is None or df.empty:
953
+ return df
954
+
955
+ out = df.copy()
956
+ resolved_keys = ParquetDataManager.ResolveUnstackKeyColumns(out, key_columns=key_columns)
957
+ if resolved_keys:
958
+ out = out.drop_duplicates(subset=resolved_keys, keep="last")
959
+ else:
960
+ out = out.drop_duplicates(keep="last")
961
+ return out
962
+
963
+ @staticmethod
964
+ def UpdateUnstackDf(
965
+ original_df: pd.DataFrame,
966
+ new_data: pd.DataFrame,
967
+ duplicate_keys: Optional[Iterable[str]] = None,
968
+ key_columns: Optional[Iterable[str]] = None,
969
+ ) -> pd.DataFrame:
970
+ """
971
+ 合并 unstack 数据,支持指定去重键
972
+
973
+ 去重键指定方式:
974
+ - duplicate_keys / key_columns 显式传入时优先
975
+ - columns.name 以 "keys:" 开头,如 "keys:股票代码,计入日期"
976
+ - 否则按所有列去重
977
+ """
978
+ updated_df = pd.concat([original_df, new_data], axis=0)
979
+ if duplicate_keys is None:
980
+ duplicate_keys = key_columns
981
+ key_columns = duplicate_keys
982
+ key_columns = ParquetDataManager.ResolveUnstackKeyColumns(updated_df, key_columns=key_columns)
983
+
984
+ if len(key_columns) > 0:
985
+ updated_df = updated_df.drop_duplicates(subset=key_columns, keep="last")
986
+ else:
987
+ updated_df = updated_df.drop_duplicates(keep="last")
988
+
989
+ return updated_df
990
+
991
+ @staticmethod
992
+ def RunConcurrently(
993
+ tasks: List[list],
994
+ worker_function: Callable,
995
+ max_workers: int = 5,
996
+ task_desc: Optional[str] = "Processing tasks",
997
+ fail_fast: bool = True,
998
+ ) -> List:
999
+ if tasks is None or len(tasks) == 0:
1000
+ return []
1001
+
1002
+ results = [None] * len(tasks)
1003
+ errors = []
1004
+
1005
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
1006
+ future_to_idx = {
1007
+ executor.submit(worker_function, *task): idx
1008
+ for idx, task in enumerate(tasks)
1009
+ }
1010
+
1011
+ with tqdm(total=len(future_to_idx), desc=task_desc, disable=task_desc is None) as pbar:
1012
+ for future in as_completed(future_to_idx):
1013
+ idx = future_to_idx[future]
1014
+ try:
1015
+ results[idx] = future.result()
1016
+ except Exception as e:
1017
+ errors.append((idx, e))
1018
+ if fail_fast:
1019
+ raise
1020
+ finally:
1021
+ pbar.update(1)
1022
+
1023
+ if errors and fail_fast:
1024
+ raise errors[0][1]
1025
+
1026
+ return results
1027
+
1028
+ def GetCategoryLatestDate(self, category: str, partition_mode: Optional[str] = None):
1029
+ if not self._if_exists_pivot_category(category):
1030
+ return None
1031
+
1032
+ partition_mode = self._resolve_partition_mode(category, explicit_partition_mode=partition_mode)
1033
+
1034
+ # auto detect partition_mode by subdir existence
1035
+ if partition_mode is None:
1036
+ if os.path.exists(self._get_typed_category_path(category, "pivot_daily", raise_err=False)):
1037
+ partition_mode = "daily"
1038
+ else:
1039
+ partition_mode = "monthly"
1040
+
1041
+ if partition_mode == "daily":
1042
+ date_list = self._get_pivot_dates(category)
1043
+ if len(date_list) == 0:
1044
+ return None
1045
+
1046
+ for latest_date in reversed(date_list):
1047
+ file_path = self._get_pivot_daily_file_path(category, latest_date)
1048
+ if not os.path.exists(file_path):
1049
+ continue
1050
+
1051
+ df = pd.read_parquet(file_path)
1052
+ if df.empty or len(df.columns) == 0:
1053
+ continue
1054
+
1055
+ cols = pd.to_datetime(df.columns)
1056
+ return cols.max()
1057
+ else:
1058
+ ym_list = self._get_pivot_year_month(category)
1059
+ if len(ym_list) == 0:
1060
+ return None
1061
+
1062
+ for latest_y, latest_m in reversed(ym_list):
1063
+ file_path = self._get_pivot_file_path(category, int(latest_y), int(latest_m))
1064
+ if not os.path.exists(file_path):
1065
+ continue
1066
+
1067
+ df = pd.read_parquet(file_path)
1068
+ if df.empty or len(df.columns) == 0:
1069
+ continue
1070
+
1071
+ cols = pd.to_datetime(df.columns)
1072
+ return cols.max()
1073
+
1074
+ return None
1075
+
1076
+ def get_download_days(self, category: str, template_date: List, iloc: Optional[int] = None) -> List:
1077
+ """
1078
+ 获取需要下载的日期列表。
1079
+
1080
+ Args:
1081
+ category: 数据类别名称
1082
+ template_date: 模板日期列表
1083
+ iloc: 可选,从末尾删除的列数
1084
+
1085
+ Returns:
1086
+ [exists_df, update_dates]: 已存在的数据框和需要更新的日期列表
1087
+ """
1088
+ category_path = self._get_category_path(category, raise_err=False)
1089
+
1090
+ if os.path.exists(category_path):
1091
+ exists_df = self.read_pivot_category(
1092
+ category=category,
1093
+ start_date=min(template_date),
1094
+ end_date=max(template_date),
1095
+ )
1096
+ else:
1097
+ exists_df = pd.DataFrame()
1098
+
1099
+ if iloc:
1100
+ exists_df = exists_df.iloc[:, :-iloc]
1101
+
1102
+ existing_dates = set(pd.to_datetime(exists_df.columns)) if not exists_df.empty else set()
1103
+ template_dates = set(pd.to_datetime(template_date))
1104
+ update_dates = sorted(template_dates - existing_dates)
1105
+
1106
+ return [exists_df, update_dates]
1107
+
1108
+ def GetCategoryDateCoverage(self, category: str, partition_mode: Optional[str] = None):
1109
+ if not self._if_exists_pivot_category(category):
1110
+ return {
1111
+ "min_date": None,
1112
+ "max_date": None,
1113
+ "partition_count": 0,
1114
+ }
1115
+
1116
+ partition_mode = self._resolve_partition_mode(category, explicit_partition_mode=partition_mode)
1117
+
1118
+ # auto detect partition_mode by subdir existence
1119
+ if partition_mode is None:
1120
+ if os.path.exists(self._get_typed_category_path(category, "pivot_daily", raise_err=False)):
1121
+ partition_mode = "daily"
1122
+ else:
1123
+ partition_mode = "monthly"
1124
+
1125
+ if partition_mode == "daily":
1126
+ date_list = self._get_pivot_dates(category)
1127
+ if len(date_list) == 0:
1128
+ return {
1129
+ "min_date": None,
1130
+ "max_date": None,
1131
+ "partition_count": 0,
1132
+ }
1133
+
1134
+ first_fp = self._get_pivot_daily_file_path(category, date_list[0])
1135
+ last_fp = self._get_pivot_daily_file_path(category, date_list[-1])
1136
+
1137
+ first_df = pd.read_parquet(first_fp)
1138
+ last_df = pd.read_parquet(last_fp)
1139
+
1140
+ min_date = pd.to_datetime(first_df.columns).min() if len(first_df.columns) > 0 else None
1141
+ max_date = pd.to_datetime(last_df.columns).max() if len(last_df.columns) > 0 else None
1142
+
1143
+ return {
1144
+ "min_date": min_date,
1145
+ "max_date": max_date,
1146
+ "partition_count": len(date_list),
1147
+ }
1148
+ else:
1149
+ ym_list = self._get_pivot_year_month(category)
1150
+ if len(ym_list) == 0:
1151
+ return {
1152
+ "min_date": None,
1153
+ "max_date": None,
1154
+ "partition_count": 0,
1155
+ }
1156
+
1157
+ first_y, first_m = ym_list[0]
1158
+ last_y, last_m = ym_list[-1]
1159
+
1160
+ first_fp = self._get_pivot_file_path(category, int(first_y), int(first_m))
1161
+ last_fp = self._get_pivot_file_path(category, int(last_y), int(last_m))
1162
+
1163
+ first_df = pd.read_parquet(first_fp)
1164
+ last_df = pd.read_parquet(last_fp)
1165
+
1166
+ min_date = pd.to_datetime(first_df.columns).min() if len(first_df.columns) > 0 else None
1167
+ max_date = pd.to_datetime(last_df.columns).max() if len(last_df.columns) > 0 else None
1168
+
1169
+ return {
1170
+ "min_date": min_date,
1171
+ "max_date": max_date,
1172
+ "partition_count": len(ym_list),
1173
+ }
1174
+
1175
+ def DeleteCategory(self, category: str) -> None:
1176
+ category_path = self._get_category_path(category, raise_err=False)
1177
+ if os.path.exists(category_path):
1178
+ shutil.rmtree(category_path)
1179
+ self.logger.info("Deleted category: %s", category_path)
1180
+ else:
1181
+ self.logger.info("Category not exists: %s", category_path)
1182
+
1183
+ def _get_local_trade_dates_until_today(self) -> List[str]:
1184
+ today = pd.Timestamp(datetime.datetime.now().date())
1185
+ for category in (
1186
+ "cn_stock_unstack#trade_date",
1187
+ "trade_date",
1188
+ "us_stock_unstack#trade_date",
1189
+ "us_trade_date",
1190
+ ):
1191
+ file_path = self._get_unstack_file_path(category)
1192
+ if not os.path.exists(file_path):
1193
+ continue
1194
+ try:
1195
+ df = pd.read_parquet(file_path)
1196
+ except Exception:
1197
+ continue
1198
+ if df.empty:
1199
+ continue
1200
+
1201
+ date_col = next(
1202
+ (col for col in ("trade_date", "cal_date", "date") if col in df.columns),
1203
+ df.columns[0],
1204
+ )
1205
+ work_df = df.copy()
1206
+ if "is_open" in work_df.columns:
1207
+ open_flag = work_df["is_open"]
1208
+ if pd.api.types.is_numeric_dtype(open_flag):
1209
+ work_df = work_df[open_flag.fillna(0).astype(int) == 1]
1210
+ else:
1211
+ work_df = work_df[open_flag.astype(str).str.strip().isin(("1", "True", "true", "交易"))]
1212
+
1213
+ dates = pd.to_datetime(work_df[date_col].astype(str), errors="coerce")
1214
+ dates = dates[(dates.notna()) & (dates <= today)]
1215
+ if dates.empty:
1216
+ continue
1217
+ return sorted(dates.dt.strftime("%Y-%m-%d").unique().tolist())
1218
+ return []
1219
+
1220
+ def CheckPivotCategoriesDates(
1221
+ self,
1222
+ exclude_category_keywords: Optional[Iterable[str]] = None,
1223
+ ) -> pd.DataFrame:
1224
+ pivot_categories = [x for x in self._get_all_categories() if "pivot#" in x]
1225
+ if exclude_category_keywords:
1226
+ exclude_keywords = tuple(exclude_category_keywords)
1227
+ pivot_categories = [
1228
+ x for x in pivot_categories
1229
+ if not any(keyword in x for keyword in exclude_keywords)
1230
+ ]
1231
+
1232
+ date_list = []
1233
+ data_nums = []
1234
+ file_sizes = []
1235
+
1236
+ with tqdm(pivot_categories, desc="检查本地 Pivot 最新日期") as pbar:
1237
+ for category in pbar:
1238
+ pbar.set_postfix_str(f"last_process: {category}")
1239
+ df = self.read_pivot_category(
1240
+ category,
1241
+ start_date=(datetime.datetime.now() - datetime.timedelta(days=30)).strftime("%Y-%m-%d"),
1242
+ show_progress=False,
1243
+ validate_calendar=False,
1244
+ )
1245
+
1246
+ if df.empty:
1247
+ date_list.append(pd.NaT)
1248
+ data_nums.append(0)
1249
+ else:
1250
+ if df.columns.duplicated().sum() > 0:
1251
+ raise LocalStorageError(f"{category} columns duplicated")
1252
+ if df.index.duplicated().sum() > 0:
1253
+ raise LocalStorageError(f"{category} index duplicated")
1254
+
1255
+ latest_col = pd.to_datetime(df.columns).max()
1256
+ date_list.append(latest_col)
1257
+ data_nums.append(len(df.iloc[:, -1]))
1258
+
1259
+ category_dir = self._get_category_path(category)
1260
+ total_size_bytes = sum(
1261
+ os.path.getsize(os.path.join(category_dir, x))
1262
+ for x in os.listdir(category_dir)
1263
+ if os.path.isfile(os.path.join(category_dir, x))
1264
+ )
1265
+ file_sizes.append(f"{round(total_size_bytes / 1024 / 1024, 2)} MB")
1266
+
1267
+ check_df = pd.DataFrame({
1268
+ "category": pivot_categories,
1269
+ "Latest Date": date_list,
1270
+ "数据量": data_nums,
1271
+ "储存大小": file_sizes,
1272
+ })
1273
+
1274
+ check_df.columns.name = "本地Pivot"
1275
+ check_df.set_index("category", inplace=True)
1276
+ check_df.sort_values("数据量", inplace=True)
1277
+ return check_df
1278
+
1279
+ def ShowCategoriesDateSummary(
1280
+ self,
1281
+ exclude_pivot_category_keywords: Optional[Iterable[str]] = None,
1282
+ ) -> None:
1283
+ """
1284
+ 显示本地 Parquet 数据的日期汇总(包括 Pivot 和 Unstack 数据)
1285
+ """
1286
+ import datetime
1287
+
1288
+ print("\n" + "="*100)
1289
+ print(f"本地 Parquet 数据日期汇总")
1290
+ print("="*100)
1291
+
1292
+ today = datetime.datetime.now().strftime("%Y-%m-%d")
1293
+ yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
1294
+ trade_dates = self._get_local_trade_dates_until_today()
1295
+ latest_expected = trade_dates[-1] if trade_dates else today
1296
+ previous_expected = trade_dates[-2] if len(trade_dates) >= 2 else yesterday
1297
+ freshness_label = "交易日" if trade_dates else "自然日"
1298
+
1299
+ # ==================== Pivot 数据 ====================
1300
+ pivot_df = self.CheckPivotCategoriesDates(
1301
+ exclude_category_keywords=exclude_pivot_category_keywords,
1302
+ )
1303
+
1304
+ if not pivot_df.empty:
1305
+ print(f"\n📊 Pivot 数据 ({len(pivot_df)} 个):")
1306
+ print("-"*100)
1307
+
1308
+ for category, row in pivot_df.iterrows():
1309
+ latest_date = row['Latest Date']
1310
+ data_count = row['数据量']
1311
+ storage_size = row['储存大小']
1312
+
1313
+ if pd.isna(latest_date):
1314
+ status = "❌ 无数据"
1315
+ else:
1316
+ latest_date_str = pd.to_datetime(latest_date).strftime("%Y-%m-%d")
1317
+ if latest_date_str >= latest_expected:
1318
+ status = f"✅ 最新{freshness_label} ({latest_date_str})"
1319
+ elif latest_date_str == previous_expected:
1320
+ status = f"✅ 上一{freshness_label} ({latest_date_str})"
1321
+ else:
1322
+ try:
1323
+ if trade_dates:
1324
+ latest_ts = pd.to_datetime(latest_date).normalize()
1325
+ days_behind = sum(pd.to_datetime(x) > latest_ts for x in trade_dates)
1326
+ status = f"⚠️ 滞后 {days_behind} 个交易日 ({latest_date_str})"
1327
+ else:
1328
+ days_behind = (datetime.datetime.now() - pd.to_datetime(latest_date)).days
1329
+ status = f"⚠️ 滞后 {days_behind} 天 ({latest_date_str})"
1330
+ except:
1331
+ status = f"⚠️ {latest_date_str}"
1332
+
1333
+ print(f" {category:<50} | {status:<25} | 数据量: {data_count:<6} | {storage_size}")
1334
+
1335
+ # Pivot 统计
1336
+ latest_ok = sum(1 for _, row in pivot_df.iterrows() if pd.notna(row['Latest Date']) and pd.to_datetime(row['Latest Date']).strftime("%Y-%m-%d") >= latest_expected)
1337
+ yesterday_ok = sum(1 for _, row in pivot_df.iterrows() if pd.notna(row['Latest Date']) and pd.to_datetime(row['Latest Date']).strftime("%Y-%m-%d") == previous_expected)
1338
+ outdated = sum(1 for _, row in pivot_df.iterrows() if pd.notna(row['Latest Date']) and pd.to_datetime(row['Latest Date']).strftime("%Y-%m-%d") < previous_expected)
1339
+
1340
+ print("-"*100)
1341
+ print(f"Pivot 数据新鲜度({freshness_label}): ✅ 最新 {latest_ok} 个 | ✅ 上一日 {yesterday_ok} 个 | ⚠️ 滞后 {outdated} 个")
1342
+ else:
1343
+ print("\n📊 Pivot 数据: 未找到")
1344
+
1345
+ # ==================== Unstack 数据 ====================
1346
+ unstack_categories = [x for x in self._get_all_categories() if "unstack#" in x]
1347
+
1348
+ if unstack_categories:
1349
+ print(f"\n📋 Unstack 数据 ({len(unstack_categories)} 个):")
1350
+ print("-"*100)
1351
+
1352
+ unstack_info = []
1353
+ for category in unstack_categories:
1354
+ file_path = self._get_unstack_file_path(category)
1355
+ if os.path.exists(file_path):
1356
+ file_size_bytes = os.path.getsize(file_path)
1357
+ file_size_mb = round(file_size_bytes / 1024 / 1024, 2)
1358
+ mod_time = datetime.datetime.fromtimestamp(os.path.getmtime(file_path)).strftime("%Y-%m-%d %H:%M")
1359
+
1360
+ # 尝试读取获取行数
1361
+ try:
1362
+ df = pd.read_parquet(file_path)
1363
+ row_count = len(df)
1364
+ except:
1365
+ row_count = -1
1366
+
1367
+ unstack_info.append({
1368
+ 'category': category,
1369
+ 'rows': row_count,
1370
+ 'size': f"{file_size_mb} MB",
1371
+ 'modified': mod_time
1372
+ })
1373
+ print(f" {category:<50} | 行数: {row_count:<8} | {file_size_mb:>8} MB | 更新于: {mod_time}")
1374
+ else:
1375
+ print(f" {category:<50} | ❌ 文件不存在")
1376
+ else:
1377
+ print("\n📋 Unstack 数据: 未找到")
1378
+
1379
+ print("="*100 + "\n")