litequant 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- litequant/LiteQuantClient.py +884 -0
- litequant/ParquetManager.py +1379 -0
- litequant/__init__.py +54 -0
- litequant/exceptions.py +168 -0
- litequant/log.py +24 -0
- litequant-3.0.0.dist-info/LICENSE +21 -0
- litequant-3.0.0.dist-info/METADATA +139 -0
- litequant-3.0.0.dist-info/RECORD +10 -0
- litequant-3.0.0.dist-info/WHEEL +5 -0
- litequant-3.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1379 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
import tempfile
|
|
9
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
10
|
+
from typing import Callable, Iterable, List, Optional, Tuple
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
|
|
15
|
+
from .exceptions import CategoryNotFoundError, LocalStorageError
|
|
16
|
+
from .log import GetLogger
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def DFSaveToFile(df: pd.DataFrame, filepath: str, file_type: Optional[str] = None, debug: bool = False) -> None:
|
|
20
|
+
"""
|
|
21
|
+
原子写入文件:
|
|
22
|
+
1. 写入临时文件
|
|
23
|
+
2. os.replace 原子替换
|
|
24
|
+
"""
|
|
25
|
+
filepath = os.path.abspath(filepath)
|
|
26
|
+
directory = os.path.dirname(filepath)
|
|
27
|
+
os.makedirs(directory, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
if file_type is None:
|
|
30
|
+
file_type = filepath.split(".")[-1].lower()
|
|
31
|
+
|
|
32
|
+
fd, temp_filepath = tempfile.mkstemp(
|
|
33
|
+
prefix=".tmp_",
|
|
34
|
+
suffix=f".{file_type}",
|
|
35
|
+
dir=directory,
|
|
36
|
+
)
|
|
37
|
+
os.close(fd)
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
if file_type == "pickle":
|
|
41
|
+
df.to_pickle(temp_filepath)
|
|
42
|
+
elif file_type == "parquet":
|
|
43
|
+
df.to_parquet(temp_filepath)
|
|
44
|
+
else:
|
|
45
|
+
raise ValueError(f"file_type must be 'pickle' or 'parquet', got {file_type}")
|
|
46
|
+
|
|
47
|
+
os.replace(temp_filepath, filepath)
|
|
48
|
+
|
|
49
|
+
if debug:
|
|
50
|
+
print(f"File saved successfully: {filepath}")
|
|
51
|
+
|
|
52
|
+
except Exception:
|
|
53
|
+
if os.path.exists(temp_filepath):
|
|
54
|
+
os.remove(temp_filepath)
|
|
55
|
+
raise LocalStorageError() from None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
PIVOT_MONTHLY_SUBDIR = "data_pivot_monthly_partition"
|
|
59
|
+
PIVOT_DAILY_SUBDIR = "data_pivot_daily_partition"
|
|
60
|
+
UNSTACK_SUBDIR = "data_unstack"
|
|
61
|
+
CATEGORY_META_SUBDIR = "category_metadata"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ParquetDataManager:
|
|
65
|
+
def __init__(self, root_dir: str, max_workers: int = 8, logger=None):
|
|
66
|
+
self.root_dir = os.path.abspath(root_dir)
|
|
67
|
+
self.max_workers = max_workers
|
|
68
|
+
self.logger = logger or GetLogger("litequant.parquet")
|
|
69
|
+
os.makedirs(self.root_dir, exist_ok=True)
|
|
70
|
+
|
|
71
|
+
def _get_typed_category_path(self, category: str, category_type: str, raise_err: bool = False) -> str:
|
|
72
|
+
mapping = {
|
|
73
|
+
"pivot_monthly": PIVOT_MONTHLY_SUBDIR,
|
|
74
|
+
"pivot_daily": PIVOT_DAILY_SUBDIR,
|
|
75
|
+
"unstack": UNSTACK_SUBDIR,
|
|
76
|
+
}
|
|
77
|
+
subdir = mapping.get(category_type, PIVOT_MONTHLY_SUBDIR)
|
|
78
|
+
dir_path = os.path.join(self.root_dir, subdir, category)
|
|
79
|
+
if raise_err and not os.path.exists(dir_path):
|
|
80
|
+
raise CategoryNotFoundError(f"Directory not exists: {dir_path}")
|
|
81
|
+
return dir_path
|
|
82
|
+
|
|
83
|
+
def _get_category_path(self, category: str, raise_err: bool = False) -> str:
|
|
84
|
+
"""自动探测 category 所在的子目录(daily -> monthly -> unstack)"""
|
|
85
|
+
for subdir in [PIVOT_DAILY_SUBDIR, PIVOT_MONTHLY_SUBDIR, UNSTACK_SUBDIR]:
|
|
86
|
+
dir_path = os.path.join(self.root_dir, subdir, category)
|
|
87
|
+
if os.path.exists(dir_path):
|
|
88
|
+
return dir_path
|
|
89
|
+
dir_path = os.path.join(self.root_dir, PIVOT_MONTHLY_SUBDIR, category)
|
|
90
|
+
if raise_err and not os.path.exists(dir_path):
|
|
91
|
+
raise CategoryNotFoundError(f"Directory not exists: {dir_path}")
|
|
92
|
+
return dir_path
|
|
93
|
+
|
|
94
|
+
def _get_all_categories(self) -> List[str]:
|
|
95
|
+
categories = []
|
|
96
|
+
for subdir in [PIVOT_MONTHLY_SUBDIR, PIVOT_DAILY_SUBDIR, UNSTACK_SUBDIR]:
|
|
97
|
+
path = os.path.join(self.root_dir, subdir)
|
|
98
|
+
if os.path.exists(path):
|
|
99
|
+
categories.extend([
|
|
100
|
+
x for x in os.listdir(path)
|
|
101
|
+
if os.path.isdir(os.path.join(path, x))
|
|
102
|
+
])
|
|
103
|
+
return sorted(categories)
|
|
104
|
+
|
|
105
|
+
def _if_exists_unstack_category(self, category: str) -> bool:
|
|
106
|
+
return os.path.exists(self._get_unstack_file_path(category))
|
|
107
|
+
|
|
108
|
+
def _if_exists_pivot_category(self, category: str) -> bool:
|
|
109
|
+
for category_type in ("pivot_monthly", "pivot_daily"):
|
|
110
|
+
category_path = self._get_typed_category_path(category, category_type, raise_err=False)
|
|
111
|
+
if not os.path.exists(category_path):
|
|
112
|
+
continue
|
|
113
|
+
parquet_files = [
|
|
114
|
+
x for x in os.listdir(category_path)
|
|
115
|
+
if x.endswith(".parquet") and (
|
|
116
|
+
re.search(r"=\d{4}-\d{2}\.parquet$", x) or
|
|
117
|
+
re.search(r"=\d{4}-\d{2}-\d{2}\.parquet$", x)
|
|
118
|
+
)
|
|
119
|
+
]
|
|
120
|
+
if len(parquet_files) > 0:
|
|
121
|
+
return True
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def ExtractYearMonth(input_string: str) -> Tuple[Optional[str], Optional[str]]:
|
|
126
|
+
match = re.search(r"=(\d{4})-(\d{1,2})\.parquet$", input_string)
|
|
127
|
+
if match:
|
|
128
|
+
year = match.group(1)
|
|
129
|
+
month = f"{int(match.group(2)):02d}"
|
|
130
|
+
return year, month
|
|
131
|
+
return None, None
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def ExtractYearMonthDay(input_string: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
|
135
|
+
match = re.search(r"=(?:\(')?(\d{4})-(\d{2})-(\d{2})(?:',\))?\.parquet$", input_string)
|
|
136
|
+
if match:
|
|
137
|
+
return match.group(1), match.group(2), match.group(3)
|
|
138
|
+
return None, None, None
|
|
139
|
+
|
|
140
|
+
def _get_pivot_year_month(self, category: str) -> List[Tuple[str, str]]:
|
|
141
|
+
directory = self._get_typed_category_path(category, "pivot_monthly", raise_err=True)
|
|
142
|
+
result = []
|
|
143
|
+
for x in os.listdir(directory):
|
|
144
|
+
year, month = self.ExtractYearMonth(x)
|
|
145
|
+
if year is not None:
|
|
146
|
+
result.append((year, month))
|
|
147
|
+
return sorted(result)
|
|
148
|
+
|
|
149
|
+
def _get_pivot_dates(self, category: str) -> List[str]:
|
|
150
|
+
directory = self._get_typed_category_path(category, "pivot_daily", raise_err=True)
|
|
151
|
+
result = []
|
|
152
|
+
for x in os.listdir(directory):
|
|
153
|
+
y, m, d = self.ExtractYearMonthDay(x)
|
|
154
|
+
if y is not None:
|
|
155
|
+
result.append(f"{y}-{m}-{d}")
|
|
156
|
+
return sorted(result)
|
|
157
|
+
|
|
158
|
+
def _get_pivot_file_path(self, category: str, year: int, month: int) -> str:
|
|
159
|
+
directory = self._get_typed_category_path(category, "pivot_monthly")
|
|
160
|
+
return os.path.join(directory, f"{category}={int(year):04d}-{int(month):02d}.parquet")
|
|
161
|
+
|
|
162
|
+
def _get_pivot_daily_file_path(self, category: str, date_str: str) -> str:
|
|
163
|
+
directory = self._get_typed_category_path(category, "pivot_daily")
|
|
164
|
+
return os.path.join(directory, f"{category}={date_str}.parquet")
|
|
165
|
+
|
|
166
|
+
def _get_unstack_file_path(self, category: str) -> str:
|
|
167
|
+
directory = self._get_typed_category_path(category, "unstack")
|
|
168
|
+
return os.path.join(directory, f"{category}.parquet")
|
|
169
|
+
|
|
170
|
+
@staticmethod
|
|
171
|
+
def _infer_category_type(category: str) -> str:
|
|
172
|
+
if "pivot#" in category:
|
|
173
|
+
return "pivot"
|
|
174
|
+
if "unstack#" in category:
|
|
175
|
+
return "unstack"
|
|
176
|
+
return "unknown"
|
|
177
|
+
|
|
178
|
+
@staticmethod
|
|
179
|
+
def _safe_meta_filename(category: str) -> str:
|
|
180
|
+
safe_name = re.sub(r'[<>:"/\\|?*]', "_", category)
|
|
181
|
+
return f"{safe_name}.json"
|
|
182
|
+
|
|
183
|
+
def _get_category_meta_dir(self) -> str:
|
|
184
|
+
return os.path.join(self.root_dir, CATEGORY_META_SUBDIR)
|
|
185
|
+
|
|
186
|
+
def _get_category_meta_file_path(self, category: str) -> str:
|
|
187
|
+
return os.path.join(self._get_category_meta_dir(), self._safe_meta_filename(category))
|
|
188
|
+
|
|
189
|
+
@staticmethod
|
|
190
|
+
def _write_json_atomic(obj: dict, file_path: str) -> None:
|
|
191
|
+
directory = os.path.dirname(os.path.abspath(file_path))
|
|
192
|
+
os.makedirs(directory, exist_ok=True)
|
|
193
|
+
|
|
194
|
+
fd, temp_filepath = tempfile.mkstemp(
|
|
195
|
+
prefix=".tmp_",
|
|
196
|
+
suffix=".json",
|
|
197
|
+
dir=directory,
|
|
198
|
+
)
|
|
199
|
+
os.close(fd)
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
with open(temp_filepath, "w", encoding="utf-8") as f:
|
|
203
|
+
json.dump(obj, f, ensure_ascii=False, sort_keys=True, indent=2)
|
|
204
|
+
f.write("\n")
|
|
205
|
+
os.replace(temp_filepath, file_path)
|
|
206
|
+
except Exception:
|
|
207
|
+
if os.path.exists(temp_filepath):
|
|
208
|
+
os.remove(temp_filepath)
|
|
209
|
+
raise LocalStorageError() from None
|
|
210
|
+
|
|
211
|
+
def get_category_meta(self, category: str, default: Optional[dict] = None) -> dict:
|
|
212
|
+
file_path = self._get_category_meta_file_path(category)
|
|
213
|
+
if not os.path.exists(file_path):
|
|
214
|
+
return dict(default or {})
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
218
|
+
meta = json.load(f)
|
|
219
|
+
except Exception:
|
|
220
|
+
raise LocalStorageError() from None
|
|
221
|
+
|
|
222
|
+
if not isinstance(meta, dict):
|
|
223
|
+
raise LocalStorageError() from None
|
|
224
|
+
return meta
|
|
225
|
+
|
|
226
|
+
def set_category_meta(self, category: str, meta: Optional[dict], merge: bool = True) -> dict:
|
|
227
|
+
if meta is None:
|
|
228
|
+
meta = {}
|
|
229
|
+
if not isinstance(meta, dict):
|
|
230
|
+
raise LocalStorageError() from None
|
|
231
|
+
|
|
232
|
+
base = self.get_category_meta(category, default={}) if merge else {}
|
|
233
|
+
merged = dict(base)
|
|
234
|
+
merged.update(meta)
|
|
235
|
+
merged["category"] = category
|
|
236
|
+
merged.setdefault("category_type", self._infer_category_type(category))
|
|
237
|
+
|
|
238
|
+
if merged["category_type"] == "pivot":
|
|
239
|
+
mode = self._first_meta_value(merged, ("partition_mode", "storage.partition_mode"))
|
|
240
|
+
merged["partition_mode"] = mode if mode in ("daily", "monthly") else "monthly"
|
|
241
|
+
elif merged["category_type"] == "unstack":
|
|
242
|
+
mode = self._first_meta_value(merged, ("partition_mode", "storage.partition_mode"))
|
|
243
|
+
merged["partition_mode"] = mode if mode == "full" else "full"
|
|
244
|
+
|
|
245
|
+
self._write_json_atomic(merged, self._get_category_meta_file_path(category))
|
|
246
|
+
return merged
|
|
247
|
+
|
|
248
|
+
def ensure_category_meta(self, category: str, **fields) -> dict:
|
|
249
|
+
meta = self.get_category_meta(category, default={})
|
|
250
|
+
next_meta = dict(meta)
|
|
251
|
+
next_meta.update({k: v for k, v in fields.items() if v is not None})
|
|
252
|
+
return self.set_category_meta(category, next_meta, merge=True)
|
|
253
|
+
|
|
254
|
+
@staticmethod
|
|
255
|
+
def _get_nested_value(meta: dict, path: str, default=None):
|
|
256
|
+
current = meta
|
|
257
|
+
for part in path.split("."):
|
|
258
|
+
if not isinstance(current, dict) or part not in current:
|
|
259
|
+
return default
|
|
260
|
+
current = current[part]
|
|
261
|
+
return current
|
|
262
|
+
|
|
263
|
+
@staticmethod
|
|
264
|
+
def _first_meta_value(meta: dict, paths: Iterable[str], default=None):
|
|
265
|
+
for path in paths:
|
|
266
|
+
value = ParquetDataManager._get_nested_value(meta, path, default=None)
|
|
267
|
+
if value is not None:
|
|
268
|
+
return value
|
|
269
|
+
return default
|
|
270
|
+
|
|
271
|
+
def get_metadata_field(self, category: str, path: str, default=None):
|
|
272
|
+
meta = self.get_category_meta(category, default={})
|
|
273
|
+
return self._get_nested_value(meta, path, default=default)
|
|
274
|
+
|
|
275
|
+
def get_first_metadata_field(self, category: str, paths: Iterable[str], default=None):
|
|
276
|
+
meta = self.get_category_meta(category, default={})
|
|
277
|
+
return self._first_meta_value(meta, paths, default=default)
|
|
278
|
+
|
|
279
|
+
def _resolve_partition_mode(self, category: str, explicit_partition_mode: Optional[str] = None) -> Optional[str]:
|
|
280
|
+
if explicit_partition_mode in ("daily", "monthly", "full"):
|
|
281
|
+
return explicit_partition_mode
|
|
282
|
+
meta = self.get_category_meta(category, default={})
|
|
283
|
+
mode = self._first_meta_value(meta, ("partition_mode", "storage.partition_mode"))
|
|
284
|
+
if mode in ("daily", "monthly", "full"):
|
|
285
|
+
return mode
|
|
286
|
+
return None
|
|
287
|
+
|
|
288
|
+
def _resolve_calendar_category(self, category: str) -> Optional[str]:
|
|
289
|
+
meta = self.get_category_meta(category, default={})
|
|
290
|
+
value = self._first_meta_value(
|
|
291
|
+
meta,
|
|
292
|
+
(
|
|
293
|
+
"calendar_category",
|
|
294
|
+
"time.calendar_category",
|
|
295
|
+
"market.calendar.calendar_category",
|
|
296
|
+
"references.calendar_category",
|
|
297
|
+
),
|
|
298
|
+
)
|
|
299
|
+
return str(value) if value else None
|
|
300
|
+
|
|
301
|
+
def _resolve_duplicate_keys(self, category: str) -> List[str]:
|
|
302
|
+
meta = self.get_category_meta(category, default={})
|
|
303
|
+
value = self._first_meta_value(
|
|
304
|
+
meta,
|
|
305
|
+
(
|
|
306
|
+
"duplicate_keys",
|
|
307
|
+
"schema_contract.primary_key",
|
|
308
|
+
),
|
|
309
|
+
default=[],
|
|
310
|
+
)
|
|
311
|
+
if not isinstance(value, list):
|
|
312
|
+
return []
|
|
313
|
+
return [str(x).strip() for x in value if str(x).strip()]
|
|
314
|
+
|
|
315
|
+
def _resolve_date_column(self, category: str, df: Optional[pd.DataFrame] = None) -> Optional[str]:
|
|
316
|
+
meta = self.get_category_meta(category, default={})
|
|
317
|
+
value = self._first_meta_value(
|
|
318
|
+
meta,
|
|
319
|
+
(
|
|
320
|
+
"date_column",
|
|
321
|
+
"time.date_column",
|
|
322
|
+
"time.timestamp_column",
|
|
323
|
+
),
|
|
324
|
+
)
|
|
325
|
+
if value and (df is None or str(value) in df.columns):
|
|
326
|
+
return str(value)
|
|
327
|
+
if df is not None and not df.empty:
|
|
328
|
+
return next(
|
|
329
|
+
(col for col in ("trade_date", "cal_date", "date", "timestamp") if col in df.columns),
|
|
330
|
+
df.columns[0],
|
|
331
|
+
)
|
|
332
|
+
return str(value) if value else None
|
|
333
|
+
|
|
334
|
+
def _resolve_is_open_column(self, category: str, df: Optional[pd.DataFrame] = None) -> Optional[str]:
|
|
335
|
+
meta = self.get_category_meta(category, default={})
|
|
336
|
+
value = self._first_meta_value(
|
|
337
|
+
meta,
|
|
338
|
+
(
|
|
339
|
+
"is_open_column",
|
|
340
|
+
"market.calendar.is_open_column",
|
|
341
|
+
),
|
|
342
|
+
)
|
|
343
|
+
if value and (df is None or str(value) in df.columns):
|
|
344
|
+
return str(value)
|
|
345
|
+
if df is not None and "is_open" in df.columns:
|
|
346
|
+
return "is_open"
|
|
347
|
+
return str(value) if value else None
|
|
348
|
+
|
|
349
|
+
@staticmethod
|
|
350
|
+
def _normalize_date_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
351
|
+
if df.empty:
|
|
352
|
+
return df.copy()
|
|
353
|
+
out = df.copy()
|
|
354
|
+
out.columns = pd.to_datetime(out.columns)
|
|
355
|
+
out.sort_index(axis=0, inplace=True)
|
|
356
|
+
out.sort_index(axis=1, inplace=True)
|
|
357
|
+
return out
|
|
358
|
+
|
|
359
|
+
@staticmethod
|
|
360
|
+
def _format_date_columns(df: pd.DataFrame, date_format: str = "%Y-%m-%d") -> pd.DataFrame:
|
|
361
|
+
if df.empty:
|
|
362
|
+
return df.copy()
|
|
363
|
+
out = df.copy()
|
|
364
|
+
out.columns = pd.to_datetime(out.columns).strftime(date_format)
|
|
365
|
+
return out
|
|
366
|
+
|
|
367
|
+
@staticmethod
|
|
368
|
+
def _drop_all_null_rows(df: pd.DataFrame) -> pd.DataFrame:
|
|
369
|
+
if df.empty:
|
|
370
|
+
return df
|
|
371
|
+
return df[df.notnull().sum(axis=1) > 0]
|
|
372
|
+
|
|
373
|
+
@staticmethod
|
|
374
|
+
def _drop_duplicate_index_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
375
|
+
if df.empty:
|
|
376
|
+
return df
|
|
377
|
+
|
|
378
|
+
out = df
|
|
379
|
+
if out.index.duplicated().any():
|
|
380
|
+
out = out[~out.index.duplicated(keep="last")]
|
|
381
|
+
if out.columns.duplicated().any():
|
|
382
|
+
out = out.loc[:, ~out.columns.duplicated(keep="last")]
|
|
383
|
+
return out
|
|
384
|
+
|
|
385
|
+
def update_pivot_category(
|
|
386
|
+
self,
|
|
387
|
+
df: pd.DataFrame,
|
|
388
|
+
category: str,
|
|
389
|
+
strf: str = "%Y-%m-%d",
|
|
390
|
+
partition_mode: Optional[str] = None,
|
|
391
|
+
replace_partition: bool = False,
|
|
392
|
+
) -> None:
|
|
393
|
+
partition_mode = self._resolve_partition_mode(category, explicit_partition_mode=partition_mode)
|
|
394
|
+
if partition_mode not in ("monthly", "daily"):
|
|
395
|
+
partition_mode = "monthly"
|
|
396
|
+
|
|
397
|
+
self.ensure_category_meta(
|
|
398
|
+
category,
|
|
399
|
+
category_type="pivot",
|
|
400
|
+
partition_mode=partition_mode,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
if df is None or df.empty:
|
|
404
|
+
self.logger.info("%s: input df is empty, skip update.", category)
|
|
405
|
+
return
|
|
406
|
+
|
|
407
|
+
df = df.copy()
|
|
408
|
+
df.columns = pd.to_datetime(df.columns)
|
|
409
|
+
df = df.dropna(axis=1, how="all")
|
|
410
|
+
|
|
411
|
+
if df.empty:
|
|
412
|
+
self.logger.info("%s: all columns are empty after dropna, skip update.", category)
|
|
413
|
+
return
|
|
414
|
+
|
|
415
|
+
df = self._drop_duplicate_index_columns(df)
|
|
416
|
+
|
|
417
|
+
if partition_mode == "daily":
|
|
418
|
+
os.makedirs(self._get_typed_category_path(category, "pivot_daily"), exist_ok=True)
|
|
419
|
+
grouped = df.T.groupby([df.columns.strftime("%Y-%m-%d")])
|
|
420
|
+
tasks = [
|
|
421
|
+
[
|
|
422
|
+
self._get_pivot_daily_file_path(category, date_key[0] if isinstance(date_key, tuple) else date_key),
|
|
423
|
+
sub_df.T,
|
|
424
|
+
strf,
|
|
425
|
+
replace_partition,
|
|
426
|
+
]
|
|
427
|
+
for date_key, sub_df in grouped
|
|
428
|
+
]
|
|
429
|
+
else:
|
|
430
|
+
os.makedirs(self._get_typed_category_path(category, "pivot_monthly"), exist_ok=True)
|
|
431
|
+
grouped = df.T.groupby([df.columns.year, df.columns.month])
|
|
432
|
+
tasks = [
|
|
433
|
+
[self._get_pivot_file_path(category, int(year), int(month)), sub_df.T, strf, replace_partition]
|
|
434
|
+
for (year, month), sub_df in grouped
|
|
435
|
+
]
|
|
436
|
+
|
|
437
|
+
if not tasks:
|
|
438
|
+
return
|
|
439
|
+
|
|
440
|
+
if len(tasks) == 1:
|
|
441
|
+
ParquetDataManager.UpdatePivotParquet(*tasks[0])
|
|
442
|
+
else:
|
|
443
|
+
self.RunConcurrently(
|
|
444
|
+
tasks=tasks,
|
|
445
|
+
worker_function=ParquetDataManager.UpdatePivotParquet,
|
|
446
|
+
max_workers=self.max_workers,
|
|
447
|
+
task_desc=f"Updating {category}",
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
@staticmethod
|
|
451
|
+
def UpdatePivotParquet(
|
|
452
|
+
file_path: str,
|
|
453
|
+
sub_df: pd.DataFrame,
|
|
454
|
+
date_format: str = "%Y-%m-%d",
|
|
455
|
+
replace_partition: bool = False,
|
|
456
|
+
) -> None:
|
|
457
|
+
if sub_df is None or sub_df.empty:
|
|
458
|
+
return
|
|
459
|
+
|
|
460
|
+
sub_df = sub_df.copy()
|
|
461
|
+
sub_df = ParquetDataManager._drop_all_null_rows(sub_df)
|
|
462
|
+
if sub_df.empty:
|
|
463
|
+
if replace_partition and os.path.exists(file_path):
|
|
464
|
+
os.remove(file_path)
|
|
465
|
+
return
|
|
466
|
+
|
|
467
|
+
try:
|
|
468
|
+
sub_df = ParquetDataManager._normalize_date_columns(sub_df)
|
|
469
|
+
sub_df = ParquetDataManager._drop_duplicate_index_columns(sub_df)
|
|
470
|
+
except Exception:
|
|
471
|
+
raise LocalStorageError() from None
|
|
472
|
+
|
|
473
|
+
if replace_partition:
|
|
474
|
+
write_df = ParquetDataManager._format_date_columns(sub_df, date_format=date_format)
|
|
475
|
+
DFSaveToFile(write_df, file_path, debug=False)
|
|
476
|
+
return
|
|
477
|
+
|
|
478
|
+
if not os.path.exists(file_path):
|
|
479
|
+
write_df = ParquetDataManager._format_date_columns(sub_df, date_format=date_format)
|
|
480
|
+
DFSaveToFile(write_df, file_path, debug=False)
|
|
481
|
+
return
|
|
482
|
+
|
|
483
|
+
try:
|
|
484
|
+
existing_df = pd.read_parquet(file_path)
|
|
485
|
+
existing_df = ParquetDataManager._normalize_date_columns(existing_df)
|
|
486
|
+
existing_df = ParquetDataManager._drop_duplicate_index_columns(existing_df)
|
|
487
|
+
except Exception:
|
|
488
|
+
raise LocalStorageError() from None
|
|
489
|
+
|
|
490
|
+
old_index = existing_df.index
|
|
491
|
+
old_columns = existing_df.columns
|
|
492
|
+
|
|
493
|
+
common_index = old_index.union(sub_df.index)
|
|
494
|
+
common_columns = old_columns.union(sub_df.columns)
|
|
495
|
+
|
|
496
|
+
existing_df = existing_df.reindex(index=common_index, columns=common_columns)
|
|
497
|
+
sub_df = sub_df.reindex(index=common_index, columns=common_columns)
|
|
498
|
+
|
|
499
|
+
has_new_index = len(common_index.difference(old_index)) > 0
|
|
500
|
+
has_new_columns = len(common_columns.difference(old_columns)) > 0
|
|
501
|
+
|
|
502
|
+
candidate_mask = sub_df.notna()
|
|
503
|
+
candidate_count = int(candidate_mask.to_numpy().sum())
|
|
504
|
+
|
|
505
|
+
if candidate_count == 0 and not has_new_index and not has_new_columns:
|
|
506
|
+
return
|
|
507
|
+
|
|
508
|
+
diff_mask = candidate_mask & (existing_df.isna() | existing_df.ne(sub_df))
|
|
509
|
+
diff_count = int(diff_mask.to_numpy().sum())
|
|
510
|
+
|
|
511
|
+
if diff_count == 0 and not has_new_index and not has_new_columns:
|
|
512
|
+
return
|
|
513
|
+
|
|
514
|
+
with pd.option_context("future.no_silent_downcasting", True):
|
|
515
|
+
merged_df = existing_df.where(~candidate_mask, sub_df)
|
|
516
|
+
merged_df = merged_df.infer_objects(copy=False)
|
|
517
|
+
merged_df = ParquetDataManager._drop_all_null_rows(merged_df)
|
|
518
|
+
merged_df = ParquetDataManager._drop_duplicate_index_columns(merged_df)
|
|
519
|
+
merged_df.sort_index(axis=0, inplace=True)
|
|
520
|
+
merged_df.sort_index(axis=1, inplace=True)
|
|
521
|
+
|
|
522
|
+
write_df = ParquetDataManager._format_date_columns(merged_df, date_format=date_format)
|
|
523
|
+
DFSaveToFile(write_df, file_path, debug=False)
|
|
524
|
+
|
|
525
|
+
@staticmethod
|
|
526
|
+
def _normalize_date_strings(values: Iterable) -> List[str]:
|
|
527
|
+
if values is None:
|
|
528
|
+
return []
|
|
529
|
+
dates = pd.to_datetime(list(values), errors="coerce")
|
|
530
|
+
if len(dates) == 0:
|
|
531
|
+
return []
|
|
532
|
+
valid_dates = pd.Series(dates).dropna()
|
|
533
|
+
if valid_dates.empty:
|
|
534
|
+
return []
|
|
535
|
+
return sorted(valid_dates.dt.strftime("%Y-%m-%d").unique().tolist())
|
|
536
|
+
|
|
537
|
+
@staticmethod
|
|
538
|
+
def _filter_open_dates(df: pd.DataFrame, is_open_col: Optional[str]) -> pd.DataFrame:
|
|
539
|
+
if not is_open_col or is_open_col not in df.columns:
|
|
540
|
+
return df
|
|
541
|
+
open_flag = df[is_open_col]
|
|
542
|
+
if pd.api.types.is_numeric_dtype(open_flag):
|
|
543
|
+
return df[open_flag.fillna(0).astype(int) == 1]
|
|
544
|
+
return df[open_flag.astype(str).str.strip().isin(("1", "True", "true", "\u4ea4\u6613"))]
|
|
545
|
+
|
|
546
|
+
@staticmethod
|
|
547
|
+
def _us_juneteenth_observed_date(year: int) -> datetime.date:
|
|
548
|
+
holiday = datetime.date(year, 6, 19)
|
|
549
|
+
if holiday.weekday() == 5:
|
|
550
|
+
return datetime.date(year, 6, 18)
|
|
551
|
+
if holiday.weekday() == 6:
|
|
552
|
+
return datetime.date(year, 6, 20)
|
|
553
|
+
return holiday
|
|
554
|
+
|
|
555
|
+
@staticmethod
|
|
556
|
+
def _filter_stock_calendar_dates(calendar_category: str, dates: List[str]) -> List[str]:
|
|
557
|
+
if not dates or "_stock_unstack#trade_date" not in calendar_category:
|
|
558
|
+
return dates
|
|
559
|
+
|
|
560
|
+
result = []
|
|
561
|
+
for date_str in dates:
|
|
562
|
+
try:
|
|
563
|
+
date_obj = datetime.datetime.strptime(date_str, "%Y-%m-%d").date()
|
|
564
|
+
except ValueError:
|
|
565
|
+
continue
|
|
566
|
+
if date_obj.weekday() >= 5:
|
|
567
|
+
continue
|
|
568
|
+
if (
|
|
569
|
+
calendar_category.startswith("us_stock_")
|
|
570
|
+
and date_obj.year >= 2022
|
|
571
|
+
and date_obj == ParquetDataManager._us_juneteenth_observed_date(date_obj.year)
|
|
572
|
+
):
|
|
573
|
+
continue
|
|
574
|
+
result.append(date_str)
|
|
575
|
+
return result
|
|
576
|
+
|
|
577
|
+
def get_calendar_trade_dates(self, calendar_category: str) -> List[str]:
|
|
578
|
+
file_path = self._get_unstack_file_path(calendar_category)
|
|
579
|
+
if not os.path.exists(file_path):
|
|
580
|
+
raise LocalStorageError() from None
|
|
581
|
+
|
|
582
|
+
try:
|
|
583
|
+
df = pd.read_parquet(file_path)
|
|
584
|
+
except Exception:
|
|
585
|
+
raise LocalStorageError() from None
|
|
586
|
+
|
|
587
|
+
if df.empty:
|
|
588
|
+
return []
|
|
589
|
+
|
|
590
|
+
date_col = self._resolve_date_column(calendar_category, df=df)
|
|
591
|
+
if not date_col or date_col not in df.columns:
|
|
592
|
+
date_col = next(
|
|
593
|
+
(col for col in ("trade_date", "cal_date", "date") if col in df.columns),
|
|
594
|
+
df.columns[0],
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
is_open_col = self._resolve_is_open_column(calendar_category, df=df)
|
|
598
|
+
work_df = self._filter_open_dates(df, is_open_col)
|
|
599
|
+
dates = self._normalize_date_strings(work_df[date_col])
|
|
600
|
+
return self._filter_stock_calendar_dates(calendar_category, dates)
|
|
601
|
+
|
|
602
|
+
def _get_expected_trade_dates(
|
|
603
|
+
self,
|
|
604
|
+
category: str,
|
|
605
|
+
start_date: Optional[str],
|
|
606
|
+
end_date: Optional[str],
|
|
607
|
+
) -> Tuple[Optional[str], List[str]]:
|
|
608
|
+
calendar_category = self._resolve_calendar_category(category)
|
|
609
|
+
if not calendar_category:
|
|
610
|
+
return None, []
|
|
611
|
+
|
|
612
|
+
trade_dates = self.get_calendar_trade_dates(str(calendar_category))
|
|
613
|
+
if start_date:
|
|
614
|
+
start = pd.to_datetime(start_date).strftime("%Y-%m-%d")
|
|
615
|
+
trade_dates = [x for x in trade_dates if x >= start]
|
|
616
|
+
if end_date:
|
|
617
|
+
end = pd.to_datetime(end_date).strftime("%Y-%m-%d")
|
|
618
|
+
trade_dates = [x for x in trade_dates if x <= end]
|
|
619
|
+
return str(calendar_category), trade_dates
|
|
620
|
+
|
|
621
|
+
def _missing_trade_dates_from_actual(
|
|
622
|
+
self,
|
|
623
|
+
category: str,
|
|
624
|
+
start_date: Optional[str],
|
|
625
|
+
end_date: Optional[str],
|
|
626
|
+
actual_dates: Iterable,
|
|
627
|
+
) -> Tuple[Optional[str], List[str]]:
|
|
628
|
+
calendar_category, expected_dates = self._get_expected_trade_dates(category, start_date, end_date)
|
|
629
|
+
if not calendar_category or not expected_dates:
|
|
630
|
+
return calendar_category, []
|
|
631
|
+
|
|
632
|
+
actual_set = set(self._normalize_date_strings(actual_dates))
|
|
633
|
+
missing_dates = [x for x in expected_dates if x not in actual_set]
|
|
634
|
+
return calendar_category, missing_dates
|
|
635
|
+
|
|
636
|
+
def _raise_if_pivot_missing_trade_dates(
|
|
637
|
+
self,
|
|
638
|
+
category: str,
|
|
639
|
+
start_date: Optional[str],
|
|
640
|
+
end_date: Optional[str],
|
|
641
|
+
actual_dates: Iterable,
|
|
642
|
+
) -> None:
|
|
643
|
+
_, missing_dates = self._missing_trade_dates_from_actual(
|
|
644
|
+
category=category,
|
|
645
|
+
start_date=start_date,
|
|
646
|
+
end_date=end_date,
|
|
647
|
+
actual_dates=actual_dates,
|
|
648
|
+
)
|
|
649
|
+
if missing_dates:
|
|
650
|
+
raise LocalStorageError() from None
|
|
651
|
+
|
|
652
|
+
def check_pivot_missing_trade_dates(
|
|
653
|
+
self,
|
|
654
|
+
category: str,
|
|
655
|
+
start_date: Optional[str] = None,
|
|
656
|
+
end_date: Optional[str] = None,
|
|
657
|
+
) -> List[str]:
|
|
658
|
+
df = self.read_pivot_category(
|
|
659
|
+
category=category,
|
|
660
|
+
start_date=start_date,
|
|
661
|
+
end_date=end_date,
|
|
662
|
+
raise_err=False,
|
|
663
|
+
validate_calendar=False,
|
|
664
|
+
show_progress=False,
|
|
665
|
+
)
|
|
666
|
+
if df.empty:
|
|
667
|
+
if start_date and end_date:
|
|
668
|
+
_, missing_dates = self._missing_trade_dates_from_actual(
|
|
669
|
+
category=category,
|
|
670
|
+
start_date=start_date,
|
|
671
|
+
end_date=end_date,
|
|
672
|
+
actual_dates=[],
|
|
673
|
+
)
|
|
674
|
+
return missing_dates
|
|
675
|
+
return []
|
|
676
|
+
|
|
677
|
+
actual_dates = self._normalize_date_strings(df.columns)
|
|
678
|
+
effective_start = start_date or (actual_dates[0] if actual_dates else None)
|
|
679
|
+
effective_end = end_date or (actual_dates[-1] if actual_dates else None)
|
|
680
|
+
_, missing_dates = self._missing_trade_dates_from_actual(
|
|
681
|
+
category=category,
|
|
682
|
+
start_date=effective_start,
|
|
683
|
+
end_date=effective_end,
|
|
684
|
+
actual_dates=actual_dates,
|
|
685
|
+
)
|
|
686
|
+
return missing_dates
|
|
687
|
+
|
|
688
|
+
def read_pivot_category(
|
|
689
|
+
self,
|
|
690
|
+
category: str,
|
|
691
|
+
start_date: Optional[str] = None,
|
|
692
|
+
end_date: Optional[str] = None,
|
|
693
|
+
column_type: str = "str",
|
|
694
|
+
raise_err: bool = True,
|
|
695
|
+
partition_mode: Optional[str] = None,
|
|
696
|
+
show_progress: bool = True,
|
|
697
|
+
validate_calendar: bool = True,
|
|
698
|
+
) -> pd.DataFrame:
|
|
699
|
+
requested_start_date = start_date
|
|
700
|
+
requested_end_date = end_date
|
|
701
|
+
partition_mode = self._resolve_partition_mode(category, explicit_partition_mode=partition_mode)
|
|
702
|
+
|
|
703
|
+
# auto detect partition_mode by subdir existence
|
|
704
|
+
if partition_mode is None:
|
|
705
|
+
if os.path.exists(self._get_typed_category_path(category, "pivot_daily", raise_err=False)):
|
|
706
|
+
partition_mode = "daily"
|
|
707
|
+
else:
|
|
708
|
+
partition_mode = "monthly"
|
|
709
|
+
|
|
710
|
+
if partition_mode == "daily":
|
|
711
|
+
directory = self._get_typed_category_path(category, "pivot_daily", raise_err=False)
|
|
712
|
+
if not os.path.exists(directory):
|
|
713
|
+
if validate_calendar:
|
|
714
|
+
validation_start = requested_start_date or start_date
|
|
715
|
+
validation_end = requested_end_date or end_date
|
|
716
|
+
self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, [])
|
|
717
|
+
if raise_err:
|
|
718
|
+
raise CategoryNotFoundError() from None
|
|
719
|
+
return pd.DataFrame()
|
|
720
|
+
|
|
721
|
+
exists_parquets = [f for f in os.listdir(directory) if f.endswith(".parquet")]
|
|
722
|
+
part_file_map = {}
|
|
723
|
+
for name in exists_parquets:
|
|
724
|
+
y, m, d = self.ExtractYearMonthDay(name)
|
|
725
|
+
if y is not None:
|
|
726
|
+
part_file_map[f"{y}-{m}-{d}"] = name
|
|
727
|
+
exists_parts = sorted(part_file_map)
|
|
728
|
+
|
|
729
|
+
if not exists_parts:
|
|
730
|
+
self.logger.info("数据不存在 %s", category)
|
|
731
|
+
return pd.DataFrame()
|
|
732
|
+
|
|
733
|
+
if start_date is None:
|
|
734
|
+
start_date = exists_parts[0]
|
|
735
|
+
if end_date is None:
|
|
736
|
+
end_date = exists_parts[-1]
|
|
737
|
+
|
|
738
|
+
read_parts = [p for p in exists_parts if start_date <= p <= end_date]
|
|
739
|
+
|
|
740
|
+
if not read_parts:
|
|
741
|
+
self.logger.info("数据不存在 %s: [%s, %s]", category, start_date, end_date)
|
|
742
|
+
if validate_calendar:
|
|
743
|
+
validation_start = requested_start_date or start_date
|
|
744
|
+
validation_end = requested_end_date or end_date
|
|
745
|
+
self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, [])
|
|
746
|
+
return pd.DataFrame()
|
|
747
|
+
|
|
748
|
+
tasks = [[os.path.join(directory, part_file_map[p])] for p in read_parts]
|
|
749
|
+
else:
|
|
750
|
+
directory = self._get_typed_category_path(category, "pivot_monthly", raise_err=False)
|
|
751
|
+
if not os.path.exists(directory):
|
|
752
|
+
if validate_calendar:
|
|
753
|
+
validation_start = requested_start_date or start_date
|
|
754
|
+
validation_end = requested_end_date or end_date
|
|
755
|
+
self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, [])
|
|
756
|
+
if raise_err:
|
|
757
|
+
raise CategoryNotFoundError() from None
|
|
758
|
+
return pd.DataFrame()
|
|
759
|
+
|
|
760
|
+
exists_parquets = [f for f in os.listdir(directory) if f.endswith(".parquet")]
|
|
761
|
+
date_pattern = re.compile(r"(\d{4})-(\d{2})\.parquet$")
|
|
762
|
+
exists_ms = []
|
|
763
|
+
for name in exists_parquets:
|
|
764
|
+
match = date_pattern.search(name)
|
|
765
|
+
if match:
|
|
766
|
+
exists_ms.append(f"{match.group(1)}-{match.group(2)}")
|
|
767
|
+
exists_ms = sorted(set(exists_ms))
|
|
768
|
+
|
|
769
|
+
if not exists_ms:
|
|
770
|
+
self.logger.info("数据不存在 %s", category)
|
|
771
|
+
return pd.DataFrame()
|
|
772
|
+
|
|
773
|
+
if start_date is None:
|
|
774
|
+
start_date = "2000-01-01"
|
|
775
|
+
if end_date is None:
|
|
776
|
+
end_date = (pd.Timestamp(exists_ms[-1]) + pd.offsets.MonthEnd(0)).strftime("%Y-%m-%d")
|
|
777
|
+
|
|
778
|
+
start_month = pd.to_datetime(start_date).replace(day=1)
|
|
779
|
+
end_month = pd.to_datetime(end_date).replace(day=1)
|
|
780
|
+
target_ms = pd.date_range(start=start_month, end=end_month, freq="MS").strftime("%Y-%m").tolist()
|
|
781
|
+
read_ms = sorted(set(target_ms) & set(exists_ms))
|
|
782
|
+
|
|
783
|
+
if not read_ms:
|
|
784
|
+
self.logger.info("数据不存在 %s: [%s, %s]", category, start_date, end_date)
|
|
785
|
+
if validate_calendar:
|
|
786
|
+
validation_start = requested_start_date or start_date
|
|
787
|
+
validation_end = requested_end_date or end_date
|
|
788
|
+
self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, [])
|
|
789
|
+
return pd.DataFrame()
|
|
790
|
+
|
|
791
|
+
tasks = [[os.path.join(directory, f"{category}={ms}.parquet")] for ms in read_ms]
|
|
792
|
+
|
|
793
|
+
results = self.RunConcurrently(
|
|
794
|
+
tasks=tasks,
|
|
795
|
+
worker_function=ParquetDataManager.ReadParquet,
|
|
796
|
+
max_workers=self.max_workers,
|
|
797
|
+
task_desc=f"Reading {category}" if show_progress else None,
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
valid_results = [x for x in results if isinstance(x, pd.DataFrame) and not x.empty]
|
|
801
|
+
if not valid_results:
|
|
802
|
+
if validate_calendar:
|
|
803
|
+
validation_start = requested_start_date or start_date
|
|
804
|
+
validation_end = requested_end_date or end_date
|
|
805
|
+
self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, [])
|
|
806
|
+
return pd.DataFrame()
|
|
807
|
+
|
|
808
|
+
dfs = pd.concat(valid_results, axis=1)
|
|
809
|
+
dfs = self._drop_duplicate_index_columns(dfs)
|
|
810
|
+
|
|
811
|
+
if column_type == "datetime":
|
|
812
|
+
dfs.columns = pd.to_datetime(dfs.columns)
|
|
813
|
+
dfs.sort_index(axis=0, inplace=True)
|
|
814
|
+
dfs.sort_index(axis=1, inplace=True)
|
|
815
|
+
start = pd.to_datetime(start_date)
|
|
816
|
+
end = pd.to_datetime(end_date)
|
|
817
|
+
result_df = dfs.loc[:, (dfs.columns >= start) & (dfs.columns <= end)]
|
|
818
|
+
if validate_calendar:
|
|
819
|
+
actual_dates = self._normalize_date_strings(result_df.columns)
|
|
820
|
+
validation_start = requested_start_date or (actual_dates[0] if actual_dates else start_date)
|
|
821
|
+
validation_end = requested_end_date or (actual_dates[-1] if actual_dates else end_date)
|
|
822
|
+
self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, result_df.columns)
|
|
823
|
+
return result_df
|
|
824
|
+
|
|
825
|
+
dfs.columns = pd.to_datetime(dfs.columns).strftime("%Y-%m-%d")
|
|
826
|
+
dfs.sort_index(axis=0, inplace=True)
|
|
827
|
+
dfs.sort_index(axis=1, inplace=True)
|
|
828
|
+
|
|
829
|
+
start = pd.to_datetime(start_date).strftime("%Y-%m-%d")
|
|
830
|
+
end = pd.to_datetime(end_date).strftime("%Y-%m-%d")
|
|
831
|
+
result_df = dfs.loc[:, (dfs.columns >= start) & (dfs.columns <= end)]
|
|
832
|
+
if validate_calendar:
|
|
833
|
+
actual_dates = self._normalize_date_strings(result_df.columns)
|
|
834
|
+
validation_start = requested_start_date or (actual_dates[0] if actual_dates else start_date)
|
|
835
|
+
validation_end = requested_end_date or (actual_dates[-1] if actual_dates else end_date)
|
|
836
|
+
self._raise_if_pivot_missing_trade_dates(category, validation_start, validation_end, result_df.columns)
|
|
837
|
+
return result_df
|
|
838
|
+
|
|
839
|
+
def ReadPivotCategoryLastNDays(
|
|
840
|
+
self,
|
|
841
|
+
category: str,
|
|
842
|
+
n_days: int = 20,
|
|
843
|
+
column_type: str = "datetime",
|
|
844
|
+
raise_err: bool = True,
|
|
845
|
+
) -> pd.DataFrame:
|
|
846
|
+
latest_date = self.GetCategoryLatestDate(category)
|
|
847
|
+
if latest_date is None:
|
|
848
|
+
return pd.DataFrame()
|
|
849
|
+
|
|
850
|
+
start_date = (pd.Timestamp(latest_date) - pd.Timedelta(days=n_days * 3)).strftime("%Y-%m-%d")
|
|
851
|
+
df = self.read_pivot_category(
|
|
852
|
+
category=category,
|
|
853
|
+
start_date=start_date,
|
|
854
|
+
end_date=pd.Timestamp(latest_date).strftime("%Y-%m-%d"),
|
|
855
|
+
column_type=column_type,
|
|
856
|
+
raise_err=raise_err,
|
|
857
|
+
)
|
|
858
|
+
if df.empty:
|
|
859
|
+
return df
|
|
860
|
+
return df.iloc[:, -n_days:]
|
|
861
|
+
|
|
862
|
+
@staticmethod
|
|
863
|
+
def ReadParquet(file_path: str) -> pd.DataFrame:
|
|
864
|
+
return pd.read_parquet(file_path)
|
|
865
|
+
|
|
866
|
+
def update_unstack_category(
|
|
867
|
+
self,
|
|
868
|
+
category: str,
|
|
869
|
+
sub_df: pd.DataFrame,
|
|
870
|
+
duplicate_keys: Optional[Iterable[str]] = None,
|
|
871
|
+
key_columns: Optional[Iterable[str]] = None,
|
|
872
|
+
) -> None:
|
|
873
|
+
if duplicate_keys is None:
|
|
874
|
+
duplicate_keys = key_columns
|
|
875
|
+
if duplicate_keys is None:
|
|
876
|
+
duplicate_keys = self._resolve_duplicate_keys(category)
|
|
877
|
+
|
|
878
|
+
resolved_key_columns = (
|
|
879
|
+
self.ResolveUnstackKeyColumns(sub_df, key_columns=duplicate_keys)
|
|
880
|
+
if sub_df is not None
|
|
881
|
+
else None
|
|
882
|
+
)
|
|
883
|
+
self.ensure_category_meta(
|
|
884
|
+
category,
|
|
885
|
+
category_type="unstack",
|
|
886
|
+
partition_mode="full",
|
|
887
|
+
duplicate_keys=resolved_key_columns,
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
if sub_df is None or sub_df.empty:
|
|
891
|
+
self.logger.info("%s: sub_df is empty, skip update.", category)
|
|
892
|
+
return
|
|
893
|
+
|
|
894
|
+
os.makedirs(self._get_typed_category_path(category, "unstack"), exist_ok=True)
|
|
895
|
+
file_path = self._get_unstack_file_path(category)
|
|
896
|
+
sub_df = self.NormalizeUnstackDf(sub_df, key_columns=resolved_key_columns)
|
|
897
|
+
|
|
898
|
+
if os.path.exists(file_path):
|
|
899
|
+
existing_df = pd.read_parquet(file_path)
|
|
900
|
+
if existing_df.equals(sub_df):
|
|
901
|
+
self.logger.info("%s: 无需更新,DataFrames identical.", file_path)
|
|
902
|
+
return
|
|
903
|
+
|
|
904
|
+
new_df = self.UpdateUnstackDf(existing_df, sub_df, duplicate_keys=resolved_key_columns)
|
|
905
|
+
DFSaveToFile(new_df, file_path, debug=False)
|
|
906
|
+
self.logger.info("%s: 更新 Unstack,Pre=%s After=%s", file_path, existing_df.shape, new_df.shape)
|
|
907
|
+
else:
|
|
908
|
+
DFSaveToFile(sub_df, file_path, debug=False)
|
|
909
|
+
self.logger.info("%s: 初始化 Unstack Shape=%s", file_path, sub_df.shape)
|
|
910
|
+
|
|
911
|
+
def read_unstack_category(self, category: str, raise_err: bool = True) -> pd.DataFrame:
|
|
912
|
+
file_path = self._get_unstack_file_path(category)
|
|
913
|
+
if raise_err:
|
|
914
|
+
return pd.read_parquet(file_path)
|
|
915
|
+
|
|
916
|
+
try:
|
|
917
|
+
return pd.read_parquet(file_path)
|
|
918
|
+
except Exception:
|
|
919
|
+
self.logger.warning("数据暂时不可用,请稍后重试")
|
|
920
|
+
return pd.DataFrame()
|
|
921
|
+
|
|
922
|
+
@staticmethod
|
|
923
|
+
def ResolveUnstackKeyColumns(
|
|
924
|
+
df: pd.DataFrame,
|
|
925
|
+
key_columns: Optional[Iterable[str]] = None,
|
|
926
|
+
) -> List[str]:
|
|
927
|
+
if key_columns is None:
|
|
928
|
+
columns_name = df.columns.name
|
|
929
|
+
if columns_name is not None and str(columns_name).startswith("keys:"):
|
|
930
|
+
key_part = str(columns_name)[5:]
|
|
931
|
+
key_columns = [x.strip() for x in key_part.split(",") if x.strip()]
|
|
932
|
+
|
|
933
|
+
if key_columns is None:
|
|
934
|
+
return list(df.columns)
|
|
935
|
+
|
|
936
|
+
if isinstance(key_columns, str):
|
|
937
|
+
key_columns = [x.strip() for x in key_columns.split(",") if x.strip()]
|
|
938
|
+
else:
|
|
939
|
+
key_columns = [str(x).strip() for x in key_columns if str(x).strip()]
|
|
940
|
+
|
|
941
|
+
missing = [col for col in key_columns if col not in df.columns]
|
|
942
|
+
if missing:
|
|
943
|
+
raise LocalStorageError()
|
|
944
|
+
|
|
945
|
+
return key_columns
|
|
946
|
+
|
|
947
|
+
@staticmethod
|
|
948
|
+
def NormalizeUnstackDf(
|
|
949
|
+
df: pd.DataFrame,
|
|
950
|
+
key_columns: Optional[Iterable[str]] = None,
|
|
951
|
+
) -> pd.DataFrame:
|
|
952
|
+
if df is None or df.empty:
|
|
953
|
+
return df
|
|
954
|
+
|
|
955
|
+
out = df.copy()
|
|
956
|
+
resolved_keys = ParquetDataManager.ResolveUnstackKeyColumns(out, key_columns=key_columns)
|
|
957
|
+
if resolved_keys:
|
|
958
|
+
out = out.drop_duplicates(subset=resolved_keys, keep="last")
|
|
959
|
+
else:
|
|
960
|
+
out = out.drop_duplicates(keep="last")
|
|
961
|
+
return out
|
|
962
|
+
|
|
963
|
+
@staticmethod
|
|
964
|
+
def UpdateUnstackDf(
|
|
965
|
+
original_df: pd.DataFrame,
|
|
966
|
+
new_data: pd.DataFrame,
|
|
967
|
+
duplicate_keys: Optional[Iterable[str]] = None,
|
|
968
|
+
key_columns: Optional[Iterable[str]] = None,
|
|
969
|
+
) -> pd.DataFrame:
|
|
970
|
+
"""
|
|
971
|
+
合并 unstack 数据,支持指定去重键
|
|
972
|
+
|
|
973
|
+
去重键指定方式:
|
|
974
|
+
- duplicate_keys / key_columns 显式传入时优先
|
|
975
|
+
- columns.name 以 "keys:" 开头,如 "keys:股票代码,计入日期"
|
|
976
|
+
- 否则按所有列去重
|
|
977
|
+
"""
|
|
978
|
+
updated_df = pd.concat([original_df, new_data], axis=0)
|
|
979
|
+
if duplicate_keys is None:
|
|
980
|
+
duplicate_keys = key_columns
|
|
981
|
+
key_columns = duplicate_keys
|
|
982
|
+
key_columns = ParquetDataManager.ResolveUnstackKeyColumns(updated_df, key_columns=key_columns)
|
|
983
|
+
|
|
984
|
+
if len(key_columns) > 0:
|
|
985
|
+
updated_df = updated_df.drop_duplicates(subset=key_columns, keep="last")
|
|
986
|
+
else:
|
|
987
|
+
updated_df = updated_df.drop_duplicates(keep="last")
|
|
988
|
+
|
|
989
|
+
return updated_df
|
|
990
|
+
|
|
991
|
+
@staticmethod
|
|
992
|
+
def RunConcurrently(
|
|
993
|
+
tasks: List[list],
|
|
994
|
+
worker_function: Callable,
|
|
995
|
+
max_workers: int = 5,
|
|
996
|
+
task_desc: Optional[str] = "Processing tasks",
|
|
997
|
+
fail_fast: bool = True,
|
|
998
|
+
) -> List:
|
|
999
|
+
if tasks is None or len(tasks) == 0:
|
|
1000
|
+
return []
|
|
1001
|
+
|
|
1002
|
+
results = [None] * len(tasks)
|
|
1003
|
+
errors = []
|
|
1004
|
+
|
|
1005
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
1006
|
+
future_to_idx = {
|
|
1007
|
+
executor.submit(worker_function, *task): idx
|
|
1008
|
+
for idx, task in enumerate(tasks)
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
with tqdm(total=len(future_to_idx), desc=task_desc, disable=task_desc is None) as pbar:
|
|
1012
|
+
for future in as_completed(future_to_idx):
|
|
1013
|
+
idx = future_to_idx[future]
|
|
1014
|
+
try:
|
|
1015
|
+
results[idx] = future.result()
|
|
1016
|
+
except Exception as e:
|
|
1017
|
+
errors.append((idx, e))
|
|
1018
|
+
if fail_fast:
|
|
1019
|
+
raise
|
|
1020
|
+
finally:
|
|
1021
|
+
pbar.update(1)
|
|
1022
|
+
|
|
1023
|
+
if errors and fail_fast:
|
|
1024
|
+
raise errors[0][1]
|
|
1025
|
+
|
|
1026
|
+
return results
|
|
1027
|
+
|
|
1028
|
+
def GetCategoryLatestDate(self, category: str, partition_mode: Optional[str] = None):
|
|
1029
|
+
if not self._if_exists_pivot_category(category):
|
|
1030
|
+
return None
|
|
1031
|
+
|
|
1032
|
+
partition_mode = self._resolve_partition_mode(category, explicit_partition_mode=partition_mode)
|
|
1033
|
+
|
|
1034
|
+
# auto detect partition_mode by subdir existence
|
|
1035
|
+
if partition_mode is None:
|
|
1036
|
+
if os.path.exists(self._get_typed_category_path(category, "pivot_daily", raise_err=False)):
|
|
1037
|
+
partition_mode = "daily"
|
|
1038
|
+
else:
|
|
1039
|
+
partition_mode = "monthly"
|
|
1040
|
+
|
|
1041
|
+
if partition_mode == "daily":
|
|
1042
|
+
date_list = self._get_pivot_dates(category)
|
|
1043
|
+
if len(date_list) == 0:
|
|
1044
|
+
return None
|
|
1045
|
+
|
|
1046
|
+
for latest_date in reversed(date_list):
|
|
1047
|
+
file_path = self._get_pivot_daily_file_path(category, latest_date)
|
|
1048
|
+
if not os.path.exists(file_path):
|
|
1049
|
+
continue
|
|
1050
|
+
|
|
1051
|
+
df = pd.read_parquet(file_path)
|
|
1052
|
+
if df.empty or len(df.columns) == 0:
|
|
1053
|
+
continue
|
|
1054
|
+
|
|
1055
|
+
cols = pd.to_datetime(df.columns)
|
|
1056
|
+
return cols.max()
|
|
1057
|
+
else:
|
|
1058
|
+
ym_list = self._get_pivot_year_month(category)
|
|
1059
|
+
if len(ym_list) == 0:
|
|
1060
|
+
return None
|
|
1061
|
+
|
|
1062
|
+
for latest_y, latest_m in reversed(ym_list):
|
|
1063
|
+
file_path = self._get_pivot_file_path(category, int(latest_y), int(latest_m))
|
|
1064
|
+
if not os.path.exists(file_path):
|
|
1065
|
+
continue
|
|
1066
|
+
|
|
1067
|
+
df = pd.read_parquet(file_path)
|
|
1068
|
+
if df.empty or len(df.columns) == 0:
|
|
1069
|
+
continue
|
|
1070
|
+
|
|
1071
|
+
cols = pd.to_datetime(df.columns)
|
|
1072
|
+
return cols.max()
|
|
1073
|
+
|
|
1074
|
+
return None
|
|
1075
|
+
|
|
1076
|
+
def get_download_days(self, category: str, template_date: List, iloc: Optional[int] = None) -> List:
|
|
1077
|
+
"""
|
|
1078
|
+
获取需要下载的日期列表。
|
|
1079
|
+
|
|
1080
|
+
Args:
|
|
1081
|
+
category: 数据类别名称
|
|
1082
|
+
template_date: 模板日期列表
|
|
1083
|
+
iloc: 可选,从末尾删除的列数
|
|
1084
|
+
|
|
1085
|
+
Returns:
|
|
1086
|
+
[exists_df, update_dates]: 已存在的数据框和需要更新的日期列表
|
|
1087
|
+
"""
|
|
1088
|
+
category_path = self._get_category_path(category, raise_err=False)
|
|
1089
|
+
|
|
1090
|
+
if os.path.exists(category_path):
|
|
1091
|
+
exists_df = self.read_pivot_category(
|
|
1092
|
+
category=category,
|
|
1093
|
+
start_date=min(template_date),
|
|
1094
|
+
end_date=max(template_date),
|
|
1095
|
+
)
|
|
1096
|
+
else:
|
|
1097
|
+
exists_df = pd.DataFrame()
|
|
1098
|
+
|
|
1099
|
+
if iloc:
|
|
1100
|
+
exists_df = exists_df.iloc[:, :-iloc]
|
|
1101
|
+
|
|
1102
|
+
existing_dates = set(pd.to_datetime(exists_df.columns)) if not exists_df.empty else set()
|
|
1103
|
+
template_dates = set(pd.to_datetime(template_date))
|
|
1104
|
+
update_dates = sorted(template_dates - existing_dates)
|
|
1105
|
+
|
|
1106
|
+
return [exists_df, update_dates]
|
|
1107
|
+
|
|
1108
|
+
def GetCategoryDateCoverage(self, category: str, partition_mode: Optional[str] = None):
|
|
1109
|
+
if not self._if_exists_pivot_category(category):
|
|
1110
|
+
return {
|
|
1111
|
+
"min_date": None,
|
|
1112
|
+
"max_date": None,
|
|
1113
|
+
"partition_count": 0,
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1116
|
+
partition_mode = self._resolve_partition_mode(category, explicit_partition_mode=partition_mode)
|
|
1117
|
+
|
|
1118
|
+
# auto detect partition_mode by subdir existence
|
|
1119
|
+
if partition_mode is None:
|
|
1120
|
+
if os.path.exists(self._get_typed_category_path(category, "pivot_daily", raise_err=False)):
|
|
1121
|
+
partition_mode = "daily"
|
|
1122
|
+
else:
|
|
1123
|
+
partition_mode = "monthly"
|
|
1124
|
+
|
|
1125
|
+
if partition_mode == "daily":
|
|
1126
|
+
date_list = self._get_pivot_dates(category)
|
|
1127
|
+
if len(date_list) == 0:
|
|
1128
|
+
return {
|
|
1129
|
+
"min_date": None,
|
|
1130
|
+
"max_date": None,
|
|
1131
|
+
"partition_count": 0,
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
first_fp = self._get_pivot_daily_file_path(category, date_list[0])
|
|
1135
|
+
last_fp = self._get_pivot_daily_file_path(category, date_list[-1])
|
|
1136
|
+
|
|
1137
|
+
first_df = pd.read_parquet(first_fp)
|
|
1138
|
+
last_df = pd.read_parquet(last_fp)
|
|
1139
|
+
|
|
1140
|
+
min_date = pd.to_datetime(first_df.columns).min() if len(first_df.columns) > 0 else None
|
|
1141
|
+
max_date = pd.to_datetime(last_df.columns).max() if len(last_df.columns) > 0 else None
|
|
1142
|
+
|
|
1143
|
+
return {
|
|
1144
|
+
"min_date": min_date,
|
|
1145
|
+
"max_date": max_date,
|
|
1146
|
+
"partition_count": len(date_list),
|
|
1147
|
+
}
|
|
1148
|
+
else:
|
|
1149
|
+
ym_list = self._get_pivot_year_month(category)
|
|
1150
|
+
if len(ym_list) == 0:
|
|
1151
|
+
return {
|
|
1152
|
+
"min_date": None,
|
|
1153
|
+
"max_date": None,
|
|
1154
|
+
"partition_count": 0,
|
|
1155
|
+
}
|
|
1156
|
+
|
|
1157
|
+
first_y, first_m = ym_list[0]
|
|
1158
|
+
last_y, last_m = ym_list[-1]
|
|
1159
|
+
|
|
1160
|
+
first_fp = self._get_pivot_file_path(category, int(first_y), int(first_m))
|
|
1161
|
+
last_fp = self._get_pivot_file_path(category, int(last_y), int(last_m))
|
|
1162
|
+
|
|
1163
|
+
first_df = pd.read_parquet(first_fp)
|
|
1164
|
+
last_df = pd.read_parquet(last_fp)
|
|
1165
|
+
|
|
1166
|
+
min_date = pd.to_datetime(first_df.columns).min() if len(first_df.columns) > 0 else None
|
|
1167
|
+
max_date = pd.to_datetime(last_df.columns).max() if len(last_df.columns) > 0 else None
|
|
1168
|
+
|
|
1169
|
+
return {
|
|
1170
|
+
"min_date": min_date,
|
|
1171
|
+
"max_date": max_date,
|
|
1172
|
+
"partition_count": len(ym_list),
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
def DeleteCategory(self, category: str) -> None:
|
|
1176
|
+
category_path = self._get_category_path(category, raise_err=False)
|
|
1177
|
+
if os.path.exists(category_path):
|
|
1178
|
+
shutil.rmtree(category_path)
|
|
1179
|
+
self.logger.info("Deleted category: %s", category_path)
|
|
1180
|
+
else:
|
|
1181
|
+
self.logger.info("Category not exists: %s", category_path)
|
|
1182
|
+
|
|
1183
|
+
def _get_local_trade_dates_until_today(self) -> List[str]:
|
|
1184
|
+
today = pd.Timestamp(datetime.datetime.now().date())
|
|
1185
|
+
for category in (
|
|
1186
|
+
"cn_stock_unstack#trade_date",
|
|
1187
|
+
"trade_date",
|
|
1188
|
+
"us_stock_unstack#trade_date",
|
|
1189
|
+
"us_trade_date",
|
|
1190
|
+
):
|
|
1191
|
+
file_path = self._get_unstack_file_path(category)
|
|
1192
|
+
if not os.path.exists(file_path):
|
|
1193
|
+
continue
|
|
1194
|
+
try:
|
|
1195
|
+
df = pd.read_parquet(file_path)
|
|
1196
|
+
except Exception:
|
|
1197
|
+
continue
|
|
1198
|
+
if df.empty:
|
|
1199
|
+
continue
|
|
1200
|
+
|
|
1201
|
+
date_col = next(
|
|
1202
|
+
(col for col in ("trade_date", "cal_date", "date") if col in df.columns),
|
|
1203
|
+
df.columns[0],
|
|
1204
|
+
)
|
|
1205
|
+
work_df = df.copy()
|
|
1206
|
+
if "is_open" in work_df.columns:
|
|
1207
|
+
open_flag = work_df["is_open"]
|
|
1208
|
+
if pd.api.types.is_numeric_dtype(open_flag):
|
|
1209
|
+
work_df = work_df[open_flag.fillna(0).astype(int) == 1]
|
|
1210
|
+
else:
|
|
1211
|
+
work_df = work_df[open_flag.astype(str).str.strip().isin(("1", "True", "true", "交易"))]
|
|
1212
|
+
|
|
1213
|
+
dates = pd.to_datetime(work_df[date_col].astype(str), errors="coerce")
|
|
1214
|
+
dates = dates[(dates.notna()) & (dates <= today)]
|
|
1215
|
+
if dates.empty:
|
|
1216
|
+
continue
|
|
1217
|
+
return sorted(dates.dt.strftime("%Y-%m-%d").unique().tolist())
|
|
1218
|
+
return []
|
|
1219
|
+
|
|
1220
|
+
def CheckPivotCategoriesDates(
|
|
1221
|
+
self,
|
|
1222
|
+
exclude_category_keywords: Optional[Iterable[str]] = None,
|
|
1223
|
+
) -> pd.DataFrame:
|
|
1224
|
+
pivot_categories = [x for x in self._get_all_categories() if "pivot#" in x]
|
|
1225
|
+
if exclude_category_keywords:
|
|
1226
|
+
exclude_keywords = tuple(exclude_category_keywords)
|
|
1227
|
+
pivot_categories = [
|
|
1228
|
+
x for x in pivot_categories
|
|
1229
|
+
if not any(keyword in x for keyword in exclude_keywords)
|
|
1230
|
+
]
|
|
1231
|
+
|
|
1232
|
+
date_list = []
|
|
1233
|
+
data_nums = []
|
|
1234
|
+
file_sizes = []
|
|
1235
|
+
|
|
1236
|
+
with tqdm(pivot_categories, desc="检查本地 Pivot 最新日期") as pbar:
|
|
1237
|
+
for category in pbar:
|
|
1238
|
+
pbar.set_postfix_str(f"last_process: {category}")
|
|
1239
|
+
df = self.read_pivot_category(
|
|
1240
|
+
category,
|
|
1241
|
+
start_date=(datetime.datetime.now() - datetime.timedelta(days=30)).strftime("%Y-%m-%d"),
|
|
1242
|
+
show_progress=False,
|
|
1243
|
+
validate_calendar=False,
|
|
1244
|
+
)
|
|
1245
|
+
|
|
1246
|
+
if df.empty:
|
|
1247
|
+
date_list.append(pd.NaT)
|
|
1248
|
+
data_nums.append(0)
|
|
1249
|
+
else:
|
|
1250
|
+
if df.columns.duplicated().sum() > 0:
|
|
1251
|
+
raise LocalStorageError(f"{category} columns duplicated")
|
|
1252
|
+
if df.index.duplicated().sum() > 0:
|
|
1253
|
+
raise LocalStorageError(f"{category} index duplicated")
|
|
1254
|
+
|
|
1255
|
+
latest_col = pd.to_datetime(df.columns).max()
|
|
1256
|
+
date_list.append(latest_col)
|
|
1257
|
+
data_nums.append(len(df.iloc[:, -1]))
|
|
1258
|
+
|
|
1259
|
+
category_dir = self._get_category_path(category)
|
|
1260
|
+
total_size_bytes = sum(
|
|
1261
|
+
os.path.getsize(os.path.join(category_dir, x))
|
|
1262
|
+
for x in os.listdir(category_dir)
|
|
1263
|
+
if os.path.isfile(os.path.join(category_dir, x))
|
|
1264
|
+
)
|
|
1265
|
+
file_sizes.append(f"{round(total_size_bytes / 1024 / 1024, 2)} MB")
|
|
1266
|
+
|
|
1267
|
+
check_df = pd.DataFrame({
|
|
1268
|
+
"category": pivot_categories,
|
|
1269
|
+
"Latest Date": date_list,
|
|
1270
|
+
"数据量": data_nums,
|
|
1271
|
+
"储存大小": file_sizes,
|
|
1272
|
+
})
|
|
1273
|
+
|
|
1274
|
+
check_df.columns.name = "本地Pivot"
|
|
1275
|
+
check_df.set_index("category", inplace=True)
|
|
1276
|
+
check_df.sort_values("数据量", inplace=True)
|
|
1277
|
+
return check_df
|
|
1278
|
+
|
|
1279
|
+
def ShowCategoriesDateSummary(
|
|
1280
|
+
self,
|
|
1281
|
+
exclude_pivot_category_keywords: Optional[Iterable[str]] = None,
|
|
1282
|
+
) -> None:
|
|
1283
|
+
"""
|
|
1284
|
+
显示本地 Parquet 数据的日期汇总(包括 Pivot 和 Unstack 数据)
|
|
1285
|
+
"""
|
|
1286
|
+
import datetime
|
|
1287
|
+
|
|
1288
|
+
print("\n" + "="*100)
|
|
1289
|
+
print(f"本地 Parquet 数据日期汇总")
|
|
1290
|
+
print("="*100)
|
|
1291
|
+
|
|
1292
|
+
today = datetime.datetime.now().strftime("%Y-%m-%d")
|
|
1293
|
+
yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
|
|
1294
|
+
trade_dates = self._get_local_trade_dates_until_today()
|
|
1295
|
+
latest_expected = trade_dates[-1] if trade_dates else today
|
|
1296
|
+
previous_expected = trade_dates[-2] if len(trade_dates) >= 2 else yesterday
|
|
1297
|
+
freshness_label = "交易日" if trade_dates else "自然日"
|
|
1298
|
+
|
|
1299
|
+
# ==================== Pivot 数据 ====================
|
|
1300
|
+
pivot_df = self.CheckPivotCategoriesDates(
|
|
1301
|
+
exclude_category_keywords=exclude_pivot_category_keywords,
|
|
1302
|
+
)
|
|
1303
|
+
|
|
1304
|
+
if not pivot_df.empty:
|
|
1305
|
+
print(f"\n📊 Pivot 数据 ({len(pivot_df)} 个):")
|
|
1306
|
+
print("-"*100)
|
|
1307
|
+
|
|
1308
|
+
for category, row in pivot_df.iterrows():
|
|
1309
|
+
latest_date = row['Latest Date']
|
|
1310
|
+
data_count = row['数据量']
|
|
1311
|
+
storage_size = row['储存大小']
|
|
1312
|
+
|
|
1313
|
+
if pd.isna(latest_date):
|
|
1314
|
+
status = "❌ 无数据"
|
|
1315
|
+
else:
|
|
1316
|
+
latest_date_str = pd.to_datetime(latest_date).strftime("%Y-%m-%d")
|
|
1317
|
+
if latest_date_str >= latest_expected:
|
|
1318
|
+
status = f"✅ 最新{freshness_label} ({latest_date_str})"
|
|
1319
|
+
elif latest_date_str == previous_expected:
|
|
1320
|
+
status = f"✅ 上一{freshness_label} ({latest_date_str})"
|
|
1321
|
+
else:
|
|
1322
|
+
try:
|
|
1323
|
+
if trade_dates:
|
|
1324
|
+
latest_ts = pd.to_datetime(latest_date).normalize()
|
|
1325
|
+
days_behind = sum(pd.to_datetime(x) > latest_ts for x in trade_dates)
|
|
1326
|
+
status = f"⚠️ 滞后 {days_behind} 个交易日 ({latest_date_str})"
|
|
1327
|
+
else:
|
|
1328
|
+
days_behind = (datetime.datetime.now() - pd.to_datetime(latest_date)).days
|
|
1329
|
+
status = f"⚠️ 滞后 {days_behind} 天 ({latest_date_str})"
|
|
1330
|
+
except:
|
|
1331
|
+
status = f"⚠️ {latest_date_str}"
|
|
1332
|
+
|
|
1333
|
+
print(f" {category:<50} | {status:<25} | 数据量: {data_count:<6} | {storage_size}")
|
|
1334
|
+
|
|
1335
|
+
# Pivot 统计
|
|
1336
|
+
latest_ok = sum(1 for _, row in pivot_df.iterrows() if pd.notna(row['Latest Date']) and pd.to_datetime(row['Latest Date']).strftime("%Y-%m-%d") >= latest_expected)
|
|
1337
|
+
yesterday_ok = sum(1 for _, row in pivot_df.iterrows() if pd.notna(row['Latest Date']) and pd.to_datetime(row['Latest Date']).strftime("%Y-%m-%d") == previous_expected)
|
|
1338
|
+
outdated = sum(1 for _, row in pivot_df.iterrows() if pd.notna(row['Latest Date']) and pd.to_datetime(row['Latest Date']).strftime("%Y-%m-%d") < previous_expected)
|
|
1339
|
+
|
|
1340
|
+
print("-"*100)
|
|
1341
|
+
print(f"Pivot 数据新鲜度({freshness_label}): ✅ 最新 {latest_ok} 个 | ✅ 上一日 {yesterday_ok} 个 | ⚠️ 滞后 {outdated} 个")
|
|
1342
|
+
else:
|
|
1343
|
+
print("\n📊 Pivot 数据: 未找到")
|
|
1344
|
+
|
|
1345
|
+
# ==================== Unstack 数据 ====================
|
|
1346
|
+
unstack_categories = [x for x in self._get_all_categories() if "unstack#" in x]
|
|
1347
|
+
|
|
1348
|
+
if unstack_categories:
|
|
1349
|
+
print(f"\n📋 Unstack 数据 ({len(unstack_categories)} 个):")
|
|
1350
|
+
print("-"*100)
|
|
1351
|
+
|
|
1352
|
+
unstack_info = []
|
|
1353
|
+
for category in unstack_categories:
|
|
1354
|
+
file_path = self._get_unstack_file_path(category)
|
|
1355
|
+
if os.path.exists(file_path):
|
|
1356
|
+
file_size_bytes = os.path.getsize(file_path)
|
|
1357
|
+
file_size_mb = round(file_size_bytes / 1024 / 1024, 2)
|
|
1358
|
+
mod_time = datetime.datetime.fromtimestamp(os.path.getmtime(file_path)).strftime("%Y-%m-%d %H:%M")
|
|
1359
|
+
|
|
1360
|
+
# 尝试读取获取行数
|
|
1361
|
+
try:
|
|
1362
|
+
df = pd.read_parquet(file_path)
|
|
1363
|
+
row_count = len(df)
|
|
1364
|
+
except:
|
|
1365
|
+
row_count = -1
|
|
1366
|
+
|
|
1367
|
+
unstack_info.append({
|
|
1368
|
+
'category': category,
|
|
1369
|
+
'rows': row_count,
|
|
1370
|
+
'size': f"{file_size_mb} MB",
|
|
1371
|
+
'modified': mod_time
|
|
1372
|
+
})
|
|
1373
|
+
print(f" {category:<50} | 行数: {row_count:<8} | {file_size_mb:>8} MB | 更新于: {mod_time}")
|
|
1374
|
+
else:
|
|
1375
|
+
print(f" {category:<50} | ❌ 文件不存在")
|
|
1376
|
+
else:
|
|
1377
|
+
print("\n📋 Unstack 数据: 未找到")
|
|
1378
|
+
|
|
1379
|
+
print("="*100 + "\n")
|