quantvn 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of quantvn might be problematic. Click here for more details.

@@ -0,0 +1,904 @@
1
+
2
+ import time, random, requests
3
+ import pandas as pd
4
+ import numpy as np
5
+
6
+ DEFAULT_TIMEOUT = 25
7
+
8
+ if "_CACHE" not in globals():
9
+ _CACHE = {}
10
+
11
+ def _fetch_price_df(symbol: str, timeframe: str = "h") -> pd.DataFrame:
12
+ try:
13
+ from quantvn.vn.data import stocks
14
+ except Exception as e:
15
+ raise ImportError("Cần quantvn + client(apikey=...) để lấy giá.") from e
16
+ df = stocks.get_hist(symbol, timeframe)
17
+ cols = _detect_price_columns(df)
18
+ return df.rename(columns={
19
+ cols["date"]:"Date", cols["time"]:"time",
20
+ cols["open"]:"Open", cols["high"]:"High",
21
+ cols["low"]:"Low", cols["close"]:"Close",
22
+ cols["volume"]:"Volume",
23
+ })
24
+
25
+ def _fetch_fund_df(symbol: str) -> pd.DataFrame:
26
+ try:
27
+ from quantvn.vn.data import Company
28
+ except Exception as e:
29
+ raise ImportError("Cần quantvn + client(apikey=...) để lấy ratio_summary().") from e
30
+ df = Company(symbol).ratio_summary()
31
+ if "ticker" not in df.columns:
32
+ df = df.copy(); df["ticker"] = symbol
33
+ if "year" not in df.columns or "quarter" not in df.columns:
34
+ raise KeyError("ratio_summary() thiếu cột 'year'/'quarter'.")
35
+ return df
36
+
37
+ def _quarter_end_date(year: int, quarter: int) -> pd.Timestamp:
38
+ """Ngày cuối quý theo (year, quarter). Nếu quarter lạ -> dùng 31/12."""
39
+ if pd.isna(year) or pd.isna(quarter):
40
+ return pd.NaT
41
+ y = int(year); q = int(quarter)
42
+ if q == 1: return pd.Timestamp(y, 3, 31)
43
+ elif q == 2: return pd.Timestamp(y, 6, 30)
44
+ elif q == 3: return pd.Timestamp(y, 9, 30)
45
+ elif q == 4: return pd.Timestamp(y, 12, 31)
46
+ else: return pd.Timestamp(y, 12, 31)
47
+
48
+ def merge_fund_into_price(
49
+ price_df: pd.DataFrame,
50
+ fund_df: pd.DataFrame,
51
+ *,
52
+ price_date_col: str = "Date",
53
+ price_time_col: str = "time",
54
+ ticker_col: str = "ticker",
55
+ quarter_col: str = "quarter",
56
+ year_col: str = "year",
57
+ assume_ticker: str | None = None,
58
+ report_release_lag_days: int = 0,
59
+ drop_all_nan_cols: bool = True
60
+ ) -> pd.DataFrame:
61
+ """
62
+ Gộp các cột (fund_df) vào price_df bằng merge_asof (backward) theo ngày hiệu lực
63
+ của báo cáo (cuối quý + lag nếu có). Giữ nguyên OHLCV.
64
+ """
65
+ out = price_df.copy()
66
+
67
+ # 1) dt ở bảng giá
68
+ if price_time_col in out.columns and price_time_col is not None:
69
+ out["dt"] = pd.to_datetime(out[price_date_col].astype(str) + " " + out[price_time_col].astype(str))
70
+ else:
71
+ out["dt"] = pd.to_datetime(out[price_date_col])
72
+
73
+ # 2) ticker ở bảng giá (nếu thiếu)
74
+ if ticker_col not in out.columns:
75
+ if assume_ticker is None:
76
+ uniq = fund_df.get(ticker_col, pd.Series(dtype=object)).dropna().unique().tolist()
77
+ assume_ticker = uniq[0] if uniq else "TICKER"
78
+ out[ticker_col] = assume_ticker
79
+
80
+ # 3) Chuẩn bị bảng fund
81
+ f = fund_df.copy()
82
+ for c in f.columns:
83
+ if c in (ticker_col, year_col, quarter_col):
84
+ continue
85
+ if not pd.api.types.is_numeric_dtype(f[c]):
86
+ f[c] = pd.to_numeric(f[c], errors="ignore")
87
+
88
+ # 4) Ngày hiệu lực báo cáo
89
+ f["report_date"] = [_quarter_end_date(y, q) for y, q in zip(f[year_col], f[quarter_col])]
90
+ if report_release_lag_days:
91
+ f["effective_date"] = f["report_date"] + pd.to_timedelta(report_release_lag_days, unit="D")
92
+ else:
93
+ f["effective_date"] = f["report_date"]
94
+
95
+ # 5) Chọn cột merge
96
+ id_cols = {ticker_col, year_col, quarter_col, "report_date", "effective_date"}
97
+ value_cols = [c for c in f.columns if c not in id_cols]
98
+
99
+ if drop_all_nan_cols and value_cols:
100
+ keep_mask = f[value_cols].notna().any(axis=0)
101
+ value_cols = list(pd.Index(value_cols)[keep_mask])
102
+
103
+ right = f[[ticker_col, "effective_date"] + value_cols].sort_values([ticker_col, "effective_date"])
104
+ left = out.sort_values([ticker_col, "dt"])
105
+
106
+ # 6) merge_asof theo ticker
107
+ merged = pd.merge_asof(
108
+ left,
109
+ right,
110
+ left_on="dt",
111
+ right_on="effective_date",
112
+ by=ticker_col,
113
+ direction="backward",
114
+ allow_exact_matches=True,
115
+ )
116
+
117
+ # 7) Dọn cột phụ
118
+ merged = merged.drop(columns=["effective_date"], errors="ignore")
119
+ return merged
120
+
121
+ def _ua(source="vietmarket"):
122
+ return {
123
+ "User-Agent": f"{source}/1.0 (+https://example.local)",
124
+ "Accept": "application/json, text/plain, */*",
125
+ "Origin": "https://example.local",
126
+ "Referer": "https://example.local/",
127
+ }
128
+
129
+ def send_request(url, method="GET", headers=None, params=None, payload=None,
130
+ retries=2, backoff=(0.6, 1.2), timeout=DEFAULT_TIMEOUT):
131
+ h = _ua()
132
+ if headers:
133
+ h.update(headers)
134
+ for attempt in range(retries + 1):
135
+ try:
136
+ if method.upper() == "GET":
137
+ r = requests.get(url, headers=h, params=params, timeout=timeout)
138
+ else:
139
+ r = requests.post(url, headers=h, params=params, json=payload, timeout=timeout)
140
+ r.raise_for_status()
141
+ if "application/json" in r.headers.get("Content-Type", ""):
142
+ return r.json()
143
+ return r.text
144
+ except Exception:
145
+ if attempt >= retries:
146
+ raise
147
+ time.sleep(random.uniform(*backoff))
148
+
149
+
150
+ def add_all_ta_features(
151
+ df,
152
+ open: str = "Open",
153
+ high: str = "High",
154
+ low: str = "Low",
155
+ close: str = "Close",
156
+ volume: str = "Volume",
157
+ fillna: bool = True,
158
+ ):
159
+ """
160
+ Thêm toàn bộ technical indicators từ thư viện `ta` vào DataFrame.
161
+ Giữ nguyên signature giống `ta.add_all_ta_features` để sử dụng y hệt.
162
+
163
+ Parameters
164
+ ----------
165
+ df : pandas.DataFrame
166
+ DataFrame phải có các cột giá/khối lượng tương ứng.
167
+ open,high,low,close,volume : str
168
+ Tên cột trong df.
169
+ fillna : bool
170
+ Nếu True, sẽ điền các giá trị NaN theo mặc định của thư viện `ta`.
171
+
172
+ Returns
173
+ -------
174
+ pandas.DataFrame
175
+ DataFrame đầu vào + các cột TA features.
176
+ """
177
+ try:
178
+ from ta import add_all_ta_features as _ta_add_all_ta_features
179
+ from ta.utils import dropna as _ta_dropna
180
+ except Exception as e:
181
+ raise ImportError(
182
+ "Thiếu thư viện 'ta'. Hãy cài: pip install ta"
183
+ ) from e
184
+
185
+ # Làm sạch NaN theo chuẩn của 'ta'
186
+ _df = _ta_dropna(df.copy())
187
+
188
+ # Gọi trực tiếp hàm gốc
189
+ return _ta_add_all_ta_features(
190
+ _df,
191
+ open=open,
192
+ high=high,
193
+ low=low,
194
+ close=close,
195
+ volume=volume,
196
+ fillna=fillna,
197
+ )
198
+
199
+ def _detect_price_columns(df: pd.DataFrame):
200
+ cmap = {c.lower(): c for c in df.columns}
201
+ def pick(*names):
202
+ for n in names:
203
+ if n.lower() in cmap:
204
+ return cmap[n.lower()]
205
+ return None
206
+ date = pick("Date","date")
207
+ timecol = pick("time","Time","datetime","Datetime")
208
+ open_ = pick("Open","open")
209
+ high = pick("High","high")
210
+ low = pick("Low","low")
211
+ close= pick("Close","close","price")
212
+ vol = pick("Volume","volume","vol")
213
+ if not all([date,timecol,open_,high,low,close,vol]):
214
+ raise KeyError(f"Thiếu cột OHLCV/time. Columns hiện có: {list(df.columns)}")
215
+ return {"date":date,"time":timecol,"open":open_,"high":high,"low":low,"close":close,"volume":vol}
216
+
217
+
218
+ def _add_yoy_cols(df: pd.DataFrame, group_col: str, value_col: str, lag_map: dict):
219
+ """Trả về (yoy_pct, yoy_abs) theo lag suy luận của từng ticker."""
220
+ prev = _group_shift(df, group_col, value_col, lag_map)
221
+ yoy_abs = df[value_col] - prev
222
+ with np.errstate(divide="ignore", invalid="ignore"):
223
+ yoy = np.where(prev != 0, (df[value_col] / prev) - 1.0, np.nan)
224
+ return pd.Series(yoy, index=df.index, dtype="float64"), pd.Series(yoy_abs, index=df.index, dtype="float64")
225
+
226
+ def _rolling_monotonic_flag(s: pd.Series, window: int, increasing=True) -> pd.Series:
227
+ """1.0 nếu chuỗi trong 'window' kỳ gần nhất đơn điệu tăng/giảm (strict)."""
228
+ def _chk(x):
229
+ v = pd.Series(x)
230
+ if v.isna().any():
231
+ return np.nan
232
+ dif = v.diff().dropna()
233
+ return float((dif > 0).all()) if increasing else float((dif < 0).all())
234
+ return s.rolling(window=window, min_periods=window).apply(_chk, raw=False)
235
+
236
+ def _last_n_increasing(s: pd.Series, n: int) -> pd.Series:
237
+ d = s.diff()
238
+ return (d.rolling(n, min_periods=n)
239
+ .apply(lambda x: float(np.all(np.array(x) > 0)), raw=True)
240
+ .astype("float"))
241
+
242
+ def _stable_positive_series(s: pd.Series, window: int, cv_tol=0.25) -> pd.Series:
243
+ """1.0 nếu trong 'window' kỳ: tất cả >0 và hệ số biến thiên (CV) <= cv_tol."""
244
+ def _chk(x):
245
+ v = pd.Series(x)
246
+ if v.isna().any() or (v <= 0).any():
247
+ return 0.0
248
+ m = v.mean()
249
+ if m == 0:
250
+ return 0.0
251
+ std = v.std(ddof=0)
252
+ return float((std / m) <= cv_tol)
253
+ return s.rolling(window=window, min_periods=window).apply(_chk, raw=False)
254
+
255
+ def _add_fund_features_bank_schema(
256
+ df: pd.DataFrame,
257
+ *,
258
+ ticker_col: str = "ticker",
259
+ year_col: str = "year",
260
+ quarter_col: str = "quarter",
261
+ stable_div_years: int = 3,
262
+ enable_capex_proxy_for_F36: bool = True,
263
+ ) -> pd.DataFrame:
264
+ """
265
+ Tính toàn bộ feature FUND_* theo schema ngân hàng (theo notebook bạn cung cấp).
266
+ Hàm an toàn với cột thiếu — tự bỏ qua phần không đủ dữ liệu.
267
+ """
268
+ df = df.copy()
269
+
270
+ # Suy luận độ trễ YoY cho từng ticker (ví dụ: 4 nếu theo quý; 1 nếu theo năm)
271
+ lag_map = {}
272
+ for key, g in df.groupby(ticker_col):
273
+ lag_map[key] = _infer_yoy_lag(g, year_col, quarter_col)
274
+
275
+ # Các cột chuẩn hoá tên (map sang cột trong df nếu có)
276
+ cols = {
277
+ "eps": "eps",
278
+ "gross_margin": "grossMargin",
279
+ "net_margin": "netMargin",
280
+ "roe": "roe",
281
+ "roa": "roa",
282
+ "debt_to_equity": "debtToEquity",
283
+ "current_ratio": "currentRatio",
284
+ "quick_ratio": "quickRatio",
285
+ "equity_per_share": "bookValuePerShare",
286
+ "asset_turnover": "assetTurnover",
287
+ "days_receivable": "daysOfReceivables",
288
+ "days_inventory": "daysOfInventory",
289
+ "days_payable": "daysOfPayables",
290
+ "charter_capital": "capitalBalance",
291
+ "cash_ratio": "cashRatio",
292
+ # thêm một số dòng tiền/lợi nhuận nếu có
293
+ "profit_after_tax": "profitAfterTax",
294
+ "operating_cashflow": "netCashFromOperating",
295
+ "investing_cashflow": "netCashFromInvesting",
296
+ "financing_cashflow": "netCashFromFinancing",
297
+ "dividend": "dividend",
298
+ "capex": "capexOnFixedAsset",
299
+ }
300
+
301
+ # Chuẩn hoá numeric
302
+ for c in cols.values():
303
+ if c in df.columns:
304
+ df[c] = pd.to_numeric(df[c], errors="coerce")
305
+
306
+ # Thêm YoY cho các cột cần
307
+ yoy_targets = [
308
+ "eps","gross_margin","net_margin","roe","roa",
309
+ "debt_to_equity","current_ratio","quick_ratio",
310
+ "equity_per_share","asset_turnover",
311
+ "days_receivable","days_inventory","days_payable",
312
+ "charter_capital","cash_ratio"
313
+ ]
314
+ for key in yoy_targets:
315
+ col = cols.get(key)
316
+ if col in df.columns:
317
+ yoy, yoy_abs = _add_yoy_cols(df, ticker_col, col, lag_map)
318
+ df[f"{col}_yoy"] = yoy
319
+ df[f"{col}_yoy_abs"] = yoy_abs
320
+
321
+ # Khởi tạo vùng FUND_*
322
+ for i in range(21, 61):
323
+ df[f"FUND_{i}"] = np.nan
324
+
325
+ # ===== Ví dụ một số Feature (theo notebook) =====
326
+ # FUND-21: EPS YoY > 0
327
+ if cols["eps"] in df.columns and f'{cols["eps"]}_yoy' in df.columns:
328
+ df["FUND_21"] = (df[f'{cols["eps"]}_yoy'] > 0).astype("float")
329
+
330
+ # FUND-22: Biên LN gộp tăng YoY
331
+ if cols["gross_margin"] in df.columns and f'{cols["gross_margin"]}_yoy' in df.columns:
332
+ df["FUND_22"] = (df[f'{cols["gross_margin"]}_yoy'] > 0).astype("float")
333
+
334
+ # FUND-23: Biên LN ròng tăng YoY
335
+ if cols["net_margin"] in df.columns and f'{cols["net_margin"]}_yoy' in df.columns:
336
+ df["FUND_23"] = (df[f'{cols["net_margin"]}_yoy'] > 0).astype("float")
337
+
338
+ # FUND-24/25: ROE/ROA tăng YoY
339
+ if cols["roe"] in df.columns and f'{cols["roe"]}_yoy' in df.columns:
340
+ df["FUND_24"] = (df[f'{cols["roe"]}_yoy'] > 0).astype("float")
341
+ if cols["roa"] in df.columns and f'{cols["roa"]}_yoy' in df.columns:
342
+ df["FUND_25"] = (df[f'{cols["roa"]}_yoy'] > 0).astype("float")
343
+
344
+ # FUND-32: D/E giảm YoY (an toàn hơn)
345
+ if cols["debt_to_equity"] in df.columns and f'{cols["debt_to_equity"]}_yoy' in df.columns:
346
+ df["FUND_32"] = (df[f'{cols["debt_to_equity"]}_yoy'] < 0).astype("float")
347
+
348
+ # FUND-34/35: Current/Quick ratio tăng YoY
349
+ if cols["current_ratio"] in df.columns and f'{cols["current_ratio"]}_yoy' in df.columns:
350
+ df["FUND_34"] = (df[f'{cols["current_ratio"]}_yoy'] > 0).astype("float")
351
+ if cols["quick_ratio"] in df.columns and f'{cols["quick_ratio"]}_yoy' in df.columns:
352
+ df["FUND_35"] = (df[f'{cols["quick_ratio"]}_yoy'] > 0).astype("float")
353
+
354
+ # FUND-36: Capex mở rộng (capex YoY_abs > 0), fallback dùng investing cashflow nếu bật proxy
355
+ if cols["capex"] in df.columns:
356
+ df["FUND_36"] = (df[cols["capex"]].diff() > 0).astype("float")
357
+ elif enable_capex_proxy_for_F36 and cols["investing_cashflow"] in df.columns:
358
+ df["FUND_36"] = (df[cols["investing_cashflow"]].diff() < 0).astype("float") # chi đầu tư tăng => CF đầu tư âm hơn
359
+
360
+ # FUND-40: Vòng quay tài sản tăng YoY
361
+ if cols["asset_turnover"] in df.columns and f'{cols["asset_turnover"]}_yoy' in df.columns:
362
+ df["FUND_40"] = (df[f'{cols["asset_turnover"]}_yoy'] > 0).astype("float")
363
+
364
+ # FUND-50: Tiền mặt/TS ngắn hạn cải thiện (cash ratio YoY > 0)
365
+ if cols["cash_ratio"] in df.columns and f'{cols["cash_ratio"]}_yoy' in df.columns:
366
+ df["FUND_50"] = (df[f'{cols["cash_ratio"]}_yoy'] > 0).astype("float")
367
+
368
+ # FUND-52: GPA tăng YoY (proxy: gross_margin * asset_turnover * (1 - debt_to_equity_norm))
369
+ # Đây chỉ là ví dụ minh hoạ; có thể thay đổi tuỳ schema thực tế.
370
+ if all(cols[k] in df.columns for k in ["gross_margin", "asset_turnover"]) and cols["debt_to_equity"] in df.columns:
371
+ gpa = df[cols["gross_margin"]] * df[cols["asset_turnover"]]
372
+ d2e = df[cols["debt_to_equity"]]
373
+ d2e_norm = (d2e - d2e.min()) / (d2e.max() - d2e.min() + 1e-9)
374
+ gpa_prev = _group_shift(pd.DataFrame({"x": gpa, "t": df[ticker_col]}).rename(columns={"x": "gpa", "t": "ticker"}),
375
+ "ticker", "gpa", lag_map)
376
+ gpa_yoy = pd.Series(np.nan, index=df.index, dtype="float64")
377
+ m = gpa_prev.notna() & (gpa_prev != 0)
378
+ gpa_yoy.loc[m] = gpa.loc[m] / gpa_prev.loc[m] - 1
379
+ df["FUND_52"] = (gpa_yoy > 0).astype("float")
380
+
381
+ # FUND-58: tăng vốn điều lệ
382
+ if cols["charter_capital"] in df.columns:
383
+ cap_abs = df.get(f"{cols['charter_capital']}_yoy_abs", None)
384
+ if cap_abs is not None:
385
+ df["FUND_58"] = (cap_abs > 0).astype("float")
386
+ else:
387
+ df["FUND_58"] = (df.groupby(ticker_col)[cols["charter_capital"]].diff() > 0).astype("float")
388
+
389
+ # FUND-60: cổ tức ổn định ≥ N năm
390
+ if cols["dividend"] in df.columns:
391
+ fund60 = pd.Series(index=df.index, dtype="float64")
392
+ for key, g in df.groupby(ticker_col):
393
+ lag = lag_map[key]
394
+ win = int(stable_div_years * lag)
395
+ fund60.loc[g.index] = _stable_positive_series(g[cols["dividend"]], window=win, cv_tol=0.25).to_numpy()
396
+ df["FUND_60"] = fund60
397
+
398
+ # Ép float
399
+ for c in [c for c in df.columns if c.startswith("FUND_")]:
400
+ df[c] = df[c].astype("float")
401
+
402
+ return df
403
+
404
+ def _finalize_fund_features(
405
+ df: pd.DataFrame,
406
+ *,
407
+ drop_nan_threshold: float = 1.0, # 1.0 = chỉ drop cột ALL-NaN; 0.9 = drop nếu NaN >= 90%
408
+ cast_binary_to_int: bool = True
409
+ ) -> pd.DataFrame:
410
+ """
411
+ - Drop các cột có tỷ lệ NaN >= drop_nan_threshold
412
+ - Đổi tên FUND_* sang tên ngắn gọn (có thể tuỳ biến sau)
413
+ - Tuỳ chọn ép kiểu 0/1 về Int8
414
+ """
415
+ df = df.copy()
416
+ if drop_nan_threshold < 1.0:
417
+ th = float(drop_nan_threshold)
418
+ keep = []
419
+ for c in df.columns:
420
+ if df[c].isna().mean() < th:
421
+ keep.append(c)
422
+ df = df[keep]
423
+
424
+ if cast_binary_to_int:
425
+ for c in [c for c in df.columns if c.startswith("FUND_")]:
426
+ if set(pd.unique(df[c].dropna())).issubset({0.0, 1.0}):
427
+ df[c] = df[c].astype("Int8")
428
+
429
+ return df
430
+
431
+ def add_all_fund_features(
432
+ df: pd.DataFrame,
433
+ *,
434
+ ticker_col: str = "ticker",
435
+ year_col: str = "year",
436
+ quarter_col: str = "quarter",
437
+ stable_div_years: int = 3,
438
+ enable_capex_proxy_for_F36: bool = True,
439
+ drop_nan_threshold: float = 1.0,
440
+ cast_binary_to_int: bool = True
441
+ ) -> pd.DataFrame:
442
+ """
443
+ Thêm TẤT CẢ fundamental features (FUND_*) theo schema ngân hàng vào DataFrame đầu vào.
444
+ - Tự động suy luận độ trễ theo quý/năm cho mỗi mã.
445
+ - Bỏ qua phần thiếu dữ liệu.
446
+ - Tuỳ chọn lọc bớt cột toàn NaN và ép kiểu nhị phân.
447
+
448
+ Trả về DataFrame mới (không sửa df gốc).
449
+ """
450
+ out = _add_fund_features_bank_schema(
451
+ df,
452
+ ticker_col=ticker_col,
453
+ year_col=year_col,
454
+ quarter_col=quarter_col,
455
+ stable_div_years=stable_div_years,
456
+ enable_capex_proxy_for_F36=enable_capex_proxy_for_F36,
457
+ )
458
+ out = _finalize_fund_features(
459
+ out,
460
+ drop_nan_threshold=drop_nan_threshold,
461
+ cast_binary_to_int=cast_binary_to_int,
462
+ )
463
+ return out
464
+
465
+
466
+
467
+ # =====BEGIN: FUNDAMENTAL FEATURES (auto-imported from Feature.ipynb)=====
468
+ import pandas as pd
469
+ import numpy as np
470
+
471
+ # =========================
472
+ # Helpers (bền vững, không dùng DataFrameGroupBy.apply)
473
+ # =========================
474
+ def _infer_yoy_lag(g: pd.DataFrame, year_col: str, quarter_col: str) -> int:
475
+ """Suy luận số kỳ/năm cho từng ticker (1 nếu dữ liệu theo năm)."""
476
+ cnt = g.groupby(year_col)[quarter_col].nunique()
477
+ if cnt.empty:
478
+ return 1
479
+ mode = cnt.mode().iat[0]
480
+ return int(mode if mode and mode > 0 else 1)
481
+
482
+ def _group_shift(df: pd.DataFrame, group_col: str, value_col: str, lag_map: dict) -> pd.Series:
483
+ """
484
+ Trả về Series 'prev' = value_col dịch theo lag riêng của từng group.
485
+ Tránh dùng groupby.apply để né DeprecationWarning.
486
+ """
487
+ prev = pd.Series(index=df.index, dtype='float64')
488
+ # groups: {key -> index}
489
+ groups = df.groupby(group_col).groups
490
+ for key, idx in groups.items():
491
+ l = lag_map[key]
492
+ prev.loc[idx] = df.loc[idx, value_col].shift(l).to_numpy()
493
+ return prev
494
+
495
+ def _add_yoy_cols(df: pd.DataFrame, group_col: str, value_col: str, lag_map: dict):
496
+ """Tạo 2 Series yoy (%) và yoy_abs cho value_col, canh chỉ số với df.index."""
497
+ if value_col not in df.columns:
498
+ return pd.Series(index=df.index, dtype='float64'), pd.Series(index=df.index, dtype='float64')
499
+ prev = _group_shift(df, group_col, value_col, lag_map)
500
+ yoy = pd.Series(np.nan, index=df.index, dtype='float64')
501
+ mask = prev != 0
502
+ yoy.loc[mask] = df.loc[mask, value_col] / prev.loc[mask] - 1
503
+ yoy_abs = (df[value_col] - prev).astype('float64')
504
+ return yoy, yoy_abs
505
+
506
+ def _rolling_monotonic_flag(s: pd.Series, window: int, increasing=True) -> pd.Series:
507
+ """True nếu chuỗi trong 'window' kỳ gần nhất đơn điệu tăng/giảm (strict)."""
508
+ def _chk(x):
509
+ v = pd.Series(x)
510
+ if v.isna().any():
511
+ return np.nan
512
+ dif = v.diff().dropna()
513
+ return float((dif > 0).all()) if increasing else float((dif < 0).all())
514
+ return s.rolling(window=window, min_periods=window).apply(_chk, raw=False)
515
+
516
+ def _last_n_increasing(s: pd.Series, n: int) -> pd.Series:
517
+ d = s.diff()
518
+ return (d.rolling(n, min_periods=n)
519
+ .apply(lambda x: float(np.all(np.array(x) > 0)), raw=True)
520
+ .astype('float'))
521
+
522
+ def _stable_positive_series(s: pd.Series, window: int, cv_tol=0.25) -> pd.Series:
523
+ """True nếu trong 'window' kỳ: tất cả >0 và CV<=cv_tol."""
524
+ def _chk(x):
525
+ v = pd.Series(x)
526
+ if v.isna().any() or (v <= 0).any():
527
+ return 0.0
528
+ m = v.mean()
529
+ if m == 0:
530
+ return 0.0
531
+ cv = v.std(ddof=0) / m
532
+ return float(cv <= cv_tol)
533
+ return s.rolling(window=window, min_periods=window).apply(_chk, raw=False)
534
+
535
+ # =========================
536
+ # Main (schema ngân hàng)
537
+ # =========================
538
+ def add_fund_features_bank_schema(
539
+ df: pd.DataFrame,
540
+ *,
541
+ ticker_col: str = 'ticker',
542
+ quarter_col: str = 'quarter',
543
+ year_col: str = 'year',
544
+ stable_div_years: int = 3,
545
+ enable_capex_proxy_for_F36: bool = True
546
+ ) -> pd.DataFrame:
547
+
548
+ df = df.copy()
549
+ df = df.sort_values([ticker_col, year_col, quarter_col]).reset_index(drop=True)
550
+
551
+ # Suy luận lag theo ticker (KHÔNG dùng groupby.apply)
552
+ lag_map = {}
553
+ for t, g in df.groupby(ticker_col):
554
+ lag_map[t] = _infer_yoy_lag(g[[year_col, quarter_col]], year_col, quarter_col)
555
+
556
+ cols = {
557
+ 'eps': 'earningPerShare',
558
+ 'gross_margin': 'grossProfitMargin',
559
+ 'net_margin': 'postTaxMargin',
560
+ 'roe': 'roe',
561
+ 'roa': 'roa',
562
+ 'debt_to_equity': 'debtOnEquity',
563
+ 'current_ratio': 'currentPayment',
564
+ 'quick_ratio': 'quickPayment',
565
+ 'interest_coverage': 'ebitOnInterest',
566
+ 'equity_per_share': 'bookValuePerShare',
567
+ 'asset_turnover': 'revenueOnAsset',
568
+ 'days_receivable': 'daysReceivable',
569
+ 'days_inventory': 'daysInventory',
570
+ 'days_payable': 'daysPayable',
571
+ 'charter_capital': 'capitalBalance',
572
+ 'cash_ratio': 'cashOnEquity',
573
+ 'dividend': 'dividend',
574
+ 'capex': 'capexOnFixedAsset',
575
+ }
576
+
577
+ # Chuẩn hoá numeric
578
+ for c in cols.values():
579
+ if c in df.columns:
580
+ df[c] = pd.to_numeric(df[c], errors='coerce')
581
+
582
+ # Thêm YoY cho các cột cần (KHÔNG dùng DataFrameGroupBy.apply)
583
+ yoy_targets = [
584
+ 'eps','gross_margin','net_margin','roe','roa',
585
+ 'debt_to_equity','current_ratio','quick_ratio',
586
+ 'equity_per_share','asset_turnover',
587
+ 'days_receivable','days_inventory','days_payable',
588
+ 'charter_capital','cash_ratio'
589
+ ]
590
+ for key in yoy_targets:
591
+ col = cols.get(key)
592
+ if col in df.columns:
593
+ yoy, yoy_abs = _add_yoy_cols(df, ticker_col, col, lag_map)
594
+ df[f'{col}_yoy'] = yoy
595
+ df[f'{col}_yoy_abs'] = yoy_abs
596
+
597
+ # Khởi tạo
598
+ for i in range(21, 61):
599
+ df[f'FUND_{i}'] = np.nan
600
+
601
+ # ===== Features tính được ngay =====
602
+ if cols['eps'] in df.columns:
603
+ df['FUND_25'] = (df[f"{cols['eps']}_yoy"] > 0).astype('float')
604
+ df['FUND_26'] = df.groupby(ticker_col, group_keys=False)[cols['eps']].apply(
605
+ lambda s: _rolling_monotonic_flag(s, window=4, increasing=True)
606
+ )
607
+
608
+ if cols['gross_margin'] in df.columns:
609
+ df['FUND_27'] = df.groupby(ticker_col, group_keys=False)[cols['gross_margin']].apply(
610
+ lambda s: _last_n_increasing(s, n=2)
611
+ )
612
+
613
+ if cols['net_margin'] in df.columns:
614
+ df['FUND_28'] = df.groupby(ticker_col, group_keys=False)[cols['net_margin']].apply(
615
+ lambda s: _last_n_increasing(s, n=2)
616
+ )
617
+
618
+ if cols['cash_ratio'] in df.columns:
619
+ df['FUND_33'] = (df[f"{cols['cash_ratio']}_yoy"] > 0).astype('float')
620
+
621
+ if enable_capex_proxy_for_F36 and cols['capex'] in df.columns:
622
+ df['FUND_36'] = (df[cols['capex']] > 0).astype('float')
623
+
624
+ if cols['debt_to_equity'] in df.columns:
625
+ df['FUND_37'] = df.groupby(ticker_col, group_keys=False)[cols['debt_to_equity']].apply(
626
+ lambda s: _rolling_monotonic_flag(s, window=4, increasing=False)
627
+ )
628
+
629
+ if cols['current_ratio'] in df.columns:
630
+ df['FUND_38'] = ((df[cols['current_ratio']] > 1.5) &
631
+ (df[f"{cols['current_ratio']}_yoy"] > 0)).astype('float')
632
+ df['FUND_44'] = ((df[cols['current_ratio']] > 1.0) &
633
+ (df[f"{cols['current_ratio']}_yoy"] > 0)).astype('float')
634
+
635
+ if cols['quick_ratio'] in df.columns:
636
+ df['FUND_39'] = ((df[cols['quick_ratio']] > 1.0) &
637
+ (df[f"{cols['quick_ratio']}_yoy"] > 0)).astype('float')
638
+
639
+ if cols['interest_coverage'] in df.columns:
640
+ df['FUND_40'] = (df[cols['interest_coverage']] > 3).astype('float')
641
+
642
+ if cols['equity_per_share'] in df.columns:
643
+ df['FUND_42'] = (df[f"{cols['equity_per_share']}_yoy"] > 0).astype('float')
644
+
645
+ if cols['roe'] in df.columns:
646
+ df['FUND_45'] = ((df[cols['roe']] > 0.15) &
647
+ (df[f"{cols['roe']}_yoy"] > 0)).astype('float')
648
+
649
+ if cols['roa'] in df.columns:
650
+ df['FUND_46'] = ((df[cols['roa']] > 0.08) &
651
+ (df[f"{cols['roa']}_yoy"] > 0)).astype('float')
652
+
653
+ if cols['asset_turnover'] in df.columns:
654
+ df['FUND_47'] = (df[f"{cols['asset_turnover']}_yoy"] > 0).astype('float')
655
+
656
+ if cols['days_inventory'] in df.columns:
657
+ df['FUND_48'] = (df[f"{cols['days_inventory']}_yoy_abs"] < 0).astype('float')
658
+
659
+ if cols['days_receivable'] in df.columns:
660
+ df['FUND_49'] = (df[f"{cols['days_receivable']}_yoy_abs"] < 0).astype('float')
661
+
662
+ if cols['days_payable'] in df.columns:
663
+ df['FUND_50'] = (df[f"{cols['days_payable']}_yoy_abs"] < 0).astype('float')
664
+
665
+ # FUND-52: (gross_margin * asset_turnover) YoY > 0 (không dùng apply DataFrame)
666
+ if cols['gross_margin'] in df.columns and cols['asset_turnover'] in df.columns:
667
+ gpa = (df[cols['gross_margin']] * df[cols['asset_turnover']]).astype('float64')
668
+ # Self-shift theo group
669
+ gpa_prev = pd.Series(index=df.index, dtype='float64')
670
+ groups = df.groupby(ticker_col).groups
671
+ for key, idx in groups.items():
672
+ l = lag_map[key]
673
+ gpa_prev.loc[idx] = gpa.loc[idx].shift(l).to_numpy()
674
+ gpa_yoy = pd.Series(np.nan, index=df.index, dtype='float64')
675
+ m = gpa_prev != 0
676
+ gpa_yoy.loc[m] = gpa.loc[m] / gpa_prev.loc[m] - 1
677
+ df['FUND_52'] = (gpa_yoy > 0).astype('float')
678
+
679
+ # FUND-58: vốn điều lệ tăng (capitalBalance YoY_abs > 0)
680
+ if cols['charter_capital'] in df.columns:
681
+ cap_abs = df.get(f"{cols['charter_capital']}_yoy_abs", None)
682
+ if cap_abs is not None:
683
+ df['FUND_58'] = (cap_abs > 0).astype('float')
684
+ else:
685
+ df['FUND_58'] = (df.groupby(ticker_col)[cols['charter_capital']].diff() > 0).astype('float')
686
+
687
+ # FUND-60: cổ tức ổn định ≥ N năm (vòng lặp group, KHÔNG dùng apply)
688
+ if cols['dividend'] in df.columns:
689
+ fund60 = pd.Series(index=df.index, dtype='float64')
690
+ for key, g in df.groupby(ticker_col):
691
+ lag = lag_map[key]
692
+ win = stable_div_years * lag
693
+ fund60.loc[g.index] = _stable_positive_series(g[cols['dividend']], window=win, cv_tol=0.25).to_numpy()
694
+ df['FUND_60'] = fund60
695
+
696
+ # Ép float
697
+ for c in [c for c in df.columns if c.startswith('FUND_')]:
698
+ df[c] = df[c].astype('float')
699
+
700
+ return df
701
+
702
+
703
+ import pandas as pd
704
+ import numpy as np
705
+
706
+ def finalize_fund_features(
707
+ df: pd.DataFrame,
708
+ *,
709
+ drop_nan_threshold: float = 1.0, # 1.0 = chỉ drop cột ALL-NaN; 0.9 = drop nếu NaN >= 90%
710
+ cast_binary_to_int: bool = True
711
+ ) -> pd.DataFrame:
712
+ """
713
+ - Drop các cột có tỷ lệ NaN >= drop_nan_threshold
714
+ - Đổi tên FUND_* sang tên ngắn gọn, có nghĩa
715
+ - Tuỳ chọn ép kiểu 0/1 về Int8
716
+ """
717
+ df2 = df.copy()
718
+
719
+ # 1) Drop cột theo tỷ lệ NaN
720
+ na_ratio = df2.isna().mean()
721
+ cols_drop = na_ratio.index[na_ratio >= drop_nan_threshold].tolist()
722
+ if cols_drop:
723
+ df2 = df2.drop(columns=cols_drop)
724
+
725
+ # 2) Đổi tên các FUND_*
726
+ rename_map = {
727
+ # EPS & margins
728
+ "FUND_25": "eps_yoy_up", # EPS tăng YoY
729
+ "FUND_26": "eps_up_4p", # EPS tăng liên tục 4 kỳ
730
+ "FUND_27": "gm_up_2p", # Gross margin tăng ≥2 kỳ
731
+ "FUND_28": "nm_up_2p", # Net margin tăng ≥2 kỳ
732
+
733
+ # Liquidity / cash / capex
734
+ "FUND_33": "cash_on_equity_yoy_up",# Tiền/VCSH tăng YoY (proxy)
735
+ "FUND_36": "capex_pos", # Capex > 0 (proxy mở rộng đầu tư)
736
+
737
+ # Leverage & coverage
738
+ "FUND_37": "de_ratio_down_4p", # Debt/Equity giảm 4 kỳ
739
+ "FUND_38": "curr_gt1_5_yoy_up", # Current ratio >1.5 & YoY tăng
740
+ "FUND_39": "quick_gt1_yoy_up", # Quick ratio >1 & YoY tăng
741
+ "FUND_40": "int_cov_gt3", # Interest coverage >3
742
+
743
+ # Equity & returns
744
+ "FUND_42": "bvps_yoy_up", # BVPS tăng YoY (proxy vốn CSH)
745
+ "FUND_44": "wc_pos_yoy_up", # Vốn lưu động dương & cải thiện (proxy)
746
+ "FUND_45": "roe_gt15_yoy_up", # ROE >15% & YoY tăng
747
+ "FUND_46": "roa_gt8_yoy_up", # ROA >8% & YoY tăng
748
+
749
+ # Efficiency & working capital cycles
750
+ "FUND_47": "asset_turnover_yoy_up",# Doanh thu/Tài sản tăng YoY
751
+ "FUND_48": "days_inv_yoy_down", # Days Inventory giảm YoY (proxy IT up)
752
+ "FUND_49": "days_rec_yoy_down", # Days Receivable giảm YoY (proxy RT up)
753
+ "FUND_50": "days_pay_yoy_down", # Days Payable giảm YoY (giả định cải thiện)
754
+
755
+ # Profitability vs assets
756
+ "FUND_52": "gpa_yoy_up", # GP/TA tăng YoY ≈ GM × AT
757
+
758
+ # Capital / dividend
759
+ "FUND_58": "charter_cap_yoy_up", # Vốn điều lệ tăng YoY (capitalBalance)
760
+ "FUND_60": "div_stable_geNyrs", # Cổ tức ổn định ≥ N năm (CV<=0.25 & >0)
761
+ }
762
+ # Chỉ rename các cột đang tồn tại
763
+ rename_existing = {k: v for k, v in rename_map.items() if k in df2.columns}
764
+ df2 = df2.rename(columns=rename_existing)
765
+
766
+ # 3) (Tuỳ chọn) Ép các cột nhị phân về Int8 (0/1/NA)
767
+ if cast_binary_to_int:
768
+ bin_cols = list(rename_existing.values())
769
+ for c in bin_cols:
770
+ if c in df2.columns:
771
+ # đôi khi có float 0.0/1.0/NaN ⇒ ép sang Int8 an toàn
772
+ df2[c] = pd.to_numeric(df2[c], errors="coerce")
773
+ # giữ NaN nếu có; dùng dtype "Int8" (nullable)
774
+ try:
775
+ df2[c] = df2[c].round().astype("Int8")
776
+ except Exception:
777
+ pass
778
+
779
+ return df2
780
+
781
+ def add_all_fund_features(
782
+ df: pd.DataFrame,
783
+ *,
784
+ ticker_col: str = "ticker",
785
+ year_col: str = "year",
786
+ quarter_col: str = "quarter",
787
+ stable_div_years: int = 3,
788
+ enable_capex_proxy_for_F36: bool = True,
789
+ drop_nan_threshold: float = 1.0,
790
+ cast_binary_to_int: bool = True
791
+ ) -> pd.DataFrame:
792
+ """
793
+ Thêm TẤT CẢ fundamental features (FUND_*) theo schema ngân hàng vào DataFrame đầu vào.
794
+ Gồm: add_fund_features_bank_schema(...) + finalize_fund_features(...)
795
+ """
796
+ out = add_fund_features_bank_schema(
797
+ df,
798
+ stable_div_years=stable_div_years,
799
+ enable_capex_proxy_for_F36=enable_capex_proxy_for_F36,
800
+ )
801
+ out = finalize_fund_features(
802
+ out,
803
+ drop_nan_threshold=drop_nan_threshold,
804
+ cast_binary_to_int=cast_binary_to_int,
805
+ )
806
+ return out
807
+
808
+
809
+
810
+
811
+ # ========================= Lazy FUND fetching/computation =========================
812
+ _BASE_FUND_COLS = {
813
+ "earningPerShare","bookValuePerShare","roe","roa",
814
+ "priceToEarning","priceToBook",
815
+ "interestMargin","nonInterestOnToi",
816
+ "badDebtPercentage","provisionOnBadDebt",
817
+ "costOfFinancing",
818
+ "equityOnTotalAsset","equityOnLoan",
819
+ "costToIncome","equityOnLiability",
820
+ "assetOnEquity",
821
+ "preProvisionOnToi","postTaxOnToi",
822
+ "loanOnEarnAsset","loanOnAsset","loanOnDeposit",
823
+ "depositOnEarnAsset",
824
+ "badDebtOnAsset","liquidityOnLiability","payableOnEquity",
825
+ "cancelDebt",
826
+ "creditGrowth",
827
+ }
828
+
829
+ # cache now stores: {"price": df, "fund_raw": df, "fund_full": df or None}
830
+ def _auto_get(symbol: str, *, timeframe: str = "h", force_refresh: bool = False):
831
+ key = f"{symbol}|{timeframe}"
832
+ cache_hit = (not force_refresh) and (key in _CACHE)
833
+ if cache_hit and "price" in _CACHE[key] and "fund_raw" in _CACHE[key]:
834
+ return _CACHE[key]["price"], _CACHE[key]["fund_raw"], _CACHE[key].get("fund_full")
835
+ df_price = _fetch_price_df(symbol, timeframe=timeframe)
836
+ # fetch raw fund without computing derived features
837
+ try:
838
+ from quantvn.vn.data import Company
839
+ except Exception as e:
840
+ raise ImportError("Cần quantvn: `pip install quantvn` và khởi tạo client(apikey=...)") from e
841
+ df_raw = Company(symbol).ratio_summary()
842
+ if "ticker" not in df_raw.columns:
843
+ df_raw = df_raw.copy(); df_raw["ticker"] = symbol
844
+ if "year" not in df_raw.columns or "quarter" not in df_raw.columns:
845
+ raise KeyError("ratio_summary() cần có cột 'year' và 'quarter'")
846
+ _CACHE[key] = {"price": df_price, "fund_raw": df_raw, "fund_full": None}
847
+ return df_price, df_raw, None
848
+
849
+ def _get_fund_frame_for_feature(symbol: str, timeframe: str, feature_name: str, *, force_refresh: bool):
850
+ """Return a DataFrame that contains the requested feature.
851
+ - If it's a base column and exists in raw, return raw.
852
+ - Otherwise, compute full FUND on demand (once), cache it, and return that.
853
+ """
854
+ key = f"{symbol}|{timeframe}"
855
+ p, raw, full = _auto_get(symbol, timeframe=timeframe, force_refresh=force_refresh)
856
+ # If base & present -> use raw
857
+ if feature_name in raw.columns:
858
+ return p, raw
859
+ # Else compute (or reuse cached) full
860
+ if full is None or force_refresh:
861
+ full = add_all_fund_features(raw, ticker_col="ticker", year_col="year", quarter_col="quarter")
862
+ _CACHE[key]["fund_full"] = full
863
+ return p, full
864
+
865
+
866
+ def fund_feature(
867
+ feature_name: str,
868
+ symbol: str,
869
+ *,
870
+ timeframe: str = "h",
871
+ report_release_lag_days: int = 0,
872
+ force_refresh: bool = False
873
+ ) -> pd.DataFrame:
874
+ """
875
+ Lấy OHLCV + 1 cột fundamental (feature_name) cho symbol.
876
+ - Với feature cơ bản (vd: earningPerShare), chỉ dùng ratio_summary() raw.
877
+ - Với feature dẫn xuất (YoY / FUND flags...), tính FUND on-demand & cache.
878
+ Trả về: date, time, open, high, low, close, volume, <feature_name>
879
+ """
880
+ if not isinstance(feature_name, str):
881
+ raise TypeError("feature_name phải là str")
882
+
883
+ price_df, fund_df = _get_fund_frame_for_feature(symbol, timeframe, feature_name, force_refresh=force_refresh)
884
+
885
+ merged = merge_fund_into_price(
886
+ price_df=price_df,
887
+ fund_df=fund_df,
888
+ price_date_col="Date",
889
+ price_time_col="time",
890
+ ticker_col="ticker",
891
+ quarter_col="quarter",
892
+ year_col="year",
893
+ assume_ticker=symbol,
894
+ report_release_lag_days=report_release_lag_days,
895
+ drop_all_nan_cols=True,
896
+ )
897
+
898
+ if feature_name not in merged.columns:
899
+ raise KeyError(f"Không tìm thấy feature '{feature_name}' sau khi merge.")
900
+
901
+ out = merged[["Date","time","Open","High","Low","Close","Volume", feature_name]].copy()
902
+
903
+ out = out.sort_values(["Date","time"], kind="mergesort").reset_index(drop=True)
904
+ return out[["Date","time","Open","High","Low","Close","Volume", feature_name]]