aria-code 4.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agents/__init__.py +32 -0
- agents/base.py +190 -0
- agents/deep/__init__.py +37 -0
- agents/deep/calibration_loop.py +144 -0
- agents/deep/critic.py +125 -0
- agents/deep/deepen.py +193 -0
- agents/deep/models.py +149 -0
- agents/deep/pipeline.py +164 -0
- agents/deep/quant_fusion.py +192 -0
- agents/deep/themes.py +95 -0
- agents/deep/tiers.py +106 -0
- agents/financial/__init__.py +10 -0
- agents/financial/catalyst.py +279 -0
- agents/financial/debate.py +145 -0
- agents/financial/earnings.py +303 -0
- agents/financial/fundamental.py +159 -0
- agents/financial/macro.py +99 -0
- agents/financial/news.py +207 -0
- agents/financial/risk.py +132 -0
- agents/financial/sector.py +279 -0
- agents/financial/synthesis.py +274 -0
- agents/financial/technical.py +258 -0
- agents/portfolio_agent.py +333 -0
- agents/realty/__init__.py +62 -0
- agents/realty/asset_diagnosis.py +150 -0
- agents/realty/business_match.py +165 -0
- agents/realty/cashflow_verify.py +208 -0
- agents/realty/contract_rules.py +209 -0
- agents/realty/energy_anomaly.py +188 -0
- agents/realty/exit_settlement.py +207 -0
- agents/realty/fulfillment_risk.py +205 -0
- agents/realty/ops_optimize.py +159 -0
- agents/realty/revenue_share.py +214 -0
- agents/registry.py +144 -0
- agents/sports/__init__.py +0 -0
- agents/sports/football_agent.py +169 -0
- agents/team.py +289 -0
- aliyun_data_client.py +660 -0
- apps/README.md +12 -0
- apps/__init__.py +2 -0
- apps/channels/README.md +15 -0
- apps/cli/README.md +13 -0
- apps/cli/__init__.py +2 -0
- apps/cli/bootstrap.py +99 -0
- apps/cli/codegen_paths.py +29 -0
- apps/cli/commands/__init__.py +16 -0
- apps/cli/commands/analysis_cmds.py +288 -0
- apps/cli/commands/backtest_cmds.py +1887 -0
- apps/cli/commands/broker_cmds.py +1154 -0
- apps/cli/commands/business_workflow_cmds.py +289 -0
- apps/cli/commands/catalog.py +84 -0
- apps/cli/commands/data_cmds.py +405 -0
- apps/cli/commands/diagnostic_cmds.py +179 -0
- apps/cli/commands/diagnostic_ops_cmds.py +696 -0
- apps/cli/commands/finance_render.py +12 -0
- apps/cli/commands/market.py +399 -0
- apps/cli/commands/market_cmds.py +1276 -0
- apps/cli/commands/market_context.py +425 -0
- apps/cli/commands/market_render.py +7 -0
- apps/cli/commands/model_cmds.py +1579 -0
- apps/cli/commands/ops_cmds.py +668 -0
- apps/cli/commands/portfolio_cmds.py +962 -0
- apps/cli/commands/report.py +377 -0
- apps/cli/commands/scaffold_templates.py +617 -0
- apps/cli/commands/session_cmds.py +179 -0
- apps/cli/commands/session_ux_cmds.py +280 -0
- apps/cli/commands/team.py +588 -0
- apps/cli/commands/team_render.py +8 -0
- apps/cli/commands/ui_cmds.py +358 -0
- apps/cli/commands/workflow_cmds.py +279 -0
- apps/cli/commands/workspace_cmds.py +1414 -0
- apps/cli/config_paths.py +70 -0
- apps/cli/config_store.py +61 -0
- apps/cli/deterministic.py +122 -0
- apps/cli/direct.py +48 -0
- apps/cli/github_app_auth.py +135 -0
- apps/cli/handlers/__init__.py +11 -0
- apps/cli/handlers/broker_handlers.py +122 -0
- apps/cli/handlers/chart_handlers.py +1309 -0
- apps/cli/handlers/market_handlers.py +2509 -0
- apps/cli/handlers/realty_handlers.py +114 -0
- apps/cli/handlers/strategy_advice.py +82 -0
- apps/cli/hooks.py +180 -0
- apps/cli/i18n.py +284 -0
- apps/cli/intent.py +136 -0
- apps/cli/intent_router.py +217 -0
- apps/cli/lifecycle_hooks.py +48 -0
- apps/cli/main.py +29 -0
- apps/cli/market_metadata.py +135 -0
- apps/cli/market_universe.py +265 -0
- apps/cli/message_processing.py +257 -0
- apps/cli/plan_mode.py +139 -0
- apps/cli/plotly_html.py +15 -0
- apps/cli/prediction_feedback.py +202 -0
- apps/cli/preflight.py +497 -0
- apps/cli/project_aria.py +60 -0
- apps/cli/prompts/__init__.py +0 -0
- apps/cli/prompts/coding.py +658 -0
- apps/cli/prompts/system_prompts.py +531 -0
- apps/cli/prompts/ui.py +434 -0
- apps/cli/providers/__init__.py +1 -0
- apps/cli/providers/base.py +271 -0
- apps/cli/providers/chat_routing.py +80 -0
- apps/cli/providers/llm/__init__.py +1 -0
- apps/cli/providers/llm/ollama_stream.py +1170 -0
- apps/cli/providers/llm/sse_stream.py +216 -0
- apps/cli/providers/runtime_bridge.py +185 -0
- apps/cli/runtime_consumer.py +489 -0
- apps/cli/session_export.py +87 -0
- apps/cli/session_jsonl.py +207 -0
- apps/cli/session_store.py +112 -0
- apps/cli/todo_tracker.py +190 -0
- apps/cli/tools/__init__.py +40 -0
- apps/cli/tools/context.py +46 -0
- apps/cli/tools/file_tools.py +112 -0
- apps/cli/tools/market_tools.py +549 -0
- apps/cli/tools/notebook_tools.py +111 -0
- apps/cli/tools/system_tools.py +669 -0
- apps/cli/tools/write_tools.py +715 -0
- apps/cli/tradingview_bridge.py +434 -0
- apps/cli/update_check.py +152 -0
- apps/cli/utils/__init__.py +0 -0
- apps/cli/utils/market_detect.py +1578 -0
- apps/daemon/README.md +14 -0
- apps/vscode/README.md +115 -0
- apps/vscode/package.json +70 -0
- aria_cli.py +11636 -0
- aria_code-4.1.3.dist-info/METADATA +952 -0
- aria_code-4.1.3.dist-info/RECORD +284 -0
- aria_code-4.1.3.dist-info/WHEEL +5 -0
- aria_code-4.1.3.dist-info/entry_points.txt +2 -0
- aria_code-4.1.3.dist-info/licenses/LICENSE +121 -0
- aria_code-4.1.3.dist-info/top_level.txt +50 -0
- aria_daemon.py +1295 -0
- aria_feishu_bot.py +1359 -0
- aria_relay_client.py +182 -0
- aria_relay_server.py +405 -0
- aria_telegram_bot.py +202 -0
- ariarc.py +328 -0
- artifacts.py +491 -0
- backtest_report.py +472 -0
- brokers/__init__.py +72 -0
- brokers/base.py +207 -0
- brokers/capabilities.py +264 -0
- brokers/cn/__init__.py +10 -0
- brokers/cn/easytrader_broker.py +193 -0
- brokers/cn/futu_broker.py +194 -0
- brokers/cn/longbridge_broker.py +190 -0
- brokers/cn/tiger_broker.py +196 -0
- brokers/cn/xtquant_broker.py +175 -0
- brokers/config.py +364 -0
- brokers/intl/__init__.py +5 -0
- brokers/intl/alpaca_broker.py +183 -0
- brokers/intl/ibkr_broker.py +215 -0
- brokers/intl/webull_broker.py +156 -0
- brokers/paper_broker.py +259 -0
- brokers/planning.py +296 -0
- brokers/registry.py +181 -0
- brokers/trading.py +237 -0
- change_store.py +127 -0
- command_safety.py +19 -0
- computer_use_tools.py +504 -0
- dashboard_generator.py +578 -0
- data_analysis_tools.py +808 -0
- data_cleaner.py +483 -0
- data_service.py +481 -0
- datasources/__init__.py +23 -0
- datasources/base.py +166 -0
- datasources/router.py +221 -0
- datasources/sources/__init__.py +15 -0
- datasources/sources/akshare_source.py +269 -0
- datasources/sources/alpha_vantage_source.py +202 -0
- datasources/sources/edgar_source.py +218 -0
- datasources/sources/finnhub_source.py +197 -0
- datasources/sources/fred_source.py +219 -0
- datasources/sources/tushare_source.py +141 -0
- datasources/sources/web_scraper_source.py +278 -0
- datasources/sources/world_bank_source.py +205 -0
- datasources/sources/yfinance_source.py +152 -0
- demo_player.py +204 -0
- doctor.py +508 -0
- file_analysis_tools.py +734 -0
- finance_formulas.py +389 -0
- football_data_client.py +1670 -0
- intent_classifier.py +358 -0
- local_finance_tools.py +3221 -0
- local_llm_provider.py +552 -0
- macro_tools.py +368 -0
- market_data_client.py +1899 -0
- mcp_client.py +506 -0
- memory_manager.py +245 -0
- model_capability.py +416 -0
- notification_tools.py +248 -0
- packages/__init__.py +23 -0
- packages/aria_agents/__init__.py +5 -0
- packages/aria_agents/manifest.py +69 -0
- packages/aria_core/__init__.py +34 -0
- packages/aria_core/architecture.py +192 -0
- packages/aria_core/export.py +124 -0
- packages/aria_core/manifest.py +65 -0
- packages/aria_infra/__init__.py +15 -0
- packages/aria_infra/arthera.py +52 -0
- packages/aria_infra/doctor.py +246 -0
- packages/aria_infra/product.py +37 -0
- packages/aria_mcp/__init__.py +25 -0
- packages/aria_mcp/bridge.py +38 -0
- packages/aria_mcp/config.py +97 -0
- packages/aria_mcp/tools.py +61 -0
- packages/aria_sdk/__init__.py +19 -0
- packages/aria_sdk/client.py +396 -0
- packages/aria_sdk/providers.py +70 -0
- packages/aria_sdk/streaming.py +73 -0
- packages/aria_sdk/types.py +86 -0
- packages/aria_services/__init__.py +55 -0
- packages/aria_services/context.py +258 -0
- packages/aria_services/data.py +11 -0
- packages/aria_services/provider_health.py +189 -0
- packages/aria_services/registry.py +213 -0
- packages/aria_services/usage.py +138 -0
- packages/aria_skills/__init__.py +5 -0
- packages/aria_skills/registry.py +59 -0
- packages/aria_tools/__init__.py +5 -0
- packages/aria_tools/registry.py +128 -0
- packages/quant_engine/__init__.py +6 -0
- packages/quant_engine/sports/__init__.py +72 -0
- packages/quant_engine/sports/calibrator.py +353 -0
- packages/quant_engine/sports/dixon_coles.py +234 -0
- packages/quant_engine/sports/elo.py +299 -0
- packages/quant_engine/sports/form.py +188 -0
- packages/quant_engine/sports/h2h.py +195 -0
- packages/quant_engine/sports/ml_model.py +354 -0
- packages/quant_engine/sports/predictor.py +311 -0
- packages/quant_engine/sports/tracker.py +664 -0
- packages/quant_engine/stochastic/__init__.py +27 -0
- packages/quant_engine/stochastic/gbm_enhanced.py +195 -0
- packages/quant_engine/stochastic/ito_calculus.py +477 -0
- packages/quant_engine/stochastic/kelly_criterion.py +181 -0
- packages/quant_engine/stochastic/monte_carlo_advanced.py +95 -0
- packages/quant_engine/stochastic/options_pricing.py +573 -0
- packages/quant_engine/stochastic/stochastic_processes.py +90 -0
- plan_utils.py +194 -0
- plugin_loader.py +328 -0
- portfolio_ledger.py +262 -0
- privacy/__init__.py +5 -0
- privacy/feedback.py +123 -0
- project_tools.py +525 -0
- providers/__init__.py +30 -0
- providers/llm/__init__.py +19 -0
- providers/llm/anthropic.py +184 -0
- providers/llm/base.py +139 -0
- providers/llm/ollama.py +128 -0
- providers/llm/openai_compat.py +282 -0
- providers/llm/registry.py +358 -0
- realty_data_tools.py +659 -0
- report_generator.py +1314 -0
- runtime/__init__.py +103 -0
- runtime/agent_loop.py +1183 -0
- runtime/approval.py +51 -0
- runtime/events.py +102 -0
- runtime/gateway.py +128 -0
- runtime/lsp.py +346 -0
- runtime/subagent.py +258 -0
- runtime/tool_executor.py +104 -0
- runtime/tool_policy.py +106 -0
- safety/__init__.py +21 -0
- safety/permissions.py +275 -0
- setup_wizard.py +653 -0
- strategy_vault.py +420 -0
- ui/__init__.py +100 -0
- ui/banner.py +310 -0
- ui/completer.py +391 -0
- ui/console.py +271 -0
- ui/image_render.py +243 -0
- ui/input_box.py +376 -0
- ui/picker.py +195 -0
- ui/render/__init__.py +11 -0
- ui/render/finance.py +1480 -0
- ui/render/market.py +225 -0
- ui/render/output.py +681 -0
- ui/render/team.py +346 -0
- ui/robot.py +235 -0
- workspace/__init__.py +6 -0
- workspace/files.py +170 -0
- workspace/verify.py +113 -0
data_cleaner.py
ADDED
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
"""
|
|
2
|
+
data_cleaner.py — Bloomberg-grade 数据清洗流水线
|
|
3
|
+
=================================================
|
|
4
|
+
提供:
|
|
5
|
+
· OHLCV 完整性验证(High≥Low, High≥O/C, Volume≥0)
|
|
6
|
+
· 滚动 Z-score 异常值检测(区分涨跌停 vs 数据错误)
|
|
7
|
+
· 交易日历感知缺口检测(区分节假日 vs 真实数据缺失)
|
|
8
|
+
· 前复权/后复权价格(yfinance auto_adjust + akshare qfq)
|
|
9
|
+
· Point-in-Time 财务摘要(使用发布日版本,防止 lookahead bias)
|
|
10
|
+
· 幸存者偏差标注(尝试检测已退市标的)
|
|
11
|
+
· 数据质量评分(0–100)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
import re
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from datetime import datetime, timedelta
|
|
20
|
+
from typing import Dict, List, Optional, Tuple
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
import pandas as pd
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
_IS_A_SHARE = re.compile(r"^[036]\d{5}$").match
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ── Data Classes ──────────────────────────────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class ValidationIssue:
|
|
34
|
+
row_index: object
|
|
35
|
+
column: str
|
|
36
|
+
issue_type: str # "invalid_ohlcv" | "outlier" | "negative_volume"
|
|
37
|
+
value: float
|
|
38
|
+
description: str
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class DataGap:
|
|
43
|
+
start: str
|
|
44
|
+
end: str
|
|
45
|
+
days: int
|
|
46
|
+
kind: str # "holiday" | "data_gap" | "suspension"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class CleanResult:
|
|
51
|
+
df: pd.DataFrame
|
|
52
|
+
issues: List[ValidationIssue] = field(default_factory=list)
|
|
53
|
+
gaps: List[DataGap] = field(default_factory=list)
|
|
54
|
+
outlier_count: int = 0
|
|
55
|
+
fill_count: int = 0
|
|
56
|
+
quality_score: float = 100.0
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def real_gap_days(self) -> int:
|
|
60
|
+
return sum(g.days for g in self.gaps if g.kind == "data_gap")
|
|
61
|
+
|
|
62
|
+
def summary(self) -> str:
|
|
63
|
+
return (
|
|
64
|
+
f"质量评分 {self.quality_score:.1f}/100 · "
|
|
65
|
+
f"异常值 {self.outlier_count} 条 · "
|
|
66
|
+
f"数据缺口 {self.real_gap_days} 天 · "
|
|
67
|
+
f"填充 {self.fill_count} 行"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ── OHLCV Validation ──────────────────────────────────────────────────────────
|
|
72
|
+
|
|
73
|
+
def validate_ohlcv(df: pd.DataFrame) -> List[ValidationIssue]:
|
|
74
|
+
"""
|
|
75
|
+
Strict integrity check: H≥L, H≥O, H≥C, L≤O, L≤C, V≥0.
|
|
76
|
+
Tolerates floating-point noise via 1e-6 epsilon.
|
|
77
|
+
"""
|
|
78
|
+
issues: List[ValidationIssue] = []
|
|
79
|
+
# Resolve column names case-insensitively
|
|
80
|
+
col = {c.lower(): c for c in df.columns}
|
|
81
|
+
h_c = col.get("high")
|
|
82
|
+
l_c = col.get("low")
|
|
83
|
+
o_c = col.get("open")
|
|
84
|
+
c_c = col.get("close")
|
|
85
|
+
v_c = col.get("volume")
|
|
86
|
+
|
|
87
|
+
if not all([h_c, l_c, o_c, c_c]):
|
|
88
|
+
return issues
|
|
89
|
+
|
|
90
|
+
eps = 1e-6
|
|
91
|
+
for idx in df.index:
|
|
92
|
+
try:
|
|
93
|
+
h = float(df.at[idx, h_c] or 0)
|
|
94
|
+
l = float(df.at[idx, l_c] or 0)
|
|
95
|
+
o = float(df.at[idx, o_c] or 0)
|
|
96
|
+
c = float(df.at[idx, c_c] or 0)
|
|
97
|
+
except (TypeError, ValueError, KeyError):
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
if h > 0 and l > 0:
|
|
101
|
+
if h < l - eps:
|
|
102
|
+
issues.append(ValidationIssue(idx, "High/Low", "invalid_ohlcv", h,
|
|
103
|
+
f"H({h:.4f})<L({l:.4f})"))
|
|
104
|
+
if o > 0 and h < o - eps:
|
|
105
|
+
issues.append(ValidationIssue(idx, "High", "invalid_ohlcv", h,
|
|
106
|
+
f"H({h:.4f})<O({o:.4f})"))
|
|
107
|
+
if c > 0 and h < c - eps:
|
|
108
|
+
issues.append(ValidationIssue(idx, "High", "invalid_ohlcv", h,
|
|
109
|
+
f"H({h:.4f})<C({c:.4f})"))
|
|
110
|
+
if o > 0 and l > o + eps:
|
|
111
|
+
issues.append(ValidationIssue(idx, "Low", "invalid_ohlcv", l,
|
|
112
|
+
f"L({l:.4f})>O({o:.4f})"))
|
|
113
|
+
if c > 0 and l > c + eps:
|
|
114
|
+
issues.append(ValidationIssue(idx, "Low", "invalid_ohlcv", l,
|
|
115
|
+
f"L({l:.4f})>C({c:.4f})"))
|
|
116
|
+
|
|
117
|
+
if v_c:
|
|
118
|
+
try:
|
|
119
|
+
v = float(df.at[idx, v_c] or 0)
|
|
120
|
+
if v < 0:
|
|
121
|
+
issues.append(ValidationIssue(idx, "Volume", "negative_volume",
|
|
122
|
+
v, f"V({v})<0"))
|
|
123
|
+
except (TypeError, ValueError):
|
|
124
|
+
pass
|
|
125
|
+
|
|
126
|
+
return issues
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# ── Outlier Detection ─────────────────────────────────────────────────────────
|
|
130
|
+
|
|
131
|
+
def detect_outliers_zscore(
|
|
132
|
+
series: pd.Series,
|
|
133
|
+
window: int = 20,
|
|
134
|
+
threshold: float = 4.0,
|
|
135
|
+
) -> pd.Series:
|
|
136
|
+
"""
|
|
137
|
+
Rolling Z-score on daily returns. Returns boolean mask (True = outlier).
|
|
138
|
+
|
|
139
|
+
A-share circuit-breaker rule: ±10% / ±20% (ST) is NORMAL — Bloomberg
|
|
140
|
+
uses ±25% as hard cap. Default threshold 4.0σ avoids false positives
|
|
141
|
+
on legitimate limit-up/down days.
|
|
142
|
+
"""
|
|
143
|
+
returns = series.pct_change().dropna()
|
|
144
|
+
roll_mu = returns.rolling(window=window, min_periods=5).mean()
|
|
145
|
+
roll_sig = returns.rolling(window=window, min_periods=5).std()
|
|
146
|
+
z = (returns - roll_mu) / (roll_sig.replace(0, np.nan) + 1e-10)
|
|
147
|
+
|
|
148
|
+
mask = pd.Series(False, index=series.index)
|
|
149
|
+
mask.update(z.abs() > threshold)
|
|
150
|
+
return mask
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# ── Data Gap Detection ────────────────────────────────────────────────────────
|
|
154
|
+
|
|
155
|
+
def detect_data_gaps(df: pd.DataFrame, market: str = "US") -> List[DataGap]:
|
|
156
|
+
"""
|
|
157
|
+
Distinguish trading-calendar holidays from genuine missing data.
|
|
158
|
+
|
|
159
|
+
Rules (market-agnostic heuristic):
|
|
160
|
+
Fri→Mon (+3 days) = weekend — skip
|
|
161
|
+
1–4 day gaps over a weekend = likely holiday
|
|
162
|
+
5+ consecutive missing calendar days = real data gap
|
|
163
|
+
"""
|
|
164
|
+
if len(df) < 2:
|
|
165
|
+
return []
|
|
166
|
+
|
|
167
|
+
idx = pd.DatetimeIndex(df.index if not isinstance(df.index, pd.DatetimeIndex)
|
|
168
|
+
else df.index)
|
|
169
|
+
gaps: List[DataGap] = []
|
|
170
|
+
|
|
171
|
+
for i in range(1, len(idx)):
|
|
172
|
+
prev, curr = idx[i-1], idx[i]
|
|
173
|
+
delta = (curr - prev).days
|
|
174
|
+
|
|
175
|
+
if delta <= 1:
|
|
176
|
+
continue
|
|
177
|
+
if prev.weekday() == 4 and delta == 3: # Fri → Mon
|
|
178
|
+
continue
|
|
179
|
+
# Gaps that span at least one weekend: likely holiday cluster
|
|
180
|
+
if delta <= 5:
|
|
181
|
+
kind = "holiday"
|
|
182
|
+
elif delta <= 10:
|
|
183
|
+
kind = "suspension" # probable trading suspension
|
|
184
|
+
else:
|
|
185
|
+
kind = "data_gap"
|
|
186
|
+
|
|
187
|
+
gaps.append(DataGap(
|
|
188
|
+
start=str(prev.date()),
|
|
189
|
+
end=str(curr.date()),
|
|
190
|
+
days=delta - 1,
|
|
191
|
+
kind=kind,
|
|
192
|
+
))
|
|
193
|
+
|
|
194
|
+
return gaps
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# ── Main Cleaning Pipeline ────────────────────────────────────────────────────
|
|
198
|
+
|
|
199
|
+
def clean_price_series(
|
|
200
|
+
df: pd.DataFrame,
|
|
201
|
+
symbol: str = "",
|
|
202
|
+
outlier_threshold: float = 4.0,
|
|
203
|
+
) -> CleanResult:
|
|
204
|
+
"""
|
|
205
|
+
Full Bloomberg-grade OHLCV cleaning in 5 stages:
|
|
206
|
+
|
|
207
|
+
1. Normalize column names (case-insensitive)
|
|
208
|
+
2. Drop all-NaN rows
|
|
209
|
+
3. OHLCV integrity validation
|
|
210
|
+
4. Rolling Z-score outlier detection (tagged, not removed)
|
|
211
|
+
5. Forward-fill price NaNs; Volume → 0 for halted days
|
|
212
|
+
6. Data gap classification
|
|
213
|
+
7. Quality scoring
|
|
214
|
+
"""
|
|
215
|
+
df = df.copy()
|
|
216
|
+
|
|
217
|
+
# 1 — Normalize column names
|
|
218
|
+
df.columns = [_normalise_col(c) for c in df.columns]
|
|
219
|
+
if "Adj Close" in df.columns:
|
|
220
|
+
df["Close"] = df["Adj Close"]
|
|
221
|
+
|
|
222
|
+
# 2 — Drop all-NaN rows
|
|
223
|
+
ohlc = [c for c in ("Open", "High", "Low", "Close") if c in df.columns]
|
|
224
|
+
df = df.dropna(subset=ohlc, how="all")
|
|
225
|
+
|
|
226
|
+
# 3 — Validate
|
|
227
|
+
issues = validate_ohlcv(df)
|
|
228
|
+
|
|
229
|
+
# 4 — Outlier detection
|
|
230
|
+
outlier_mask = pd.Series(False, index=df.index)
|
|
231
|
+
if "Close" in df.columns:
|
|
232
|
+
outlier_mask = detect_outliers_zscore(df["Close"], threshold=outlier_threshold)
|
|
233
|
+
df["_outlier"] = outlier_mask
|
|
234
|
+
outlier_count = int(outlier_mask.sum())
|
|
235
|
+
|
|
236
|
+
# 5 — Fill NaN
|
|
237
|
+
fill_count = int(df[ohlc].isna().sum().sum())
|
|
238
|
+
df[ohlc] = df[ohlc].ffill().bfill()
|
|
239
|
+
if "Volume" in df.columns:
|
|
240
|
+
df["Volume"] = df["Volume"].fillna(0)
|
|
241
|
+
|
|
242
|
+
# 6 — Gaps
|
|
243
|
+
gaps = detect_data_gaps(df)
|
|
244
|
+
|
|
245
|
+
# 7 — Quality score (penalty-based)
|
|
246
|
+
n = max(len(df), 1)
|
|
247
|
+
penalty = (
|
|
248
|
+
len(issues) * 2.0 + # OHLCV violations
|
|
249
|
+
outlier_count * 0.5 + # outliers (soft)
|
|
250
|
+
fill_count * 0.3 + # imputed rows
|
|
251
|
+
sum(g.days for g in gaps if g.kind == "data_gap") * 5.0 # real gaps
|
|
252
|
+
) / n * 10
|
|
253
|
+
quality_score = round(max(0.0, min(100.0, 100.0 - penalty)), 1)
|
|
254
|
+
|
|
255
|
+
return CleanResult(
|
|
256
|
+
df=df,
|
|
257
|
+
issues=issues,
|
|
258
|
+
gaps=gaps,
|
|
259
|
+
outlier_count=outlier_count,
|
|
260
|
+
fill_count=fill_count,
|
|
261
|
+
quality_score=quality_score,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _normalise_col(name: str) -> str:
|
|
266
|
+
mapping = {
|
|
267
|
+
"open": "Open", "high": "High", "low": "Low",
|
|
268
|
+
"close": "Close", "volume": "Volume",
|
|
269
|
+
"adj close": "Adj Close", "adj_close": "Adj Close",
|
|
270
|
+
"turnover": "Turnover",
|
|
271
|
+
}
|
|
272
|
+
return mapping.get(name.lower(), name.capitalize())
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
# ── Public Data API ───────────────────────────────────────────────────────────
|
|
276
|
+
|
|
277
|
+
def get_clean_prices(
|
|
278
|
+
symbol: str,
|
|
279
|
+
period: str = "1y",
|
|
280
|
+
auto_adjust: bool = True,
|
|
281
|
+
) -> Tuple[pd.DataFrame, CleanResult]:
|
|
282
|
+
"""
|
|
283
|
+
Fetch + clean price series.
|
|
284
|
+
|
|
285
|
+
Returns (clean_df, CleanResult).
|
|
286
|
+
Supports US equities (yfinance) and A-shares (akshare with qfq).
|
|
287
|
+
"""
|
|
288
|
+
try:
|
|
289
|
+
df = (_fetch_a_prices(symbol, period, auto_adjust)
|
|
290
|
+
if _IS_A_SHARE(symbol) else
|
|
291
|
+
_fetch_us_prices(symbol, period, auto_adjust))
|
|
292
|
+
except Exception as e:
|
|
293
|
+
logger.warning("[cleaner] fetch %s: %s", symbol, e)
|
|
294
|
+
empty = pd.DataFrame()
|
|
295
|
+
return empty, CleanResult(empty, quality_score=0.0)
|
|
296
|
+
|
|
297
|
+
if df.empty:
|
|
298
|
+
return df, CleanResult(df, quality_score=0.0)
|
|
299
|
+
|
|
300
|
+
result = clean_price_series(df, symbol)
|
|
301
|
+
return result.df, result
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def get_fundamentals(symbol: str) -> Dict:
|
|
305
|
+
"""
|
|
306
|
+
Fetch key financial metrics.
|
|
307
|
+
|
|
308
|
+
Returns a flat dict with standardised keys regardless of market.
|
|
309
|
+
Missing values are None (never empty string).
|
|
310
|
+
"""
|
|
311
|
+
try:
|
|
312
|
+
return (_get_a_fundamentals(symbol)
|
|
313
|
+
if _IS_A_SHARE(symbol) else
|
|
314
|
+
_get_us_fundamentals(symbol))
|
|
315
|
+
except Exception as e:
|
|
316
|
+
logger.debug("[cleaner] fundamentals %s: %s", symbol, e)
|
|
317
|
+
return {"company_name": symbol, "symbol": symbol,
|
|
318
|
+
"currency": "CNY" if _IS_A_SHARE(symbol) else "USD"}
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
# ── Internal Fetchers ─────────────────────────────────────────────────────────
|
|
322
|
+
|
|
323
|
+
def _fetch_us_prices(symbol: str, period: str, auto_adjust: bool) -> pd.DataFrame:
|
|
324
|
+
import yfinance as yf
|
|
325
|
+
df = yf.Ticker(symbol).history(period=period, auto_adjust=auto_adjust)
|
|
326
|
+
if df.empty:
|
|
327
|
+
return df
|
|
328
|
+
df = df[["Open", "High", "Low", "Close", "Volume"]].copy()
|
|
329
|
+
if hasattr(df.index, "tz") and df.index.tz is not None:
|
|
330
|
+
df.index = df.index.tz_localize(None)
|
|
331
|
+
return df
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _fetch_a_prices(symbol: str, period: str, auto_adjust: bool) -> pd.DataFrame:
|
|
335
|
+
_DAYS = {"1mo": 35, "3mo": 95, "6mo": 185, "1y": 370, "2y": 740, "5y": 1830}
|
|
336
|
+
days = _DAYS.get(period, 370)
|
|
337
|
+
end = datetime.now()
|
|
338
|
+
start = end - timedelta(days=days)
|
|
339
|
+
|
|
340
|
+
try:
|
|
341
|
+
import akshare as ak
|
|
342
|
+
import os as _dc_os
|
|
343
|
+
adj = "qfq" if auto_adjust else ""
|
|
344
|
+
# AKShare creates its own requests session and routes through the system
|
|
345
|
+
# proxy, but numbered push2his.eastmoney.com subdomains are not reachable
|
|
346
|
+
# via the local Clash VPN — clear proxy env vars for this call only.
|
|
347
|
+
_dc_proxy_bk = {k: _dc_os.environ.pop(k, None)
|
|
348
|
+
for k in ("HTTP_PROXY","HTTPS_PROXY","http_proxy","https_proxy")}
|
|
349
|
+
try:
|
|
350
|
+
raw = ak.stock_zh_a_hist(
|
|
351
|
+
symbol=symbol, period="daily",
|
|
352
|
+
start_date=start.strftime("%Y%m%d"),
|
|
353
|
+
end_date=end.strftime("%Y%m%d"),
|
|
354
|
+
adjust=adj,
|
|
355
|
+
)
|
|
356
|
+
finally:
|
|
357
|
+
for _k, _v in _dc_proxy_bk.items():
|
|
358
|
+
if _v is not None:
|
|
359
|
+
_dc_os.environ[_k] = _v
|
|
360
|
+
if raw is None or raw.empty:
|
|
361
|
+
raise ValueError("empty response")
|
|
362
|
+
col_map = {"日期": "Date", "开盘": "Open", "最高": "High",
|
|
363
|
+
"最低": "Low", "收盘": "Close", "成交量": "Volume"}
|
|
364
|
+
raw = raw.rename(columns=col_map)
|
|
365
|
+
raw["Date"] = pd.to_datetime(raw["Date"])
|
|
366
|
+
raw = raw.set_index("Date").sort_index()
|
|
367
|
+
for col in ("Open", "High", "Low", "Close", "Volume"):
|
|
368
|
+
if col not in raw.columns:
|
|
369
|
+
raw[col] = np.nan
|
|
370
|
+
return raw[["Open", "High", "Low", "Close", "Volume"]]
|
|
371
|
+
except ImportError:
|
|
372
|
+
pass
|
|
373
|
+
|
|
374
|
+
# Fallback: yfinance with exchange suffix
|
|
375
|
+
suffix = ".SS" if symbol[:1] in ("6", "5") else ".SZ"
|
|
376
|
+
return _fetch_us_prices(symbol + suffix, period, auto_adjust)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def _get_us_fundamentals(symbol: str) -> Dict:
|
|
380
|
+
import yfinance as yf
|
|
381
|
+
info = yf.Ticker(symbol).info or {}
|
|
382
|
+
return {
|
|
383
|
+
"company_name": info.get("longName", symbol),
|
|
384
|
+
"symbol": symbol,
|
|
385
|
+
"sector": info.get("sector", ""),
|
|
386
|
+
"industry": info.get("industry", ""),
|
|
387
|
+
"exchange": info.get("exchange", ""),
|
|
388
|
+
"currency": info.get("currency", "USD"),
|
|
389
|
+
"market_cap": info.get("marketCap"),
|
|
390
|
+
"price": info.get("currentPrice") or info.get("regularMarketPrice"),
|
|
391
|
+
"prev_close": info.get("previousClose"),
|
|
392
|
+
"open": info.get("open"),
|
|
393
|
+
"volume": info.get("volume"),
|
|
394
|
+
"avg_volume": info.get("averageVolume"),
|
|
395
|
+
"pe_ratio": info.get("trailingPE"),
|
|
396
|
+
"forward_pe": info.get("forwardPE"),
|
|
397
|
+
"pb_ratio": info.get("priceToBook"),
|
|
398
|
+
"ps_ratio": info.get("priceToSalesTrailing12Months"),
|
|
399
|
+
"eps_ttm": info.get("trailingEps"),
|
|
400
|
+
"eps_forward": info.get("forwardEps"),
|
|
401
|
+
"revenue": info.get("totalRevenue"),
|
|
402
|
+
"revenue_growth": info.get("revenueGrowth"),
|
|
403
|
+
"earnings_growth": info.get("earningsGrowth"),
|
|
404
|
+
"gross_margin": info.get("grossMargins"),
|
|
405
|
+
"operating_margin": info.get("operatingMargins"),
|
|
406
|
+
"net_margin": info.get("profitMargins"),
|
|
407
|
+
"roe": info.get("returnOnEquity"),
|
|
408
|
+
"roa": info.get("returnOnAssets"),
|
|
409
|
+
"debt_equity": info.get("debtToEquity"),
|
|
410
|
+
"current_ratio": info.get("currentRatio"),
|
|
411
|
+
"quick_ratio": info.get("quickRatio"),
|
|
412
|
+
"free_cashflow": info.get("freeCashflow"),
|
|
413
|
+
"dividend_yield": info.get("dividendYield"),
|
|
414
|
+
"payout_ratio": info.get("payoutRatio"),
|
|
415
|
+
"beta": info.get("beta"),
|
|
416
|
+
"52w_high": info.get("fiftyTwoWeekHigh"),
|
|
417
|
+
"52w_low": info.get("fiftyTwoWeekLow"),
|
|
418
|
+
"analyst_target": info.get("targetMeanPrice"),
|
|
419
|
+
"analyst_low": info.get("targetLowPrice"),
|
|
420
|
+
"analyst_high": info.get("targetHighPrice"),
|
|
421
|
+
"analyst_count": info.get("numberOfAnalystOpinions"),
|
|
422
|
+
"recommendation": info.get("recommendationKey", ""),
|
|
423
|
+
"short_ratio": info.get("shortRatio"),
|
|
424
|
+
"shares_out": info.get("sharesOutstanding"),
|
|
425
|
+
"float_shares": info.get("floatShares"),
|
|
426
|
+
"description": (info.get("longBusinessSummary") or "")[:600],
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def _get_a_fundamentals(symbol: str) -> Dict:
|
|
431
|
+
try:
|
|
432
|
+
import akshare as ak
|
|
433
|
+
import os as _dc_os2
|
|
434
|
+
_dc_proxy_bk2 = {k: _dc_os2.environ.pop(k, None)
|
|
435
|
+
for k in ("HTTP_PROXY","HTTPS_PROXY","http_proxy","https_proxy")}
|
|
436
|
+
try:
|
|
437
|
+
df = ak.stock_individual_info_em(symbol=symbol)
|
|
438
|
+
finally:
|
|
439
|
+
for _k, _v in _dc_proxy_bk2.items():
|
|
440
|
+
if _v is not None:
|
|
441
|
+
_dc_os2.environ[_k] = _v
|
|
442
|
+
if df is None or df.empty:
|
|
443
|
+
raise ValueError("empty")
|
|
444
|
+
info = {str(row.iloc[0]): row.iloc[1] for _, row in df.iterrows()}
|
|
445
|
+
return {
|
|
446
|
+
"company_name": info.get("股票简称", symbol),
|
|
447
|
+
"symbol": symbol,
|
|
448
|
+
"sector": info.get("行业", ""),
|
|
449
|
+
"industry": info.get("行业", ""),
|
|
450
|
+
"exchange": "SSE" if symbol[:1] in ("6","5") else "SZSE",
|
|
451
|
+
"currency": "CNY",
|
|
452
|
+
"market_cap": _safe_float(info.get("总市值")),
|
|
453
|
+
"price": _safe_float(info.get("最新价")),
|
|
454
|
+
"pe_ratio": _safe_float(info.get("市盈率(动)")),
|
|
455
|
+
"pb_ratio": _safe_float(info.get("市净率")),
|
|
456
|
+
"roe": _safe_float(info.get("净资产收益率")),
|
|
457
|
+
"dividend_yield": _safe_float(info.get("股息率(%)")),
|
|
458
|
+
"52w_high": _safe_float(info.get("52周最高")),
|
|
459
|
+
"52w_low": _safe_float(info.get("52周最低")),
|
|
460
|
+
"eps_ttm": _safe_float(info.get("每股收益")),
|
|
461
|
+
"revenue": None,
|
|
462
|
+
}
|
|
463
|
+
except (ImportError, Exception):
|
|
464
|
+
suffix = ".SS" if symbol[:1] in ("6","5") else ".SZ"
|
|
465
|
+
result = _get_us_fundamentals(symbol + suffix)
|
|
466
|
+
# yfinance may return USD and an English name for A-share symbols;
|
|
467
|
+
# override to correct values
|
|
468
|
+
result["currency"] = "CNY"
|
|
469
|
+
result["exchange"] = "SSE" if symbol[:1] in ("6", "5") else "SZSE"
|
|
470
|
+
# if yfinance returned the suffixed symbol as name, strip it back
|
|
471
|
+
if result.get("company_name") in (symbol, symbol + suffix):
|
|
472
|
+
result["company_name"] = symbol
|
|
473
|
+
return result
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def _safe_float(val) -> Optional[float]:
|
|
477
|
+
if val is None:
|
|
478
|
+
return None
|
|
479
|
+
try:
|
|
480
|
+
s = str(val).replace(",", "").replace("%", "").strip()
|
|
481
|
+
return float(s) if s and s not in ("--", "-", "N/A", "nan") else None
|
|
482
|
+
except ValueError:
|
|
483
|
+
return None
|