aria-code 4.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (284) hide show
  1. agents/__init__.py +32 -0
  2. agents/base.py +190 -0
  3. agents/deep/__init__.py +37 -0
  4. agents/deep/calibration_loop.py +144 -0
  5. agents/deep/critic.py +125 -0
  6. agents/deep/deepen.py +193 -0
  7. agents/deep/models.py +149 -0
  8. agents/deep/pipeline.py +164 -0
  9. agents/deep/quant_fusion.py +192 -0
  10. agents/deep/themes.py +95 -0
  11. agents/deep/tiers.py +106 -0
  12. agents/financial/__init__.py +10 -0
  13. agents/financial/catalyst.py +279 -0
  14. agents/financial/debate.py +145 -0
  15. agents/financial/earnings.py +303 -0
  16. agents/financial/fundamental.py +159 -0
  17. agents/financial/macro.py +99 -0
  18. agents/financial/news.py +207 -0
  19. agents/financial/risk.py +132 -0
  20. agents/financial/sector.py +279 -0
  21. agents/financial/synthesis.py +274 -0
  22. agents/financial/technical.py +258 -0
  23. agents/portfolio_agent.py +333 -0
  24. agents/realty/__init__.py +62 -0
  25. agents/realty/asset_diagnosis.py +150 -0
  26. agents/realty/business_match.py +165 -0
  27. agents/realty/cashflow_verify.py +208 -0
  28. agents/realty/contract_rules.py +209 -0
  29. agents/realty/energy_anomaly.py +188 -0
  30. agents/realty/exit_settlement.py +207 -0
  31. agents/realty/fulfillment_risk.py +205 -0
  32. agents/realty/ops_optimize.py +159 -0
  33. agents/realty/revenue_share.py +214 -0
  34. agents/registry.py +144 -0
  35. agents/sports/__init__.py +0 -0
  36. agents/sports/football_agent.py +169 -0
  37. agents/team.py +289 -0
  38. aliyun_data_client.py +660 -0
  39. apps/README.md +12 -0
  40. apps/__init__.py +2 -0
  41. apps/channels/README.md +15 -0
  42. apps/cli/README.md +13 -0
  43. apps/cli/__init__.py +2 -0
  44. apps/cli/bootstrap.py +99 -0
  45. apps/cli/codegen_paths.py +29 -0
  46. apps/cli/commands/__init__.py +16 -0
  47. apps/cli/commands/analysis_cmds.py +288 -0
  48. apps/cli/commands/backtest_cmds.py +1887 -0
  49. apps/cli/commands/broker_cmds.py +1154 -0
  50. apps/cli/commands/business_workflow_cmds.py +289 -0
  51. apps/cli/commands/catalog.py +84 -0
  52. apps/cli/commands/data_cmds.py +405 -0
  53. apps/cli/commands/diagnostic_cmds.py +179 -0
  54. apps/cli/commands/diagnostic_ops_cmds.py +696 -0
  55. apps/cli/commands/finance_render.py +12 -0
  56. apps/cli/commands/market.py +399 -0
  57. apps/cli/commands/market_cmds.py +1276 -0
  58. apps/cli/commands/market_context.py +425 -0
  59. apps/cli/commands/market_render.py +7 -0
  60. apps/cli/commands/model_cmds.py +1579 -0
  61. apps/cli/commands/ops_cmds.py +668 -0
  62. apps/cli/commands/portfolio_cmds.py +962 -0
  63. apps/cli/commands/report.py +377 -0
  64. apps/cli/commands/scaffold_templates.py +617 -0
  65. apps/cli/commands/session_cmds.py +179 -0
  66. apps/cli/commands/session_ux_cmds.py +280 -0
  67. apps/cli/commands/team.py +588 -0
  68. apps/cli/commands/team_render.py +8 -0
  69. apps/cli/commands/ui_cmds.py +358 -0
  70. apps/cli/commands/workflow_cmds.py +279 -0
  71. apps/cli/commands/workspace_cmds.py +1414 -0
  72. apps/cli/config_paths.py +70 -0
  73. apps/cli/config_store.py +61 -0
  74. apps/cli/deterministic.py +122 -0
  75. apps/cli/direct.py +48 -0
  76. apps/cli/github_app_auth.py +135 -0
  77. apps/cli/handlers/__init__.py +11 -0
  78. apps/cli/handlers/broker_handlers.py +122 -0
  79. apps/cli/handlers/chart_handlers.py +1309 -0
  80. apps/cli/handlers/market_handlers.py +2509 -0
  81. apps/cli/handlers/realty_handlers.py +114 -0
  82. apps/cli/handlers/strategy_advice.py +82 -0
  83. apps/cli/hooks.py +180 -0
  84. apps/cli/i18n.py +284 -0
  85. apps/cli/intent.py +136 -0
  86. apps/cli/intent_router.py +217 -0
  87. apps/cli/lifecycle_hooks.py +48 -0
  88. apps/cli/main.py +29 -0
  89. apps/cli/market_metadata.py +135 -0
  90. apps/cli/market_universe.py +265 -0
  91. apps/cli/message_processing.py +257 -0
  92. apps/cli/plan_mode.py +139 -0
  93. apps/cli/plotly_html.py +15 -0
  94. apps/cli/prediction_feedback.py +202 -0
  95. apps/cli/preflight.py +497 -0
  96. apps/cli/project_aria.py +60 -0
  97. apps/cli/prompts/__init__.py +0 -0
  98. apps/cli/prompts/coding.py +658 -0
  99. apps/cli/prompts/system_prompts.py +531 -0
  100. apps/cli/prompts/ui.py +434 -0
  101. apps/cli/providers/__init__.py +1 -0
  102. apps/cli/providers/base.py +271 -0
  103. apps/cli/providers/chat_routing.py +80 -0
  104. apps/cli/providers/llm/__init__.py +1 -0
  105. apps/cli/providers/llm/ollama_stream.py +1170 -0
  106. apps/cli/providers/llm/sse_stream.py +216 -0
  107. apps/cli/providers/runtime_bridge.py +185 -0
  108. apps/cli/runtime_consumer.py +489 -0
  109. apps/cli/session_export.py +87 -0
  110. apps/cli/session_jsonl.py +207 -0
  111. apps/cli/session_store.py +112 -0
  112. apps/cli/todo_tracker.py +190 -0
  113. apps/cli/tools/__init__.py +40 -0
  114. apps/cli/tools/context.py +46 -0
  115. apps/cli/tools/file_tools.py +112 -0
  116. apps/cli/tools/market_tools.py +549 -0
  117. apps/cli/tools/notebook_tools.py +111 -0
  118. apps/cli/tools/system_tools.py +669 -0
  119. apps/cli/tools/write_tools.py +715 -0
  120. apps/cli/tradingview_bridge.py +434 -0
  121. apps/cli/update_check.py +152 -0
  122. apps/cli/utils/__init__.py +0 -0
  123. apps/cli/utils/market_detect.py +1578 -0
  124. apps/daemon/README.md +14 -0
  125. apps/vscode/README.md +115 -0
  126. apps/vscode/package.json +70 -0
  127. aria_cli.py +11636 -0
  128. aria_code-4.1.3.dist-info/METADATA +952 -0
  129. aria_code-4.1.3.dist-info/RECORD +284 -0
  130. aria_code-4.1.3.dist-info/WHEEL +5 -0
  131. aria_code-4.1.3.dist-info/entry_points.txt +2 -0
  132. aria_code-4.1.3.dist-info/licenses/LICENSE +121 -0
  133. aria_code-4.1.3.dist-info/top_level.txt +50 -0
  134. aria_daemon.py +1295 -0
  135. aria_feishu_bot.py +1359 -0
  136. aria_relay_client.py +182 -0
  137. aria_relay_server.py +405 -0
  138. aria_telegram_bot.py +202 -0
  139. ariarc.py +328 -0
  140. artifacts.py +491 -0
  141. backtest_report.py +472 -0
  142. brokers/__init__.py +72 -0
  143. brokers/base.py +207 -0
  144. brokers/capabilities.py +264 -0
  145. brokers/cn/__init__.py +10 -0
  146. brokers/cn/easytrader_broker.py +193 -0
  147. brokers/cn/futu_broker.py +194 -0
  148. brokers/cn/longbridge_broker.py +190 -0
  149. brokers/cn/tiger_broker.py +196 -0
  150. brokers/cn/xtquant_broker.py +175 -0
  151. brokers/config.py +364 -0
  152. brokers/intl/__init__.py +5 -0
  153. brokers/intl/alpaca_broker.py +183 -0
  154. brokers/intl/ibkr_broker.py +215 -0
  155. brokers/intl/webull_broker.py +156 -0
  156. brokers/paper_broker.py +259 -0
  157. brokers/planning.py +296 -0
  158. brokers/registry.py +181 -0
  159. brokers/trading.py +237 -0
  160. change_store.py +127 -0
  161. command_safety.py +19 -0
  162. computer_use_tools.py +504 -0
  163. dashboard_generator.py +578 -0
  164. data_analysis_tools.py +808 -0
  165. data_cleaner.py +483 -0
  166. data_service.py +481 -0
  167. datasources/__init__.py +23 -0
  168. datasources/base.py +166 -0
  169. datasources/router.py +221 -0
  170. datasources/sources/__init__.py +15 -0
  171. datasources/sources/akshare_source.py +269 -0
  172. datasources/sources/alpha_vantage_source.py +202 -0
  173. datasources/sources/edgar_source.py +218 -0
  174. datasources/sources/finnhub_source.py +197 -0
  175. datasources/sources/fred_source.py +219 -0
  176. datasources/sources/tushare_source.py +141 -0
  177. datasources/sources/web_scraper_source.py +278 -0
  178. datasources/sources/world_bank_source.py +205 -0
  179. datasources/sources/yfinance_source.py +152 -0
  180. demo_player.py +204 -0
  181. doctor.py +508 -0
  182. file_analysis_tools.py +734 -0
  183. finance_formulas.py +389 -0
  184. football_data_client.py +1670 -0
  185. intent_classifier.py +358 -0
  186. local_finance_tools.py +3221 -0
  187. local_llm_provider.py +552 -0
  188. macro_tools.py +368 -0
  189. market_data_client.py +1899 -0
  190. mcp_client.py +506 -0
  191. memory_manager.py +245 -0
  192. model_capability.py +416 -0
  193. notification_tools.py +248 -0
  194. packages/__init__.py +23 -0
  195. packages/aria_agents/__init__.py +5 -0
  196. packages/aria_agents/manifest.py +69 -0
  197. packages/aria_core/__init__.py +34 -0
  198. packages/aria_core/architecture.py +192 -0
  199. packages/aria_core/export.py +124 -0
  200. packages/aria_core/manifest.py +65 -0
  201. packages/aria_infra/__init__.py +15 -0
  202. packages/aria_infra/arthera.py +52 -0
  203. packages/aria_infra/doctor.py +246 -0
  204. packages/aria_infra/product.py +37 -0
  205. packages/aria_mcp/__init__.py +25 -0
  206. packages/aria_mcp/bridge.py +38 -0
  207. packages/aria_mcp/config.py +97 -0
  208. packages/aria_mcp/tools.py +61 -0
  209. packages/aria_sdk/__init__.py +19 -0
  210. packages/aria_sdk/client.py +396 -0
  211. packages/aria_sdk/providers.py +70 -0
  212. packages/aria_sdk/streaming.py +73 -0
  213. packages/aria_sdk/types.py +86 -0
  214. packages/aria_services/__init__.py +55 -0
  215. packages/aria_services/context.py +258 -0
  216. packages/aria_services/data.py +11 -0
  217. packages/aria_services/provider_health.py +189 -0
  218. packages/aria_services/registry.py +213 -0
  219. packages/aria_services/usage.py +138 -0
  220. packages/aria_skills/__init__.py +5 -0
  221. packages/aria_skills/registry.py +59 -0
  222. packages/aria_tools/__init__.py +5 -0
  223. packages/aria_tools/registry.py +128 -0
  224. packages/quant_engine/__init__.py +6 -0
  225. packages/quant_engine/sports/__init__.py +72 -0
  226. packages/quant_engine/sports/calibrator.py +353 -0
  227. packages/quant_engine/sports/dixon_coles.py +234 -0
  228. packages/quant_engine/sports/elo.py +299 -0
  229. packages/quant_engine/sports/form.py +188 -0
  230. packages/quant_engine/sports/h2h.py +195 -0
  231. packages/quant_engine/sports/ml_model.py +354 -0
  232. packages/quant_engine/sports/predictor.py +311 -0
  233. packages/quant_engine/sports/tracker.py +664 -0
  234. packages/quant_engine/stochastic/__init__.py +27 -0
  235. packages/quant_engine/stochastic/gbm_enhanced.py +195 -0
  236. packages/quant_engine/stochastic/ito_calculus.py +477 -0
  237. packages/quant_engine/stochastic/kelly_criterion.py +181 -0
  238. packages/quant_engine/stochastic/monte_carlo_advanced.py +95 -0
  239. packages/quant_engine/stochastic/options_pricing.py +573 -0
  240. packages/quant_engine/stochastic/stochastic_processes.py +90 -0
  241. plan_utils.py +194 -0
  242. plugin_loader.py +328 -0
  243. portfolio_ledger.py +262 -0
  244. privacy/__init__.py +5 -0
  245. privacy/feedback.py +123 -0
  246. project_tools.py +525 -0
  247. providers/__init__.py +30 -0
  248. providers/llm/__init__.py +19 -0
  249. providers/llm/anthropic.py +184 -0
  250. providers/llm/base.py +139 -0
  251. providers/llm/ollama.py +128 -0
  252. providers/llm/openai_compat.py +282 -0
  253. providers/llm/registry.py +358 -0
  254. realty_data_tools.py +659 -0
  255. report_generator.py +1314 -0
  256. runtime/__init__.py +103 -0
  257. runtime/agent_loop.py +1183 -0
  258. runtime/approval.py +51 -0
  259. runtime/events.py +102 -0
  260. runtime/gateway.py +128 -0
  261. runtime/lsp.py +346 -0
  262. runtime/subagent.py +258 -0
  263. runtime/tool_executor.py +104 -0
  264. runtime/tool_policy.py +106 -0
  265. safety/__init__.py +21 -0
  266. safety/permissions.py +275 -0
  267. setup_wizard.py +653 -0
  268. strategy_vault.py +420 -0
  269. ui/__init__.py +100 -0
  270. ui/banner.py +310 -0
  271. ui/completer.py +391 -0
  272. ui/console.py +271 -0
  273. ui/image_render.py +243 -0
  274. ui/input_box.py +376 -0
  275. ui/picker.py +195 -0
  276. ui/render/__init__.py +11 -0
  277. ui/render/finance.py +1480 -0
  278. ui/render/market.py +225 -0
  279. ui/render/output.py +681 -0
  280. ui/render/team.py +346 -0
  281. ui/robot.py +235 -0
  282. workspace/__init__.py +6 -0
  283. workspace/files.py +170 -0
  284. workspace/verify.py +113 -0
file_analysis_tools.py ADDED
@@ -0,0 +1,734 @@
1
+ """
2
+ file_analysis_tools.py — 多格式文件解析与内容提取层
3
+ =====================================================
4
+ 支持格式:
5
+ PDF — pdfplumber (优先) 或 pypdf
6
+ Word/DOCX — python-docx
7
+ Excel/XLSX — openpyxl + pandas
8
+ CSV/TSV — pandas
9
+ JSON/JSONL — 内置 json
10
+ Markdown/TXT — 直接读取
11
+ 图片 — PIL/Pillow 元数据 + base64 (发给视觉模型)
12
+ HTML — BeautifulSoup4 提取正文
13
+ 代码文件 — 语法感知提取 (py/js/ts/go/java/cpp 等)
14
+
15
+ 全部函数返回统一格式:
16
+ {"success": bool, "type": str, "content": str, "metadata": dict, ...}
17
+
18
+ 依赖安装(可选,按需安装):
19
+ pip install pdfplumber python-docx openpyxl pandas pillow beautifulsoup4
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import base64
25
+ import json
26
+ import logging
27
+ import mimetypes
28
+ import os
29
+ import re
30
+ from dataclasses import dataclass, field
31
+ from pathlib import Path
32
+ from typing import Any, Dict, List, Optional, Tuple
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ # ── Optional imports ──────────────────────────────────────────────────────────
37
+
38
+ def _try(mod):
39
+ try:
40
+ return __import__(mod)
41
+ except ImportError:
42
+ return None
43
+
44
+ # Detect available parsers at module load time (fast, no exceptions in hot path)
45
+ _pdfplumber = _try("pdfplumber")
46
+ _pypdf = _try("pypdf")
47
+ _docx_mod = _try("docx")
48
+ _openpyxl = _try("openpyxl")
49
+ _pd = _try("pandas")
50
+ _bs4 = _try("bs4")
51
+ _PIL = _try("PIL")
52
+
53
+ # ── Data structures ───────────────────────────────────────────────────────────
54
+
55
+ @dataclass
56
+ class FileContent:
57
+ """Normalised result of parsing any file."""
58
+ success: bool
59
+ file_type: str # "pdf" | "docx" | "xlsx" | "csv" | "json" | "image" | "code" | "text"
60
+ path: str
61
+ filename: str
62
+ size_kb: float
63
+ content: str # Extracted text (may be truncated)
64
+ metadata: Dict[str, Any] = field(default_factory=dict)
65
+ tables: List[List[List[Any]]] = field(default_factory=list) # nested: [sheet[row[cell]]]
66
+ images_b64: List[str] = field(default_factory=list) # base64-encoded embedded images
67
+ error: Optional[str] = None
68
+ truncated: bool = False
69
+ char_count: int = 0
70
+
71
+
72
+ # ── Size / truncation limits ──────────────────────────────────────────────────
73
+
74
+ MAX_TEXT_CHARS = 80_000 # ~20k tokens — safe for most context windows
75
+ MAX_TABLE_ROWS = 200 # per sheet
76
+ MAX_PAGES = 50 # PDF page limit
77
+ SUMMARY_CHARS = 3_000 # for quick summary mode
78
+
79
+ # ── Main dispatcher ───────────────────────────────────────────────────────────
80
+
81
+ def parse_file(path_str: str, max_chars: int = MAX_TEXT_CHARS,
82
+ include_images: bool = False) -> FileContent:
83
+ """
84
+ Parse any supported file and return normalised FileContent.
85
+
86
+ path_str: absolute or ~-relative path
87
+ max_chars: truncate text extraction at this many chars
88
+ include_images: whether to base64-encode embedded images (expensive)
89
+ """
90
+ path = Path(path_str).expanduser().resolve()
91
+ if not path.exists():
92
+ return FileContent(False, "unknown", str(path), path.name, 0,
93
+ "", error=f"文件不存在: {path}")
94
+ if not path.is_file():
95
+ return FileContent(False, "unknown", str(path), path.name, 0,
96
+ "", error=f"不是文件: {path}")
97
+
98
+ size_kb = path.stat().st_size / 1024
99
+ if size_kb > 100 * 1024: # 100 MB guard
100
+ return FileContent(False, "unknown", str(path), path.name, size_kb,
101
+ "", error="文件过大(>100MB),请使用更小的文件")
102
+
103
+ suffix = path.suffix.lstrip(".").lower()
104
+ _DISPATCH = {
105
+ "pdf": _parse_pdf,
106
+ "docx": _parse_docx,
107
+ "doc": _parse_docx,
108
+ "xlsx": _parse_excel,
109
+ "xls": _parse_excel,
110
+ "csv": _parse_csv,
111
+ "tsv": _parse_csv,
112
+ "json": _parse_json,
113
+ "jsonl": _parse_json,
114
+ "html": _parse_html,
115
+ "htm": _parse_html,
116
+ "xml": _parse_html,
117
+ "md": _parse_text,
118
+ "txt": _parse_text,
119
+ "rst": _parse_text,
120
+ "log": _parse_text,
121
+ "yaml": _parse_text,
122
+ "yml": _parse_text,
123
+ "toml": _parse_text,
124
+ "ini": _parse_text,
125
+ "env": _parse_text,
126
+ "png": _parse_image,
127
+ "jpg": _parse_image,
128
+ "jpeg": _parse_image,
129
+ "gif": _parse_image,
130
+ "webp": _parse_image,
131
+ "bmp": _parse_image,
132
+ }
133
+ # Code files
134
+ _CODE_EXT = {"py","js","ts","tsx","jsx","go","java","c","cpp","h","hpp",
135
+ "rs","rb","php","swift","kt","scala","sh","bash","zsh",
136
+ "sql","r","m","cs","vb","lua","perl","ps1"}
137
+ if suffix in _CODE_EXT:
138
+ fn = _parse_code
139
+ else:
140
+ fn = _DISPATCH.get(suffix, _parse_text)
141
+
142
+ try:
143
+ result = fn(path, max_chars=max_chars, include_images=include_images)
144
+ result.char_count = len(result.content)
145
+ return result
146
+ except Exception as e:
147
+ logger.exception("parse_file failed for %s", path)
148
+ return FileContent(False, suffix, str(path), path.name, size_kb,
149
+ "", error=f"解析失败: {e}")
150
+
151
+
152
+ # ── PDF ───────────────────────────────────────────────────────────────────────
153
+
154
+ def _parse_pdf(path: Path, max_chars: int, include_images: bool) -> FileContent:
155
+ size_kb = path.stat().st_size / 1024
156
+ meta: Dict[str, Any] = {"pages": 0}
157
+ text_parts = []
158
+ tables = []
159
+
160
+ if _pdfplumber:
161
+ import pdfplumber
162
+ with pdfplumber.open(str(path)) as pdf:
163
+ meta["pages"] = len(pdf.pages)
164
+ meta["pdf_info"] = dict(pdf.metadata or {})
165
+ for i, page in enumerate(pdf.pages[:MAX_PAGES]):
166
+ page_text = (page.extract_text() or "").strip()
167
+ if page_text:
168
+ text_parts.append(f"[第{i+1}页]\n{page_text}")
169
+ # Tables
170
+ for tbl in (page.extract_tables() or []):
171
+ if tbl:
172
+ tables.append(tbl)
173
+ elif _pypdf:
174
+ import pypdf
175
+ reader = pypdf.PdfReader(str(path))
176
+ meta["pages"] = len(reader.pages)
177
+ for i, page in enumerate(reader.pages[:MAX_PAGES]):
178
+ t = page.extract_text() or ""
179
+ if t.strip():
180
+ text_parts.append(f"[第{i+1}页]\n{t.strip()}")
181
+ else:
182
+ return FileContent(False, "pdf", str(path), path.name, size_kb, "",
183
+ error="未安装 PDF 解析库,请运行: pip install pdfplumber")
184
+
185
+ full_text = "\n\n".join(text_parts)
186
+ truncated = len(full_text) > max_chars
187
+ return FileContent(True, "pdf", str(path), path.name, size_kb,
188
+ full_text[:max_chars], meta, tables[:20],
189
+ truncated=truncated)
190
+
191
+
192
+ # ── DOCX ──────────────────────────────────────────────────────────────────────
193
+
194
+ def _parse_docx(path: Path, max_chars: int, include_images: bool) -> FileContent:
195
+ size_kb = path.stat().st_size / 1024
196
+ if not _docx_mod:
197
+ return FileContent(False, "docx", str(path), path.name, size_kb, "",
198
+ error="未安装 python-docx,请运行: pip install python-docx")
199
+ import docx
200
+ doc = docx.Document(str(path))
201
+
202
+ parts = []
203
+ tables = []
204
+
205
+ # Core properties
206
+ meta: Dict[str, Any] = {}
207
+ try:
208
+ cp = doc.core_properties
209
+ meta = {
210
+ "author": cp.author,
211
+ "created": str(cp.created)[:10] if cp.created else "",
212
+ "modified": str(cp.modified)[:10] if cp.modified else "",
213
+ "title": cp.title or "",
214
+ "subject": cp.subject or "",
215
+ }
216
+ except Exception:
217
+ pass
218
+
219
+ # Paragraphs
220
+ for para in doc.paragraphs:
221
+ t = para.text.strip()
222
+ if not t:
223
+ continue
224
+ # Heading levels
225
+ if para.style.name.startswith("Heading"):
226
+ level = para.style.name.replace("Heading ", "")
227
+ parts.append(f"\n{'#' * int(level) if level.isdigit() else '##'} {t}")
228
+ else:
229
+ parts.append(t)
230
+
231
+ # Tables
232
+ for tbl in doc.tables:
233
+ rows = []
234
+ for row in tbl.rows[:MAX_TABLE_ROWS]:
235
+ rows.append([cell.text.strip() for cell in row.cells])
236
+ if rows:
237
+ tables.append(rows)
238
+ # Inline text representation
239
+ header = " | ".join(rows[0])
240
+ sep = " | ".join(["---"] * len(rows[0]))
241
+ body = "\n".join(" | ".join(r) for r in rows[1:MAX_TABLE_ROWS])
242
+ parts.append(f"\n[表格]\n{header}\n{sep}\n{body}")
243
+
244
+ full_text = "\n".join(parts)
245
+ truncated = len(full_text) > max_chars
246
+ meta["paragraphs"] = len(doc.paragraphs)
247
+ meta["tables"] = len(doc.tables)
248
+ return FileContent(True, "docx", str(path), path.name, size_kb,
249
+ full_text[:max_chars], meta, tables[:10],
250
+ truncated=truncated)
251
+
252
+
253
+ # ── Excel ─────────────────────────────────────────────────────────────────────
254
+
255
+ def _parse_excel(path: Path, max_chars: int, include_images: bool) -> FileContent:
256
+ size_kb = path.stat().st_size / 1024
257
+ if not _pd:
258
+ return FileContent(False, "xlsx", str(path), path.name, size_kb, "",
259
+ error="未安装 pandas,请运行: pip install pandas openpyxl")
260
+
261
+ import pandas as pd
262
+ try:
263
+ sheets = pd.read_excel(str(path), sheet_name=None, engine="openpyxl",
264
+ nrows=MAX_TABLE_ROWS)
265
+ except Exception as e:
266
+ return FileContent(False, "xlsx", str(path), path.name, size_kb, "",
267
+ error=f"读取 Excel 失败: {e}")
268
+
269
+ parts = []
270
+ tables = []
271
+ meta: Dict[str, Any] = {"sheets": list(sheets.keys()), "sheet_count": len(sheets)}
272
+
273
+ for sname, df in sheets.items():
274
+ df = df.fillna("").astype(str)
275
+ rows = [list(df.columns)] + df.values.tolist()
276
+ tables.append(rows)
277
+ meta[f"sheet_{sname}_shape"] = f"{len(df)} 行 × {len(df.columns)} 列"
278
+
279
+ # Text representation (first 50 rows shown)
280
+ header = " | ".join(str(c) for c in df.columns)
281
+ sep = " | ".join(["---"] * len(df.columns))
282
+ body_rows = df.head(50).values.tolist()
283
+ body = "\n".join(" | ".join(str(v)[:30] for v in r) for r in body_rows)
284
+ parts.append(f"\n[Sheet: {sname}] ({len(df)} 行 × {len(df.columns)} 列)\n"
285
+ f"{header}\n{sep}\n{body}")
286
+
287
+ # Basic stats for numeric columns
288
+ num_cols = df.select_dtypes(include="number") if hasattr(df, "select_dtypes") else None
289
+ if num_cols is not None and not num_cols.empty:
290
+ stats_lines = []
291
+ for col in list(num_cols.columns)[:5]:
292
+ s = num_cols[col].describe()
293
+ stats_lines.append(f" {col}: 均值={s.get('mean','')} 最大={s.get('max','')} 最小={s.get('min','')}")
294
+ if stats_lines:
295
+ parts.append("[数值统计]\n" + "\n".join(stats_lines))
296
+
297
+ full_text = "\n".join(parts)
298
+ truncated = len(full_text) > max_chars
299
+ return FileContent(True, "xlsx", str(path), path.name, size_kb,
300
+ full_text[:max_chars], meta, tables,
301
+ truncated=truncated)
302
+
303
+
304
+ # ── CSV ───────────────────────────────────────────────────────────────────────
305
+
306
+ def _parse_csv(path: Path, max_chars: int, include_images: bool) -> FileContent:
307
+ size_kb = path.stat().st_size / 1024
308
+ if not _pd:
309
+ # Fallback: plain text
310
+ return _parse_text(path, max_chars, include_images)
311
+
312
+ import pandas as pd
313
+ # Detect delimiter
314
+ delim = "\t" if path.suffix.lower() == ".tsv" else ","
315
+ try:
316
+ df = pd.read_csv(str(path), sep=delim, nrows=MAX_TABLE_ROWS,
317
+ encoding="utf-8", on_bad_lines="skip")
318
+ except Exception:
319
+ try:
320
+ df = pd.read_csv(str(path), sep=delim, nrows=MAX_TABLE_ROWS,
321
+ encoding="gbk", on_bad_lines="skip")
322
+ except Exception as e:
323
+ return FileContent(False, "csv", str(path), path.name, size_kb, "",
324
+ error=f"CSV 读取失败: {e}")
325
+
326
+ meta: Dict[str, Any] = {
327
+ "rows": len(df),
328
+ "columns": len(df.columns),
329
+ "col_names": list(df.columns)[:30],
330
+ }
331
+ try:
332
+ desc = df.describe(include="all").to_string()
333
+ meta["stats_preview"] = desc[:1000]
334
+ except Exception:
335
+ pass
336
+
337
+ rows = [list(df.columns)] + df.values.tolist()
338
+ header = " | ".join(str(c) for c in df.columns)
339
+ sep = " | ".join(["---"] * len(df.columns))
340
+ body = "\n".join(" | ".join(str(v)[:25] for v in r) for r in df.head(80).values.tolist())
341
+ full_text = f"[CSV: {path.name}] {len(df)} 行 × {len(df.columns)} 列\n{header}\n{sep}\n{body}"
342
+ truncated = len(full_text) > max_chars
343
+ return FileContent(True, "csv", str(path), path.name, size_kb,
344
+ full_text[:max_chars], meta, [rows[:MAX_TABLE_ROWS]],
345
+ truncated=truncated)
346
+
347
+
348
+ # ── JSON ──────────────────────────────────────────────────────────────────────
349
+
350
+ def _parse_json(path: Path, max_chars: int, include_images: bool) -> FileContent:
351
+ size_kb = path.stat().st_size / 1024
352
+ suffix = path.suffix.lower()
353
+
354
+ try:
355
+ if suffix == ".jsonl":
356
+ lines = path.read_text(encoding="utf-8").splitlines()
357
+ records = []
358
+ for line in lines[:500]:
359
+ line = line.strip()
360
+ if line:
361
+ try: records.append(json.loads(line))
362
+ except Exception: pass
363
+ data = records
364
+ meta = {"format": "JSONL", "records": len(records), "sample": records[:3]}
365
+ else:
366
+ raw = path.read_text(encoding="utf-8")
367
+ data = json.loads(raw)
368
+ meta = {"format": "JSON", "type": type(data).__name__}
369
+ if isinstance(data, list):
370
+ meta["length"] = len(data)
371
+ meta["sample"] = data[:3]
372
+ elif isinstance(data, dict):
373
+ meta["keys"] = list(data.keys())[:20]
374
+
375
+ # Pretty-print (truncated)
376
+ text = json.dumps(data, ensure_ascii=False, indent=2)
377
+ truncated = len(text) > max_chars
378
+ return FileContent(True, "json", str(path), path.name, size_kb,
379
+ text[:max_chars], meta, truncated=truncated)
380
+ except Exception as e:
381
+ return FileContent(False, "json", str(path), path.name, size_kb, "",
382
+ error=f"JSON 解析失败: {e}")
383
+
384
+
385
+ # ── HTML ──────────────────────────────────────────────────────────────────────
386
+
387
+ def _parse_html(path: Path, max_chars: int, include_images: bool) -> FileContent:
388
+ size_kb = path.stat().st_size / 1024
389
+ raw = path.read_text(encoding="utf-8", errors="replace")
390
+ meta: Dict[str, Any] = {}
391
+
392
+ if _bs4:
393
+ from bs4 import BeautifulSoup
394
+ soup = BeautifulSoup(raw, "html.parser")
395
+ # Remove scripts/styles
396
+ for tag in soup(["script", "style", "nav", "footer", "header"]):
397
+ tag.decompose()
398
+ title = soup.find("title")
399
+ meta["title"] = title.get_text().strip() if title else ""
400
+ # Get main content
401
+ main = soup.find("main") or soup.find("article") or soup.find("body") or soup
402
+ text = main.get_text(separator="\n").strip()
403
+ # Collapse blank lines
404
+ text = re.sub(r"\n{3,}", "\n\n", text)
405
+ else:
406
+ # Simple tag stripping
407
+ text = re.sub(r"<[^>]+>", " ", raw)
408
+ text = re.sub(r"\s+", " ", text).strip()
409
+
410
+ truncated = len(text) > max_chars
411
+ return FileContent(True, "html", str(path), path.name, size_kb,
412
+ text[:max_chars], meta, truncated=truncated)
413
+
414
+
415
+ # ── Plain Text / Markdown / Code ──────────────────────────────────────────────
416
+
417
+ def _parse_text(path: Path, max_chars: int, include_images: bool) -> FileContent:
418
+ size_kb = path.stat().st_size / 1024
419
+ for enc in ("utf-8", "gbk", "latin-1"):
420
+ try:
421
+ text = path.read_text(encoding=enc)
422
+ break
423
+ except UnicodeDecodeError:
424
+ continue
425
+ else:
426
+ text = path.read_bytes().decode("utf-8", errors="replace")
427
+
428
+ lines = text.count("\n")
429
+ meta = {"lines": lines, "encoding": "utf-8"}
430
+ truncated = len(text) > max_chars
431
+ return FileContent(True, "text", str(path), path.name, size_kb,
432
+ text[:max_chars], meta, truncated=truncated)
433
+
434
+
435
+ def _parse_code(path: Path, max_chars: int, include_images: bool) -> FileContent:
436
+ size_kb = path.stat().st_size / 1024
437
+ for enc in ("utf-8", "gbk", "latin-1"):
438
+ try:
439
+ text = path.read_text(encoding=enc)
440
+ break
441
+ except UnicodeDecodeError:
442
+ continue
443
+ else:
444
+ text = path.read_bytes().decode("utf-8", errors="replace")
445
+
446
+ suffix = path.suffix.lstrip(".").lower()
447
+ lines = text.splitlines()
448
+ meta: Dict[str, Any] = {
449
+ "language": suffix,
450
+ "lines": len(lines),
451
+ "blank_lines": sum(1 for l in lines if not l.strip()),
452
+ }
453
+
454
+ # Extract function/class names
455
+ _PATTERNS = {
456
+ "py": (r"^(?:async\s+)?def\s+(\w+)", r"^class\s+(\w+)"),
457
+ "js": (r"function\s+(\w+)\s*\(", r"class\s+(\w+)"),
458
+ "ts": (r"function\s+(\w+)\s*\(", r"class\s+(\w+)", r"interface\s+(\w+)"),
459
+ "go": (r"^func\s+\(?[^)]*\)?\s*(\w+)\s*\(", r"^type\s+(\w+)\s+struct"),
460
+ "java": (r"(?:public|private|protected)?\s+\w+\s+(\w+)\s*\(", r"class\s+(\w+)"),
461
+ "rs": (r"^(?:pub\s+)?fn\s+(\w+)", r"^(?:pub\s+)?struct\s+(\w+)"),
462
+ }
463
+ patterns = _PATTERNS.get(suffix, [])
464
+ symbols = []
465
+ for pat in patterns:
466
+ found = re.findall(pat, text, re.MULTILINE)
467
+ symbols.extend(found[:20])
468
+ if symbols:
469
+ meta["symbols"] = list(dict.fromkeys(symbols))[:30]
470
+
471
+ truncated = len(text) > max_chars
472
+ return FileContent(True, "code", str(path), path.name, size_kb,
473
+ text[:max_chars], meta, truncated=truncated)
474
+
475
+
476
+ # ── Image ─────────────────────────────────────────────────────────────────────
477
+
478
+ def _parse_image(path: Path, max_chars: int, include_images: bool) -> FileContent:
479
+ size_kb = path.stat().st_size / 1024
480
+ suffix = path.suffix.lstrip(".").lower()
481
+ mime = {"png":"image/png","jpg":"image/jpeg","jpeg":"image/jpeg",
482
+ "gif":"image/gif","webp":"image/webp","bmp":"image/bmp"}.get(suffix,"image/png")
483
+ meta: Dict[str, Any] = {"mime": mime, "size_kb": round(size_kb, 1)}
484
+
485
+ if _PIL:
486
+ from PIL import Image
487
+ try:
488
+ with Image.open(str(path)) as img:
489
+ meta["width"] = img.width
490
+ meta["height"] = img.height
491
+ meta["mode"] = img.mode
492
+ meta["format"] = img.format
493
+ if hasattr(img, "_getexif") and img._getexif():
494
+ meta["has_exif"] = True
495
+ except Exception:
496
+ pass
497
+
498
+ # Base64 for vision models
499
+ images_b64 = []
500
+ if include_images and size_kb < 10 * 1024: # < 10MB
501
+ try:
502
+ b64 = base64.b64encode(path.read_bytes()).decode()
503
+ images_b64 = [f"data:{mime};base64,{b64}"]
504
+ except Exception:
505
+ pass
506
+
507
+ text = (f"[图片文件: {path.name}]\n"
508
+ f"尺寸: {meta.get('width','?')}×{meta.get('height','?')} px\n"
509
+ f"格式: {meta.get('format',suffix.upper())}\n"
510
+ f"大小: {size_kb:.1f} KB")
511
+ return FileContent(True, "image", str(path), path.name, size_kb,
512
+ text, meta, images_b64=images_b64)
513
+
514
+
515
+ # ── 多层分析提示词生成器 ──────────────────────────────────────────────────────
516
+
517
+ def build_analysis_prompt(fc: FileContent, layer: int = 1,
518
+ domain: str = "auto",
519
+ question: str = "") -> str:
520
+ """
521
+ 为不同分析层次生成系统/用户提示词。
522
+
523
+ layer 1 — 快速摘要 (300 字以内,Who/What/When/Why/Key metrics)
524
+ layer 2 — 深度分析 (结构、要点、数据模式、异常值)
525
+ layer 3 — 领域洞察 (财务/法律/技术/学术 — 由 domain 控制)
526
+ layer 4 — 行动建议 (风险点、优化建议、下一步)
527
+ """
528
+ type_zh = {
529
+ "pdf": "PDF 文档",
530
+ "docx": "Word 文档",
531
+ "xlsx": "Excel 表格",
532
+ "csv": "CSV 数据文件",
533
+ "json": "JSON 数据",
534
+ "image": "图片",
535
+ "code": "代码文件",
536
+ "text": "文本文件",
537
+ "html": "网页文件",
538
+ }.get(fc.file_type, "文件")
539
+
540
+ trunc_note = "\n\n⚠️ 注意:文件内容已被截断(超出上下文限制),以下为前段内容。" if fc.truncated else ""
541
+ meta_summary = _format_meta(fc)
542
+
543
+ # Auto-detect domain from content
544
+ if domain == "auto":
545
+ domain = _detect_domain(fc)
546
+
547
+ _DOMAIN_CONTEXT = {
548
+ "finance": "你是资深财务分析师,专注财报分析、现金流、盈利质量、风险敞口。",
549
+ "legal": "你是法律顾问,关注合同条款、风险条款、义务约束、免责声明。",
550
+ "tech": "你是高级软件工程师,评估代码质量、架构、安全、性能、可维护性。",
551
+ "research":"你是研究员,提炼论文方法论、数据、结论、局限性及引用。",
552
+ "realty": "你是不动产分析师,关注物业数据、租金、估值、市场趋势、合规风险。",
553
+ "medical": "你是医学顾问,总结诊断信息、治疗方案、用药风险(不构成诊断建议)。",
554
+ "general": "你是专业文档分析师,全面理解文件内容。",
555
+ }
556
+ domain_ctx = _DOMAIN_CONTEXT.get(domain, _DOMAIN_CONTEXT["general"])
557
+
558
+ layers = {
559
+ 1: f"""请对以下{type_zh}进行**快速摘要分析**(300字以内):
560
+ {meta_summary}
561
+ 要求:
562
+ 1. 用一句话说明文件核心主题
563
+ 2. 列出 3-5 个关键发现/数据点
564
+ 3. 标注文件覆盖的时间范围(如有)
565
+ 4. 指出最重要的结论或结果
566
+ {trunc_note}
567
+
568
+ ---文件内容---
569
+ {fc.content[:SUMMARY_CHARS]}""",
570
+
571
+ 2: f"""请对以下{type_zh}进行**深度内容分析**:
572
+ {meta_summary}
573
+ 分析维度:
574
+ 1. **结构分析** — 文件章节/字段组织,逻辑流程
575
+ 2. **数据要点** — 关键数字、趋势、比较(列表格式)
576
+ 3. **异常与亮点** — 与常规预期显著不同的点
577
+ 4. **信息完整性** — 是否有缺失、矛盾或模糊内容
578
+ 5. **数据质量** — 若为数据文件:空值率、一致性
579
+ {trunc_note}
580
+
581
+ ---文件内容---
582
+ {fc.content}""",
583
+
584
+ 3: f"""{domain_ctx}
585
+
586
+ 请对以下{type_zh}进行**领域专项分析**:
587
+ {meta_summary}
588
+ 重点关注:
589
+ 1. **核心指标解读** — 本领域最重要的量化指标及其含义
590
+ 2. **潜在风险** — 文件中反映或隐含的风险点
591
+ 3. **与行业基准的偏差** — 什么是正常水平,当前数据如何?
592
+ 4. **合规/规范性** — 是否符合本领域常规标准
593
+ 5. **深层逻辑** — 表面数据背后的原因/驱动因素
594
+ {trunc_note}
595
+
596
+ ---文件内容---
597
+ {fc.content}""",
598
+
599
+ 4: f"""{domain_ctx}
600
+
601
+ 基于以下{type_zh}的内容,请给出**可执行的行动建议**:
602
+ {meta_summary}
603
+ 要求:
604
+ 1. **立即行动** — 需要立刻处理的事项(按优先级)
605
+ 2. **改进建议** — 中期可以优化的方面
606
+ 3. **风险预警** — 需要关注但未必立刻行动的隐患
607
+ 4. **问题清单** — 文件不清晰/需要补充的 3-5 个问题
608
+ 5. **下一步** — 建议的后续分析或决策步骤
609
+ {trunc_note}
610
+
611
+ ---文件内容---
612
+ {fc.content[:max(len(fc.content)//2, SUMMARY_CHARS)]}""",
613
+ }
614
+
615
+ base_prompt = layers.get(layer, layers[1])
616
+ if question:
617
+ base_prompt = (f"关于以下{type_zh},用户提问:\n\n**{question}**\n\n"
618
+ f"{meta_summary}{trunc_note}\n\n---文件内容---\n{fc.content}")
619
+ return base_prompt
620
+
621
+
622
+ def _format_meta(fc: FileContent) -> str:
623
+ """Format metadata for prompt header."""
624
+ lines = [f"文件名: {fc.filename}", f"类型: {fc.file_type.upper()}",
625
+ f"大小: {fc.size_kb:.1f} KB"]
626
+ for k, v in fc.metadata.items():
627
+ if k in ("pages","rows","columns","lines","language","sheets",
628
+ "records","length","sheet_count","paragraphs","tables",
629
+ "title","author","created"):
630
+ lines.append(f"{k}: {v}")
631
+ if fc.tables:
632
+ lines.append(f"包含表格: {len(fc.tables)} 个")
633
+ return " ".join(lines[:8])
634
+
635
+
636
+ def _detect_domain(fc: FileContent) -> str:
637
+ """Auto-detect analysis domain from content keywords."""
638
+ text_lower = fc.content[:2000].lower()
639
+ scores = {
640
+ "finance": sum(text_lower.count(k) for k in ["revenue","profit","ebitda","cashflow",
641
+ "净利润","营收","现金流","资产负债","毛利率","eps","roe","pe","财报"]),
642
+ "legal": sum(text_lower.count(k) for k in ["合同","甲方","乙方","违约","协议",
643
+ "条款","liability","indemnify","agreement","breach"]),
644
+ "tech": sum(text_lower.count(k) for k in ["def ","class ","function","import",
645
+ "select ","return ","var ","const ","type "]) + (2 if fc.file_type=="code" else 0),
646
+ "research": sum(text_lower.count(k) for k in ["abstract","methodology","conclusion",
647
+ "hypothesis","摘要","结论","方法","样本量","显著性","p-value"]),
648
+ "realty": sum(text_lower.count(k) for k in ["租金","房价","物业","不动产","reit",
649
+ "产权","容积率","建筑面积","房地产","地块"]),
650
+ }
651
+ best = max(scores, key=scores.get)
652
+ return best if scores[best] >= 2 else "general"
653
+
654
+
655
+ # ── Session file store (used by /file command) ────────────────────────────────
656
+
657
+ class FileSession:
658
+ """
659
+ Holds loaded files for the current REPL session.
660
+ Provides multi-turn Q&A context injection.
661
+ """
662
+ def __init__(self):
663
+ self._files: Dict[str, FileContent] = {} # name → FileContent
664
+ self._active: Optional[str] = None
665
+
666
+ def load(self, path_str: str, include_images: bool = False) -> FileContent:
667
+ fc = parse_file(path_str, include_images=include_images)
668
+ if fc.success:
669
+ self._files[fc.filename] = fc
670
+ self._active = fc.filename
671
+ return fc
672
+
673
+ def get_active(self) -> Optional[FileContent]:
674
+ if self._active and self._active in self._files:
675
+ return self._files[self._active]
676
+ return None
677
+
678
+ def set_active(self, name: str) -> bool:
679
+ if name in self._files:
680
+ self._active = name
681
+ return True
682
+ # Partial match
683
+ for key in self._files:
684
+ if name.lower() in key.lower():
685
+ self._active = key
686
+ return True
687
+ return False
688
+
689
+ def list_files(self) -> List[Dict[str, Any]]:
690
+ result = []
691
+ for name, fc in self._files.items():
692
+ result.append({
693
+ "filename": name,
694
+ "type": fc.file_type,
695
+ "size_kb": round(fc.size_kb, 1),
696
+ "chars": fc.char_count,
697
+ "active": name == self._active,
698
+ "truncated": fc.truncated,
699
+ })
700
+ return result
701
+
702
+ def build_context_block(self, max_chars: int = 12_000) -> str:
703
+ """Return a context block to inject into the system prompt."""
704
+ fc = self.get_active()
705
+ if not fc:
706
+ return ""
707
+ meta = _format_meta(fc)
708
+ content_preview = fc.content[:max_chars]
709
+ trunc = f"\n[内容已截断至前 {max_chars} 字符]" if fc.truncated or len(fc.content) > max_chars else ""
710
+ return (f"\n\n---已加载文件: {fc.filename}---\n"
711
+ f"{meta}\n\n{content_preview}{trunc}\n---文件结束---")
712
+
713
+ def clear(self, name: Optional[str] = None):
714
+ if name:
715
+ self._files.pop(name, None)
716
+ if self._active == name:
717
+ self._active = next(iter(self._files), None)
718
+ else:
719
+ self._files.clear()
720
+ self._active = None
721
+
722
+
723
+ # ── Dependency check ──────────────────────────────────────────────────────────
724
+
725
+ def check_parsers() -> Dict[str, bool]:
726
+ return {
727
+ "pdfplumber": _pdfplumber is not None,
728
+ "pypdf": _pypdf is not None,
729
+ "python-docx": _docx_mod is not None,
730
+ "pandas": _pd is not None,
731
+ "openpyxl": _openpyxl is not None,
732
+ "beautifulsoup4": _bs4 is not None,
733
+ "Pillow": _PIL is not None,
734
+ }