@tcos/broker-parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/parse-statement/SKILL.md +134 -0
- package/.claude/skills/parse-statement/examples.md +257 -0
- package/.claude/skills/parse-statement/trigger-tests/cases.yaml +133 -0
- package/README.md +153 -0
- package/dist/cli/index.d.ts +17 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +150 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/core/cleaning.d.ts +78 -0
- package/dist/core/cleaning.d.ts.map +1 -0
- package/dist/core/cleaning.js +217 -0
- package/dist/core/cleaning.js.map +1 -0
- package/dist/core/pipeline.d.ts +49 -0
- package/dist/core/pipeline.d.ts.map +1 -0
- package/dist/core/pipeline.js +66 -0
- package/dist/core/pipeline.js.map +1 -0
- package/dist/core/registry.d.ts +24 -0
- package/dist/core/registry.d.ts.map +1 -0
- package/dist/core/registry.js +53 -0
- package/dist/core/registry.js.map +1 -0
- package/dist/index.d.ts +9 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +29 -0
- package/dist/index.js.map +1 -0
- package/dist/parsers/phillip/extract.py +90 -0
- package/dist/parsers/phillip/extractor.d.ts +215 -0
- package/dist/parsers/phillip/extractor.d.ts.map +1 -0
- package/dist/parsers/phillip/extractor.js +1012 -0
- package/dist/parsers/phillip/extractor.js.map +1 -0
- package/dist/parsers/phillip/formatter.d.ts +113 -0
- package/dist/parsers/phillip/formatter.d.ts.map +1 -0
- package/dist/parsers/phillip/formatter.js +760 -0
- package/dist/parsers/phillip/formatter.js.map +1 -0
- package/dist/parsers/phillip/index.d.ts +25 -0
- package/dist/parsers/phillip/index.d.ts.map +1 -0
- package/dist/parsers/phillip/index.js +59 -0
- package/dist/parsers/phillip/index.js.map +1 -0
- package/dist/types/formatter.d.ts +47 -0
- package/dist/types/formatter.d.ts.map +1 -0
- package/dist/types/formatter.js +9 -0
- package/dist/types/formatter.js.map +1 -0
- package/dist/types/plugin.d.ts +14 -0
- package/dist/types/plugin.d.ts.map +1 -0
- package/dist/types/plugin.js +5 -0
- package/dist/types/plugin.js.map +1 -0
- package/dist/types/raw.d.ts +136 -0
- package/dist/types/raw.d.ts.map +1 -0
- package/dist/types/raw.js +11 -0
- package/dist/types/raw.js.map +1 -0
- package/dist/types/statement.d.ts +55 -0
- package/dist/types/statement.d.ts.map +1 -0
- package/dist/types/statement.js +12 -0
- package/dist/types/statement.js.map +1 -0
- package/package.json +64 -0
- package/src/parsers/phillip/extract.py +90 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: parse-statement
|
|
3
|
+
description: 解析券商 PDF 结单文件,提取交易记录、持仓和资产汇总为结构化 JSON
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
## 触发条件
|
|
7
|
+
|
|
8
|
+
以下情况触发此 Skill:
|
|
9
|
+
|
|
10
|
+
- 用户提供 PDF 文件,要求解析/读取/提取数据
|
|
11
|
+
- 用户提到「结单」、「月结单」、「交易结单」、「对账单」、「持仓报告」、「brokerage statement」
|
|
12
|
+
- 用户要求将 PDF 转为 JSON、提取交易记录、查看持仓
|
|
13
|
+
- 关键词:辉立证券、Phillip Securities、结单解析、parse statement
|
|
14
|
+
|
|
15
|
+
## Setup
|
|
16
|
+
|
|
17
|
+
运行以下 setup 脚本确保环境就绪:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
#!/usr/bin/env bash
|
|
21
|
+
set -e
|
|
22
|
+
|
|
23
|
+
# 1. 检查并安装 CLI
|
|
24
|
+
if ! command -v tcos-parse &>/dev/null; then
|
|
25
|
+
echo "Installing tcos-parse..."
|
|
26
|
+
npm install -g @tcos/broker-parser
|
|
27
|
+
fi
|
|
28
|
+
|
|
29
|
+
# 2. 检查 Python 3
|
|
30
|
+
if ! command -v python3 &>/dev/null; then
|
|
31
|
+
if command -v brew &>/dev/null; then
|
|
32
|
+
brew install python3
|
|
33
|
+
elif command -v apt-get &>/dev/null; then
|
|
34
|
+
sudo apt-get install -y python3 python3-pip
|
|
35
|
+
else
|
|
36
|
+
echo "ERROR: Cannot install Python automatically. Please install Python 3 manually."
|
|
37
|
+
exit 1
|
|
38
|
+
fi
|
|
39
|
+
fi
|
|
40
|
+
|
|
41
|
+
# 3. 检查并安装 pdfplumber
|
|
42
|
+
if ! python3 -c "import pdfplumber" 2>/dev/null; then
|
|
43
|
+
pip3 install pdfplumber
|
|
44
|
+
fi
|
|
45
|
+
|
|
46
|
+
# 4. 检查 poppler(pdfplumber 底层依赖)
|
|
47
|
+
if ! command -v pdftotext &>/dev/null; then
|
|
48
|
+
if command -v brew &>/dev/null; then
|
|
49
|
+
brew install poppler
|
|
50
|
+
elif command -v apt-get &>/dev/null; then
|
|
51
|
+
sudo apt-get install -y poppler-utils
|
|
52
|
+
fi
|
|
53
|
+
fi
|
|
54
|
+
|
|
55
|
+
echo "tcos-parse setup complete"
|
|
56
|
+
tcos-parse --version
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## 使用方式
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# 基础解析
|
|
63
|
+
tcos-parse <pdf> # 解析并输出 JSON 到 stdout
|
|
64
|
+
tcos-parse <pdf> -o out.json # 输出到文件
|
|
65
|
+
|
|
66
|
+
# 指定券商(跳过自动检测)
|
|
67
|
+
tcos-parse -b phillip <pdf> # 指定券商为 phillip
|
|
68
|
+
|
|
69
|
+
# 阶段控制
|
|
70
|
+
tcos-parse <pdf> --raw # 只输出 Stage1 原始提取数据
|
|
71
|
+
tcos-parse <pdf> --no-clean # 跳过 Stage3 清理步骤
|
|
72
|
+
|
|
73
|
+
# 检测与查询
|
|
74
|
+
tcos-parse --detect <pdf> # 检测 PDF 所属券商
|
|
75
|
+
tcos-parse --list-parsers # 列出支持的券商解析器
|
|
76
|
+
|
|
77
|
+
# 输出控制
|
|
78
|
+
tcos-parse <pdf> -v # 显示各阶段耗时(输出到 stderr)
|
|
79
|
+
tcos-parse <pdf> -q # 静默模式,只输出 JSON(无额外提示信息)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### 选项说明
|
|
83
|
+
|
|
84
|
+
| 选项 | 说明 |
|
|
85
|
+
| ---------------- | -------------------------------------- |
|
|
86
|
+
| `-o, --output` | 输出到文件而非 stdout |
|
|
87
|
+
| `-b, --broker` | 指定券商名称,跳过自动检测 |
|
|
88
|
+
| `--raw` | 只输出 Stage1 原始表格数据,不做格式化 |
|
|
89
|
+
| `--no-clean` | 跳过 Stage3 数据清理步骤 |
|
|
90
|
+
| `--detect` | 检测 PDF 所属券商及置信度 |
|
|
91
|
+
| `--list-parsers` | 列出所有可用的券商解析器 |
|
|
92
|
+
| `-v, --verbose` | 显示各阶段耗时详情(输出到 stderr) |
|
|
93
|
+
| `-q, --quiet` | 静默模式,仅输出纯 JSON |
|
|
94
|
+
|
|
95
|
+
## 输出格式
|
|
96
|
+
|
|
97
|
+
完整解析结果(StatementData):
|
|
98
|
+
|
|
99
|
+
```json
|
|
100
|
+
{
|
|
101
|
+
"broker": "phillip",
|
|
102
|
+
"accountCode": "M000001",
|
|
103
|
+
"statementDate": "2024-01-31",
|
|
104
|
+
"transactions": [
|
|
105
|
+
{
|
|
106
|
+
"date": "2024-01-15",
|
|
107
|
+
"ticker": "00700",
|
|
108
|
+
"name": "TENCENT",
|
|
109
|
+
"type": "BUY",
|
|
110
|
+
"quantity": 100,
|
|
111
|
+
"price": 298.4,
|
|
112
|
+
"amount": 29840.0,
|
|
113
|
+
"fee": 50.0,
|
|
114
|
+
"currency": "HKD"
|
|
115
|
+
}
|
|
116
|
+
],
|
|
117
|
+
"holdings": [
|
|
118
|
+
{
|
|
119
|
+
"ticker": "00700",
|
|
120
|
+
"name": "TENCENT",
|
|
121
|
+
"quantity": 100,
|
|
122
|
+
"avgCost": 298.4,
|
|
123
|
+
"marketValue": 30000.0,
|
|
124
|
+
"currency": "HKD"
|
|
125
|
+
}
|
|
126
|
+
],
|
|
127
|
+
"assets": {
|
|
128
|
+
"totalAssets": 150000.0,
|
|
129
|
+
"cashBalance": 120000.0,
|
|
130
|
+
"marketValue": 30000.0,
|
|
131
|
+
"currency": "HKD"
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
```
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
# parse-statement 使用示例
|
|
2
|
+
|
|
3
|
+
## 示例 1:基础解析 — 查看交易记录
|
|
4
|
+
|
|
5
|
+
**用户说**:帮我解析这份辉立的结单
|
|
6
|
+
|
|
7
|
+
**操作过程**:
|
|
8
|
+
|
|
9
|
+
1. 运行 setup 安装环境(首次使用)
|
|
10
|
+
2. 执行解析命令
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
tcos-parse /path/to/statement_202401.pdf
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
**输出**:
|
|
17
|
+
|
|
18
|
+
```json
|
|
19
|
+
{
|
|
20
|
+
"broker": "phillip",
|
|
21
|
+
"accountCode": "M000001",
|
|
22
|
+
"statementDate": "2024-01-31",
|
|
23
|
+
"transactions": [
|
|
24
|
+
{
|
|
25
|
+
"date": "2024-01-10",
|
|
26
|
+
"ticker": "00700",
|
|
27
|
+
"name": "TENCENT",
|
|
28
|
+
"type": "BUY",
|
|
29
|
+
"quantity": 100,
|
|
30
|
+
"price": 298.4,
|
|
31
|
+
"amount": 29840.0,
|
|
32
|
+
"fee": 50.0,
|
|
33
|
+
"currency": "HKD"
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"date": "2024-01-22",
|
|
37
|
+
"ticker": "09988",
|
|
38
|
+
"name": "BABA-SW",
|
|
39
|
+
"type": "SELL",
|
|
40
|
+
"quantity": 200,
|
|
41
|
+
"price": 72.5,
|
|
42
|
+
"amount": 14500.0,
|
|
43
|
+
"fee": 30.0,
|
|
44
|
+
"currency": "HKD"
|
|
45
|
+
}
|
|
46
|
+
],
|
|
47
|
+
"holdings": [
|
|
48
|
+
{
|
|
49
|
+
"ticker": "00700",
|
|
50
|
+
"name": "TENCENT",
|
|
51
|
+
"quantity": 100,
|
|
52
|
+
"avgCost": 298.4,
|
|
53
|
+
"marketValue": 30000.0,
|
|
54
|
+
"currency": "HKD"
|
|
55
|
+
}
|
|
56
|
+
],
|
|
57
|
+
"assets": {
|
|
58
|
+
"totalAssets": 150000.0,
|
|
59
|
+
"cashBalance": 120000.0,
|
|
60
|
+
"marketValue": 30000.0,
|
|
61
|
+
"currency": "HKD"
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## 示例 2:输出到文件
|
|
69
|
+
|
|
70
|
+
**用户说**:解析后保存到文件,方便我后续导入系统
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
tcos-parse /path/to/statement_202401.pdf -o result.json
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
**输出(stderr)**:
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
Written to result.json
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
解析结果已写入 `result.json`,不会输出到终端。
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## 示例 3:调试 — 查看原始提取数据
|
|
87
|
+
|
|
88
|
+
**用户说**:解析结果不对,我想看看 PDF 里原始提取出了什么
|
|
89
|
+
|
|
90
|
+
使用 `--raw` 选项只执行 Stage1(pdfplumber 提取),跳过格式化和清理:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
tcos-parse /path/to/statement_202401.pdf --raw
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
**输出**:
|
|
97
|
+
|
|
98
|
+
```json
|
|
99
|
+
{
|
|
100
|
+
"pages": [
|
|
101
|
+
{
|
|
102
|
+
"pageNumber": 1,
|
|
103
|
+
"tables": [
|
|
104
|
+
[
|
|
105
|
+
["Date", "Stock Code", "Description", "Buy/Sell", "Qty", "Price", "Amount"],
|
|
106
|
+
["10/01/2024", "00700", "TENCENT", "B", "100", "298.40", "29,840.00"]
|
|
107
|
+
]
|
|
108
|
+
]
|
|
109
|
+
}
|
|
110
|
+
]
|
|
111
|
+
}
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## 示例 4:调试 — 查看各阶段耗时
|
|
117
|
+
|
|
118
|
+
**用户说**:解析好慢,想看看慢在哪一步
|
|
119
|
+
|
|
120
|
+
使用 `-v` 查看各阶段耗时(耗时输出到 stderr,不污染 JSON):
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
tcos-parse /path/to/statement_202401.pdf -v
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
**stderr 输出**:
|
|
127
|
+
|
|
128
|
+
```
|
|
129
|
+
[timing] detect=120ms stage1=850ms stage2=30ms clean=15ms total=1015ms
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
**stdout 输出**:正常的 JSON 解析结果。
|
|
133
|
+
|
|
134
|
+
可以配合重定向只看耗时:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
tcos-parse /path/to/statement_202401.pdf -v > /dev/null
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## 示例 5:检测 PDF 所属券商
|
|
143
|
+
|
|
144
|
+
**用户说**:这个 PDF 是哪家券商的结单?
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
tcos-parse --detect /path/to/unknown_statement.pdf
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**输出**:
|
|
151
|
+
|
|
152
|
+
```
|
|
153
|
+
Detected broker: phillip (confidence: 0.95)
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
静默模式只输出券商名称:
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
tcos-parse --detect /path/to/unknown_statement.pdf -q
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
**输出**:
|
|
163
|
+
|
|
164
|
+
```
|
|
165
|
+
phillip
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## 示例 6:列出支持的券商
|
|
171
|
+
|
|
172
|
+
**用户说**:目前支持解析哪些券商的结单?
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
tcos-parse --list-parsers
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
**输出**:
|
|
179
|
+
|
|
180
|
+
```
|
|
181
|
+
Available parsers:
|
|
182
|
+
phillip — Phillip Securities (辉立证券)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## 示例 7:指定券商跳过自动检测
|
|
188
|
+
|
|
189
|
+
**用户说**:我知道这是辉立的结单,不需要自动检测
|
|
190
|
+
|
|
191
|
+
使用 `-b` 直接指定券商,省去检测步骤:
|
|
192
|
+
|
|
193
|
+
```bash
|
|
194
|
+
tcos-parse -b phillip /path/to/statement_202401.pdf
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## 示例 8:管道处理 — 用 jq 过滤特定交易
|
|
200
|
+
|
|
201
|
+
**用户说**:我只想看买入交易
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
tcos-parse /path/to/statement_202401.pdf -q | jq '.transactions[] | select(.type == "BUY")'
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
**输出**:
|
|
208
|
+
|
|
209
|
+
```json
|
|
210
|
+
{
|
|
211
|
+
"date": "2024-01-10",
|
|
212
|
+
"ticker": "00700",
|
|
213
|
+
"name": "TENCENT",
|
|
214
|
+
"type": "BUY",
|
|
215
|
+
"quantity": 100,
|
|
216
|
+
"price": 298.4,
|
|
217
|
+
"amount": 29840.0,
|
|
218
|
+
"fee": 50.0,
|
|
219
|
+
"currency": "HKD"
|
|
220
|
+
}
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
统计交易笔数:
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
tcos-parse /path/to/statement_202401.pdf -q | jq '.transactions | length'
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## 示例 9:批量解析多份结单
|
|
232
|
+
|
|
233
|
+
**用户说**:我有几个月的结单,想一起解析
|
|
234
|
+
|
|
235
|
+
用 shell 循环批量处理,每份结单输出到独立文件:
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
for pdf in /path/to/statements/*.pdf; do
|
|
239
|
+
name=$(basename "$pdf" .pdf)
|
|
240
|
+
tcos-parse "$pdf" -q -o "${name}.json"
|
|
241
|
+
echo "Done: $pdf -> ${name}.json"
|
|
242
|
+
done
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
---
|
|
246
|
+
|
|
247
|
+
## 示例 10:跳过清理步骤
|
|
248
|
+
|
|
249
|
+
**用户说**:Stage3 清理把我的某些数据删了,我想跳过清理
|
|
250
|
+
|
|
251
|
+
使用 `--no-clean` 跳过 Stage3 数据清理:
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
tcos-parse /path/to/statement_202401.pdf --no-clean
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
解析管道只执行 Stage1(提取)和 Stage2(格式化),不做去重和清洗。
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# promptfoo Skill 触发率评估用例
|
|
2
|
+
# 正例(22 个):应触发 parse-statement skill
|
|
3
|
+
# 负例(12 个):不应触发
|
|
4
|
+
|
|
5
|
+
version: 1
|
|
6
|
+
testCases:
|
|
7
|
+
# ── 正例(positive)──────────────────────────────────────────────
|
|
8
|
+
|
|
9
|
+
# 直接要求解析(中英文)
|
|
10
|
+
- description: 中文直接要求解析 PDF 结单
|
|
11
|
+
input: '帮我解析这个 PDF 结单'
|
|
12
|
+
expected: triggered
|
|
13
|
+
- description: 英文要求解析券商结单
|
|
14
|
+
input: 'Parse this brokerage statement for me'
|
|
15
|
+
expected: triggered
|
|
16
|
+
- description: 英文解析 PDF 月结单
|
|
17
|
+
input: 'Can you parse my monthly PDF statement?'
|
|
18
|
+
expected: triggered
|
|
19
|
+
|
|
20
|
+
# 各种"结单"同义词
|
|
21
|
+
- description: 月结单
|
|
22
|
+
input: '这是我上个月的月结单,帮我分析一下'
|
|
23
|
+
expected: triggered
|
|
24
|
+
- description: 交易结单
|
|
25
|
+
input: '我有一份交易结单需要解析'
|
|
26
|
+
expected: triggered
|
|
27
|
+
- description: 对账单
|
|
28
|
+
input: '我的对账单来了,帮我解析'
|
|
29
|
+
expected: triggered
|
|
30
|
+
- description: 持仓报告(PDF)
|
|
31
|
+
input: '帮我读取这份 PDF 持仓报告'
|
|
32
|
+
expected: triggered
|
|
33
|
+
|
|
34
|
+
# 不同券商关键词
|
|
35
|
+
- description: 辉立关键词
|
|
36
|
+
input: '我有一份辉立的月结单'
|
|
37
|
+
expected: triggered
|
|
38
|
+
- description: Phillip 关键词
|
|
39
|
+
input: 'I have a Phillip Securities statement PDF'
|
|
40
|
+
expected: triggered
|
|
41
|
+
- description: 辉立证券全称
|
|
42
|
+
input: '帮我解析辉立证券的这份结单 PDF'
|
|
43
|
+
expected: triggered
|
|
44
|
+
|
|
45
|
+
# 不同动词
|
|
46
|
+
- description: 动词"读取"
|
|
47
|
+
input: '帮我读取这个 PDF 里的交易数据'
|
|
48
|
+
expected: triggered
|
|
49
|
+
- description: 动词"提取"
|
|
50
|
+
input: '从这个 PDF 里提取所有交易记录'
|
|
51
|
+
expected: triggered
|
|
52
|
+
- description: 动词"转换"
|
|
53
|
+
input: '把这个 PDF 转成 JSON 格式'
|
|
54
|
+
expected: triggered
|
|
55
|
+
- description: 动词"分析"
|
|
56
|
+
input: '分析一下这份结单 PDF'
|
|
57
|
+
expected: triggered
|
|
58
|
+
- description: 关键词"结单解析"
|
|
59
|
+
input: '能帮我做结单解析吗'
|
|
60
|
+
expected: triggered
|
|
61
|
+
|
|
62
|
+
# 用户描述场景
|
|
63
|
+
- description: 收到结单的场景
|
|
64
|
+
input: '我收到了券商发来的结单,帮我看看里面的交易'
|
|
65
|
+
expected: triggered
|
|
66
|
+
- description: 用户提供 PDF 路径
|
|
67
|
+
input: '帮我看看这个 PDF /tmp/statement_202401.pdf'
|
|
68
|
+
expected: triggered
|
|
69
|
+
- description: 帮我看看这个 PDF
|
|
70
|
+
input: '帮我看看这个 PDF 文件里有什么交易'
|
|
71
|
+
expected: triggered
|
|
72
|
+
|
|
73
|
+
# 组合场景
|
|
74
|
+
- description: 查看交易记录
|
|
75
|
+
input: '帮我从结单里查看我的交易记录'
|
|
76
|
+
expected: triggered
|
|
77
|
+
- description: 查看持仓情况
|
|
78
|
+
input: '帮我看看这份结单里的持仓情况'
|
|
79
|
+
expected: triggered
|
|
80
|
+
- description: 查看资产汇总
|
|
81
|
+
input: '解析一下这份 PDF,我想看资产汇总'
|
|
82
|
+
expected: triggered
|
|
83
|
+
- description: 批量解析
|
|
84
|
+
input: '我有几份 PDF 结单,能一起解析吗'
|
|
85
|
+
expected: triggered
|
|
86
|
+
|
|
87
|
+
# ── 负例(negative)──────────────────────────────────────────────
|
|
88
|
+
|
|
89
|
+
# 一般股市问题(无结单/PDF 语境)
|
|
90
|
+
- description: 一般股市行情问题
|
|
91
|
+
input: '帮我分析一下股市行情'
|
|
92
|
+
expected: not_triggered
|
|
93
|
+
- description: 股票推荐
|
|
94
|
+
input: '你觉得腾讯的股票值得买吗'
|
|
95
|
+
expected: not_triggered
|
|
96
|
+
- description: 港股交易时间
|
|
97
|
+
input: '港股的交易时间是什么时候'
|
|
98
|
+
expected: not_triggered
|
|
99
|
+
|
|
100
|
+
# 其他文档格式
|
|
101
|
+
- description: Excel 表格
|
|
102
|
+
input: '帮我解析这个 Excel 表格'
|
|
103
|
+
expected: not_triggered
|
|
104
|
+
- description: Word 文档
|
|
105
|
+
input: '帮我读取这份 Word 文档'
|
|
106
|
+
expected: not_triggered
|
|
107
|
+
- description: CSV 文件
|
|
108
|
+
input: '帮我导入这个 CSV 交易数据'
|
|
109
|
+
expected: not_triggered
|
|
110
|
+
|
|
111
|
+
# 一般财务问题(不涉及解析)
|
|
112
|
+
- description: 税务计算问题
|
|
113
|
+
input: '我今年的股票收益要交多少税'
|
|
114
|
+
expected: not_triggered
|
|
115
|
+
- description: 开户咨询
|
|
116
|
+
input: '怎么在辉立证券开户'
|
|
117
|
+
expected: not_triggered
|
|
118
|
+
|
|
119
|
+
# PDF 但不是结单
|
|
120
|
+
- description: 合同 PDF
|
|
121
|
+
input: '帮我看看这份 PDF 合同'
|
|
122
|
+
expected: not_triggered
|
|
123
|
+
- description: 招股说明书
|
|
124
|
+
input: '帮我读取这份 PDF 招股说明书'
|
|
125
|
+
expected: not_triggered
|
|
126
|
+
|
|
127
|
+
# IPO 相关但不涉及解析
|
|
128
|
+
- description: IPO 概念问题
|
|
129
|
+
input: '什么是 IPO?'
|
|
130
|
+
expected: not_triggered
|
|
131
|
+
- description: 申购操作问题
|
|
132
|
+
input: '怎么在手机上申购新股'
|
|
133
|
+
expected: not_triggered
|
package/README.md
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# @tcos/broker-parser
|
|
2
|
+
|
|
3
|
+
> Parse brokerage PDF statements into structured JSON
|
|
4
|
+
|
|
5
|
+
[](https://github.com/biggersun/broker-parser/actions)
|
|
6
|
+
[](https://www.npmjs.com/package/@tcos/broker-parser)
|
|
7
|
+
[](./LICENSE)
|
|
8
|
+
|
|
9
|
+
## Supported Brokers
|
|
10
|
+
|
|
11
|
+
| Broker | Status |
|
|
12
|
+
| ----------------------------- | --------- |
|
|
13
|
+
| Phillip Securities (辉立证券) | Supported |
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
### Option 1: CLI
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
# Install
|
|
21
|
+
npm install -g @tcos/broker-parser
|
|
22
|
+
|
|
23
|
+
# Parse a PDF statement
|
|
24
|
+
tcos-parse statement.pdf
|
|
25
|
+
|
|
26
|
+
# Output to file
|
|
27
|
+
tcos-parse statement.pdf -o result.json
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Option 2: As an npm Package
|
|
31
|
+
|
|
32
|
+
```typescript
|
|
33
|
+
import { ParsePipeline, PluginRegistry, PhillipPlugin } from '@tcos/broker-parser';
|
|
34
|
+
|
|
35
|
+
const registry = new PluginRegistry();
|
|
36
|
+
registry.register(new PhillipPlugin());
|
|
37
|
+
const pipeline = new ParsePipeline(registry);
|
|
38
|
+
|
|
39
|
+
const result = await pipeline.parse('./statement.pdf');
|
|
40
|
+
console.log(result.data);
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Option 3: Claude Code Skill (Recommended for Non-Technical Users)
|
|
44
|
+
|
|
45
|
+
After installing, tell Claude: "Help me parse this PDF statement"
|
|
46
|
+
|
|
47
|
+
## Prerequisites
|
|
48
|
+
|
|
49
|
+
- Node.js 18+
|
|
50
|
+
- Python 3.8+ with `pdfplumber` (`pip install pdfplumber`)
|
|
51
|
+
- poppler
|
|
52
|
+
- macOS: `brew install poppler`
|
|
53
|
+
- Ubuntu: `apt-get install poppler-utils`
|
|
54
|
+
|
|
55
|
+
## CLI Reference
|
|
56
|
+
|
|
57
|
+
| Command | Description |
|
|
58
|
+
| ------------------------------ | ------------------------------------ |
|
|
59
|
+
| `tcos-parse <pdf>` | Parse PDF, output JSON to stdout |
|
|
60
|
+
| `tcos-parse <pdf> -o out.json` | Output to file |
|
|
61
|
+
| `tcos-parse <pdf> --raw` | Output Stage1 raw data only |
|
|
62
|
+
| `tcos-parse <pdf> --no-clean` | Skip Stage3 cleaning step |
|
|
63
|
+
| `tcos-parse <pdf> -b phillip` | Specify broker (skip auto-detect) |
|
|
64
|
+
| `tcos-parse --detect <pdf>` | Detect which broker a PDF belongs to |
|
|
65
|
+
| `tcos-parse --list-parsers` | List available broker parsers |
|
|
66
|
+
| `tcos-parse <pdf> -v` | Show stage timing to stderr |
|
|
67
|
+
| `tcos-parse <pdf> -q` | Quiet mode, output JSON only |
|
|
68
|
+
|
|
69
|
+
## Pipeline Architecture
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
PDF File
|
|
73
|
+
|
|
|
74
|
+
Stage1: Extract (pdfplumber)
|
|
75
|
+
| -> RawTableData
|
|
76
|
+
|
|
|
77
|
+
Stage2: Format (rule engine)
|
|
78
|
+
| -> StatementData
|
|
79
|
+
|
|
|
80
|
+
Stage3: Clean (dedup, normalize)
|
|
81
|
+
| -> StatementData (cleaned)
|
|
82
|
+
v
|
|
83
|
+
JSON Output
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Output Format
|
|
87
|
+
|
|
88
|
+
```json
|
|
89
|
+
{
|
|
90
|
+
"accountCode": "M000001",
|
|
91
|
+
"clientName": "USER A",
|
|
92
|
+
"statementDate": "2024-01-31",
|
|
93
|
+
"transactions": [
|
|
94
|
+
{
|
|
95
|
+
"transactionDate": "2024-01-15",
|
|
96
|
+
"stockCode": "1234",
|
|
97
|
+
"stockName": "EXAMPLE CO",
|
|
98
|
+
"transactionType": "BUY",
|
|
99
|
+
"quantity": 1000,
|
|
100
|
+
"price": 12.34,
|
|
101
|
+
"amount": -12340.0,
|
|
102
|
+
"currency": "HKD"
|
|
103
|
+
}
|
|
104
|
+
],
|
|
105
|
+
"ipo": [],
|
|
106
|
+
"snapshots": [
|
|
107
|
+
{
|
|
108
|
+
"symbol": "HKD",
|
|
109
|
+
"assetCategory": "Cash",
|
|
110
|
+
"quantity": 50000.0,
|
|
111
|
+
"currency": "HKD"
|
|
112
|
+
}
|
|
113
|
+
]
|
|
114
|
+
}
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Key type definitions:
|
|
118
|
+
|
|
119
|
+
- **`StatementData`** — Full parsed statement (account info, transactions, IPO records, holdings snapshots)
|
|
120
|
+
- **`TradeData`** — Individual trade record (BUY, SELL, DIVIDEND, FEE, etc.)
|
|
121
|
+
- **`IPOData`** — IPO subscription/allotment record
|
|
122
|
+
- **`SnapshotData`** — Holdings snapshot (cash balances, stock positions)
|
|
123
|
+
|
|
124
|
+
## Adding a New Broker
|
|
125
|
+
|
|
126
|
+
See [CONTRIBUTING.md](./CONTRIBUTING.md) for the guide on implementing a new broker plugin.
|
|
127
|
+
|
|
128
|
+
## Development
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
# Install dependencies
|
|
132
|
+
npm install
|
|
133
|
+
|
|
134
|
+
# Run CI tests (Stage2 + CLI, no PDF dependency)
|
|
135
|
+
npm test
|
|
136
|
+
|
|
137
|
+
# Run all tests including Stage1 (requires local PDFs)
|
|
138
|
+
npm run test:local
|
|
139
|
+
|
|
140
|
+
# Lint & format
|
|
141
|
+
npm run lint
|
|
142
|
+
npm run format:check
|
|
143
|
+
|
|
144
|
+
# Type check
|
|
145
|
+
npm run typecheck
|
|
146
|
+
|
|
147
|
+
# Build
|
|
148
|
+
npm run build
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## License
|
|
152
|
+
|
|
153
|
+
MIT
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* CLI 入口 — tcos-parse 命令
|
|
4
|
+
*
|
|
5
|
+
* 用法:
|
|
6
|
+
* tcos-parse <pdf> # 解析并输出 JSON 到 stdout
|
|
7
|
+
* tcos-parse <pdf> -o out.json # 输出到文件
|
|
8
|
+
* tcos-parse <pdf> --raw # 只输出 Stage1 原始数据
|
|
9
|
+
* tcos-parse <pdf> --no-clean # 跳过 Stage3 清理
|
|
10
|
+
* tcos-parse -b phillip <pdf> # 指定券商
|
|
11
|
+
* tcos-parse --detect <pdf> # 检测 PDF 所属券商
|
|
12
|
+
* tcos-parse --list-parsers # 列出支持的券商
|
|
13
|
+
* tcos-parse -v <pdf> # 显示各阶段耗时
|
|
14
|
+
* tcos-parse -q <pdf> # 静默模式,只输出 JSON
|
|
15
|
+
*/
|
|
16
|
+
export {};
|
|
17
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/cli/index.ts"],"names":[],"mappings":";AACA;;;;;;;;;;;;;GAaG"}
|