@tcos/broker-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/.claude/skills/parse-statement/SKILL.md +134 -0
  2. package/.claude/skills/parse-statement/examples.md +257 -0
  3. package/.claude/skills/parse-statement/trigger-tests/cases.yaml +133 -0
  4. package/README.md +153 -0
  5. package/dist/cli/index.d.ts +17 -0
  6. package/dist/cli/index.d.ts.map +1 -0
  7. package/dist/cli/index.js +150 -0
  8. package/dist/cli/index.js.map +1 -0
  9. package/dist/core/cleaning.d.ts +78 -0
  10. package/dist/core/cleaning.d.ts.map +1 -0
  11. package/dist/core/cleaning.js +217 -0
  12. package/dist/core/cleaning.js.map +1 -0
  13. package/dist/core/pipeline.d.ts +49 -0
  14. package/dist/core/pipeline.d.ts.map +1 -0
  15. package/dist/core/pipeline.js +66 -0
  16. package/dist/core/pipeline.js.map +1 -0
  17. package/dist/core/registry.d.ts +24 -0
  18. package/dist/core/registry.d.ts.map +1 -0
  19. package/dist/core/registry.js +53 -0
  20. package/dist/core/registry.js.map +1 -0
  21. package/dist/index.d.ts +9 -0
  22. package/dist/index.d.ts.map +1 -0
  23. package/dist/index.js +29 -0
  24. package/dist/index.js.map +1 -0
  25. package/dist/parsers/phillip/extract.py +90 -0
  26. package/dist/parsers/phillip/extractor.d.ts +215 -0
  27. package/dist/parsers/phillip/extractor.d.ts.map +1 -0
  28. package/dist/parsers/phillip/extractor.js +1012 -0
  29. package/dist/parsers/phillip/extractor.js.map +1 -0
  30. package/dist/parsers/phillip/formatter.d.ts +113 -0
  31. package/dist/parsers/phillip/formatter.d.ts.map +1 -0
  32. package/dist/parsers/phillip/formatter.js +760 -0
  33. package/dist/parsers/phillip/formatter.js.map +1 -0
  34. package/dist/parsers/phillip/index.d.ts +25 -0
  35. package/dist/parsers/phillip/index.d.ts.map +1 -0
  36. package/dist/parsers/phillip/index.js +59 -0
  37. package/dist/parsers/phillip/index.js.map +1 -0
  38. package/dist/types/formatter.d.ts +47 -0
  39. package/dist/types/formatter.d.ts.map +1 -0
  40. package/dist/types/formatter.js +9 -0
  41. package/dist/types/formatter.js.map +1 -0
  42. package/dist/types/plugin.d.ts +14 -0
  43. package/dist/types/plugin.d.ts.map +1 -0
  44. package/dist/types/plugin.js +5 -0
  45. package/dist/types/plugin.js.map +1 -0
  46. package/dist/types/raw.d.ts +136 -0
  47. package/dist/types/raw.d.ts.map +1 -0
  48. package/dist/types/raw.js +11 -0
  49. package/dist/types/raw.js.map +1 -0
  50. package/dist/types/statement.d.ts +55 -0
  51. package/dist/types/statement.d.ts.map +1 -0
  52. package/dist/types/statement.js +12 -0
  53. package/dist/types/statement.js.map +1 -0
  54. package/package.json +64 -0
  55. package/src/parsers/phillip/extract.py +90 -0
@@ -0,0 +1,1012 @@
1
+ "use strict";
2
+ /**
3
+ * Phillip pdfplumber Stage 1 提取器
4
+ *
5
+ * 纯 PDF 表格提取,不做业务逻辑处理。
6
+ * 输出统一的 RawTableData 格式,供 Stage 2 处理。
7
+ *
8
+ * 职责:
9
+ * 1. 调用 pdfplumber 提取字符坐标
10
+ * 2. 按行/列分组
11
+ * 3. 提取原始字段值
12
+ * 4. 不做分类、不做字段合并
13
+ */
14
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
15
+ if (k2 === undefined) k2 = k;
16
+ var desc = Object.getOwnPropertyDescriptor(m, k);
17
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
18
+ desc = { enumerable: true, get: function() { return m[k]; } };
19
+ }
20
+ Object.defineProperty(o, k2, desc);
21
+ }) : (function(o, m, k, k2) {
22
+ if (k2 === undefined) k2 = k;
23
+ o[k2] = m[k];
24
+ }));
25
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
26
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
27
+ }) : function(o, v) {
28
+ o["default"] = v;
29
+ });
30
+ var __importStar = (this && this.__importStar) || (function () {
31
+ var ownKeys = function(o) {
32
+ ownKeys = Object.getOwnPropertyNames || function (o) {
33
+ var ar = [];
34
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
35
+ return ar;
36
+ };
37
+ return ownKeys(o);
38
+ };
39
+ return function (mod) {
40
+ if (mod && mod.__esModule) return mod;
41
+ var result = {};
42
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
43
+ __setModuleDefault(result, mod);
44
+ return result;
45
+ };
46
+ })();
47
+ Object.defineProperty(exports, "__esModule", { value: true });
48
+ exports.PhillipPdfplumberExtractor = void 0;
49
+ const child_process_1 = require("child_process");
50
+ const path = __importStar(require("path"));
51
+ // ============================================================================
52
+ // 常量定义
53
+ // ============================================================================
54
+ /** 默认配置 */
55
+ const DEFAULT_CONFIG = {
56
+ gapThreshold: 5,
57
+ };
58
+ /** 区域标记关键词 */
59
+ const SECTION_MARKERS = {
60
+ // 交易记录区域 (包含双渲染版本)
61
+ transactionStart: [
62
+ '交易記錄',
63
+ 'Transaction Details',
64
+ '交交易易記記錄錄',
65
+ 'TTrraannssaaccttiioonn',
66
+ ],
67
+ transactionEnd: ['轉下結餘', '承下結餘', 'Balance C/F', '戶口資料', 'Account Details'],
68
+ // Account Details 区域 (现金余额) - 包含双渲染版本
69
+ accountDetailsStart: [
70
+ '戶口資料',
71
+ 'Account Details',
72
+ '戶戶口口資資料料',
73
+ 'AAccccoouunntt DDeettaaiillss',
74
+ ],
75
+ accountDetailsEnd: [
76
+ '股票投資組合',
77
+ 'Securities Portfolio',
78
+ '股股票票投投資資組組合合',
79
+ 'SSeeccuurriittiieess PPoorrttffoolliioo',
80
+ '詳情請參閱',
81
+ 'Please see important',
82
+ ],
83
+ // Securities Portfolio 区域 (股票/基金持仓) - 包含双渲染版本
84
+ holdingsStart: [
85
+ '股票投資組合',
86
+ 'Securities Portfolio',
87
+ '股股票票投投資資組組合合',
88
+ 'SSeeccuurriittiieess PPoorrttffoolliioo',
89
+ '持倉',
90
+ ],
91
+ holdingsEnd: ['E. & O. E.', 'Please see important', '詳情請參閱'],
92
+ // 股息及公告区域 - 出现时应终止持仓区域解析(包含双渲染版本)
93
+ dividendStart: [
94
+ '股息及公告',
95
+ 'Dividend',
96
+ 'Announcements',
97
+ '股股息息及及公公告告',
98
+ 'DDiivviiddeenndd',
99
+ ],
100
+ };
101
+ /** 交易类型关键词 (用于识别,不做分类) */
102
+ const TRANS_TYPE_KEYWORDS = {
103
+ Payment: ['Payment', '支付'],
104
+ Receipt: ['Receipt', '收入'],
105
+ Withdraw: ['Withdraw', '提貨'],
106
+ Deposit: ['Deposit', '存入'],
107
+ Buy: ['Buy', '買入', 'Bought'],
108
+ Sell: ['Sell', '賣出', 'Sold'],
109
+ };
110
+ // ============================================================================
111
+ // 提取器实现
112
+ // ============================================================================
113
+ /**
114
+ * Phillip pdfplumber Stage 1 提取器
115
+ */
116
+ class PhillipPdfplumberExtractor {
117
+ constructor(config) {
118
+ this.config = {
119
+ type: 'pdfplumber',
120
+ ...DEFAULT_CONFIG,
121
+ ...config,
122
+ };
123
+ }
124
+ /**
125
+ * 从 PDF 提取原始表格数据
126
+ */
127
+ async extract(pdfPath) {
128
+ const startTime = Date.now();
129
+ const warnings = [];
130
+ // 1. 调用 pdfplumber 提取字符数据
131
+ const pdfData = await this.callPdfplumber(pdfPath);
132
+ // 2. 提取账户信息
133
+ const accountInfo = this.extractAccountInfo(pdfData);
134
+ // 3. 提取交易记录
135
+ const transactions = this.extractTransactions(pdfData, warnings);
136
+ // 4. 提取持仓数据 (Account Details + Securities Portfolio)
137
+ const holdings = this.extractHoldings(pdfData, warnings);
138
+ return {
139
+ accountInfo,
140
+ transactions,
141
+ holdings,
142
+ metadata: {
143
+ totalPages: pdfData.totalPages,
144
+ parseTimeMs: Date.now() - startTime,
145
+ extractor: 'pdfplumber',
146
+ warnings,
147
+ },
148
+ };
149
+ }
150
+ /**
151
+ * 调用 pdfplumber Python 脚本
152
+ */
153
+ async callPdfplumber(pdfPath) {
154
+ return new Promise((resolve, reject) => {
155
+ // extract.py 发布时会和本文件在同一个 dist/parsers/phillip/ 目录
156
+ // 开发时在 src/parsers/phillip/extract.py
157
+ const pythonScript = path.join(__dirname, 'extract.py');
158
+ const pythonBin = 'python3'; // 直接用 PATH 中的 python3
159
+ const proc = (0, child_process_1.spawn)(pythonBin, [pythonScript, pdfPath, '--json'], {
160
+ env: {
161
+ ...process.env,
162
+ },
163
+ });
164
+ let stdout = '';
165
+ let stderr = '';
166
+ proc.stdout.on('data', (data) => {
167
+ stdout += data.toString();
168
+ });
169
+ proc.stderr.on('data', (data) => {
170
+ stderr += data.toString();
171
+ });
172
+ proc.on('close', (code) => {
173
+ if (code !== 0) {
174
+ reject(new Error(`Python script failed: ${stderr}`));
175
+ return;
176
+ }
177
+ try {
178
+ const result = JSON.parse(stdout);
179
+ resolve(result);
180
+ }
181
+ catch (e) {
182
+ reject(new Error(`Failed to parse Python output: ${String(e)}`));
183
+ }
184
+ });
185
+ });
186
+ }
187
+ /**
188
+ * 提取账户信息
189
+ *
190
+ * 辉立日结单格式示例:
191
+ * 客戶名稱 Name : SUN XIAOXU 客戶編號 A/C No : M596241
192
+ * 日期 Issue Date : 05/11/25
193
+ */
194
+ extractAccountInfo(pdfData) {
195
+ // 从第一页提取头部信息
196
+ if (pdfData.pages.length === 0) {
197
+ return {};
198
+ }
199
+ const firstPageText = pdfData.pages[0].text;
200
+ // 提取账户编号 (格式: A/C No : M596241)
201
+ const accountMatch = firstPageText.match(/A\/C\s*No\s*[::]\s*([A-Z]\d+)/i);
202
+ const accountCode = accountMatch ? accountMatch[1] : undefined;
203
+ // 提取客户名称 (格式: Name : SUN XIAOXU 客戶編號)
204
+ // 名称在 "Name :" 和 "客戶編號" 之间
205
+ const nameMatch = firstPageText.match(/Name\s*[::]\s*([A-Z\s]+?)\s*客戶編號/i);
206
+ const clientName = nameMatch ? nameMatch[1].trim() : undefined;
207
+ // 提取账单日期 (格式: Issue Date : 05/11/25)
208
+ const dateMatch = firstPageText.match(/Issue\s*Date\s*[::]\s*(\d{2}\/\d{2}\/\d{2,4})/i);
209
+ const statementDate = dateMatch ? dateMatch[1] : undefined;
210
+ return {
211
+ accountCode,
212
+ clientName,
213
+ statementDate,
214
+ };
215
+ }
216
+ /**
217
+ * 提取交易记录
218
+ *
219
+ * 跨页状态传递:inTransactionSection 和 lastTransaction 在页面间保持,
220
+ * 确保多页结单中 Page 2+ 即使没有区域标记也能继续提取交易。
221
+ */
222
+ extractTransactions(pdfData, warnings) {
223
+ const allTransactions = [];
224
+ // 跨页状态维护
225
+ let inTransactionSection = false;
226
+ let lastTransaction = null;
227
+ for (const page of pdfData.pages) {
228
+ const result = this.extractPageTransactionsWithState(page.chars, warnings, inTransactionSection, lastTransaction);
229
+ allTransactions.push(...result.transactions);
230
+ inTransactionSection = result.inTransactionSection;
231
+ lastTransaction = result.lastTransaction;
232
+ }
233
+ // 跨页续行合并
234
+ return this.mergeCrossPageContinuations(allTransactions);
235
+ }
236
+ /**
237
+ * 合并跨页续行
238
+ * 识别规则:tradeDate 为空字符串 = 续行
239
+ */
240
+ mergeCrossPageContinuations(transactions) {
241
+ const result = [];
242
+ for (const tx of transactions) {
243
+ if (!tx.tradeDate && result.length > 0) {
244
+ // 续行,合并到前一条记录
245
+ const lastTx = result[result.length - 1];
246
+ lastTx.particulars += ' ' + tx.particulars;
247
+ if (lastTx.sourceLines && tx.sourceLines) {
248
+ lastTx.sourceLines.push(...tx.sourceLines);
249
+ }
250
+ }
251
+ else {
252
+ result.push(tx);
253
+ }
254
+ }
255
+ return result;
256
+ }
257
+ /**
258
+ * 从单页提取交易记录(支持跨页状态)
259
+ *
260
+ * 与原 extractPageTransactions 的区别:
261
+ * - 接收上一页的状态(是否在交易区域、最后一条交易)
262
+ * - 返回当前页处理后的状态,供下一页继续使用
263
+ * - 确保多页结单中 Page 2+ 没有区域标记时也能正确提取交易
264
+ *
265
+ * @param chars - 页面字符数据
266
+ * @param _warnings - 警告信息数组
267
+ * @param initialInSection - 初始是否在交易区域
268
+ * @param initialLastTx - 初始最后一条交易(用于续行合并)
269
+ * @returns 交易数据和更新后的状态
270
+ */
271
+ extractPageTransactionsWithState(chars, _warnings, initialInSection, initialLastTx) {
272
+ const transactions = [];
273
+ // 按 Y 坐标分组
274
+ const lineGroups = this.groupCharsByY(chars);
275
+ // 使用跨页传入的状态
276
+ let inTransactionSection = initialInSection;
277
+ const sortedYs = Object.keys(lineGroups)
278
+ .map(Number)
279
+ .sort((a, b) => a - b);
280
+ // 临时存储续行数据(使用跨页传入的最后一条交易)
281
+ let lastTransaction = initialLastTx;
282
+ for (let i = 0; i < sortedYs.length; i++) {
283
+ const y = sortedYs[i];
284
+ const lineChars = lineGroups[y];
285
+ const lineText = this.getLineText(lineChars);
286
+ // 检测区域边界
287
+ if (this.matchesAny(lineText, SECTION_MARKERS.transactionStart)) {
288
+ inTransactionSection = true;
289
+ continue;
290
+ }
291
+ if (this.matchesAny(lineText, SECTION_MARKERS.transactionEnd)) {
292
+ inTransactionSection = false;
293
+ continue;
294
+ }
295
+ if (!inTransactionSection)
296
+ continue;
297
+ // 跳过表头和分隔行
298
+ if (this.isHeaderOrSeparator(lineText))
299
+ continue;
300
+ // 检查是否是续行 (没有日期格式的行可能是上一条记录的续行)
301
+ const dateMatch = lineText.match(/(\d{2}\/\d{2}\/\d{2})/);
302
+ if (!dateMatch) {
303
+ if (lastTransaction) {
304
+ // 页内续行:合并到当前页前一条记录
305
+ const continuationText = this.extractContinuationText(lineChars);
306
+ if (continuationText) {
307
+ lastTransaction.particulars += ' ' + continuationText;
308
+ if (lastTransaction.sourceLines) {
309
+ lastTransaction.sourceLines.push(y);
310
+ }
311
+ }
312
+ }
313
+ else {
314
+ // 跨页续行:创建占位记录,等待后续合并
315
+ const continuationText = this.extractContinuationText(lineChars);
316
+ if (continuationText) {
317
+ transactions.push({
318
+ tradeDate: '', // 空日期标记续行
319
+ refNo: '',
320
+ particulars: continuationText,
321
+ sourceLines: [y],
322
+ });
323
+ }
324
+ }
325
+ continue;
326
+ }
327
+ // 尝试解析交易行
328
+ const transaction = this.parseTransactionLine(lineChars, y);
329
+ if (transaction) {
330
+ transactions.push(transaction);
331
+ lastTransaction = transaction;
332
+ }
333
+ }
334
+ return { transactions, inTransactionSection, lastTransaction };
335
+ }
336
+ /**
337
+ * 解析单行交易记录
338
+ * 只提取原始字段值,不做业务分类
339
+ */
340
+ parseTransactionLine(chars, yPosition) {
341
+ // 智能提取字段
342
+ const fields = this.extractFieldsWithSmartBoundaries(chars);
343
+ // 验证必要字段
344
+ if (!fields.tradeDate || !fields.refNo) {
345
+ return null;
346
+ }
347
+ return {
348
+ tradeDate: fields.tradeDate,
349
+ settleDate: fields.settleDate,
350
+ product: fields.product,
351
+ refNo: fields.refNo,
352
+ transType: fields.transType,
353
+ particulars: fields.particulars,
354
+ debit: fields.debit,
355
+ credit: fields.credit,
356
+ sourceLines: [yPosition],
357
+ };
358
+ }
359
+ /**
360
+ * 智能提取字段 - 基于列位置
361
+ *
362
+ * 辉立日结单列边界参考:
363
+ * - tradeDate: x = 30-65
364
+ * - settleDate: x = 65-110
365
+ * - product: x = 100-145 (可选,如 UT)
366
+ * - refNo: x = 130-180
367
+ * - type: x = 170-230
368
+ * - particulars: x = 195-490
369
+ * - debit: x = 480-530
370
+ * - credit: x = 530-580
371
+ */
372
+ extractFieldsWithSmartBoundaries(chars) {
373
+ // 按 X 坐标排序
374
+ const sortedChars = [...chars].sort((a, b) => a.x0 - b.x0);
375
+ // 按间隔分组
376
+ const groups = this.groupCharsByGap(sortedChars);
377
+ // 识别各字段
378
+ let tradeDate = '';
379
+ let settleDate = '';
380
+ let product = '';
381
+ let refNo = '';
382
+ let transType = '';
383
+ const particularsArr = [];
384
+ let debitStr = '';
385
+ let creditStr = '';
386
+ for (const group of groups) {
387
+ const text = group.text.trim(); // 去除前后空格
388
+ const x0 = group.x0;
389
+ const x1 = group.x1;
390
+ const midX = (x0 + x1) / 2;
391
+ // 1. 日期识别 (格式 DD/MM/YY)
392
+ if (/^\d{2}\/\d{2}\/\d{2}$/.test(text)) {
393
+ if (x0 < 65 && !tradeDate) {
394
+ tradeDate = text;
395
+ }
396
+ else if (x0 >= 65 && x0 < 110 && !settleDate) {
397
+ settleDate = text;
398
+ }
399
+ else if (!tradeDate) {
400
+ tradeDate = text;
401
+ }
402
+ else if (!settleDate) {
403
+ settleDate = text;
404
+ }
405
+ continue;
406
+ }
407
+ // 2. 产品代码 (2-6字母,在 x < 145 区域,如 UT, Equity)
408
+ if (/^[A-Z]{2,6}$/i.test(text) && x0 >= 100 && x0 < 145) {
409
+ product = text;
410
+ continue;
411
+ }
412
+ // 3. 参考号 (8位数字,在 x = 130-180 区域)
413
+ if (/^\d{8}$/.test(text) && x0 >= 130 && x0 < 180) {
414
+ refNo = text;
415
+ continue;
416
+ }
417
+ // 4. 交易类型 (在 x = 170-230 区域)
418
+ if (x0 >= 170 && x0 < 230) {
419
+ const typeMatch = Object.entries(TRANS_TYPE_KEYWORDS).find(([, keywords]) => keywords.some((kw) => text === kw || text.startsWith(kw)));
420
+ if (typeMatch) {
421
+ transType = typeMatch[0];
422
+ // 检查是否有摘要粘连
423
+ for (const kw of typeMatch[1]) {
424
+ if (text.startsWith(kw) && text.length > kw.length) {
425
+ particularsArr.push(text.substring(kw.length));
426
+ break;
427
+ }
428
+ }
429
+ continue;
430
+ }
431
+ }
432
+ // 5. 金额 (在右侧 x >= 478,考虑浮点数误差)
433
+ if (x0 >= 478) {
434
+ const cleanNum = text.replace(/,/g, '');
435
+ if (/^[\d.]+$/.test(cleanNum)) {
436
+ if (midX < 530) {
437
+ debitStr = text;
438
+ }
439
+ else {
440
+ creditStr = text;
441
+ }
442
+ continue;
443
+ }
444
+ }
445
+ // 6. 摘要 (中间区域 x = 195-490)
446
+ if (x0 >= 195 && x1 < 490) {
447
+ particularsArr.push(text);
448
+ }
449
+ }
450
+ // 合并摘要数组
451
+ const particulars = particularsArr.join(' ').trim();
452
+ return {
453
+ tradeDate,
454
+ settleDate: settleDate || undefined,
455
+ product: product || undefined,
456
+ refNo,
457
+ transType: transType || undefined,
458
+ particulars,
459
+ debit: this.parseAmount(debitStr),
460
+ credit: this.parseAmount(creditStr),
461
+ };
462
+ }
463
+ /**
464
+ * 提取续行中的摘要内容
465
+ */
466
+ extractContinuationText(chars) {
467
+ const sortedChars = [...chars].sort((a, b) => a.x0 - b.x0);
468
+ const groups = this.groupCharsByGap(sortedChars);
469
+ const texts = [];
470
+ for (const group of groups) {
471
+ // 续行摘要通常在 x > 100 的区域
472
+ if (group.x0 >= 100 && group.x1 < 490) {
473
+ texts.push(group.text);
474
+ }
475
+ }
476
+ return texts.join(' ').trim();
477
+ }
478
+ // ============================================================================
479
+ // 工具方法
480
+ // ============================================================================
481
+ /**
482
+ * 按 Y 坐标分组字符
483
+ */
484
+ groupCharsByY(chars) {
485
+ const groups = {};
486
+ for (const c of chars) {
487
+ const y = Math.round(c.top);
488
+ if (!groups[y])
489
+ groups[y] = [];
490
+ groups[y].push(c);
491
+ }
492
+ return groups;
493
+ }
494
+ /**
495
+ * 按间隔分组字符
496
+ */
497
+ groupCharsByGap(chars) {
498
+ if (chars.length === 0)
499
+ return [];
500
+ const gapThreshold = this.config.gapThreshold ?? 5;
501
+ const groups = [];
502
+ let currentGroup = [chars[0]];
503
+ for (let i = 1; i < chars.length; i++) {
504
+ const gap = chars[i].x0 - chars[i - 1].x1;
505
+ if (gap > gapThreshold) {
506
+ groups.push({
507
+ text: currentGroup.map((c) => c.text).join(''),
508
+ x0: currentGroup[0].x0,
509
+ x1: currentGroup[currentGroup.length - 1].x1,
510
+ });
511
+ currentGroup = [chars[i]];
512
+ }
513
+ else {
514
+ currentGroup.push(chars[i]);
515
+ }
516
+ }
517
+ if (currentGroup.length > 0) {
518
+ groups.push({
519
+ text: currentGroup.map((c) => c.text).join(''),
520
+ x0: currentGroup[0].x0,
521
+ x1: currentGroup[currentGroup.length - 1].x1,
522
+ });
523
+ }
524
+ return groups;
525
+ }
526
+ /**
527
+ * 获取一行的文本
528
+ */
529
+ getLineText(chars) {
530
+ return chars
531
+ .sort((a, b) => a.x0 - b.x0)
532
+ .map((c) => c.text)
533
+ .join('');
534
+ }
535
+ /**
536
+ * 检查是否匹配任意关键词
537
+ */
538
+ matchesAny(text, keywords) {
539
+ return keywords.some((kw) => text.includes(kw));
540
+ }
541
+ /**
542
+ * 检查是否是表头或分隔行
543
+ */
544
+ isHeaderOrSeparator(text) {
545
+ const headerKeywords = ['Date', 'RefNo', '日期', '參考', 'Product', '產品'];
546
+ const separators = ['|', '─', '═'];
547
+ if (headerKeywords.some((kw) => text.includes(kw)))
548
+ return true;
549
+ if (text.trim().length < 3)
550
+ return true;
551
+ if (separators.some((s) => text.includes(s) && text.replace(new RegExp(`[${s}\\s]`, 'g'), '').length < 5)) {
552
+ return true;
553
+ }
554
+ if (text.includes('Normal 普通戶口') || text.includes('Currency :'))
555
+ return true;
556
+ if (text.includes('承上結餘') || text.includes('Balance B/F'))
557
+ return true;
558
+ // PDF 页眉/页脚检测(跨页状态传递时需要过滤这些行)
559
+ if (this.isPageHeaderOrFooter(text))
560
+ return true;
561
+ return false;
562
+ }
563
+ /**
564
+ * 检查是否是 PDF 页眉或页脚行
565
+ *
566
+ * 多页结单中每页都会重复出现的页面级标题和页脚,
567
+ * 与交易表格的表头行不同。跨页提取交易时需要跳过这些行。
568
+ *
569
+ * 注意:getLineText 将字符直接拼接,可能没有空格(如 "A/CNo:" 而非 "A/C No :"),
570
+ * 因此匹配模式需要兼容无空格情况。
571
+ */
572
+ isPageHeaderOrFooter(text) {
573
+ const pageHeaderFooterPatterns = [
574
+ // 页脚:网站链接
575
+ 'poems.com.hk',
576
+ 'Website',
577
+ '網址',
578
+ // 页头:双渲染标题(每个字符重复一次)
579
+ '綜綜合合',
580
+ 'CCoommbbiinneedd',
581
+ 'DDaaiillyy',
582
+ '存存款款',
583
+ 'EEaassyyppaayy',
584
+ // 页头:账户信息行(兼容有无空格)
585
+ 'A/CNo',
586
+ 'A/C No',
587
+ '客戶編號',
588
+ 'A/ECode',
589
+ 'A/E Code',
590
+ '經紀',
591
+ '帳戶類別',
592
+ // 页头:页码(兼容有无空格)
593
+ 'Page:',
594
+ 'Page :',
595
+ ];
596
+ return pageHeaderFooterPatterns.some((p) => text.includes(p));
597
+ }
598
+ /**
599
+ * 解析金额
600
+ */
601
+ parseAmount(str) {
602
+ if (!str)
603
+ return undefined;
604
+ const clean = str.replace(/,/g, '').trim();
605
+ const num = parseFloat(clean);
606
+ return isNaN(num) ? undefined : num;
607
+ }
608
+ // ============================================================================
609
+ // Holdings 提取方法
610
+ // ============================================================================
611
+ /**
612
+ * 提取持仓数据
613
+ *
614
+ * 包含两部分:
615
+ * 1. Account Details - 现金余额
616
+ * 2. Securities Portfolio - 股票/基金持仓
617
+ *
618
+ * 注意:Securities Portfolio 支持跨页提取,状态在页面之间传递
619
+ *
620
+ * @param pdfData - pdfplumber 输出数据
621
+ * @param warnings - 警告信息数组
622
+ * @returns 持仓数据数组
623
+ */
624
+ extractHoldings(pdfData, warnings) {
625
+ const allHoldings = [];
626
+ // 跨页状态维护
627
+ let inPortfolioSection = false;
628
+ let currentCurrency = 'HKD';
629
+ let lastHolding = null;
630
+ for (const page of pdfData.pages) {
631
+ // 提取 Account Details(现金余额)- 每页独立处理
632
+ const cashHoldings = this.extractAccountDetailsHoldings(page.chars, warnings);
633
+ allHoldings.push(...cashHoldings);
634
+ // 提取 Securities Portfolio(股票/基金持仓)- 跨页状态传递
635
+ const { holdings: portfolioHoldings, inPortfolioSection: newState, currentCurrency: newCurrency, lastHolding: newLastHolding, } = this.extractPortfolioHoldingsWithState(page.chars, warnings, inPortfolioSection, currentCurrency, lastHolding);
636
+ allHoldings.push(...portfolioHoldings);
637
+ inPortfolioSection = newState;
638
+ currentCurrency = newCurrency;
639
+ lastHolding = newLastHolding;
640
+ }
641
+ return allHoldings;
642
+ }
643
+ /**
644
+ * 从 Account Details 区域提取现金余额
645
+ *
646
+ * 辉立日结单 Account Details 格式示例:
647
+ * | Currency | Balance C/F | Unsettled T+1 | ... | Available Balance |
648
+ * | HKD | 63,832.41 | 0.00 | ... | 63,832.41 |
649
+ * | USD | -0.30 | 0.00 | ... | -0.30 |
650
+ * | HKD(Base)| 63,830.07 | ... | ... | 63,830.07 | <- 跳过
651
+ *
652
+ * @param chars - 页面字符数据
653
+ * @param warnings - 警告信息数组
654
+ * @returns 现金余额数组
655
+ */
656
+ extractAccountDetailsHoldings(chars, _warnings) {
657
+ const holdings = [];
658
+ const lineGroups = this.groupCharsByY(chars);
659
+ const sortedYs = Object.keys(lineGroups)
660
+ .map(Number)
661
+ .sort((a, b) => a - b);
662
+ let inAccountDetailsSection = false;
663
+ for (const y of sortedYs) {
664
+ const lineChars = lineGroups[y];
665
+ const lineText = this.getLineText(lineChars);
666
+ // 检测区域起始边界
667
+ if (this.matchesAny(lineText, SECTION_MARKERS.accountDetailsStart)) {
668
+ inAccountDetailsSection = true;
669
+ continue;
670
+ }
671
+ // 检测区域结束边界
672
+ if (this.matchesAny(lineText, SECTION_MARKERS.accountDetailsEnd)) {
673
+ inAccountDetailsSection = false;
674
+ continue;
675
+ }
676
+ if (!inAccountDetailsSection)
677
+ continue;
678
+ // 跳过表头行
679
+ if (this.isAccountDetailsHeader(lineText))
680
+ continue;
681
+ // 跳过 HKD(Base) 汇总行
682
+ if (lineText.includes('HKD(Base)') || lineText.includes('Base'))
683
+ continue;
684
+ // 解析现金余额行
685
+ const cashHolding = this.parseCashBalanceLine(lineChars);
686
+ if (cashHolding) {
687
+ holdings.push(cashHolding);
688
+ }
689
+ }
690
+ return holdings;
691
+ }
692
+ /**
693
+ * 检查是否是 Account Details 表头行
694
+ */
695
+ isAccountDetailsHeader(text) {
696
+ const headerKeywords = [
697
+ 'Currency',
698
+ '貨幣',
699
+ 'Balance C/F',
700
+ '轉下結餘',
701
+ 'Unsettled Balance',
702
+ '未交收結餘',
703
+ 'Normal 普通戶口',
704
+ 'Accrued Interest',
705
+ '累計利息',
706
+ 'Available Balance',
707
+ '可用結餘',
708
+ ];
709
+ return headerKeywords.some((kw) => text.includes(kw));
710
+ }
711
+ /**
712
+ * 解析现金余额行
713
+ *
714
+ * 行格式: Currency Balance_C/F Unsettled_T+1 ... Available_Balance Ref_ExRate DR_Int_Rate
715
+ * 示例: HKD 63,832.41 0.00 ... 63,832.41 1.0000 列表1(Sch1)
716
+ *
717
+ * @param chars - 行字符数据
718
+ * @returns 现金余额持仓或 null
719
+ */
720
+ parseCashBalanceLine(chars) {
721
+ const sortedChars = [...chars].sort((a, b) => a.x0 - b.x0);
722
+ const groups = this.groupCharsByGap(sortedChars);
723
+ if (groups.length < 2)
724
+ return null;
725
+ // 第一个组应该是货币代码
726
+ const currencyGroup = groups[0];
727
+ const currency = currencyGroup.text.trim();
728
+ // 验证是否是有效货币代码 (3 字母)
729
+ if (!/^[A-Z]{3}$/.test(currency))
730
+ return null;
731
+ // 第二个组应该是 Balance C/F (转下结余)
732
+ // 根据辉立日结单布局,Balance C/F 通常在 x = 120-180 区域
733
+ const balanceGroup = groups.find((g) => g.x0 >= 100 && g.x0 < 200);
734
+ if (!balanceGroup)
735
+ return null;
736
+ const balance = this.parseAmount(balanceGroup.text);
737
+ if (balance === undefined)
738
+ return null;
739
+ // 跳过余额为 0 的记录
740
+ if (balance === 0)
741
+ return null;
742
+ return {
743
+ symbol: currency,
744
+ name: 'Cash Balance',
745
+ assetType: 'Cash',
746
+ quantity: balance,
747
+ marketPrice: 1,
748
+ marketValue: balance,
749
+ currency: currency,
750
+ };
751
+ }
752
+ /**
753
+ * 从 Securities Portfolio 区域提取股票/基金持仓(支持跨页状态)
754
+ *
755
+ * - 接收上一页的状态(是否在 Portfolio 区域、当前货币、最后一条持仓)
756
+ * - 返回当前页处理后的状态,供下一页继续使用
757
+ * - 不再依赖结束标记(如 E. & O. E.)来判断区域结束
758
+ *
759
+ * @param chars - 页面字符数据
760
+ * @param warnings - 警告信息数组
761
+ * @param initialInSection - 初始是否在 Portfolio 区域
762
+ * @param initialCurrency - 初始货币
763
+ * @param initialLastHolding - 初始最后一条持仓(用于续行合并)
764
+ * @returns 持仓数据和更新后的状态
765
+ */
766
+ extractPortfolioHoldingsWithState(chars, _warnings, initialInSection, initialCurrency, initialLastHolding) {
767
+ const holdings = [];
768
+ const lineGroups = this.groupCharsByY(chars);
769
+ const sortedYs = Object.keys(lineGroups)
770
+ .map(Number)
771
+ .sort((a, b) => a - b);
772
+ let inPortfolioSection = initialInSection;
773
+ let currentCurrency = initialCurrency;
774
+ let lastHolding = initialLastHolding;
775
+ for (let i = 0; i < sortedYs.length; i++) {
776
+ const y = sortedYs[i];
777
+ const lineChars = lineGroups[y];
778
+ const lineText = this.getLineText(lineChars);
779
+ // 检测区域起始边界(优先级高于结束边界)
780
+ if (this.matchesAny(lineText, SECTION_MARKERS.holdingsStart)) {
781
+ inPortfolioSection = true;
782
+ continue;
783
+ }
784
+ // 检测交易记录区域开始 - 这才是真正的持仓区域结束
785
+ if (this.matchesAny(lineText, SECTION_MARKERS.transactionStart)) {
786
+ inPortfolioSection = false;
787
+ continue;
788
+ }
789
+ // 检测股息及公告区域开始 - 终止持仓区域解析
790
+ // 股息公告行格式与持仓行类似(Equity XHKG 003750 ...),
791
+ // 如不终止会导致股息记录被误解析为持仓数据
792
+ if (this.matchesAny(lineText, SECTION_MARKERS.dividendStart)) {
793
+ inPortfolioSection = false;
794
+ continue;
795
+ }
796
+ // 注意:不再使用 holdingsEnd 标记来判断区域结束
797
+ // 因为 E. & O. E. 等页脚标记会在每页底部出现,导致跨页持仓丢失
798
+ // 只有遇到交易记录区域或股息公告区域才真正结束持仓区域
799
+ if (!inPortfolioSection)
800
+ continue;
801
+ // 检测货币切换头部 (Currency : HKD 或 Currency : USD)
802
+ const currencyMatch = lineText.match(/Currency\s*:\s*([A-Z]{3})/i);
803
+ if (currencyMatch) {
804
+ currentCurrency = currencyMatch[1].toUpperCase();
805
+ continue;
806
+ }
807
+ // 跳过表头、Sub-Total、Total 等行
808
+ if (this.isPortfolioHeaderOrFooter(lineText))
809
+ continue;
810
+ // 跳过页脚标记(但不终止区域)
811
+ if (this.matchesAny(lineText, SECTION_MARKERS.holdingsEnd))
812
+ continue;
813
+ // 检测续行(中文名称行,如 "股票 美图集团")
814
+ if (this.isPortfolioContinuationLine(lineText, lineChars)) {
815
+ if (lastHolding) {
816
+ const chineseName = this.extractChineseNameFromLine(lineChars);
817
+ if (chineseName) {
818
+ lastHolding.name = lastHolding.name
819
+ ? `${lastHolding.name} ${chineseName}`
820
+ : chineseName;
821
+ }
822
+ }
823
+ continue;
824
+ }
825
+ // 解析持仓数据行
826
+ const holding = this.parsePortfolioLine(lineChars, currentCurrency);
827
+ if (holding) {
828
+ holdings.push(holding);
829
+ lastHolding = holding;
830
+ }
831
+ }
832
+ return { holdings, inPortfolioSection, currentCurrency, lastHolding };
833
+ }
834
+ /**
835
+ * 检查是否是 Portfolio 表头或汇总行
836
+ */
837
+ isPortfolioHeaderOrFooter(text) {
838
+ const keywords = [
839
+ 'Product',
840
+ 'Market',
841
+ 'InstrumentCd',
842
+ 'DisplayName',
843
+ '產品',
844
+ '市場',
845
+ '產品代號',
846
+ '代號名稱',
847
+ 'Sub-Total',
848
+ 'Total :',
849
+ 'Total:',
850
+ 'Exchange Rate',
851
+ '匯率',
852
+ 'Qty B/F',
853
+ 'Qty C/F',
854
+ 'ClsPrice',
855
+ 'Market Value',
856
+ 'MgnRatio',
857
+ 'Margin Value',
858
+ ];
859
+ return keywords.some((kw) => text.includes(kw));
860
+ }
861
+ /**
862
+ * 检查是否是续行(中文名称行)
863
+ *
864
+ * 续行特征:
865
+ * 1. 以 "股票" 或 "基金" 开头
866
+ * 2. 右侧没有数值金额
867
+ */
868
+ isPortfolioContinuationLine(text, chars) {
869
+ // 续行以 "股票" 或 "基金" 开头
870
+ if (text.startsWith('股票') || text.startsWith('基金')) {
871
+ // 检查行中是否有数值(在右侧 x > 340 区域)
872
+ const rightChars = chars.filter((c) => c.x0 > 340);
873
+ if (rightChars.length === 0) {
874
+ return true;
875
+ }
876
+ // 如果右侧只有非数字字符,也是续行
877
+ const rightText = rightChars.map((c) => c.text).join('');
878
+ if (!/[\d.]/.test(rightText)) {
879
+ return true;
880
+ }
881
+ }
882
+ return false;
883
+ }
884
+ /**
885
+ * 从续行中提取中文名称
886
+ */
887
+ extractChineseNameFromLine(chars) {
888
+ const sortedChars = [...chars].sort((a, b) => a.x0 - b.x0);
889
+ const groups = this.groupCharsByGap(sortedChars);
890
+ // 跳过第一个组("股票" 或 "基金"),返回后面的内容
891
+ const nameGroups = groups.filter((g) => g.x0 >= 160 && g.x0 < 300);
892
+ return nameGroups
893
+ .map((g) => g.text)
894
+ .join(' ')
895
+ .trim();
896
+ }
897
+ /**
898
+ * 解析单行持仓数据
899
+ *
900
+ * 列边界(基于 pdfplumber 坐标分析):
901
+ * - Product (Equity/UT): x0 < 60
902
+ * - Market (XHKG/OTCU/XNGS): x0 = 80-115
903
+ * - InstrumentCd: x0 = 115-165
904
+ * - DisplayName: x0 = 160-270
905
+ * - Qty B/F: x0 = 260-295
906
+ * - LastBoughtOn (可选日期): x0 = 295-340
907
+ * - Qty C/F: x0 = 340-385
908
+ * - ClsPrice: x0 = 385-435
909
+ * - Market Value: x0 = 435-485
910
+ * - MgnRatio: x0 = 490-520
911
+ * - Margin Value: x0 = 530-570
912
+ *
913
+ * @param chars - 行字符数据
914
+ * @param currency - 当前货币
915
+ * @returns 持仓数据或 null
916
+ */
917
+ parsePortfolioLine(chars, currency) {
918
+ const sortedChars = [...chars].sort((a, b) => a.x0 - b.x0);
919
+ const groups = this.groupCharsByGap(sortedChars);
920
+ if (groups.length < 5)
921
+ return null;
922
+ // 提取字段
923
+ let product = ''; // Equity/UT
924
+ let market = ''; // XHKG/OTCU/XNGS
925
+ let instrumentCd = ''; // 000100/UT.480010/PDD
926
+ let displayName = ''; // MINIMAX GROUP INC.
927
+ let qtyCF;
928
+ let clsPrice;
929
+ let marketValue;
930
+ for (const group of groups) {
931
+ const text = group.text.trim(); // 去除前后空格
932
+ const x0 = group.x0;
933
+ // Product 列 (x0 < 60)
934
+ if (x0 < 60 && (text === 'Equity' || text === 'UT')) {
935
+ product = text;
936
+ continue;
937
+ }
938
+ // Market 列 (x0 = 80-115)
939
+ if (x0 >= 80 && x0 < 120 && /^[A-Z]{4}$/.test(text)) {
940
+ market = text;
941
+ continue;
942
+ }
943
+ // 名称列和代码列在 160-175 区间存在重叠。
944
+ // 若前面已经识别到 InstrumentCd,则此处优先把后续文本视为名称,
945
+ // 避免 CATL 这类全大写简称再次被误判成代码。
946
+ if (instrumentCd && x0 >= 160 && x0 < 275 && !/^\d/.test(text)) {
947
+ displayName = text;
948
+ continue;
949
+ }
950
+ // InstrumentCd 列 (x0 = 115-170)
951
+ if (x0 >= 115 && x0 < 175) {
952
+ // 股票代码: 000100, 基金代码: UT.480010, 美股代码: PDD
953
+ if (/^\d{6}$/.test(text) || /^UT\.\d+$/.test(text) || /^[A-Z]{2,5}$/.test(text)) {
954
+ instrumentCd = text;
955
+ continue;
956
+ }
957
+ }
958
+ // DisplayName 列 (x0 = 160-270)
959
+ if (x0 >= 160 && x0 < 275 && !/^\d/.test(text)) {
960
+ displayName = text;
961
+ continue;
962
+ }
963
+ // Qty C/F 列 (x0 = 340-390) - 使用右侧较宽的范围
964
+ if (x0 >= 330 && x0 < 395) {
965
+ const qty = this.parseAmount(text);
966
+ if (qty !== undefined && !qtyCF) {
967
+ qtyCF = qty;
968
+ continue;
969
+ }
970
+ }
971
+ // ClsPrice 列 (x0 = 385-445)
972
+ if (x0 >= 385 && x0 < 450) {
973
+ const price = this.parseAmount(text);
974
+ if (price !== undefined && !clsPrice) {
975
+ clsPrice = price;
976
+ continue;
977
+ }
978
+ }
979
+ // Market Value 列 (x0 = 435-490)
980
+ if (x0 >= 435 && x0 < 500) {
981
+ const value = this.parseAmount(text);
982
+ if (value !== undefined && !marketValue) {
983
+ marketValue = value;
984
+ continue;
985
+ }
986
+ }
987
+ }
988
+ // 验证必要字段
989
+ if (!instrumentCd)
990
+ return null;
991
+ // 确定资产类型
992
+ let assetType = 'Stock';
993
+ if (product === 'UT' || instrumentCd.startsWith('UT.')) {
994
+ assetType = 'Fund';
995
+ }
996
+ return {
997
+ symbol: instrumentCd,
998
+ name: displayName || undefined,
999
+ assetType,
1000
+ quantity: qtyCF,
1001
+ marketPrice: clsPrice,
1002
+ marketValue: marketValue,
1003
+ currency: currency,
1004
+ extras: {
1005
+ product: product || undefined,
1006
+ market: market || undefined,
1007
+ },
1008
+ };
1009
+ }
1010
+ }
1011
+ exports.PhillipPdfplumberExtractor = PhillipPdfplumberExtractor;
1012
+ //# sourceMappingURL=extractor.js.map