@krizpoon/hangseng-statement-extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,92 @@
1
+ # Hang Seng Statement Extractor
2
+
3
+ Extract transaction history from Hang Seng Bank statement PDFs using a small TypeScript library and a CLI. The core extractor is importable for use in other projects, and the CLI is a thin wrapper around it.
4
+
5
+ ## Features
6
+ - Extracts transactions from statement PDFs with position-aware parsing.
7
+ - Works as both a library and a CLI.
8
+ - Outputs CSV with statement date and account type included.
9
+
10
+ ## Requirements
11
+ - Node.js 18+ (ESM).
12
+ - Python 3 with `pdfminer.six` installed:
13
+ - `pip install pdfminer.six`
14
+
15
+ ## Install
16
+ ```bash
17
+ npm install hangseng-statement-extractor
18
+ ```
19
+
20
+ ## CLI
21
+ The CLI requires all paths to be provided explicitly.
22
+
23
+ ```bash
24
+ hangseng-statement-extractor \
25
+ --input /path/to/statement.pdf \
26
+ --account savings \
27
+ --output /path/to/output.csv
28
+ ```
29
+
30
+ Process a directory of PDFs:
31
+ ```bash
32
+ hangseng-statement-extractor \
33
+ --input /path/to/statements/ \
34
+ --account savings \
35
+ --output /path/to/output.csv
36
+ ```
37
+
38
+ Notes:
39
+ - `--output` is optional; if omitted, CSV is printed to stdout.
40
+ - `--account` accepts `savings` or `current`.
41
+
42
+ ## Library Usage
43
+ ```ts
44
+ import { extractStatement, formatTransactionsAsCsv } from "hangseng-statement-extractor"
45
+ import { fileURLToPath } from "node:url"
46
+ import { dirname, join } from "node:path"
47
+
48
+ const here = dirname(fileURLToPath(import.meta.url))
49
+ const pdfPath = "/path/to/statement.pdf"
50
+ const pythonPath = "/usr/bin/python3"
51
+ const scriptPath = join(
52
+ here,
53
+ "../node_modules/hangseng-statement-extractor/scripts/pdf-extract-positions.py",
54
+ )
55
+
56
+ const result = extractStatement({
57
+ pdfPath,
58
+ scriptPath,
59
+ pythonPath,
60
+ accountType: "savings",
61
+ })
62
+
63
+ const csv = formatTransactionsAsCsv([result])
64
+ console.log(csv)
65
+ ```
66
+
67
+ ### API
68
+ - `extractStatement(options)`
69
+ - `options.pdfPath` (string)
70
+ - `options.scriptPath` (string)
71
+ - `options.pythonPath` (string)
72
+ - `options.accountType` ("savings" | "current")
73
+ - `options.logger` (optional)
74
+ - Returns `{ statementDate, accountType, transactions }`
75
+
76
+ - `formatTransactionsAsCsv(results)`
77
+ - Accepts an array of statement results and returns CSV text.
78
+
79
+ ## CSV Output
80
+ Columns:
81
+ ```
82
+ Date,Details,Deposit,Withdrawal,Balance,Account Type,Statement Date
83
+ ```
84
+
85
+ ## Development
86
+ ```bash
87
+ npm install
88
+ npm run build
89
+ ```
90
+
91
+ ## License
92
+ MIT
package/dist/cli.d.ts ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env node
2
+ export {};
3
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":""}
package/dist/cli.js ADDED
@@ -0,0 +1,96 @@
1
+ #!/usr/bin/env node
2
+ import { Option, program } from "commander";
3
+ import { execFile } from "node:child_process";
4
+ import * as fs from "node:fs";
5
+ import * as path from "node:path";
6
+ import { fileURLToPath } from "node:url";
7
+ import { promisify } from "node:util";
8
+ import { extractStatement, formatTransactionsAsCsv } from "./index.js";
9
+ const execFileAsync = promisify(execFile);
10
+ const SCRIPT_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..", "scripts", "pdf-extract-positions.py");
11
+ async function findPython() {
12
+ for (const candidate of ["python3", "python"]) {
13
+ try {
14
+ await execFileAsync(candidate, ["--version"]);
15
+ return candidate;
16
+ }
17
+ catch {
18
+ // not found, try next
19
+ }
20
+ }
21
+ throw new Error("Python not found. Install Python or specify its path with --python.");
22
+ }
23
+ function collectPdfFiles(inputPath) {
24
+ const stat = fs.statSync(inputPath);
25
+ if (stat.isDirectory()) {
26
+ return fs
27
+ .readdirSync(inputPath)
28
+ .filter((file) => file.toLowerCase().endsWith(".pdf"))
29
+ .map((file) => path.join(inputPath, file))
30
+ .sort();
31
+ }
32
+ return [inputPath];
33
+ }
34
+ program
35
+ .name("hangseng-statement-extractor")
36
+ .description("Extract transaction history from Hang Seng Bank statement PDFs")
37
+ .requiredOption("-i, --input <path>", "Input PDF file or directory containing PDFs")
38
+ .option("--python <path>", "Path to python executable")
39
+ .option("-o, --output <path>", "Output CSV file (prints to stdout if omitted)")
40
+ .addOption(new Option("-a, --account <type>", "Account type to extract: savings or current")
41
+ .choices(["savings", "current"])
42
+ .default("savings"))
43
+ .option("-v, --verbose", "Enable verbose logging")
44
+ .action(async (options) => {
45
+ const inputPath = path.resolve(options.input);
46
+ const scriptPath = SCRIPT_PATH;
47
+ const pythonPath = options.python
48
+ ? path.resolve(options.python)
49
+ : await findPython();
50
+ const logger = options.verbose ? console : undefined;
51
+ const pdfFiles = collectPdfFiles(inputPath);
52
+ if (pdfFiles.length === 0) {
53
+ console.error("No PDF files found");
54
+ process.exitCode = 1;
55
+ return;
56
+ }
57
+ const results = [];
58
+ let errorCount = 0;
59
+ for (const pdfFile of pdfFiles) {
60
+ if (options.verbose) {
61
+ console.error(`Processing: ${path.basename(pdfFile)}`);
62
+ }
63
+ try {
64
+ const result = await extractStatement({
65
+ pdfPath: pdfFile,
66
+ scriptPath,
67
+ pythonPath,
68
+ accountType: options.account,
69
+ logger,
70
+ });
71
+ results.push(result);
72
+ }
73
+ catch (error) {
74
+ errorCount += 1;
75
+ console.error(`Error processing ${pdfFile}:`, error);
76
+ }
77
+ }
78
+ if (results.length === 0) {
79
+ console.error("No transactions extracted");
80
+ process.exitCode = 1;
81
+ return;
82
+ }
83
+ const csv = formatTransactionsAsCsv(results);
84
+ if (options.output) {
85
+ fs.writeFileSync(options.output, `${csv}\n`);
86
+ console.error(`Output written to: ${options.output}`);
87
+ }
88
+ else {
89
+ process.stdout.write(`${csv}\n`);
90
+ }
91
+ if (errorCount > 0) {
92
+ process.exitCode = 1;
93
+ }
94
+ });
95
+ program.parse();
96
+ //# sourceMappingURL=cli.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,WAAW,CAAA;AAC3C,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAA;AAC7C,OAAO,KAAK,EAAE,MAAM,SAAS,CAAA;AAC7B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAA;AACjC,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAA;AACxC,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAA;AACrC,OAAO,EAAE,gBAAgB,EAAE,uBAAuB,EAAE,MAAM,YAAY,CAAA;AAGtE,MAAM,aAAa,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAA;AAEzC,MAAM,WAAW,GAAG,IAAI,CAAC,OAAO,CAC9B,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAC5C,IAAI,EACJ,SAAS,EACT,0BAA0B,CAC3B,CAAA;AAED,KAAK,UAAU,UAAU;IACvB,KAAK,MAAM,SAAS,IAAI,CAAC,SAAS,EAAE,QAAQ,CAAC,EAAE,CAAC;QAC9C,IAAI,CAAC;YACH,MAAM,aAAa,CAAC,SAAS,EAAE,CAAC,WAAW,CAAC,CAAC,CAAA;YAC7C,OAAO,SAAS,CAAA;QAClB,CAAC;QAAC,MAAM,CAAC;YACP,sBAAsB;QACxB,CAAC;IACH,CAAC;IACD,MAAM,IAAI,KAAK,CACb,qEAAqE,CACtE,CAAA;AACH,CAAC;AAED,SAAS,eAAe,CAAC,SAAiB;IACxC,MAAM,IAAI,GAAG,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAA;IACnC,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;QACvB,OAAO,EAAE;aACN,WAAW,CAAC,SAAS,CAAC;aACtB,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;aACrD,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC;aACzC,IAAI,EAAE,CAAA;IACX,CAAC;IAED,OAAO,CAAC,SAAS,CAAC,CAAA;AACpB,CAAC;AAED,OAAO;KACJ,IAAI,CAAC,8BAA8B,CAAC;KACpC,WAAW,CAAC,gEAAgE,CAAC;KAC7E,cAAc,CACb,oBAAoB,EACpB,6CAA6C,CAC9C;KACA,MAAM,CAAC,iBAAiB,EAAE,2BAA2B,CAAC;KACtD,MAAM,CACL,qBAAqB,EACrB,+CAA+C,CAChD;KACA,SAAS,CACR,IAAI,MAAM,CACR,sBAAsB,EACtB,6CAA6C,CAC9C;KACE,OAAO,CAAC,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;KAC/B,OAAO,CAAC,SAAS,CAAC,CACtB;KACA,MAAM,CAAC,eAAe,EAAE,wBAAwB,CAAC;KACjD,MAAM,CACL,KAAK,EAAE,OAMN,EAAE,EAAE;IACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAA;IAC7C,MAAM,UAAU,GAAG,WAAW,CAAA;IAC9B,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM;QAC/B,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC;QAC9B,CAAC,CAAC,MAAM,UAAU,EAAE,CAAA;IACtB,MAAM,MAAM,GAAuB,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS,CAAA;IAExE,MAAM,QAAQ,GAAG,eAAe,CAAC,SAAS,CAAC,CAAA;IAC3C,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,OAAO,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAA;QACnC,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAA;QACpB,OAAM;IACR,CAAC;IAED,MAAM,OAAO,GAAsB,EAAE,CAAA;IACrC,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;YACpB,OAAO,CAAC,KAAK,CAAC,eAAe,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC,CAAA;QACxD,CAAC;QAED,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,gBAAgB,CAAC;gBACpC,OAAO,EAAE,OAAO;gBAChB,UAAU;gBACV,UAAU;gBACV,WAAW,EAAE,OAAO,CAAC,OAAO;gBAC5B,MAAM;aACP,CAAC,CAAA;YACF,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACtB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,UAAU,IAAI,CAAC,CAAA;YACf,OAAO,CAAC,KAAK,CAAC,oBAAoB,OAAO,GAAG,EAAE,KAAK,CAAC,CAAA;QACtD,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,CAAC,KAAK,CAAC,2BAA2B,CAAC,CAAA;QAC1C,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAA;QACpB,OAAM;IACR,CAAC;IAED,MAAM,GAAG,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAA;IAC5C,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;QACnB,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,GAAG,IAAI,CAAC,CAAA;QAC5C,OAAO,CAAC,KAAK,CAAC,sBAAsB,OAAO,CAAC,MAAM,EAAE,CAAC,CAAA;IACvD,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,IAAI,CAAC,CAAA;IAClC,CAAC;IAED,IAAI,UAAU,GAAG,CAAC,EAAE,CAAC;QACnB,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAA;IACtB,CAAC;AACH,CAAC,CACF,CAAA;AAEH,OAAO,CAAC,KAAK,EAAE,CAAA"}
@@ -0,0 +1,6 @@
1
+ export { extractStatement } from "./lib/extractor.js";
2
+ export { formatTransactionsAsCsv } from "./lib/csv.js";
3
+ export type { AccountType, Logger, StatementResult, TextElement, Transaction, } from "./lib/types.js";
4
+ export type { ExtractStatementOptions } from "./lib/extractor.js";
5
+ export type { PdfMinerOptions } from "./lib/pdfminer.js";
6
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAA;AACrD,OAAO,EAAE,uBAAuB,EAAE,MAAM,cAAc,CAAA;AACtD,YAAY,EACV,WAAW,EACX,MAAM,EACN,eAAe,EACf,WAAW,EACX,WAAW,GACZ,MAAM,gBAAgB,CAAA;AACvB,YAAY,EAAE,uBAAuB,EAAE,MAAM,oBAAoB,CAAA;AACjE,YAAY,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAA"}
package/dist/index.js ADDED
@@ -0,0 +1,3 @@
1
+ export { extractStatement } from "./lib/extractor.js";
2
+ export { formatTransactionsAsCsv } from "./lib/csv.js";
3
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAA;AACrD,OAAO,EAAE,uBAAuB,EAAE,MAAM,cAAc,CAAA"}
@@ -0,0 +1,3 @@
1
+ import { StatementResult } from "./types.js";
2
+ export declare function formatTransactionsAsCsv(results: StatementResult[]): string;
3
+ //# sourceMappingURL=csv.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"csv.d.ts","sourceRoot":"","sources":["../../src/lib/csv.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,MAAM,YAAY,CAAA;AAS5C,wBAAgB,uBAAuB,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG,MAAM,CAuB1E"}
@@ -0,0 +1,23 @@
1
+ function escapeCsvField(value) {
2
+ if (value.includes('"') || value.includes(",") || value.includes("\n")) {
3
+ return `"${value.replace(/\"/g, '""')}"`;
4
+ }
5
+ return value;
6
+ }
7
+ export function formatTransactionsAsCsv(results) {
8
+ const header = "Date,Details,Deposit,Withdrawal,Balance,Account Type,Statement Date";
9
+ const rows = [];
10
+ for (const result of results) {
11
+ for (const transaction of result.transactions) {
12
+ const deposit = transaction.deposit !== null ? transaction.deposit.toFixed(2) : "";
13
+ const withdrawal = transaction.withdrawal !== null ? transaction.withdrawal.toFixed(2) : "";
14
+ const balance = transaction.balance.toFixed(2);
15
+ const details = escapeCsvField(transaction.details);
16
+ const accountType = result.accountType;
17
+ const statementDate = escapeCsvField(result.statementDate);
18
+ rows.push(`${transaction.date},${details},${deposit},${withdrawal},${balance},${accountType},${statementDate}`);
19
+ }
20
+ }
21
+ return [header, ...rows].join("\n");
22
+ }
23
+ //# sourceMappingURL=csv.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"csv.js","sourceRoot":"","sources":["../../src/lib/csv.ts"],"names":[],"mappings":"AAEA,SAAS,cAAc,CAAC,KAAa;IACnC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACvE,OAAO,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,GAAG,CAAA;IAC1C,CAAC;IACD,OAAO,KAAK,CAAA;AACd,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,OAA0B;IAChE,MAAM,MAAM,GACV,qEAAqE,CAAA;IACvE,MAAM,IAAI,GAAa,EAAE,CAAA;IAEzB,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,KAAK,MAAM,WAAW,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;YAC9C,MAAM,OAAO,GACX,WAAW,CAAC,OAAO,KAAK,IAAI,CAAC,CAAC,CAAC,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAA;YACpE,MAAM,UAAU,GACd,WAAW,CAAC,UAAU,KAAK,IAAI,CAAC,CAAC,CAAC,WAAW,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAA;YAC1E,MAAM,OAAO,GAAG,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;YAC9C,MAAM,OAAO,GAAG,cAAc,CAAC,WAAW,CAAC,OAAO,CAAC,CAAA;YACnD,MAAM,WAAW,GAAG,MAAM,CAAC,WAAW,CAAA;YACtC,MAAM,aAAa,GAAG,cAAc,CAAC,MAAM,CAAC,aAAa,CAAC,CAAA;YAE1D,IAAI,CAAC,IAAI,CACP,GAAG,WAAW,CAAC,IAAI,IAAI,OAAO,IAAI,OAAO,IAAI,UAAU,IAAI,OAAO,IAAI,WAAW,IAAI,aAAa,EAAE,CACrG,CAAA;QACH,CAAC;IACH,CAAC;IAED,OAAO,CAAC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AACrC,CAAC"}
@@ -0,0 +1,8 @@
1
+ import { PdfMinerOptions } from "./pdfminer.js";
2
+ import { AccountType, Logger, StatementResult } from "./types.js";
3
+ export interface ExtractStatementOptions extends PdfMinerOptions {
4
+ accountType: AccountType;
5
+ logger?: Logger;
6
+ }
7
+ export declare function extractStatement(options: ExtractStatementOptions): Promise<StatementResult>;
8
+ //# sourceMappingURL=extractor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractor.d.ts","sourceRoot":"","sources":["../../src/lib/extractor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAuB,eAAe,EAAE,MAAM,eAAe,CAAA;AACpE,OAAO,EAAE,WAAW,EAAE,MAAM,EAAE,eAAe,EAAE,MAAM,YAAY,CAAA;AAGjE,MAAM,WAAW,uBAAwB,SAAQ,eAAe;IAC9D,WAAW,EAAE,WAAW,CAAA;IACxB,MAAM,CAAC,EAAE,MAAM,CAAA;CAChB;AAED,wBAAsB,gBAAgB,CACpC,OAAO,EAAE,uBAAuB,GAC/B,OAAO,CAAC,eAAe,CAAC,CAgB1B"}
@@ -0,0 +1,14 @@
1
+ import { extractTextElements } from "./pdfminer.js";
2
+ import { findStatementDate, formatDateIso, parseTransactions } from "./parser.js";
3
+ export async function extractStatement(options) {
4
+ const elements = await extractTextElements(options);
5
+ const statementDateRaw = findStatementDate(elements);
6
+ const statementDate = formatDateIso(statementDateRaw);
7
+ const transactions = parseTransactions(elements, options.accountType, statementDateRaw, options.logger);
8
+ return {
9
+ statementDate,
10
+ accountType: options.accountType,
11
+ transactions,
12
+ };
13
+ }
14
+ //# sourceMappingURL=extractor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractor.js","sourceRoot":"","sources":["../../src/lib/extractor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAmB,MAAM,eAAe,CAAA;AAEpE,OAAO,EAAE,iBAAiB,EAAE,aAAa,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAA;AAOjF,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,OAAgC;IAEhC,MAAM,QAAQ,GAAG,MAAM,mBAAmB,CAAC,OAAO,CAAC,CAAA;IACnD,MAAM,gBAAgB,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAA;IACpD,MAAM,aAAa,GAAG,aAAa,CAAC,gBAAgB,CAAC,CAAA;IACrD,MAAM,YAAY,GAAG,iBAAiB,CACpC,QAAQ,EACR,OAAO,CAAC,WAAW,EACnB,gBAAgB,EAChB,OAAO,CAAC,MAAM,CACf,CAAA;IAED,OAAO;QACL,aAAa;QACb,WAAW,EAAE,OAAO,CAAC,WAAW;QAChC,YAAY;KACb,CAAA;AACH,CAAC"}
@@ -0,0 +1,5 @@
1
+ import { AccountType, Logger, TextElement, Transaction } from "./types.js";
2
+ export declare function formatDateIso(dateStr: string): string;
3
+ export declare function findStatementDate(elements: TextElement[]): string;
4
+ export declare function parseTransactions(elements: TextElement[], accountType: AccountType, statementDate: string, logger?: Logger): Transaction[];
5
+ //# sourceMappingURL=parser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"parser.d.ts","sourceRoot":"","sources":["../../src/lib/parser.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,YAAY,CAAA;AAmD1E,wBAAgB,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAKrD;AAsHD,wBAAgB,iBAAiB,CAAC,QAAQ,EAAE,WAAW,EAAE,GAAG,MAAM,CAgBjE;AAuDD,wBAAgB,iBAAiB,CAC/B,QAAQ,EAAE,WAAW,EAAE,EACvB,WAAW,EAAE,WAAW,EACxB,aAAa,EAAE,MAAM,EACrB,MAAM,CAAC,EAAE,MAAM,GACd,WAAW,EAAE,CA4Ff"}
@@ -0,0 +1,248 @@
1
+ const MONTHS = {
2
+ Jan: 1, Feb: 2, Mar: 3, Apr: 4, May: 5, Jun: 6,
3
+ Jul: 7, Aug: 8, Sep: 9, Oct: 10, Nov: 11, Dec: 12,
4
+ };
5
+ function parseStatementDate(statementDate) {
6
+ const match = statementDate.match(/^(\d{1,2})\s+(\w{3})\s+(\d{4})$/);
7
+ if (!match) {
8
+ throw new Error(`Invalid statement date format: "${statementDate}"`);
9
+ }
10
+ const day = Number.parseInt(match[1], 10);
11
+ const month = MONTHS[match[2]];
12
+ const year = Number.parseInt(match[3], 10);
13
+ if (!month) {
14
+ throw new Error(`Unknown month in statement date: "${match[2]}"`);
15
+ }
16
+ return { year, month, day };
17
+ }
18
+ function resolveTransactionDate(shortDate, stmtYear, stmtMonth, stmtDay) {
19
+ const match = shortDate.match(/^(\d{1,2})\s+(\w{3})$/);
20
+ if (!match) {
21
+ throw new Error(`Invalid transaction date format: "${shortDate}"`);
22
+ }
23
+ const txDay = Number.parseInt(match[1], 10);
24
+ const txMonth = MONTHS[match[2]];
25
+ if (!txMonth) {
26
+ throw new Error(`Unknown month in transaction date: "${match[2]}"`);
27
+ }
28
+ // Assign the statement's year, then check if the date falls after the statement date.
29
+ // If so, the transaction must belong to the previous year (year boundary crossing).
30
+ let txYear = stmtYear;
31
+ const txNumeric = txYear * 10000 + txMonth * 100 + txDay;
32
+ const stmtNumeric = stmtYear * 10000 + stmtMonth * 100 + stmtDay;
33
+ if (txNumeric > stmtNumeric) {
34
+ txYear -= 1;
35
+ }
36
+ const mm = String(txMonth).padStart(2, "0");
37
+ const dd = String(txDay).padStart(2, "0");
38
+ return `${txYear}-${mm}-${dd}`;
39
+ }
40
+ export function formatDateIso(dateStr) {
41
+ const { year, month, day } = parseStatementDate(dateStr);
42
+ const mm = String(month).padStart(2, "0");
43
+ const dd = String(day).padStart(2, "0");
44
+ return `${year}-${mm}-${dd}`;
45
+ }
46
+ const COLUMNS = {
47
+ DATE: { min: 55, max: 105 },
48
+ DETAILS: { min: 105, max: 345 },
49
+ DEPOSIT: { min: 345, max: 420 },
50
+ WITHDRAWAL: { min: 420, max: 490 },
51
+ BALANCE: { min: 490, max: 550 },
52
+ };
53
+ function parseNumber(str) {
54
+ if (!str || str.trim() === "")
55
+ return null;
56
+ const cleaned = str.replace(/,/g, "").trim();
57
+ const num = Number.parseFloat(cleaned);
58
+ return Number.isNaN(num) ? null : num;
59
+ }
60
+ function detectColumn(x0) {
61
+ if (x0 >= COLUMNS.DATE.min && x0 < COLUMNS.DATE.max)
62
+ return "date";
63
+ if (x0 >= COLUMNS.DETAILS.min && x0 < COLUMNS.DETAILS.max)
64
+ return "details";
65
+ if (x0 >= COLUMNS.DEPOSIT.min && x0 < COLUMNS.DEPOSIT.max)
66
+ return "deposit";
67
+ if (x0 >= COLUMNS.WITHDRAWAL.min && x0 < COLUMNS.WITHDRAWAL.max)
68
+ return "withdrawal";
69
+ if (x0 >= COLUMNS.BALANCE.min && x0 < COLUMNS.BALANCE.max)
70
+ return "balance";
71
+ return null;
72
+ }
73
+ function groupIntoRows(elements) {
74
+ const rows = new Map();
75
+ const tolerance = 2;
76
+ for (const el of elements) {
77
+ let rowY = null;
78
+ for (const y of rows.keys()) {
79
+ if (Math.abs(y - el.y0) <= tolerance) {
80
+ rowY = y;
81
+ break;
82
+ }
83
+ }
84
+ if (rowY !== null) {
85
+ rows.get(rowY).push(el);
86
+ }
87
+ else {
88
+ rows.set(el.y0, [el]);
89
+ }
90
+ }
91
+ return rows;
92
+ }
93
+ function groupIntoRowsArray(elements) {
94
+ const rowMap = groupIntoRows(elements);
95
+ const sortedYs = Array.from(rowMap.keys()).sort((a, b) => a - b);
96
+ return sortedYs.map((y) => rowMap.get(y));
97
+ }
98
+ function parseRow(elements) {
99
+ const row = {
100
+ date: null,
101
+ details: null,
102
+ deposit: null,
103
+ withdrawal: null,
104
+ balance: null,
105
+ };
106
+ elements.sort((a, b) => a.x0 - b.x0);
107
+ for (const el of elements) {
108
+ const col = detectColumn(el.x0);
109
+ const text = el.text.trim();
110
+ switch (col) {
111
+ case "date":
112
+ row.date = text;
113
+ break;
114
+ case "details":
115
+ row.details = text;
116
+ break;
117
+ case "deposit":
118
+ row.deposit = parseNumber(text);
119
+ break;
120
+ case "withdrawal":
121
+ row.withdrawal = parseNumber(text);
122
+ break;
123
+ case "balance":
124
+ row.balance = parseNumber(text);
125
+ break;
126
+ }
127
+ }
128
+ return row;
129
+ }
130
+ function findElementWithText(elements, text, startPage = 0, startY = 0) {
131
+ return (elements.find((el) => (el.page > startPage || (el.page === startPage && el.y0 >= startY)) &&
132
+ el.text.includes(text)) || null);
133
+ }
134
+ export function findStatementDate(elements) {
135
+ for (const el of elements) {
136
+ if (el.text.match(/^\d{1,2}\s+\w{3}\s+\d{4}$/)) {
137
+ const prevEl = elements.find((e) => e.page === el.page &&
138
+ Math.abs(e.y0 - el.y0) < 3 &&
139
+ e.text.includes("Statement Date"));
140
+ if (prevEl) {
141
+ return el.text;
142
+ }
143
+ }
144
+ }
145
+ return "Unknown";
146
+ }
147
+ function findTransactionsSection(elements, accountType) {
148
+ const start = findElementWithText(elements, accountType === "savings"
149
+ ? "Integrated Account Statement Savings"
150
+ : "Integrated Account Current");
151
+ if (!start)
152
+ return { start: null, end: null };
153
+ const rowStart = findElementWithText(elements, "DR=Debit", start.page, start.y1);
154
+ if (!rowStart)
155
+ return { start: null, end: null };
156
+ const end = findElementWithText(elements, "Transaction Summary", rowStart.page, rowStart.y1);
157
+ if (!end)
158
+ return { start: null, end: null };
159
+ return {
160
+ start: {
161
+ page: rowStart.page,
162
+ y0: rowStart.y1,
163
+ },
164
+ end: {
165
+ page: end.page,
166
+ y0: end.y0,
167
+ },
168
+ };
169
+ }
170
+ function getElementsInSection(start, end, elements) {
171
+ return elements.filter((el) => {
172
+ if (el.page < start.page || el.page > end.page)
173
+ return false;
174
+ if (el.page === start.page && el.y0 < start.y0)
175
+ return false;
176
+ if (el.page === end.page && el.y0 >= end.y0)
177
+ return false;
178
+ return true;
179
+ });
180
+ }
181
+ export function parseTransactions(elements, accountType, statementDate, logger) {
182
+ const stmt = parseStatementDate(statementDate);
183
+ const { start, end } = findTransactionsSection(elements, accountType);
184
+ if (!start || !end) {
185
+ throw new Error("Could not find transactions section in PDF");
186
+ }
187
+ logger?.debug?.(`Transaction section: start page ${start.page} y ${start.y0}, end page ${end.page} y ${end.y0}`);
188
+ const sectionElements = getElementsInSection(start, end, elements);
189
+ const rows = groupIntoRowsArray(sectionElements);
190
+ let openingBalance = null;
191
+ let lastDate = "";
192
+ const desc = [];
193
+ const transactions = [];
194
+ for (let i = 0; i < rows.length; i++) {
195
+ const row = rows[i];
196
+ const parsedRow = parseRow(row);
197
+ const date = parsedRow.date ?? lastDate;
198
+ if (parsedRow.date) {
199
+ lastDate = parsedRow.date;
200
+ }
201
+ if (parsedRow.deposit !== null || parsedRow.withdrawal !== null) {
202
+ if (!date) {
203
+ throw new Error(`Could not determine date for transaction at row ${i + 1}`);
204
+ }
205
+ if (openingBalance === null) {
206
+ throw new Error(`Transaction found before opening balance at row ${i + 1}`);
207
+ }
208
+ const previousTransaction = transactions[transactions.length - 1];
209
+ const previousBalance = previousTransaction
210
+ ? previousTransaction.balance
211
+ : openingBalance;
212
+ const balance = parsedRow.balance ??
213
+ previousBalance + (parsedRow.deposit ?? 0) - (parsedRow.withdrawal ?? 0);
214
+ if (parsedRow.balance !== null && balance !== parsedRow.balance) {
215
+ throw new Error(`Balance mismatch at row ${i + 1}: calculated ${balance}, found ${parsedRow.balance}`);
216
+ }
217
+ const details = [...desc, parsedRow.details ?? ""].join(" ").trim();
218
+ desc.length = 0;
219
+ transactions.push({
220
+ date: resolveTransactionDate(date, stmt.year, stmt.month, stmt.day),
221
+ details,
222
+ deposit: parsedRow.deposit,
223
+ withdrawal: parsedRow.withdrawal,
224
+ balance,
225
+ });
226
+ }
227
+ else {
228
+ if (!parsedRow.details) {
229
+ throw new Error(`Found non-transaction row without details at row ${i + 1}`);
230
+ }
231
+ if (parsedRow.details === "B/F BALANCE") {
232
+ if (parsedRow.balance === null) {
233
+ throw new Error(`Opening balance row missing balance amount at row ${i + 1}`);
234
+ }
235
+ if (transactions.length > 0) {
236
+ throw new Error(`Opening balance row found after transactions at row ${i + 1}`);
237
+ }
238
+ logger?.debug?.(`Opening balance detected: ${parsedRow.balance}`);
239
+ openingBalance = parsedRow.balance;
240
+ }
241
+ else {
242
+ desc.push(parsedRow.details.trim());
243
+ }
244
+ }
245
+ }
246
+ return transactions;
247
+ }
248
+ //# sourceMappingURL=parser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"parser.js","sourceRoot":"","sources":["../../src/lib/parser.ts"],"names":[],"mappings":"AAEA,MAAM,MAAM,GAA2B;IACrC,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC;IAC9C,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,EAAE,EAAE,GAAG,EAAE,EAAE,EAAE,GAAG,EAAE,EAAE;CAClD,CAAA;AAED,SAAS,kBAAkB,CAAC,aAAqB;IAC/C,MAAM,KAAK,GAAG,aAAa,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAA;IACpE,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,mCAAmC,aAAa,GAAG,CAAC,CAAA;IACtE,CAAC;IACD,MAAM,GAAG,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAA;IAC1C,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,CAAA;IAC/B,MAAM,IAAI,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAA;IAC3C,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,qCAAqC,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAA;IACnE,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,CAAA;AAC7B,CAAC;AAED,SAAS,sBAAsB,CAC7B,SAAiB,EACjB,QAAgB,EAChB,SAAiB,EACjB,OAAe;IAEf,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAA;IACtD,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,qCAAqC,SAAS,GAAG,CAAC,CAAA;IACpE,CAAC;IACD,MAAM,KAAK,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAA;IAC5C,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,CAAA;IACjC,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,MAAM,IAAI,KAAK,CAAC,uCAAuC,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAA;IACrE,CAAC;IAED,sFAAsF;IACtF,oFAAoF;IACpF,IAAI,MAAM,GAAG,QAAQ,CAAA;IACrB,MAAM,SAAS,GAAG,MAAM,GAAG,KAAK,GAAG,OAAO,GAAG,GAAG,GAAG,KAAK,CAAA;IACxD,MAAM,WAAW,GAAG,QAAQ,GAAG,KAAK,GAAG,SAAS,GAAG,GAAG,GAAG,OAAO,CAAA;IAChE,IAAI,SAAS,GAAG,WAAW,EAAE,CAAC;QAC5B,MAAM,IAAI,CAAC,CAAA;IACb,CAAC;IAED,MAAM,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAA;IAC3C,MAAM,EAAE,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAA;IACzC,OAAO,GAAG,MAAM,IAAI,EAAE,IAAI,EAAE,EAAE,CAAA;AAChC,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,OAAe;IAC3C,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,kBAAkB,CAAC,OAAO,CAAC,CAAA;IACxD,MAAM,EAAE,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAA;IACzC,MAAM,EAAE,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAA;IACvC,OAAO,GAAG,IAAI,IAAI,EAAE,IAAI,EAAE,EAAE,CAAA;AAC9B,CAAC;AAED,MAAM,OAAO,GAAG;IACd,IAAI,EAAE,EAAE,GAAG,EAAE,EAAE,EAAE,GAAG,EAAE,GAAG,EAAE;IAC3B,OAAO,EAAE,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE;IAC/B,OAAO,EAAE,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE;IAC/B,UAAU,EAAE,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE;IAClC,OAAO,EAAE,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE;CAChC,CAAA;AAUD,SAAS,WAAW,CAAC,GAAW;IAC9B,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,KAAK,EAAE;QAAE,OAAO,IAAI,CAAA;IAC1C,MAAM,OAAO,GAAG,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;IAC5C,MAAM,GAAG,GAAG,MAAM,CAAC,UAAU,CAAC,OAAO,CAAC,CAAA;IACtC,OAAO,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAA;AACvC,CAAC;AAED,SAAS,YAAY,CACnB,EAAU;IAEV,IAAI,EAAE,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG;QAAE,OAAO,MAAM,CAAA;IAClE,IAAI,EAAE,IAAI,OAAO,CAAC,OAAO,CAAC,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG;QAAE,OAAO,SAAS,CAAA;IAC3E,IAAI,EAAE,IAAI,OAAO,CAAC,OAAO,CAAC,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG;QAAE,OAAO,SAAS,CAAA;IAC3E,IAAI,EAAE,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,UAAU,CAAC,GAAG;QAC7D,OAAO,YAAY,CAAA;IACrB,IAAI,EAAE,IAAI,OAAO,CAAC,OAAO,CAAC,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG;QAAE,OAAO,SAAS,CAAA;IAC3E,OAAO,IAAI,CAAA;AACb,CAAC;AAED,SAAS,aAAa,CAAC,QAAuB;IAC5C,MAAM,IAAI,GAAG,IAAI,GAAG,EAAyB,CAAA;IAC7C,MAAM,SAAS,GAAG,CAAC,CAAA;IAEnB,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,IAAI,IAAI,GAAkB,IAAI,CAAA;QAC9B,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;YAC5B,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,IAAI,SAAS,EAAE,CAAC;gBACrC,IAAI,GAAG,CAAC,CAAA;gBACR,MAAK;YACP,CAAC;QACH,CAAC;QAED,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;YAClB,IAAI,CAAC,GAAG,CAAC,IAAI,CAAE,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;QAC1B,CAAC;aAAM,CAAC;YACN,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC,CAAA;QACvB,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAA;AACb,CAAC;AAED,SAAS,kBAAkB,CAAC,QAAuB;IACjD,MAAM,MAAM,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAA;IACtC,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;IAChE,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAE,CAAC,CAAA;AAC5C,CAAC;AAED,SAAS,QAAQ,CAAC,QAAuB;IACvC,MAAM,GAAG,GAAY;QACnB,IAAI,EAAE,IAAI;QACV,OAAO,EAAE,IAAI;QACb,OAAO,EAAE,IAAI;QACb,UAAU,EAAE,IAAI;QAChB,OAAO,EAAE,IAAI;KACd,CAAA;IAED,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,EAAE,CAAC,CAAA;IAEpC,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,CAAC,EAAE,CAAC,CAAA;QAC/B,MAAM,IAAI,GAAG,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAA;QAE3B,QAAQ,GAAG,EAAE,CAAC;YACZ,KAAK,MAAM;gBACT,GAAG,CAAC,IAAI,GAAG,IAAI,CAAA;gBACf,MAAK;YACP,KAAK,SAAS;gBACZ,GAAG,CAAC,OAAO,GAAG,IAAI,CAAA;gBAClB,MAAK;YACP,KAAK,SAAS;gBACZ,GAAG,CAAC,OAAO,GAAG,WAAW,CAAC,IAAI,CAAC,CAAA;gBAC/B,MAAK;YACP,KAAK,YAAY;gBACf,GAAG,CAAC,UAAU,GAAG,WAAW,CAAC,IAAI,CAAC,CAAA;gBAClC,MAAK;YACP,KAAK,SAAS;gBACZ,GAAG,CAAC,OAAO,GAAG,WAAW,CAAC,IAAI,CAAC,CAAA;gBAC/B,MAAK;QACT,CAAC;IACH,CAAC;IAED,OAAO,GAAG,CAAA;AACZ,CAAC;AAED,SAAS,mBAAmB,CAC1B,QAAuB,EACvB,IAAY,EACZ,SAAS,GAAG,CAAC,EACb,MAAM,GAAG,CAAC;IAEV,OAAO,CACL,QAAQ,CAAC,IAAI,CACX,CAAC,EAAE,EAAE,EAAE,CACL,CAAC,EAAE,CAAC,IAAI,GAAG,SAAS,IAAI,CAAC,EAAE,CAAC,IAAI,KAAK,SAAS,IAAI,EAAE,CAAC,EAAE,IAAI,MAAM,CAAC,CAAC;QACnE,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CACzB,IAAI,IAAI,CACV,CAAA;AACH,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,QAAuB;IACvD,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,IAAI,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,2BAA2B,CAAC,EAAE,CAAC;YAC/C,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAC1B,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,IAAI,KAAK,EAAE,CAAC,IAAI;gBAClB,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC;gBAC1B,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CACpC,CAAA;YACD,IAAI,MAAM,EAAE,CAAC;gBACX,OAAO,EAAE,CAAC,IAAI,CAAA;YAChB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAA;AAClB,CAAC;AAED,SAAS,uBAAuB,CAC9B,QAAuB,EACvB,WAAwB;IAExB,MAAM,KAAK,GAAG,mBAAmB,CAC/B,QAAQ,EACR,WAAW,KAAK,SAAS;QACvB,CAAC,CAAC,sCAAsC;QACxC,CAAC,CAAC,4BAA4B,CACjC,CAAA;IACD,IAAI,CAAC,KAAK;QAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,CAAA;IAE7C,MAAM,QAAQ,GAAG,mBAAmB,CAClC,QAAQ,EACR,UAAU,EACV,KAAK,CAAC,IAAI,EACV,KAAK,CAAC,EAAE,CACT,CAAA;IACD,IAAI,CAAC,QAAQ;QAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,CAAA;IAEhD,MAAM,GAAG,GAAG,mBAAmB,CAC7B,QAAQ,EACR,qBAAqB,EACrB,QAAQ,CAAC,IAAI,EACb,QAAQ,CAAC,EAAE,CACZ,CAAA;IACD,IAAI,CAAC,GAAG;QAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,CAAA;IAE3C,OAAO;QACL,KAAK,EAAE;YACL,IAAI,EAAE,QAAQ,CAAC,IAAI;YACnB,EAAE,EAAE,QAAQ,CAAC,EAAE;SAChB;QACD,GAAG,EAAE;YACH,IAAI,EAAE,GAAG,CAAC,IAAI;YACd,EAAE,EAAE,GAAG,CAAC,EAAE;SACX;KACF,CAAA;AACH,CAAC;AAED,SAAS,oBAAoB,CAC3B,KAAmC,EACnC,GAAiC,EACjC,QAAuB;IAEvB,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE;QAC5B,IAAI,EAAE,CAAC,IAAI,GAAG,KAAK,CAAC,IAAI,IAAI,EAAE,CAAC,IAAI,GAAG,GAAG,CAAC,IAAI;YAAE,OAAO,KAAK,CAAA;QAC5D,IAAI,EAAE,CAAC,IAAI,KAAK,KAAK,CAAC,IAAI,IAAI,EAAE,CAAC,EAAE,GAAG,KAAK,CAAC,EAAE;YAAE,OAAO,KAAK,CAAA;QAC5D,IAAI,EAAE,CAAC,IAAI,KAAK,GAAG,CAAC,IAAI,IAAI,EAAE,CAAC,EAAE,IAAI,GAAG,CAAC,EAAE;YAAE,OAAO,KAAK,CAAA;QACzD,OAAO,IAAI,CAAA;IACb,CAAC,CAAC,CAAA;AACJ,CAAC;AAED,MAAM,UAAU,iBAAiB,CAC/B,QAAuB,EACvB,WAAwB,EACxB,aAAqB,EACrB,MAAe;IAEf,MAAM,IAAI,GAAG,kBAAkB,CAAC,aAAa,CAAC,CAAA;IAC9C,MAAM,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,uBAAuB,CAAC,QAAQ,EAAE,WAAW,CAAC,CAAA;IACrE,IAAI,CAAC,KAAK,IAAI,CAAC,GAAG,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CAAC,4CAA4C,CAAC,CAAA;IAC/D,CAAC;IAED,MAAM,EAAE,KAAK,EAAE,CACb,mCAAmC,KAAK,CAAC,IAAI,MAAM,KAAK,CAAC,EAAE,cAAc,GAAG,CAAC,IAAI,MAAM,GAAG,CAAC,EAAE,EAAE,CAChG,CAAA;IAED,MAAM,eAAe,GAAG,oBAAoB,CAAC,KAAK,EAAE,GAAG,EAAE,QAAQ,CAAC,CAAA;IAClE,MAAM,IAAI,GAAG,kBAAkB,CAAC,eAAe,CAAC,CAAA;IAEhD,IAAI,cAAc,GAAkB,IAAI,CAAA;IACxC,IAAI,QAAQ,GAAG,EAAE,CAAA;IACjB,MAAM,IAAI,GAAa,EAAE,CAAA;IACzB,MAAM,YAAY,GAAkB,EAAE,CAAA;IAEtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAE,CAAA;QACpB,MAAM,SAAS,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAA;QAE/B,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,IAAI,QAAQ,CAAA;QACvC,IAAI,SAAS,CAAC,IAAI,EAAE,CAAC;YACnB,QAAQ,GAAG,SAAS,CAAC,IAAI,CAAA;QAC3B,CAAC;QAED,IAAI,SAAS,CAAC,OAAO,KAAK,IAAI,IAAI,SAAS,CAAC,UAAU,KAAK,IAAI,EAAE,CAAC;YAChE,IAAI,CAAC,IAAI,EAAE,CAAC;gBACV,MAAM,IAAI,KAAK,CACb,mDAAmD,CAAC,GAAG,CAAC,EAAE,CAC3D,CAAA;YACH,CAAC;YACD,IAAI,cAAc,KAAK,IAAI,EAAE,CAAC;gBAC5B,MAAM,IAAI,KAAK,CACb,mDAAmD,CAAC,GAAG,CAAC,EAAE,CAC3D,CAAA;YACH,CAAC;YAED,MAAM,mBAAmB,GAAG,YAAY,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;YACjE,MAAM,eAAe,GAAG,mBAAmB;gBACzC,CAAC,CAAC,mBAAmB,CAAC,OAAO;gBAC7B,CAAC,CAAC,cAAc,CAAA;YAElB,MAAM,OAAO,GACX,SAAS,CAAC,OAAO;gBACjB,eAAe,GAAG,CAAC,SAAS,CAAC,OAAO,IAAI,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,UAAU,IAAI,CAAC,CAAC,CAAA;YAE1E,IAAI,SAAS,CAAC,OAAO,KAAK,IAAI,IAAI,OAAO,KAAK,SAAS,CAAC,OAAO,EAAE,CAAC;gBAChE,MAAM,IAAI,KAAK,CACb,2BAA2B,CAAC,GAAG,CAAC,gBAAgB,OAAO,WAAW,SAAS,CAAC,OAAO,EAAE,CACtF,CAAA;YACH,CAAC;YAED,MAAM,OAAO,GAAG,CAAC,GAAG,IAAI,EAAE,SAAS,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;YACnE,IAAI,CAAC,MAAM,GAAG,CAAC,CAAA;YAEf,YAAY,CAAC,IAAI,CAAC;gBAChB,IAAI,EAAE,sBAAsB,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC;gBACnE,OAAO;gBACP,OAAO,EAAE,SAAS,CAAC,OAAO;gBAC1B,UAAU,EAAE,SAAS,CAAC,UAAU;gBAChC,OAAO;aACR,CAAC,CAAA;QACJ,CAAC;aAAM,CAAC;YACN,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,CAAC;gBACvB,MAAM,IAAI,KAAK,CACb,oDAAoD,CAAC,GAAG,CAAC,EAAE,CAC5D,CAAA;YACH,CAAC;YAED,IAAI,SAAS,CAAC,OAAO,KAAK,aAAa,EAAE,CAAC;gBACxC,IAAI,SAAS,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;oBAC/B,MAAM,IAAI,KAAK,CACb,qDAAqD,CAAC,GAAG,CAAC,EAAE,CAC7D,CAAA;gBACH,CAAC;gBACD,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC5B,MAAM,IAAI,KAAK,CACb,uDAAuD,CAAC,GAAG,CAAC,EAAE,CAC/D,CAAA;gBACH,CAAC;gBACD,MAAM,EAAE,KAAK,EAAE,CAAC,6BAA6B,SAAS,CAAC,OAAO,EAAE,CAAC,CAAA;gBACjE,cAAc,GAAG,SAAS,CAAC,OAAO,CAAA;YACpC,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAA;YACrC,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,YAAY,CAAA;AACrB,CAAC"}
@@ -0,0 +1,8 @@
1
+ import { TextElement } from "./types.js";
2
+ export interface PdfMinerOptions {
3
+ pdfPath: string;
4
+ scriptPath: string;
5
+ pythonPath: string;
6
+ }
7
+ export declare function extractTextElements(options: PdfMinerOptions): Promise<TextElement[]>;
8
+ //# sourceMappingURL=pdfminer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdfminer.d.ts","sourceRoot":"","sources":["../../src/lib/pdfminer.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,WAAW,EAAE,MAAM,YAAY,CAAA;AAIxC,MAAM,WAAW,eAAe;IAC9B,OAAO,EAAE,MAAM,CAAA;IACf,UAAU,EAAE,MAAM,CAAA;IAClB,UAAU,EAAE,MAAM,CAAA;CACnB;AAED,wBAAsB,mBAAmB,CAAC,OAAO,EAAE,eAAe,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC,CAO1F"}
@@ -0,0 +1,11 @@
1
+ import { execFile } from "node:child_process";
2
+ import { promisify } from "node:util";
3
+ const execFileAsync = promisify(execFile);
4
+ export async function extractTextElements(options) {
5
+ const { stdout } = await execFileAsync(options.pythonPath, [options.scriptPath, options.pdfPath], {
6
+ encoding: "utf-8",
7
+ maxBuffer: 10 * 1024 * 1024,
8
+ });
9
+ return JSON.parse(stdout);
10
+ }
11
+ //# sourceMappingURL=pdfminer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdfminer.js","sourceRoot":"","sources":["../../src/lib/pdfminer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAA;AAC7C,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAA;AAGrC,MAAM,aAAa,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAA;AAQzC,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,OAAwB;IAChE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,OAAO,CAAC,EAAE;QAChG,QAAQ,EAAE,OAAO;QACjB,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI;KAC5B,CAAC,CAAA;IAEF,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAkB,CAAA;AAC5C,CAAC"}
@@ -0,0 +1,28 @@
1
+ export interface Transaction {
2
+ date: string;
3
+ details: string;
4
+ deposit: number | null;
5
+ withdrawal: number | null;
6
+ balance: number;
7
+ }
8
+ export interface TextElement {
9
+ page: number;
10
+ text: string;
11
+ x0: number;
12
+ y0: number;
13
+ x1: number;
14
+ y1: number;
15
+ }
16
+ export type AccountType = "savings" | "current";
17
+ export interface StatementResult {
18
+ statementDate: string;
19
+ accountType: AccountType;
20
+ transactions: Transaction[];
21
+ }
22
+ export interface Logger {
23
+ debug?: (message: string) => void;
24
+ info?: (message: string) => void;
25
+ warn?: (message: string) => void;
26
+ error?: (message: string) => void;
27
+ }
28
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAA;IACZ,OAAO,EAAE,MAAM,CAAA;IACf,OAAO,EAAE,MAAM,GAAG,IAAI,CAAA;IACtB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAA;IACzB,OAAO,EAAE,MAAM,CAAA;CAChB;AAED,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAA;IACZ,IAAI,EAAE,MAAM,CAAA;IACZ,EAAE,EAAE,MAAM,CAAA;IACV,EAAE,EAAE,MAAM,CAAA;IACV,EAAE,EAAE,MAAM,CAAA;IACV,EAAE,EAAE,MAAM,CAAA;CACX;AAED,MAAM,MAAM,WAAW,GAAG,SAAS,GAAG,SAAS,CAAA;AAE/C,MAAM,WAAW,eAAe;IAC9B,aAAa,EAAE,MAAM,CAAA;IACrB,WAAW,EAAE,WAAW,CAAA;IACxB,YAAY,EAAE,WAAW,EAAE,CAAA;CAC5B;AAED,MAAM,WAAW,MAAM;IACrB,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAA;IACjC,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAA;IAChC,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAA;IAChC,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAA;CAClC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":""}
package/package.json ADDED
@@ -0,0 +1,40 @@
1
+ {
2
+ "name": "@krizpoon/hangseng-statement-extractor",
3
+ "version": "0.1.0",
4
+ "description": "Extract transaction history from Hang Seng Bank statement PDFs",
5
+ "type": "module",
6
+ "main": "./dist/index.js",
7
+ "types": "./dist/index.d.ts",
8
+ "exports": {
9
+ ".": {
10
+ "types": "./dist/index.d.ts",
11
+ "import": "./dist/index.js"
12
+ }
13
+ },
14
+ "bin": {
15
+ "hangseng-statement-extractor": "dist/cli.js"
16
+ },
17
+ "files": [
18
+ "dist",
19
+ "scripts"
20
+ ],
21
+ "scripts": {
22
+ "build": "tsc",
23
+ "lint": "tsc -p tsconfig.json --noEmit"
24
+ },
25
+ "keywords": [
26
+ "hang-seng",
27
+ "pdf",
28
+ "statement",
29
+ "banking",
30
+ "parser"
31
+ ],
32
+ "license": "MIT",
33
+ "dependencies": {
34
+ "commander": "^14.0.2"
35
+ },
36
+ "devDependencies": {
37
+ "@types/node": "^25.0.9",
38
+ "typescript": "^5.9.3"
39
+ }
40
+ }
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Extract text with positions from a PDF file using pdfminer.
4
+ Outputs JSON with text elements and their bounding boxes.
5
+ """
6
+
7
+ import sys
8
+ import json
9
+ from pdfminer.high_level import extract_pages
10
+ from pdfminer.layout import LTTextBoxHorizontal, LTTextLineHorizontal, LTChar, LAParams
11
+
12
+
13
+ def extract_text_with_positions(pdf_path: str) -> list[dict]:
14
+ """Extract text elements with their positions from a PDF."""
15
+ elements = []
16
+
17
+ laparams = LAParams(
18
+ line_margin=0.3, # Smaller margin to keep lines separate
19
+ word_margin=0.1,
20
+ char_margin=2.0,
21
+ boxes_flow=0.5,
22
+ )
23
+
24
+ for page_num, page_layout in enumerate(extract_pages(pdf_path, laparams=laparams)):
25
+ page_height = page_layout.height
26
+
27
+ for element in page_layout:
28
+ if isinstance(element, LTTextBoxHorizontal):
29
+ for line in element:
30
+ if isinstance(line, LTTextLineHorizontal):
31
+ text = line.get_text().strip()
32
+ if text:
33
+ # PDF coordinates have origin at bottom-left
34
+ # Convert to top-left origin for easier processing
35
+ elements.append({
36
+ "page": page_num + 1,
37
+ "text": text,
38
+ "x0": round(line.x0, 2),
39
+ "y0": round(page_height - line.y1, 2), # Convert to top-left origin
40
+ "x1": round(line.x1, 2),
41
+ "y1": round(page_height - line.y0, 2),
42
+ })
43
+
44
+ # Sort by page, then by y position (top to bottom), then by x position (left to right)
45
+ elements.sort(key=lambda e: (e["page"], e["y0"], e["x0"]))
46
+
47
+ return elements
48
+
49
+
50
+ def main():
51
+ if len(sys.argv) < 2:
52
+ print("Usage: pdf-extract-positions.py <pdf_path>", file=sys.stderr)
53
+ sys.exit(1)
54
+
55
+ pdf_path = sys.argv[1]
56
+
57
+ try:
58
+ elements = extract_text_with_positions(pdf_path)
59
+ print(json.dumps(elements, ensure_ascii=False, indent=2))
60
+ except Exception as e:
61
+ print(f"Error: {e}", file=sys.stderr)
62
+ sys.exit(1)
63
+
64
+
65
+ if __name__ == "__main__":
66
+ main()