@krizpoon/hangseng-statement-extractor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +92 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +96 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/csv.d.ts +3 -0
- package/dist/lib/csv.d.ts.map +1 -0
- package/dist/lib/csv.js +23 -0
- package/dist/lib/csv.js.map +1 -0
- package/dist/lib/extractor.d.ts +8 -0
- package/dist/lib/extractor.d.ts.map +1 -0
- package/dist/lib/extractor.js +14 -0
- package/dist/lib/extractor.js.map +1 -0
- package/dist/lib/parser.d.ts +5 -0
- package/dist/lib/parser.d.ts.map +1 -0
- package/dist/lib/parser.js +248 -0
- package/dist/lib/parser.js.map +1 -0
- package/dist/lib/pdfminer.d.ts +8 -0
- package/dist/lib/pdfminer.d.ts.map +1 -0
- package/dist/lib/pdfminer.js +11 -0
- package/dist/lib/pdfminer.js.map +1 -0
- package/dist/lib/types.d.ts +28 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +2 -0
- package/dist/lib/types.js.map +1 -0
- package/package.json +40 -0
- package/scripts/pdf-extract-positions.py +66 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Hang Seng Statement Extractor
|
|
2
|
+
|
|
3
|
+
Extract transaction history from Hang Seng Bank statement PDFs using a small TypeScript library and a CLI. The core extractor is importable for use in other projects, and the CLI is a thin wrapper around it.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
- Extracts transactions from statement PDFs with position-aware parsing.
|
|
7
|
+
- Works as both a library and a CLI.
|
|
8
|
+
- Outputs CSV with statement date and account type included.
|
|
9
|
+
|
|
10
|
+
## Requirements
|
|
11
|
+
- Node.js 18+ (ESM).
|
|
12
|
+
- Python 3 with `pdfminer.six` installed:
|
|
13
|
+
- `pip install pdfminer.six`
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
```bash
|
|
17
|
+
npm install hangseng-statement-extractor
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## CLI
|
|
21
|
+
The CLI requires all paths to be provided explicitly.
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
hangseng-statement-extractor \
|
|
25
|
+
--input /path/to/statement.pdf \
|
|
26
|
+
--account savings \
|
|
27
|
+
--output /path/to/output.csv
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Process a directory of PDFs:
|
|
31
|
+
```bash
|
|
32
|
+
hangseng-statement-extractor \
|
|
33
|
+
--input /path/to/statements/ \
|
|
34
|
+
--account savings \
|
|
35
|
+
--output /path/to/output.csv
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Notes:
|
|
39
|
+
- `--output` is optional; if omitted, CSV is printed to stdout.
|
|
40
|
+
- `--account` accepts `savings` or `current`.
|
|
41
|
+
|
|
42
|
+
## Library Usage
|
|
43
|
+
```ts
|
|
44
|
+
import { extractStatement, formatTransactionsAsCsv } from "hangseng-statement-extractor"
|
|
45
|
+
import { fileURLToPath } from "node:url"
|
|
46
|
+
import { dirname, join } from "node:path"
|
|
47
|
+
|
|
48
|
+
const here = dirname(fileURLToPath(import.meta.url))
|
|
49
|
+
const pdfPath = "/path/to/statement.pdf"
|
|
50
|
+
const pythonPath = "/usr/bin/python3"
|
|
51
|
+
const scriptPath = join(
|
|
52
|
+
here,
|
|
53
|
+
"../node_modules/hangseng-statement-extractor/scripts/pdf-extract-positions.py",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
const result = extractStatement({
|
|
57
|
+
pdfPath,
|
|
58
|
+
scriptPath,
|
|
59
|
+
pythonPath,
|
|
60
|
+
accountType: "savings",
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
const csv = formatTransactionsAsCsv([result])
|
|
64
|
+
console.log(csv)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### API
|
|
68
|
+
- `extractStatement(options)`
|
|
69
|
+
- `options.pdfPath` (string)
|
|
70
|
+
- `options.scriptPath` (string)
|
|
71
|
+
- `options.pythonPath` (string)
|
|
72
|
+
- `options.accountType` ("savings" | "current")
|
|
73
|
+
- `options.logger` (optional)
|
|
74
|
+
- Returns `{ statementDate, accountType, transactions }`
|
|
75
|
+
|
|
76
|
+
- `formatTransactionsAsCsv(results)`
|
|
77
|
+
- Accepts an array of statement results and returns CSV text.
|
|
78
|
+
|
|
79
|
+
## CSV Output
|
|
80
|
+
Columns:
|
|
81
|
+
```
|
|
82
|
+
Date,Details,Deposit,Withdrawal,Balance,Account Type,Statement Date
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Development
|
|
86
|
+
```bash
|
|
87
|
+
npm install
|
|
88
|
+
npm run build
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## License
|
|
92
|
+
MIT
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":""}
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { Option, program } from "commander";
|
|
3
|
+
import { execFile } from "node:child_process";
|
|
4
|
+
import * as fs from "node:fs";
|
|
5
|
+
import * as path from "node:path";
|
|
6
|
+
import { fileURLToPath } from "node:url";
|
|
7
|
+
import { promisify } from "node:util";
|
|
8
|
+
import { extractStatement, formatTransactionsAsCsv } from "./index.js";
|
|
9
|
+
const execFileAsync = promisify(execFile);
|
|
10
|
+
const SCRIPT_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..", "scripts", "pdf-extract-positions.py");
|
|
11
|
+
async function findPython() {
|
|
12
|
+
for (const candidate of ["python3", "python"]) {
|
|
13
|
+
try {
|
|
14
|
+
await execFileAsync(candidate, ["--version"]);
|
|
15
|
+
return candidate;
|
|
16
|
+
}
|
|
17
|
+
catch {
|
|
18
|
+
// not found, try next
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
throw new Error("Python not found. Install Python or specify its path with --python.");
|
|
22
|
+
}
|
|
23
|
+
function collectPdfFiles(inputPath) {
|
|
24
|
+
const stat = fs.statSync(inputPath);
|
|
25
|
+
if (stat.isDirectory()) {
|
|
26
|
+
return fs
|
|
27
|
+
.readdirSync(inputPath)
|
|
28
|
+
.filter((file) => file.toLowerCase().endsWith(".pdf"))
|
|
29
|
+
.map((file) => path.join(inputPath, file))
|
|
30
|
+
.sort();
|
|
31
|
+
}
|
|
32
|
+
return [inputPath];
|
|
33
|
+
}
|
|
34
|
+
program
|
|
35
|
+
.name("hangseng-statement-extractor")
|
|
36
|
+
.description("Extract transaction history from Hang Seng Bank statement PDFs")
|
|
37
|
+
.requiredOption("-i, --input <path>", "Input PDF file or directory containing PDFs")
|
|
38
|
+
.option("--python <path>", "Path to python executable")
|
|
39
|
+
.option("-o, --output <path>", "Output CSV file (prints to stdout if omitted)")
|
|
40
|
+
.addOption(new Option("-a, --account <type>", "Account type to extract: savings or current")
|
|
41
|
+
.choices(["savings", "current"])
|
|
42
|
+
.default("savings"))
|
|
43
|
+
.option("-v, --verbose", "Enable verbose logging")
|
|
44
|
+
.action(async (options) => {
|
|
45
|
+
const inputPath = path.resolve(options.input);
|
|
46
|
+
const scriptPath = SCRIPT_PATH;
|
|
47
|
+
const pythonPath = options.python
|
|
48
|
+
? path.resolve(options.python)
|
|
49
|
+
: await findPython();
|
|
50
|
+
const logger = options.verbose ? console : undefined;
|
|
51
|
+
const pdfFiles = collectPdfFiles(inputPath);
|
|
52
|
+
if (pdfFiles.length === 0) {
|
|
53
|
+
console.error("No PDF files found");
|
|
54
|
+
process.exitCode = 1;
|
|
55
|
+
return;
|
|
56
|
+
}
|
|
57
|
+
const results = [];
|
|
58
|
+
let errorCount = 0;
|
|
59
|
+
for (const pdfFile of pdfFiles) {
|
|
60
|
+
if (options.verbose) {
|
|
61
|
+
console.error(`Processing: ${path.basename(pdfFile)}`);
|
|
62
|
+
}
|
|
63
|
+
try {
|
|
64
|
+
const result = await extractStatement({
|
|
65
|
+
pdfPath: pdfFile,
|
|
66
|
+
scriptPath,
|
|
67
|
+
pythonPath,
|
|
68
|
+
accountType: options.account,
|
|
69
|
+
logger,
|
|
70
|
+
});
|
|
71
|
+
results.push(result);
|
|
72
|
+
}
|
|
73
|
+
catch (error) {
|
|
74
|
+
errorCount += 1;
|
|
75
|
+
console.error(`Error processing ${pdfFile}:`, error);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
if (results.length === 0) {
|
|
79
|
+
console.error("No transactions extracted");
|
|
80
|
+
process.exitCode = 1;
|
|
81
|
+
return;
|
|
82
|
+
}
|
|
83
|
+
const csv = formatTransactionsAsCsv(results);
|
|
84
|
+
if (options.output) {
|
|
85
|
+
fs.writeFileSync(options.output, `${csv}\n`);
|
|
86
|
+
console.error(`Output written to: ${options.output}`);
|
|
87
|
+
}
|
|
88
|
+
else {
|
|
89
|
+
process.stdout.write(`${csv}\n`);
|
|
90
|
+
}
|
|
91
|
+
if (errorCount > 0) {
|
|
92
|
+
process.exitCode = 1;
|
|
93
|
+
}
|
|
94
|
+
});
|
|
95
|
+
program.parse();
|
|
96
|
+
//# sourceMappingURL=cli.js.map
|
package/dist/cli.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,WAAW,CAAA;AAC3C,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAA;AAC7C,OAAO,KAAK,EAAE,MAAM,SAAS,CAAA;AAC7B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAA;AACjC,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAA;AACxC,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAA;AACrC,OAAO,EAAE,gBAAgB,EAAE,uBAAuB,EAAE,MAAM,YAAY,CAAA;AAGtE,MAAM,aAAa,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAA;AAEzC,MAAM,WAAW,GAAG,IAAI,CAAC,OAAO,CAC9B,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAC5C,IAAI,EACJ,SAAS,EACT,0BAA0B,CAC3B,CAAA;AAED,KAAK,UAAU,UAAU;IACvB,KAAK,MAAM,SAAS,IAAI,CAAC,SAAS,EAAE,QAAQ,CAAC,EAAE,CAAC;QAC9C,IAAI,CAAC;YACH,MAAM,aAAa,CAAC,SAAS,EAAE,CAAC,WAAW,CAAC,CAAC,CAAA;YAC7C,OAAO,SAAS,CAAA;QAClB,CAAC;QAAC,MAAM,CAAC;YACP,sBAAsB;QACxB,CAAC;IACH,CAAC;IACD,MAAM,IAAI,KAAK,CACb,qEAAqE,CACtE,CAAA;AACH,CAAC;AAED,SAAS,eAAe,CAAC,SAAiB;IACxC,MAAM,IAAI,GAAG,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAA;IACnC,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;QACvB,OAAO,EAAE;aACN,WAAW,CAAC,SAAS,CAAC;aACtB,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;aACrD,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC;aACzC,IAAI,EAAE,CAAA;IACX,CAAC;IAED,OAAO,CAAC,SAAS,CAAC,CAAA;AACpB,CAAC;AAED,OAAO;KACJ,IAAI,CAAC,8BAA8B,CAAC;KACpC,WAAW,CAAC,gEAAgE,CAAC;KAC7E,cAAc,CACb,oBAAoB,EACpB,6CAA6C,CAC9C;KACA,MAAM,CAAC,iBAAiB,EAAE,2BAA2B,CAAC;KACtD,MAAM,CACL,qBAAqB,EACrB,+CAA+C,CAChD;KACA,SAAS,CACR,IAAI,MAAM,CACR,sBAAsB,EACtB,6CAA6C,CAC9C;KACE,OAAO,CAAC,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;KAC/B,OAAO,CAAC,SAAS,CAAC,CACtB;KACA,MAAM,CAAC,eAAe,EAAE,wBAAwB,CAAC;KACjD,MAAM,CACL,KAAK,EAAE,OAMN,EAAE,EAAE;IACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAA;IAC7C,MAAM,UAAU,GAAG,WAAW,CAAA;IAC9B,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM;QAC/B,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC;QAC9B,CAAC,CAAC,MAAM,UAAU,EAAE,CAAA;IACtB,MAAM,MAAM,GAAuB,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS,CAAA;IAExE,MAAM,QAAQ,GAAG,eAAe,CAAC,SAAS,CAAC,CAAA;IAC3C,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,OAAO,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAA;QACnC,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAA;QACpB,OAAM;IACR,CAAC;IAED,MAAM,OAAO,GAAsB,EAAE,CAAA;IACrC,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;YACpB,OAAO,CAAC,KAAK,CAAC,eAAe,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC,CAAA;QACxD,CAAC;QAED,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,gBAAgB,CAAC;gBACpC,OAAO,EAAE,OAAO;gBAChB,UAAU;gBACV,UAAU;gBACV,WAAW,EAAE,OAAO,CAAC,OAAO;gBAC5B,MAAM;aACP,CAAC,CAAA;YACF,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACtB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,UAAU,IAAI,CAAC,CAAA;YACf,OAAO,CAAC,KAAK,CAAC,oBAAoB,OAAO,GAAG,EAAE,KAAK,CAAC,CAAA;QACtD,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,CAAC,KAAK,CAAC,2BAA2B,CAAC,CAAA;QAC1C,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAA;QACpB,OAAM;IACR,CAAC;IAED,MAAM,GAAG,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAA;IAC5C,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;QACnB,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,GAAG,IAAI,CAAC,CAAA;QAC5C,OAAO,CAAC,KAAK,CAAC,sBAAsB,OAAO,CAAC,MAAM,EAAE,CAAC,CAAA;IACvD,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,IAAI,CAAC,CAAA;IAClC,CAAC;IAED,IAAI,UAAU,GAAG,CAAC,EAAE,CAAC;QACnB,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAA;IACtB,CAAC;AACH,CAAC,CACF,CAAA;AAEH,OAAO,CAAC,KAAK,EAAE,CAAA"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { extractStatement } from "./lib/extractor.js";
|
|
2
|
+
export { formatTransactionsAsCsv } from "./lib/csv.js";
|
|
3
|
+
export type { AccountType, Logger, StatementResult, TextElement, Transaction, } from "./lib/types.js";
|
|
4
|
+
export type { ExtractStatementOptions } from "./lib/extractor.js";
|
|
5
|
+
export type { PdfMinerOptions } from "./lib/pdfminer.js";
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAA;AACrD,OAAO,EAAE,uBAAuB,EAAE,MAAM,cAAc,CAAA;AACtD,YAAY,EACV,WAAW,EACX,MAAM,EACN,eAAe,EACf,WAAW,EACX,WAAW,GACZ,MAAM,gBAAgB,CAAA;AACvB,YAAY,EAAE,uBAAuB,EAAE,MAAM,oBAAoB,CAAA;AACjE,YAAY,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAA"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAA;AACrD,OAAO,EAAE,uBAAuB,EAAE,MAAM,cAAc,CAAA"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"csv.d.ts","sourceRoot":"","sources":["../../src/lib/csv.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,MAAM,YAAY,CAAA;AAS5C,wBAAgB,uBAAuB,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG,MAAM,CAuB1E"}
|
package/dist/lib/csv.js
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
function escapeCsvField(value) {
|
|
2
|
+
if (value.includes('"') || value.includes(",") || value.includes("\n")) {
|
|
3
|
+
return `"${value.replace(/\"/g, '""')}"`;
|
|
4
|
+
}
|
|
5
|
+
return value;
|
|
6
|
+
}
|
|
7
|
+
export function formatTransactionsAsCsv(results) {
|
|
8
|
+
const header = "Date,Details,Deposit,Withdrawal,Balance,Account Type,Statement Date";
|
|
9
|
+
const rows = [];
|
|
10
|
+
for (const result of results) {
|
|
11
|
+
for (const transaction of result.transactions) {
|
|
12
|
+
const deposit = transaction.deposit !== null ? transaction.deposit.toFixed(2) : "";
|
|
13
|
+
const withdrawal = transaction.withdrawal !== null ? transaction.withdrawal.toFixed(2) : "";
|
|
14
|
+
const balance = transaction.balance.toFixed(2);
|
|
15
|
+
const details = escapeCsvField(transaction.details);
|
|
16
|
+
const accountType = result.accountType;
|
|
17
|
+
const statementDate = escapeCsvField(result.statementDate);
|
|
18
|
+
rows.push(`${transaction.date},${details},${deposit},${withdrawal},${balance},${accountType},${statementDate}`);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
return [header, ...rows].join("\n");
|
|
22
|
+
}
|
|
23
|
+
//# sourceMappingURL=csv.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"csv.js","sourceRoot":"","sources":["../../src/lib/csv.ts"],"names":[],"mappings":"AAEA,SAAS,cAAc,CAAC,KAAa;IACnC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACvE,OAAO,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,GAAG,CAAA;IAC1C,CAAC;IACD,OAAO,KAAK,CAAA;AACd,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,OAA0B;IAChE,MAAM,MAAM,GACV,qEAAqE,CAAA;IACvE,MAAM,IAAI,GAAa,EAAE,CAAA;IAEzB,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,KAAK,MAAM,WAAW,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;YAC9C,MAAM,OAAO,GACX,WAAW,CAAC,OAAO,KAAK,IAAI,CAAC,CAAC,CAAC,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAA;YACpE,MAAM,UAAU,GACd,WAAW,CAAC,UAAU,KAAK,IAAI,CAAC,CAAC,CAAC,WAAW,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAA;YAC1E,MAAM,OAAO,GAAG,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;YAC9C,MAAM,OAAO,GAAG,cAAc,CAAC,WAAW,CAAC,OAAO,CAAC,CAAA;YACnD,MAAM,WAAW,GAAG,MAAM,CAAC,WAAW,CAAA;YACtC,MAAM,aAAa,GAAG,cAAc,CAAC,MAAM,CAAC,aAAa,CAAC,CAAA;YAE1D,IAAI,CAAC,IAAI,CACP,GAAG,WAAW,CAAC,IAAI,IAAI,OAAO,IAAI,OAAO,IAAI,UAAU,IAAI,OAAO,IAAI,WAAW,IAAI,aAAa,EAAE,CACrG,CAAA;QACH,CAAC;IACH,CAAC;IAED,OAAO,CAAC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AACrC,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { PdfMinerOptions } from "./pdfminer.js";
|
|
2
|
+
import { AccountType, Logger, StatementResult } from "./types.js";
|
|
3
|
+
export interface ExtractStatementOptions extends PdfMinerOptions {
|
|
4
|
+
accountType: AccountType;
|
|
5
|
+
logger?: Logger;
|
|
6
|
+
}
|
|
7
|
+
export declare function extractStatement(options: ExtractStatementOptions): Promise<StatementResult>;
|
|
8
|
+
//# sourceMappingURL=extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extractor.d.ts","sourceRoot":"","sources":["../../src/lib/extractor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAuB,eAAe,EAAE,MAAM,eAAe,CAAA;AACpE,OAAO,EAAE,WAAW,EAAE,MAAM,EAAE,eAAe,EAAE,MAAM,YAAY,CAAA;AAGjE,MAAM,WAAW,uBAAwB,SAAQ,eAAe;IAC9D,WAAW,EAAE,WAAW,CAAA;IACxB,MAAM,CAAC,EAAE,MAAM,CAAA;CAChB;AAED,wBAAsB,gBAAgB,CACpC,OAAO,EAAE,uBAAuB,GAC/B,OAAO,CAAC,eAAe,CAAC,CAgB1B"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { extractTextElements } from "./pdfminer.js";
|
|
2
|
+
import { findStatementDate, formatDateIso, parseTransactions } from "./parser.js";
|
|
3
|
+
export async function extractStatement(options) {
|
|
4
|
+
const elements = await extractTextElements(options);
|
|
5
|
+
const statementDateRaw = findStatementDate(elements);
|
|
6
|
+
const statementDate = formatDateIso(statementDateRaw);
|
|
7
|
+
const transactions = parseTransactions(elements, options.accountType, statementDateRaw, options.logger);
|
|
8
|
+
return {
|
|
9
|
+
statementDate,
|
|
10
|
+
accountType: options.accountType,
|
|
11
|
+
transactions,
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
//# sourceMappingURL=extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extractor.js","sourceRoot":"","sources":["../../src/lib/extractor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAmB,MAAM,eAAe,CAAA;AAEpE,OAAO,EAAE,iBAAiB,EAAE,aAAa,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAA;AAOjF,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,OAAgC;IAEhC,MAAM,QAAQ,GAAG,MAAM,mBAAmB,CAAC,OAAO,CAAC,CAAA;IACnD,MAAM,gBAAgB,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAA;IACpD,MAAM,aAAa,GAAG,aAAa,CAAC,gBAAgB,CAAC,CAAA;IACrD,MAAM,YAAY,GAAG,iBAAiB,CACpC,QAAQ,EACR,OAAO,CAAC,WAAW,EACnB,gBAAgB,EAChB,OAAO,CAAC,MAAM,CACf,CAAA;IAED,OAAO;QACL,aAAa;QACb,WAAW,EAAE,OAAO,CAAC,WAAW;QAChC,YAAY;KACb,CAAA;AACH,CAAC"}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import { AccountType, Logger, TextElement, Transaction } from "./types.js";
|
|
2
|
+
export declare function formatDateIso(dateStr: string): string;
|
|
3
|
+
export declare function findStatementDate(elements: TextElement[]): string;
|
|
4
|
+
export declare function parseTransactions(elements: TextElement[], accountType: AccountType, statementDate: string, logger?: Logger): Transaction[];
|
|
5
|
+
//# sourceMappingURL=parser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"parser.d.ts","sourceRoot":"","sources":["../../src/lib/parser.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,YAAY,CAAA;AAmD1E,wBAAgB,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAKrD;AAsHD,wBAAgB,iBAAiB,CAAC,QAAQ,EAAE,WAAW,EAAE,GAAG,MAAM,CAgBjE;AAuDD,wBAAgB,iBAAiB,CAC/B,QAAQ,EAAE,WAAW,EAAE,EACvB,WAAW,EAAE,WAAW,EACxB,aAAa,EAAE,MAAM,EACrB,MAAM,CAAC,EAAE,MAAM,GACd,WAAW,EAAE,CA4Ff"}
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
const MONTHS = {
|
|
2
|
+
Jan: 1, Feb: 2, Mar: 3, Apr: 4, May: 5, Jun: 6,
|
|
3
|
+
Jul: 7, Aug: 8, Sep: 9, Oct: 10, Nov: 11, Dec: 12,
|
|
4
|
+
};
|
|
5
|
+
function parseStatementDate(statementDate) {
|
|
6
|
+
const match = statementDate.match(/^(\d{1,2})\s+(\w{3})\s+(\d{4})$/);
|
|
7
|
+
if (!match) {
|
|
8
|
+
throw new Error(`Invalid statement date format: "${statementDate}"`);
|
|
9
|
+
}
|
|
10
|
+
const day = Number.parseInt(match[1], 10);
|
|
11
|
+
const month = MONTHS[match[2]];
|
|
12
|
+
const year = Number.parseInt(match[3], 10);
|
|
13
|
+
if (!month) {
|
|
14
|
+
throw new Error(`Unknown month in statement date: "${match[2]}"`);
|
|
15
|
+
}
|
|
16
|
+
return { year, month, day };
|
|
17
|
+
}
|
|
18
|
+
function resolveTransactionDate(shortDate, stmtYear, stmtMonth, stmtDay) {
|
|
19
|
+
const match = shortDate.match(/^(\d{1,2})\s+(\w{3})$/);
|
|
20
|
+
if (!match) {
|
|
21
|
+
throw new Error(`Invalid transaction date format: "${shortDate}"`);
|
|
22
|
+
}
|
|
23
|
+
const txDay = Number.parseInt(match[1], 10);
|
|
24
|
+
const txMonth = MONTHS[match[2]];
|
|
25
|
+
if (!txMonth) {
|
|
26
|
+
throw new Error(`Unknown month in transaction date: "${match[2]}"`);
|
|
27
|
+
}
|
|
28
|
+
// Assign the statement's year, then check if the date falls after the statement date.
|
|
29
|
+
// If so, the transaction must belong to the previous year (year boundary crossing).
|
|
30
|
+
let txYear = stmtYear;
|
|
31
|
+
const txNumeric = txYear * 10000 + txMonth * 100 + txDay;
|
|
32
|
+
const stmtNumeric = stmtYear * 10000 + stmtMonth * 100 + stmtDay;
|
|
33
|
+
if (txNumeric > stmtNumeric) {
|
|
34
|
+
txYear -= 1;
|
|
35
|
+
}
|
|
36
|
+
const mm = String(txMonth).padStart(2, "0");
|
|
37
|
+
const dd = String(txDay).padStart(2, "0");
|
|
38
|
+
return `${txYear}-${mm}-${dd}`;
|
|
39
|
+
}
|
|
40
|
+
export function formatDateIso(dateStr) {
|
|
41
|
+
const { year, month, day } = parseStatementDate(dateStr);
|
|
42
|
+
const mm = String(month).padStart(2, "0");
|
|
43
|
+
const dd = String(day).padStart(2, "0");
|
|
44
|
+
return `${year}-${mm}-${dd}`;
|
|
45
|
+
}
|
|
46
|
+
const COLUMNS = {
|
|
47
|
+
DATE: { min: 55, max: 105 },
|
|
48
|
+
DETAILS: { min: 105, max: 345 },
|
|
49
|
+
DEPOSIT: { min: 345, max: 420 },
|
|
50
|
+
WITHDRAWAL: { min: 420, max: 490 },
|
|
51
|
+
BALANCE: { min: 490, max: 550 },
|
|
52
|
+
};
|
|
53
|
+
function parseNumber(str) {
|
|
54
|
+
if (!str || str.trim() === "")
|
|
55
|
+
return null;
|
|
56
|
+
const cleaned = str.replace(/,/g, "").trim();
|
|
57
|
+
const num = Number.parseFloat(cleaned);
|
|
58
|
+
return Number.isNaN(num) ? null : num;
|
|
59
|
+
}
|
|
60
|
+
function detectColumn(x0) {
|
|
61
|
+
if (x0 >= COLUMNS.DATE.min && x0 < COLUMNS.DATE.max)
|
|
62
|
+
return "date";
|
|
63
|
+
if (x0 >= COLUMNS.DETAILS.min && x0 < COLUMNS.DETAILS.max)
|
|
64
|
+
return "details";
|
|
65
|
+
if (x0 >= COLUMNS.DEPOSIT.min && x0 < COLUMNS.DEPOSIT.max)
|
|
66
|
+
return "deposit";
|
|
67
|
+
if (x0 >= COLUMNS.WITHDRAWAL.min && x0 < COLUMNS.WITHDRAWAL.max)
|
|
68
|
+
return "withdrawal";
|
|
69
|
+
if (x0 >= COLUMNS.BALANCE.min && x0 < COLUMNS.BALANCE.max)
|
|
70
|
+
return "balance";
|
|
71
|
+
return null;
|
|
72
|
+
}
|
|
73
|
+
function groupIntoRows(elements) {
|
|
74
|
+
const rows = new Map();
|
|
75
|
+
const tolerance = 2;
|
|
76
|
+
for (const el of elements) {
|
|
77
|
+
let rowY = null;
|
|
78
|
+
for (const y of rows.keys()) {
|
|
79
|
+
if (Math.abs(y - el.y0) <= tolerance) {
|
|
80
|
+
rowY = y;
|
|
81
|
+
break;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
if (rowY !== null) {
|
|
85
|
+
rows.get(rowY).push(el);
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
rows.set(el.y0, [el]);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return rows;
|
|
92
|
+
}
|
|
93
|
+
function groupIntoRowsArray(elements) {
|
|
94
|
+
const rowMap = groupIntoRows(elements);
|
|
95
|
+
const sortedYs = Array.from(rowMap.keys()).sort((a, b) => a - b);
|
|
96
|
+
return sortedYs.map((y) => rowMap.get(y));
|
|
97
|
+
}
|
|
98
|
+
function parseRow(elements) {
|
|
99
|
+
const row = {
|
|
100
|
+
date: null,
|
|
101
|
+
details: null,
|
|
102
|
+
deposit: null,
|
|
103
|
+
withdrawal: null,
|
|
104
|
+
balance: null,
|
|
105
|
+
};
|
|
106
|
+
elements.sort((a, b) => a.x0 - b.x0);
|
|
107
|
+
for (const el of elements) {
|
|
108
|
+
const col = detectColumn(el.x0);
|
|
109
|
+
const text = el.text.trim();
|
|
110
|
+
switch (col) {
|
|
111
|
+
case "date":
|
|
112
|
+
row.date = text;
|
|
113
|
+
break;
|
|
114
|
+
case "details":
|
|
115
|
+
row.details = text;
|
|
116
|
+
break;
|
|
117
|
+
case "deposit":
|
|
118
|
+
row.deposit = parseNumber(text);
|
|
119
|
+
break;
|
|
120
|
+
case "withdrawal":
|
|
121
|
+
row.withdrawal = parseNumber(text);
|
|
122
|
+
break;
|
|
123
|
+
case "balance":
|
|
124
|
+
row.balance = parseNumber(text);
|
|
125
|
+
break;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
return row;
|
|
129
|
+
}
|
|
130
|
+
function findElementWithText(elements, text, startPage = 0, startY = 0) {
|
|
131
|
+
return (elements.find((el) => (el.page > startPage || (el.page === startPage && el.y0 >= startY)) &&
|
|
132
|
+
el.text.includes(text)) || null);
|
|
133
|
+
}
|
|
134
|
+
export function findStatementDate(elements) {
|
|
135
|
+
for (const el of elements) {
|
|
136
|
+
if (el.text.match(/^\d{1,2}\s+\w{3}\s+\d{4}$/)) {
|
|
137
|
+
const prevEl = elements.find((e) => e.page === el.page &&
|
|
138
|
+
Math.abs(e.y0 - el.y0) < 3 &&
|
|
139
|
+
e.text.includes("Statement Date"));
|
|
140
|
+
if (prevEl) {
|
|
141
|
+
return el.text;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return "Unknown";
|
|
146
|
+
}
|
|
147
|
+
function findTransactionsSection(elements, accountType) {
|
|
148
|
+
const start = findElementWithText(elements, accountType === "savings"
|
|
149
|
+
? "Integrated Account Statement Savings"
|
|
150
|
+
: "Integrated Account Current");
|
|
151
|
+
if (!start)
|
|
152
|
+
return { start: null, end: null };
|
|
153
|
+
const rowStart = findElementWithText(elements, "DR=Debit", start.page, start.y1);
|
|
154
|
+
if (!rowStart)
|
|
155
|
+
return { start: null, end: null };
|
|
156
|
+
const end = findElementWithText(elements, "Transaction Summary", rowStart.page, rowStart.y1);
|
|
157
|
+
if (!end)
|
|
158
|
+
return { start: null, end: null };
|
|
159
|
+
return {
|
|
160
|
+
start: {
|
|
161
|
+
page: rowStart.page,
|
|
162
|
+
y0: rowStart.y1,
|
|
163
|
+
},
|
|
164
|
+
end: {
|
|
165
|
+
page: end.page,
|
|
166
|
+
y0: end.y0,
|
|
167
|
+
},
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
function getElementsInSection(start, end, elements) {
|
|
171
|
+
return elements.filter((el) => {
|
|
172
|
+
if (el.page < start.page || el.page > end.page)
|
|
173
|
+
return false;
|
|
174
|
+
if (el.page === start.page && el.y0 < start.y0)
|
|
175
|
+
return false;
|
|
176
|
+
if (el.page === end.page && el.y0 >= end.y0)
|
|
177
|
+
return false;
|
|
178
|
+
return true;
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
export function parseTransactions(elements, accountType, statementDate, logger) {
|
|
182
|
+
const stmt = parseStatementDate(statementDate);
|
|
183
|
+
const { start, end } = findTransactionsSection(elements, accountType);
|
|
184
|
+
if (!start || !end) {
|
|
185
|
+
throw new Error("Could not find transactions section in PDF");
|
|
186
|
+
}
|
|
187
|
+
logger?.debug?.(`Transaction section: start page ${start.page} y ${start.y0}, end page ${end.page} y ${end.y0}`);
|
|
188
|
+
const sectionElements = getElementsInSection(start, end, elements);
|
|
189
|
+
const rows = groupIntoRowsArray(sectionElements);
|
|
190
|
+
let openingBalance = null;
|
|
191
|
+
let lastDate = "";
|
|
192
|
+
const desc = [];
|
|
193
|
+
const transactions = [];
|
|
194
|
+
for (let i = 0; i < rows.length; i++) {
|
|
195
|
+
const row = rows[i];
|
|
196
|
+
const parsedRow = parseRow(row);
|
|
197
|
+
const date = parsedRow.date ?? lastDate;
|
|
198
|
+
if (parsedRow.date) {
|
|
199
|
+
lastDate = parsedRow.date;
|
|
200
|
+
}
|
|
201
|
+
if (parsedRow.deposit !== null || parsedRow.withdrawal !== null) {
|
|
202
|
+
if (!date) {
|
|
203
|
+
throw new Error(`Could not determine date for transaction at row ${i + 1}`);
|
|
204
|
+
}
|
|
205
|
+
if (openingBalance === null) {
|
|
206
|
+
throw new Error(`Transaction found before opening balance at row ${i + 1}`);
|
|
207
|
+
}
|
|
208
|
+
const previousTransaction = transactions[transactions.length - 1];
|
|
209
|
+
const previousBalance = previousTransaction
|
|
210
|
+
? previousTransaction.balance
|
|
211
|
+
: openingBalance;
|
|
212
|
+
const balance = parsedRow.balance ??
|
|
213
|
+
previousBalance + (parsedRow.deposit ?? 0) - (parsedRow.withdrawal ?? 0);
|
|
214
|
+
if (parsedRow.balance !== null && balance !== parsedRow.balance) {
|
|
215
|
+
throw new Error(`Balance mismatch at row ${i + 1}: calculated ${balance}, found ${parsedRow.balance}`);
|
|
216
|
+
}
|
|
217
|
+
const details = [...desc, parsedRow.details ?? ""].join(" ").trim();
|
|
218
|
+
desc.length = 0;
|
|
219
|
+
transactions.push({
|
|
220
|
+
date: resolveTransactionDate(date, stmt.year, stmt.month, stmt.day),
|
|
221
|
+
details,
|
|
222
|
+
deposit: parsedRow.deposit,
|
|
223
|
+
withdrawal: parsedRow.withdrawal,
|
|
224
|
+
balance,
|
|
225
|
+
});
|
|
226
|
+
}
|
|
227
|
+
else {
|
|
228
|
+
if (!parsedRow.details) {
|
|
229
|
+
throw new Error(`Found non-transaction row without details at row ${i + 1}`);
|
|
230
|
+
}
|
|
231
|
+
if (parsedRow.details === "B/F BALANCE") {
|
|
232
|
+
if (parsedRow.balance === null) {
|
|
233
|
+
throw new Error(`Opening balance row missing balance amount at row ${i + 1}`);
|
|
234
|
+
}
|
|
235
|
+
if (transactions.length > 0) {
|
|
236
|
+
throw new Error(`Opening balance row found after transactions at row ${i + 1}`);
|
|
237
|
+
}
|
|
238
|
+
logger?.debug?.(`Opening balance detected: ${parsedRow.balance}`);
|
|
239
|
+
openingBalance = parsedRow.balance;
|
|
240
|
+
}
|
|
241
|
+
else {
|
|
242
|
+
desc.push(parsedRow.details.trim());
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
return transactions;
|
|
247
|
+
}
|
|
248
|
+
//# sourceMappingURL=parser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"parser.js","sourceRoot":"","sources":["../../src/lib/parser.ts"],"names":[],"mappings":"AAEA,MAAM,MAAM,GAA2B;IACrC,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC;IAC9C,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,EAAE,EAAE,GAAG,EAAE,EAAE,EAAE,GAAG,EAAE,EAAE;CAClD,CAAA;AAED,SAAS,kBAAkB,CAAC,aAAqB;IAC/C,MAAM,KAAK,GAAG,aAAa,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAA;IACpE,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,mCAAmC,aAAa,GAAG,CAAC,CAAA;IACtE,CAAC;IACD,MAAM,GAAG,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAA;IAC1C,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,CAAA;IAC/B,MAAM,IAAI,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAA;IAC3C,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,qCAAqC,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAA;IACnE,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,CAAA;AAC7B,CAAC;AAED,SAAS,sBAAsB,CAC7B,SAAiB,EACjB,QAAgB,EAChB,SAAiB,EACjB,OAAe;IAEf,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAA;IACtD,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,qCAAqC,SAAS,GAAG,CAAC,CAAA;IACpE,CAAC;IACD,MAAM,KAAK,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAA;IAC5C,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,CAAA;IACjC,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,MAAM,IAAI,KAAK,CAAC,uCAAuC,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAA;IACrE,CAAC;IAED,sFAAsF;IACtF,oFAAoF;IACpF,IAAI,MAAM,GAAG,QAAQ,CAAA;IACrB,MAAM,SAAS,GAAG,MAAM,GAAG,KAAK,GAAG,OAAO,GAAG,GAAG,GAAG,KAAK,CAAA;IACxD,MAAM,WAAW,GAAG,QAAQ,GAAG,KAAK,GAAG,SAAS,GAAG,GAAG,GAAG,OAAO,CAAA;IAChE,IAAI,SAAS,GAAG,WAAW,EAAE,CAAC;QAC5B,MAAM,IAAI,CAAC,CAAA;IACb,CAAC;IAED,MAAM,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAA;IAC3C,MAAM,EAAE,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAA;IACzC,OAAO,GAAG,MAAM,IAAI,EAAE,IAAI,EAAE,EAAE,CAAA;AAChC,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,OAAe;IAC3C,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,kBAAkB,CAAC,OAAO,CAAC,CAAA;IACxD,MAAM,EAAE,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAA;IACzC,MAAM,EAAE,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAA;IACvC,OAAO,GAAG,IAAI,IAAI,EAAE,IAAI,EAAE,EAAE,CAAA;AAC9B,CAAC;AAED,MAAM,OAAO,GAAG;IACd,IAAI,EAAE,EAAE,GAAG,EAAE,EAAE,EAAE,GAAG,EAAE,GAAG,EAAE;IAC3B,OAAO,EAAE,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE;IAC/B,OAAO,EAAE,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE;IAC/B,UAAU,EAAE,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE;IAClC,OAAO,EAAE,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE;CAChC,CAAA;AAUD,SAAS,WAAW,CAAC,GAAW;IAC9B,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,KAAK,EAAE;QAAE,OAAO,IAAI,CAAA;IAC1C,MAAM,OAAO,GAAG,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;IAC5C,MAAM,GAAG,GAAG,MAAM,CAAC,UAAU,CAAC,OAAO,CAAC,CAAA;IACtC,OAAO,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAA;AACvC,CAAC;AAED,SAAS,YAAY,CACnB,EAAU;IAEV,IAAI,EAAE,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG;QAAE,OAAO,MAAM,CAAA;IAClE,IAAI,EAAE,IAAI,OAAO,CAAC,OAAO,CAAC,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG;QAAE,OAAO,SAAS,CAAA;IAC3E,IAAI,EAAE,IAAI,OAAO,CAAC,OAAO,CAAC,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG;QAAE,OAAO,SAAS,CAAA;IAC3E,IAAI,EAAE,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,UAAU,CAAC,GAAG;QAC7D,OAAO,YAAY,CAAA;IACrB,IAAI,EAAE,IAAI,OAAO,CAAC,OAAO,CAAC,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG;QAAE,OAAO,SAAS,CAAA;IAC3E,OAAO,IAAI,CAAA;AACb,CAAC;AAED,SAAS,aAAa,CAAC,QAAuB;IAC5C,MAAM,IAAI,GAAG,IAAI,GAAG,EAAyB,CAAA;IAC7C,MAAM,SAAS,GAAG,CAAC,CAAA;IAEnB,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,IAAI,IAAI,GAAkB,IAAI,CAAA;QAC9B,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;YAC5B,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,IAAI,SAAS,EAAE,CAAC;gBACrC,IAAI,GAAG,CAAC,CAAA;gBACR,MAAK;YACP,CAAC;QACH,CAAC;QAED,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;YAClB,IAAI,CAAC,GAAG,CAAC,IAAI,CAAE,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;QAC1B,CAAC;aAAM,CAAC;YACN,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC,CAAA;QACvB,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAA;AACb,CAAC;AAED,SAAS,kBAAkB,CAAC,QAAuB;IACjD,MAAM,MAAM,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAA;IACtC,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;IAChE,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAE,CAAC,CAAA;AAC5C,CAAC;AAED,SAAS,QAAQ,CAAC,QAAuB;IACvC,MAAM,GAAG,GAAY;QACnB,IAAI,EAAE,IAAI;QACV,OAAO,EAAE,IAAI;QACb,OAAO,EAAE,IAAI;QACb,UAAU,EAAE,IAAI;QAChB,OAAO,EAAE,IAAI;KACd,CAAA;IAED,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,EAAE,CAAC,CAAA;IAEpC,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,CAAC,EAAE,CAAC,CAAA;QAC/B,MAAM,IAAI,GAAG,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAA;QAE3B,QAAQ,GAAG,EAAE,CAAC;YACZ,KAAK,MAAM;gBACT,GAAG,CAAC,IAAI,GAAG,IAAI,CAAA;gBACf,MAAK;YACP,KAAK,SAAS;gBACZ,GAAG,CAAC,OAAO,GAAG,IAAI,CAAA;gBAClB,MAAK;YACP,KAAK,SAAS;gBACZ,GAAG,CAAC,OAAO,GAAG,WAAW,CAAC,IAAI,CAAC,CAAA;gBAC/B,MAAK;YACP,KAAK,YAAY;gBACf,GAAG,CAAC,UAAU,GAAG,WAAW,CAAC,IAAI,CAAC,CAAA;gBAClC,MAAK;YACP,KAAK,SAAS;gBACZ,GAAG,CAAC,OAAO,GAAG,WAAW,CAAC,IAAI,CAAC,CAAA;gBAC/B,MAAK;QACT,CAAC;IACH,CAAC;IAED,OAAO,GAAG,CAAA;AACZ,CAAC;AAED,SAAS,mBAAmB,CAC1B,QAAuB,EACvB,IAAY,EACZ,SAAS,GAAG,CAAC,EACb,MAAM,GAAG,CAAC;IAEV,OAAO,CACL,QAAQ,CAAC,IAAI,CACX,CAAC,EAAE,EAAE,EAAE,CACL,CAAC,EAAE,CAAC,IAAI,GAAG,SAAS,IAAI,CAAC,EAAE,CAAC,IAAI,KAAK,SAAS,IAAI,EAAE,CAAC,EAAE,IAAI,MAAM,CAAC,CAAC;QACnE,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CACzB,IAAI,IAAI,CACV,CAAA;AACH,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,QAAuB;IACvD,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,IAAI,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,2BAA2B,CAAC,EAAE,CAAC;YAC/C,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAC1B,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,IAAI,KAAK,EAAE,CAAC,IAAI;gBAClB,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC;gBAC1B,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CACpC,CAAA;YACD,IAAI,MAAM,EAAE,CAAC;gBACX,OAAO,EAAE,CAAC,IAAI,CAAA;YAChB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAA;AAClB,CAAC;AAED,SAAS,uBAAuB,CAC9B,QAAuB,EACvB,WAAwB;IAExB,MAAM,KAAK,GAAG,mBAAmB,CAC/B,QAAQ,EACR,WAAW,KAAK,SAAS;QACvB,CAAC,CAAC,sCAAsC;QACxC,CAAC,CAAC,4BAA4B,CACjC,CAAA;IACD,IAAI,CAAC,KAAK;QAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,CAAA;IAE7C,MAAM,QAAQ,GAAG,mBAAmB,CAClC,QAAQ,EACR,UAAU,EACV,KAAK,CAAC,IAAI,EACV,KAAK,CAAC,EAAE,CACT,CAAA;IACD,IAAI,CAAC,QAAQ;QAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,CAAA;IAEhD,MAAM,GAAG,GAAG,mBAAmB,CAC7B,QAAQ,EACR,qBAAqB,EACrB,QAAQ,CAAC,IAAI,EACb,QAAQ,CAAC,EAAE,CACZ,CAAA;IACD,IAAI,CAAC,GAAG;QAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,CAAA;IAE3C,OAAO;QACL,KAAK,EAAE;YACL,IAAI,EAAE,QAAQ,CAAC,IAAI;YACnB,EAAE,EAAE,QAAQ,CAAC,EAAE;SAChB;QACD,GAAG,EAAE;YACH,IAAI,EAAE,GAAG,CAAC,IAAI;YACd,EAAE,EAAE,GAAG,CAAC,EAAE;SACX;KACF,CAAA;AACH,CAAC;AAED,SAAS,oBAAoB,CAC3B,KAAmC,EACnC,GAAiC,EACjC,QAAuB;IAEvB,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE;QAC5B,IAAI,EAAE,CAAC,IAAI,GAAG,KAAK,CAAC,IAAI,IAAI,EAAE,CAAC,IAAI,GAAG,GAAG,CAAC,IAAI;YAAE,OAAO,KAAK,CAAA;QAC5D,IAAI,EAAE,CAAC,IAAI,KAAK,KAAK,CAAC,IAAI,IAAI,EAAE,CAAC,EAAE,GAAG,KAAK,CAAC,EAAE;YAAE,OAAO,KAAK,CAAA;QAC5D,IAAI,EAAE,CAAC,IAAI,KAAK,GAAG,CAAC,IAAI,IAAI,EAAE,CAAC,EAAE,IAAI,GAAG,CAAC,EAAE;YAAE,OAAO,KAAK,CAAA;QACzD,OAAO,IAAI,CAAA;IACb,CAAC,CAAC,CAAA;AACJ,CAAC;AAED,MAAM,UAAU,iBAAiB,CAC/B,QAAuB,EACvB,WAAwB,EACxB,aAAqB,EACrB,MAAe;IAEf,MAAM,IAAI,GAAG,kBAAkB,CAAC,aAAa,CAAC,CAAA;IAC9C,MAAM,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,uBAAuB,CAAC,QAAQ,EAAE,WAAW,CAAC,CAAA;IACrE,IAAI,CAAC,KAAK,IAAI,CAAC,GAAG,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CAAC,4CAA4C,CAAC,CAAA;IAC/D,CAAC;IAED,MAAM,EAAE,KAAK,EAAE,CACb,mCAAmC,KAAK,CAAC,IAAI,MAAM,KAAK,CAAC,EAAE,cAAc,GAAG,CAAC,IAAI,MAAM,GAAG,CAAC,EAAE,EAAE,CAChG,CAAA;IAED,MAAM,eAAe,GAAG,oBAAoB,CAAC,KAAK,EAAE,GAAG,EAAE,QAAQ,CAAC,CAAA;IAClE,MAAM,IAAI,GAAG,kBAAkB,CAAC,eAAe,CAAC,CAAA;IAEhD,IAAI,cAAc,GAAkB,IAAI,CAAA;IACxC,IAAI,QAAQ,GAAG,EAAE,CAAA;IACjB,MAAM,IAAI,GAAa,EAAE,CAAA;IACzB,MAAM,YAAY,GAAkB,EAAE,CAAA;IAEtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAE,CAAA;QACpB,MAAM,SAAS,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAA;QAE/B,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,IAAI,QAAQ,CAAA;QACvC,IAAI,SAAS,CAAC,IAAI,EAAE,CAAC;YACnB,QAAQ,GAAG,SAAS,CAAC,IAAI,CAAA;QAC3B,CAAC;QAED,IAAI,SAAS,CAAC,OAAO,KAAK,IAAI,IAAI,SAAS,CAAC,UAAU,KAAK,IAAI,EAAE,CAAC;YAChE,IAAI,CAAC,IAAI,EAAE,CAAC;gBACV,MAAM,IAAI,KAAK,CACb,mDAAmD,CAAC,GAAG,CAAC,EAAE,CAC3D,CAAA;YACH,CAAC;YACD,IAAI,cAAc,KAAK,IAAI,EAAE,CAAC;gBAC5B,MAAM,IAAI,KAAK,CACb,mDAAmD,CAAC,GAAG,CAAC,EAAE,CAC3D,CAAA;YACH,CAAC;YAED,MAAM,mBAAmB,GAAG,YAAY,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;YACjE,MAAM,eAAe,GAAG,mBAAmB;gBACzC,CAAC,CAAC,mBAAmB,CAAC,OAAO;gBAC7B,CAAC,CAAC,cAAc,CAAA;YAElB,MAAM,OAAO,GACX,SAAS,CAAC,OAAO;gBACjB,eAAe,GAAG,CAAC,SAAS,CAAC,OAAO,IAAI,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,UAAU,IAAI,CAAC,CAAC,CAAA;YAE1E,IAAI,SAAS,CAAC,OAAO,KAAK,IAAI,IAAI,OAAO,KAAK,SAAS,CAAC,OAAO,EAAE,CAAC;gBAChE,MAAM,IAAI,KAAK,CACb,2BAA2B,CAAC,GAAG,CAAC,gBAAgB,OAAO,WAAW,SAAS,CAAC,OAAO,EAAE,CACtF,CAAA;YACH,CAAC;YAED,MAAM,OAAO,GAAG,CAAC,GAAG,IAAI,EAAE,SAAS,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;YACnE,IAAI,CAAC,MAAM,GAAG,CAAC,CAAA;YAEf,YAAY,CAAC,IAAI,CAAC;gBAChB,IAAI,EAAE,sBAAsB,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC;gBACnE,OAAO;gBACP,OAAO,EAAE,SAAS,CAAC,OAAO;gBAC1B,UAAU,EAAE,SAAS,CAAC,UAAU;gBAChC,OAAO;aACR,CAAC,CAAA;QACJ,CAAC;aAAM,CAAC;YACN,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,CAAC;gBACvB,MAAM,IAAI,KAAK,CACb,oDAAoD,CAAC,GAAG,CAAC,EAAE,CAC5D,CAAA;YACH,CAAC;YAED,IAAI,SAAS,CAAC,OAAO,KAAK,aAAa,EAAE,CAAC;gBACxC,IAAI,SAAS,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;oBAC/B,MAAM,IAAI,KAAK,CACb,qDAAqD,CAAC,GAAG,CAAC,EAAE,CAC7D,CAAA;gBACH,CAAC;gBACD,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC5B,MAAM,IAAI,KAAK,CACb,uDAAuD,CAAC,GAAG,CAAC,EAAE,CAC/D,CAAA;gBACH,CAAC;gBACD,MAAM,EAAE,KAAK,EAAE,CAAC,6BAA6B,SAAS,CAAC,OAAO,EAAE,CAAC,CAAA;gBACjE,cAAc,GAAG,SAAS,CAAC,OAAO,CAAA;YACpC,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAA;YACrC,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,YAAY,CAAA;AACrB,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { TextElement } from "./types.js";
|
|
2
|
+
export interface PdfMinerOptions {
|
|
3
|
+
pdfPath: string;
|
|
4
|
+
scriptPath: string;
|
|
5
|
+
pythonPath: string;
|
|
6
|
+
}
|
|
7
|
+
export declare function extractTextElements(options: PdfMinerOptions): Promise<TextElement[]>;
|
|
8
|
+
//# sourceMappingURL=pdfminer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdfminer.d.ts","sourceRoot":"","sources":["../../src/lib/pdfminer.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,WAAW,EAAE,MAAM,YAAY,CAAA;AAIxC,MAAM,WAAW,eAAe;IAC9B,OAAO,EAAE,MAAM,CAAA;IACf,UAAU,EAAE,MAAM,CAAA;IAClB,UAAU,EAAE,MAAM,CAAA;CACnB;AAED,wBAAsB,mBAAmB,CAAC,OAAO,EAAE,eAAe,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC,CAO1F"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { execFile } from "node:child_process";
|
|
2
|
+
import { promisify } from "node:util";
|
|
3
|
+
const execFileAsync = promisify(execFile);
|
|
4
|
+
export async function extractTextElements(options) {
|
|
5
|
+
const { stdout } = await execFileAsync(options.pythonPath, [options.scriptPath, options.pdfPath], {
|
|
6
|
+
encoding: "utf-8",
|
|
7
|
+
maxBuffer: 10 * 1024 * 1024,
|
|
8
|
+
});
|
|
9
|
+
return JSON.parse(stdout);
|
|
10
|
+
}
|
|
11
|
+
//# sourceMappingURL=pdfminer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdfminer.js","sourceRoot":"","sources":["../../src/lib/pdfminer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAA;AAC7C,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAA;AAGrC,MAAM,aAAa,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAA;AAQzC,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,OAAwB;IAChE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,OAAO,CAAC,EAAE;QAChG,QAAQ,EAAE,OAAO;QACjB,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI;KAC5B,CAAC,CAAA;IAEF,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAkB,CAAA;AAC5C,CAAC"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
export interface Transaction {
|
|
2
|
+
date: string;
|
|
3
|
+
details: string;
|
|
4
|
+
deposit: number | null;
|
|
5
|
+
withdrawal: number | null;
|
|
6
|
+
balance: number;
|
|
7
|
+
}
|
|
8
|
+
export interface TextElement {
|
|
9
|
+
page: number;
|
|
10
|
+
text: string;
|
|
11
|
+
x0: number;
|
|
12
|
+
y0: number;
|
|
13
|
+
x1: number;
|
|
14
|
+
y1: number;
|
|
15
|
+
}
|
|
16
|
+
export type AccountType = "savings" | "current";
|
|
17
|
+
export interface StatementResult {
|
|
18
|
+
statementDate: string;
|
|
19
|
+
accountType: AccountType;
|
|
20
|
+
transactions: Transaction[];
|
|
21
|
+
}
|
|
22
|
+
export interface Logger {
|
|
23
|
+
debug?: (message: string) => void;
|
|
24
|
+
info?: (message: string) => void;
|
|
25
|
+
warn?: (message: string) => void;
|
|
26
|
+
error?: (message: string) => void;
|
|
27
|
+
}
|
|
28
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAA;IACZ,OAAO,EAAE,MAAM,CAAA;IACf,OAAO,EAAE,MAAM,GAAG,IAAI,CAAA;IACtB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAA;IACzB,OAAO,EAAE,MAAM,CAAA;CAChB;AAED,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAA;IACZ,IAAI,EAAE,MAAM,CAAA;IACZ,EAAE,EAAE,MAAM,CAAA;IACV,EAAE,EAAE,MAAM,CAAA;IACV,EAAE,EAAE,MAAM,CAAA;IACV,EAAE,EAAE,MAAM,CAAA;CACX;AAED,MAAM,MAAM,WAAW,GAAG,SAAS,GAAG,SAAS,CAAA;AAE/C,MAAM,WAAW,eAAe;IAC9B,aAAa,EAAE,MAAM,CAAA;IACrB,WAAW,EAAE,WAAW,CAAA;IACxB,YAAY,EAAE,WAAW,EAAE,CAAA;CAC5B;AAED,MAAM,WAAW,MAAM;IACrB,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAA;IACjC,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAA;IAChC,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAA;IAChC,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAA;CAClC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":""}
|
package/package.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@krizpoon/hangseng-statement-extractor",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Extract transaction history from Hang Seng Bank statement PDFs",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.js",
|
|
7
|
+
"types": "./dist/index.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./dist/index.d.ts",
|
|
11
|
+
"import": "./dist/index.js"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
"bin": {
|
|
15
|
+
"hangseng-statement-extractor": "dist/cli.js"
|
|
16
|
+
},
|
|
17
|
+
"files": [
|
|
18
|
+
"dist",
|
|
19
|
+
"scripts"
|
|
20
|
+
],
|
|
21
|
+
"scripts": {
|
|
22
|
+
"build": "tsc",
|
|
23
|
+
"lint": "tsc -p tsconfig.json --noEmit"
|
|
24
|
+
},
|
|
25
|
+
"keywords": [
|
|
26
|
+
"hang-seng",
|
|
27
|
+
"pdf",
|
|
28
|
+
"statement",
|
|
29
|
+
"banking",
|
|
30
|
+
"parser"
|
|
31
|
+
],
|
|
32
|
+
"license": "MIT",
|
|
33
|
+
"dependencies": {
|
|
34
|
+
"commander": "^14.0.2"
|
|
35
|
+
},
|
|
36
|
+
"devDependencies": {
|
|
37
|
+
"@types/node": "^25.0.9",
|
|
38
|
+
"typescript": "^5.9.3"
|
|
39
|
+
}
|
|
40
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Extract text with positions from a PDF file using pdfminer.
|
|
4
|
+
Outputs JSON with text elements and their bounding boxes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import json
|
|
9
|
+
from pdfminer.high_level import extract_pages
|
|
10
|
+
from pdfminer.layout import LTTextBoxHorizontal, LTTextLineHorizontal, LTChar, LAParams
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def extract_text_with_positions(pdf_path: str) -> list[dict]:
|
|
14
|
+
"""Extract text elements with their positions from a PDF."""
|
|
15
|
+
elements = []
|
|
16
|
+
|
|
17
|
+
laparams = LAParams(
|
|
18
|
+
line_margin=0.3, # Smaller margin to keep lines separate
|
|
19
|
+
word_margin=0.1,
|
|
20
|
+
char_margin=2.0,
|
|
21
|
+
boxes_flow=0.5,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
for page_num, page_layout in enumerate(extract_pages(pdf_path, laparams=laparams)):
|
|
25
|
+
page_height = page_layout.height
|
|
26
|
+
|
|
27
|
+
for element in page_layout:
|
|
28
|
+
if isinstance(element, LTTextBoxHorizontal):
|
|
29
|
+
for line in element:
|
|
30
|
+
if isinstance(line, LTTextLineHorizontal):
|
|
31
|
+
text = line.get_text().strip()
|
|
32
|
+
if text:
|
|
33
|
+
# PDF coordinates have origin at bottom-left
|
|
34
|
+
# Convert to top-left origin for easier processing
|
|
35
|
+
elements.append({
|
|
36
|
+
"page": page_num + 1,
|
|
37
|
+
"text": text,
|
|
38
|
+
"x0": round(line.x0, 2),
|
|
39
|
+
"y0": round(page_height - line.y1, 2), # Convert to top-left origin
|
|
40
|
+
"x1": round(line.x1, 2),
|
|
41
|
+
"y1": round(page_height - line.y0, 2),
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
# Sort by page, then by y position (top to bottom), then by x position (left to right)
|
|
45
|
+
elements.sort(key=lambda e: (e["page"], e["y0"], e["x0"]))
|
|
46
|
+
|
|
47
|
+
return elements
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def main():
|
|
51
|
+
if len(sys.argv) < 2:
|
|
52
|
+
print("Usage: pdf-extract-positions.py <pdf_path>", file=sys.stderr)
|
|
53
|
+
sys.exit(1)
|
|
54
|
+
|
|
55
|
+
pdf_path = sys.argv[1]
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
elements = extract_text_with_positions(pdf_path)
|
|
59
|
+
print(json.dumps(elements, ensure_ascii=False, indent=2))
|
|
60
|
+
except Exception as e:
|
|
61
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
62
|
+
sys.exit(1)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
main()
|