einvoice-cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +139 -0
- package/bin/cli.js +52 -0
- package/index.js +23 -0
- package/lib/BaseInvoiceService.js +158 -0
- package/lib/ErrorHandler.js +98 -0
- package/lib/Invoice.js +108 -0
- package/lib/InvoiceValidator.js +422 -0
- package/lib/OfdInvoiceExtractor.js +170 -0
- package/lib/PDFTextPositionAnalyzer.js +366 -0
- package/lib/PdfFinancialInvoiceService.js +134 -0
- package/lib/PdfFullElectronicInvoiceService.js +325 -0
- package/lib/PdfInvoiceExtractor.js +124 -0
- package/lib/PdfRegularInvoiceService.js +786 -0
- package/lib/RegexPatterns.js +202 -0
- package/lib/StringUtils.js +70 -0
- package/lib/extractor.js +24 -0
- package/package.json +31 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 正则表达式模式库 - 参考原 Java 代码
|
|
3
|
+
* 用于提取发票各个字段
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
class RegexPatterns {
|
|
7
|
+
/**
|
|
8
|
+
* 基础字段模式
|
|
9
|
+
*/
|
|
10
|
+
static BASIC_FIELDS = {
|
|
11
|
+
// 机器编号: 12位数字,可能在"机器编号:"或"发票代码机器编号:"后
|
|
12
|
+
machineNumber: /(?:机器编号[::]|发票代码机器编号[::])(\d{12})/,
|
|
13
|
+
|
|
14
|
+
// 发票代码: 12位数字
|
|
15
|
+
code: /发票代码[::](\d{12})/,
|
|
16
|
+
|
|
17
|
+
// 发票号码: 8-10位数字
|
|
18
|
+
number: /发票号码[::](\d{8,10})/,
|
|
19
|
+
|
|
20
|
+
// 日期: YYYY年MM月DD日 格式
|
|
21
|
+
date: /(\d{4}年\d{1,2}月\d{1,2}日)/,
|
|
22
|
+
|
|
23
|
+
// 其他日期格式
|
|
24
|
+
dateDash: /(\d{4}[-/]\d{1,2}[-/]\d{1,2})/,
|
|
25
|
+
dateCompact: /(\d{4}\d{2}\d{2})/,
|
|
26
|
+
|
|
27
|
+
// 校验码: 20位数字或较长的非空字符序列
|
|
28
|
+
checksum: /校验码[::](\d{20}|\S{10,})/,
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* 金额类模式
|
|
33
|
+
*/
|
|
34
|
+
static AMOUNT_FIELDS = {
|
|
35
|
+
// 合计金额(不含税)
|
|
36
|
+
amount: /(?:合计|小计)[::\s]*¥?(\d+\.?\d*)/,
|
|
37
|
+
|
|
38
|
+
// 税额
|
|
39
|
+
taxAmount: /税额[::\s]*¥?(\d+\.?\d*)/,
|
|
40
|
+
|
|
41
|
+
// 价税合计 (大写和小写)
|
|
42
|
+
totalAmount: /价税合计(?:\(大写\))?[::\s]*([^\(\)]*?)(?:\(小写\))?[::\s]*¥?(\d+\.?\d*)/,
|
|
43
|
+
|
|
44
|
+
// 金额+税额组合 (用于全电发票或紧凑布局)
|
|
45
|
+
amountWithTax: /¥?(\d+\.?\d*)\s*¥?(\d+\.?\d*)/,
|
|
46
|
+
|
|
47
|
+
// 带空格或分隔符的金额+税额
|
|
48
|
+
amountWithSpace: /¥?(\d+\.?\d*)\s+¥?(\d+\.?\d*)/,
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* 人名信息模式
|
|
53
|
+
*/
|
|
54
|
+
static PERSON_FIELDS = {
|
|
55
|
+
// 收款人 / 复核 / 开票人
|
|
56
|
+
// 更精确的正则表达式,匹配中文名字(2-4个中文字符),使用非贪婪匹配
|
|
57
|
+
people: /收款人[::](\S{2,4}?)复核[::](\S{2,4}?)开票人[::](\S{2,4}?)(?=销售方|$)/,
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* 购销方信息模式 - 统一优化版
|
|
62
|
+
*/
|
|
63
|
+
static PARTY_FIELDS = {
|
|
64
|
+
// 名称:支持空格,支持冒号后的各种字符,直到换行或密码区标记
|
|
65
|
+
name: /名[\s]*称[::\s]*([^密\n\r]+)/,
|
|
66
|
+
// 纳税人识别号:18位大写字母和数字,支持空格
|
|
67
|
+
code: /(?:纳[\s]*税[\s]*人[\s]*识[\s]*别[\s]*号|识别号)[::\s]*([A-Z0-9]{15,20})/,
|
|
68
|
+
// 地址、电话:支持多种分隔符,提取地址部分
|
|
69
|
+
address: /地[\s]*址(?:[\s]*[、,,\s]*电[\s]*话)?[::\s]*([^电\n\r]*)/,
|
|
70
|
+
// 电话:提取电话号码部分
|
|
71
|
+
phone: /电[\s]*话[::\s]*([^\n\r]+)/,
|
|
72
|
+
// 开户行及账号
|
|
73
|
+
account: /开[\s]*户[\s]*行[\s]*及[\s]*账[\s]*号[::\s]*([^\n\r]+)/,
|
|
74
|
+
// 电子支付标识
|
|
75
|
+
electronicAccount: /电[\s]*子[\s]*支[\s]*付[\s]*标[\s]*识[::\s]*([^\n\r]*)/,
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* 明细表头识别
|
|
80
|
+
*/
|
|
81
|
+
static DETAIL_HEADERS = {
|
|
82
|
+
name: /货物或应[\s]*税[\s]*劳[\s]*务[\s]*、[\s]*服[\s]*务[\s]*名[\s]*称/,
|
|
83
|
+
model: /规[\s]*格[\s]*型[\s]*号/,
|
|
84
|
+
unit: /单[\s]*位/,
|
|
85
|
+
quantity: /数[\s]*量/,
|
|
86
|
+
price: /单[\s]*价/,
|
|
87
|
+
amount: /金[\s]*额/,
|
|
88
|
+
taxRate: /税[\s]*率/,
|
|
89
|
+
taxAmount: /税[\s]*额/,
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* 发票类型识别
|
|
94
|
+
*/
|
|
95
|
+
static INVOICE_TYPE = {
|
|
96
|
+
// 普通发票
|
|
97
|
+
regular: /(\S*)通发票/,
|
|
98
|
+
regularCleanup: /[国统一发票监制]/g,
|
|
99
|
+
|
|
100
|
+
// 专用发票
|
|
101
|
+
special: /(\S*)用发票/,
|
|
102
|
+
specialCleanup: /[国统一发票监制]/g,
|
|
103
|
+
|
|
104
|
+
// 通行费
|
|
105
|
+
tollFee: /通行费/,
|
|
106
|
+
tollFeeCheck: /车牌号/,
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* 明细行识别模式
|
|
111
|
+
*/
|
|
112
|
+
static DETAIL_LINE = {
|
|
113
|
+
// 税率和金额行: 包含百分比和数字
|
|
114
|
+
taxRateLine: /\S+\d*(%|免税|不征税|出口零税率|普通零税率)\S*/,
|
|
115
|
+
// 数字提取
|
|
116
|
+
number: /^(-?\d+)(\.\d+)?$/,
|
|
117
|
+
};
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* 电子发票特定模式
|
|
121
|
+
*/
|
|
122
|
+
static ELECTRONIC_INVOICE = {
|
|
123
|
+
// 电子发票类型
|
|
124
|
+
type: /电子发票\s*[((]增值税(专用|普通)发票[))]/,
|
|
125
|
+
|
|
126
|
+
// 合计金额模式
|
|
127
|
+
amountWithTax: /合\s*计\s*¥?\s*(\d+\.\d+)\s+¥?\s*(\d+\.\d+)/,
|
|
128
|
+
|
|
129
|
+
// 价税合计(数字)
|
|
130
|
+
totalAmount: /价税合计.*?¥?\s*(\d+\.\d+)/,
|
|
131
|
+
|
|
132
|
+
// 价税合计(中文大写)
|
|
133
|
+
totalAmountChinese: /价税合计.*?([壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整]+)/,
|
|
134
|
+
};
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* 实用模式
|
|
138
|
+
*/
|
|
139
|
+
static UTILITY_PATTERNS = {
|
|
140
|
+
// 电子发票号码
|
|
141
|
+
electronicsNumber: /电子发票号码[::](\d{8,12})/,
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* 获取特定模式的所有匹配
|
|
146
|
+
* @param {string} text - 文本
|
|
147
|
+
* @param {RegExp} pattern - 正则表达式
|
|
148
|
+
* @returns {Array} 所有匹配的组
|
|
149
|
+
*/
|
|
150
|
+
static getAllMatches(text, pattern) {
|
|
151
|
+
const results = [];
|
|
152
|
+
let match;
|
|
153
|
+
// 确保只有一个 'g' 标志
|
|
154
|
+
let flags = pattern.flags || '';
|
|
155
|
+
if (!flags.includes('g')) {
|
|
156
|
+
flags += 'g';
|
|
157
|
+
}
|
|
158
|
+
const globalPattern = new RegExp(pattern.source, flags);
|
|
159
|
+
while ((match = globalPattern.exec(text)) !== null) {
|
|
160
|
+
results.push(match);
|
|
161
|
+
}
|
|
162
|
+
return results;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* 尝试多个模式,返回第一个匹配
|
|
167
|
+
* @param {string} text - 文本
|
|
168
|
+
* @param {Array<RegExp>} patterns - 正则表达式数组
|
|
169
|
+
* @returns {object} {match, pattern}
|
|
170
|
+
*/
|
|
171
|
+
static tryPatterns(text, patterns) {
|
|
172
|
+
for (const pattern of patterns) {
|
|
173
|
+
const match = text.match(pattern);
|
|
174
|
+
if (match) {
|
|
175
|
+
return { match, pattern };
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
return null;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* 提取数字型字段值
|
|
183
|
+
*/
|
|
184
|
+
static extractNumber(text) {
|
|
185
|
+
if (!text) return null;
|
|
186
|
+
const match = text.match(/\d+(\.\d+)?/);
|
|
187
|
+
return match ? parseFloat(match[0]) : null;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* 提取税率百分比
|
|
192
|
+
*/
|
|
193
|
+
static extractTaxRate(text) {
|
|
194
|
+
if (!text || text.includes('免税') || text.includes('不征税')) {
|
|
195
|
+
return 0;
|
|
196
|
+
}
|
|
197
|
+
const match = text.match(/(\d+)%/);
|
|
198
|
+
return match ? parseFloat(match[1]) / 100 : 0;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
module.exports = RegexPatterns;
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 字符串工具类
|
|
3
|
+
*/
|
|
4
|
+
class StringUtils {
|
|
5
|
+
/**
|
|
6
|
+
* 规范化字符串 - 移除空格和特殊符号
|
|
7
|
+
*/
|
|
8
|
+
static normalize(str) {
|
|
9
|
+
if (!str) return '';
|
|
10
|
+
return str
|
|
11
|
+
.replace(/\s+/g, '')
|
|
12
|
+
.replace(/ +/g, '')
|
|
13
|
+
.replace(/:/g, ':')
|
|
14
|
+
.replace(/(/g, '(')
|
|
15
|
+
.replace(/)/g, ')')
|
|
16
|
+
.replace(/¥/g, '¥');
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* 规范化为单个空格和去除前后空格
|
|
21
|
+
*/
|
|
22
|
+
static replace(str) {
|
|
23
|
+
if (!str) return '';
|
|
24
|
+
return str
|
|
25
|
+
.replace(/\s+/g, ' ')
|
|
26
|
+
.replace(/ +/g, ' ')
|
|
27
|
+
.trim();
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* 去除空白字符
|
|
32
|
+
*/
|
|
33
|
+
static trim(str) {
|
|
34
|
+
if (!str) return '';
|
|
35
|
+
return str.trim();
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* 判断是否为空或仅包含空白字符
|
|
40
|
+
*/
|
|
41
|
+
static isBlank(str) {
|
|
42
|
+
return !str || str.trim().length === 0;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* 判断是否不为空
|
|
47
|
+
*/
|
|
48
|
+
static isNotBlank(str) {
|
|
49
|
+
return !StringUtils.isBlank(str);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* 分割字符串
|
|
54
|
+
*/
|
|
55
|
+
static split(str, separator = ' ') {
|
|
56
|
+
if (!str) return [];
|
|
57
|
+
return str.split(separator).filter((s) => s.length > 0);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* 提取数字
|
|
62
|
+
*/
|
|
63
|
+
static extractNumber(str) {
|
|
64
|
+
if (!str) return null;
|
|
65
|
+
const match = str.match(/-?\d+(\.\d+)?/);
|
|
66
|
+
return match ? match[0] : null;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
module.exports = StringUtils;
|
package/lib/extractor.js
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 统一的发票提取器入口
|
|
3
|
+
*/
|
|
4
|
+
const PdfInvoiceExtractor = require('./PdfInvoiceExtractor');
|
|
5
|
+
const OfdInvoiceExtractor = require('./OfdInvoiceExtractor');
|
|
6
|
+
const ErrorHandler = require('./ErrorHandler');
|
|
7
|
+
|
|
8
|
+
async function extract(filePath) {
|
|
9
|
+
return ErrorHandler.safeExtract(async () => {
|
|
10
|
+
if (filePath.toLowerCase().endsWith('.ofd')) {
|
|
11
|
+
return OfdInvoiceExtractor.extract(filePath);
|
|
12
|
+
} else if (filePath.toLowerCase().endsWith('.pdf')) {
|
|
13
|
+
return PdfInvoiceExtractor.extract(filePath);
|
|
14
|
+
} else {
|
|
15
|
+
throw new Error('Unsupported file format. Only PDF and OFD are supported.');
|
|
16
|
+
}
|
|
17
|
+
}, [], 'extractor');
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
module.exports = {
|
|
21
|
+
extract,
|
|
22
|
+
extractPdf: PdfInvoiceExtractor.extract,
|
|
23
|
+
extractOfd: OfdInvoiceExtractor.extract,
|
|
24
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "einvoice-cli",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "电子发票识别 CLI 工具 (PDF/OFD)",
|
|
5
|
+
"main": "index.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"einvoice": "bin/cli.js"
|
|
8
|
+
},
|
|
9
|
+
"scripts": {
|
|
10
|
+
"start": "node bin/cli.js",
|
|
11
|
+
"test": "node test/test.js"
|
|
12
|
+
},
|
|
13
|
+
"keywords": [
|
|
14
|
+
"invoice",
|
|
15
|
+
"pdf",
|
|
16
|
+
"ofd",
|
|
17
|
+
"recognition"
|
|
18
|
+
],
|
|
19
|
+
"author": "jhoncy",
|
|
20
|
+
"license": "MIT",
|
|
21
|
+
"dependencies": {
|
|
22
|
+
"commander": "^12.1.0",
|
|
23
|
+
"iconv-lite": "^0.7.2",
|
|
24
|
+
"jszip": "^3.10.1",
|
|
25
|
+
"pdfjs-dist": "^3.11.174",
|
|
26
|
+
"xml2js": "^0.6.2"
|
|
27
|
+
},
|
|
28
|
+
"devDependencies": {
|
|
29
|
+
"node": ">=16.0.0"
|
|
30
|
+
}
|
|
31
|
+
}
|