einvoice-cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +139 -0
- package/bin/cli.js +52 -0
- package/index.js +23 -0
- package/lib/BaseInvoiceService.js +158 -0
- package/lib/ErrorHandler.js +98 -0
- package/lib/Invoice.js +108 -0
- package/lib/InvoiceValidator.js +422 -0
- package/lib/OfdInvoiceExtractor.js +170 -0
- package/lib/PDFTextPositionAnalyzer.js +366 -0
- package/lib/PdfFinancialInvoiceService.js +134 -0
- package/lib/PdfFullElectronicInvoiceService.js +325 -0
- package/lib/PdfInvoiceExtractor.js +124 -0
- package/lib/PdfRegularInvoiceService.js +786 -0
- package/lib/RegexPatterns.js +202 -0
- package/lib/StringUtils.js +70 -0
- package/lib/extractor.js +24 -0
- package/package.json +31 -0
|
@@ -0,0 +1,786 @@
|
|
|
1
|
+
const { Invoice, Detail } = require('./Invoice');
|
|
2
|
+
const StringUtils = require('./StringUtils');
|
|
3
|
+
const RegexPatterns = require('./RegexPatterns');
|
|
4
|
+
const PDFTextPositionAnalyzer = require('./PDFTextPositionAnalyzer');
|
|
5
|
+
const InvoiceValidator = require('./InvoiceValidator');
|
|
6
|
+
const BaseInvoiceService = require('./BaseInvoiceService');
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* PDF 普通发票服务 - 优化版
|
|
10
|
+
* 使用完整的正则表达式库、坐标定位和字段验证
|
|
11
|
+
*/
|
|
12
|
+
class PdfRegularInvoiceService {
|
|
13
|
+
static extract(fullText, allText, pageWidth, items) {
|
|
14
|
+
const invoice = new Invoice();
|
|
15
|
+
|
|
16
|
+
// 1. 提取基础字段
|
|
17
|
+
this.extractBasicFields(invoice, allText);
|
|
18
|
+
|
|
19
|
+
// 2. 提取金额信息
|
|
20
|
+
this.extractAmountInfo(invoice, allText, fullText);
|
|
21
|
+
|
|
22
|
+
// 3. 提取人名信息(签单人)
|
|
23
|
+
BaseInvoiceService.extractPersonInfo(invoice, allText);
|
|
24
|
+
|
|
25
|
+
// 4. 尝试使用坐标定位提取购销方信息
|
|
26
|
+
if (items && items.length > 0) {
|
|
27
|
+
const analyzer = new PDFTextPositionAnalyzer(items);
|
|
28
|
+
this.extractPartyInfoByPosition(invoice, analyzer);
|
|
29
|
+
this.extractDetailsByPosition(invoice, analyzer, allText);
|
|
30
|
+
} else {
|
|
31
|
+
// 降级处理:基于文本匹配
|
|
32
|
+
this.extractPartyInfoByText(invoice, fullText, allText);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// 5. 验证发票数据
|
|
36
|
+
this.validateAndFixPartyInfo(invoice);
|
|
37
|
+
|
|
38
|
+
// 6. 验证和修正发票
|
|
39
|
+
BaseInvoiceService.validateInvoice(invoice);
|
|
40
|
+
|
|
41
|
+
return invoice;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* 提取基础字段
|
|
46
|
+
*/
|
|
47
|
+
static extractBasicFields(invoice, allText) {
|
|
48
|
+
// 使用完整的正则表达式库
|
|
49
|
+
const patterns = RegexPatterns.BASIC_FIELDS;
|
|
50
|
+
|
|
51
|
+
for (const [key, pattern] of Object.entries(patterns)) {
|
|
52
|
+
const result = RegexPatterns.tryPatterns(allText, [pattern]);
|
|
53
|
+
if (result) {
|
|
54
|
+
invoice[key] = result.match[1] || result.match[0];
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// 发票类型识别
|
|
59
|
+
this.detectInvoiceType(invoice, allText);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* 识别发票类型
|
|
64
|
+
*/
|
|
65
|
+
static detectInvoiceType(invoice, allText) {
|
|
66
|
+
// 通行费特殊处理
|
|
67
|
+
if (allText.includes('通行费') && allText.includes('车牌号')) {
|
|
68
|
+
invoice.type = '通行费';
|
|
69
|
+
return;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// 普通发票
|
|
73
|
+
const regularMatch = allText.match(RegexPatterns.INVOICE_TYPE.regular);
|
|
74
|
+
if (regularMatch) {
|
|
75
|
+
let cleanText = regularMatch[1].replace(
|
|
76
|
+
RegexPatterns.INVOICE_TYPE.regularCleanup,
|
|
77
|
+
''
|
|
78
|
+
);
|
|
79
|
+
// 移除前导的代码等杂质
|
|
80
|
+
cleanText = cleanText.replace(/.*代码[::]?\d+/, '').trim();
|
|
81
|
+
invoice.title = cleanText + '通发票';
|
|
82
|
+
invoice.type = '普通发票';
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// 专用发票
|
|
87
|
+
const specialMatch = allText.match(RegexPatterns.INVOICE_TYPE.special);
|
|
88
|
+
if (specialMatch) {
|
|
89
|
+
let cleanText = specialMatch[1].replace(
|
|
90
|
+
RegexPatterns.INVOICE_TYPE.specialCleanup,
|
|
91
|
+
''
|
|
92
|
+
);
|
|
93
|
+
// 移除前导的代码等杂质
|
|
94
|
+
cleanText = cleanText.replace(/.*代码[::]?\d+/, '').trim();
|
|
95
|
+
invoice.title = cleanText + '用发票';
|
|
96
|
+
invoice.type = '专用发票';
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* 提取金额信息 - 多策略尝试
|
|
103
|
+
*/
|
|
104
|
+
static extractAmountInfo(invoice, allText, fullText) {
|
|
105
|
+
|
|
106
|
+
// 1. 尝试从 fullText 提取金额和税额,因为它保留了换行和相对位置
|
|
107
|
+
const amountPatterns = [
|
|
108
|
+
/合计[::\s]*¥?(\d+\.\d+)\s+¥?(\d+\.\d+)/,
|
|
109
|
+
/小计[::\s]*¥?(\d+\.\d+)\s+¥?(\d+\.\d+)/,
|
|
110
|
+
/¥?(\d+\.\d+)\s+¥?(\d+\.\d+)\s*$/m,
|
|
111
|
+
/¥?(\d+\.\d+)\s+([0-9.]+)\s*$/m // 宽松模式
|
|
112
|
+
];
|
|
113
|
+
|
|
114
|
+
for (const pattern of amountPatterns) {
|
|
115
|
+
const match = fullText.match(pattern);
|
|
116
|
+
if (match) {
|
|
117
|
+
invoice.amount = match[1];
|
|
118
|
+
invoice.taxAmount = match[2];
|
|
119
|
+
console.log(`DEBUG: Using combined pattern ${pattern} found: amount=${invoice.amount}, tax=${invoice.taxAmount}`);
|
|
120
|
+
break;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// 2. 如果税额提取失败或为 "0"/"1"(误读),尝试显式模式
|
|
125
|
+
if (!invoice.taxAmount || invoice.taxAmount === '0' || invoice.taxAmount === '1' || invoice.taxAmount === '0.00') {
|
|
126
|
+
const taxMatch = allText.match(RegexPatterns.AMOUNT_FIELDS.taxAmount);
|
|
127
|
+
if (taxMatch) {
|
|
128
|
+
invoice.taxAmount = taxMatch[1];
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
if (!invoice.amount) {
|
|
133
|
+
const amountMatch = allText.match(RegexPatterns.AMOUNT_FIELDS.amount);
|
|
134
|
+
if (amountMatch) {
|
|
135
|
+
invoice.amount = amountMatch[1];
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// 3. 兜底逻辑:如果税额还是有问题,在合计字样附近的数字中查找
|
|
140
|
+
if (!invoice.taxAmount || invoice.taxAmount === '0' || invoice.taxAmount === '1' || invoice.taxAmount === '0.00' || invoice.taxAmount === invoice.totalAmount) {
|
|
141
|
+
const lines = fullText.split('\n');
|
|
142
|
+
for (const line of lines) {
|
|
143
|
+
if (line.includes('合计') || line.includes('小计')) {
|
|
144
|
+
// 排除掉价税合计这一行,它通常包含总额
|
|
145
|
+
if (line.includes('价税合计')) continue;
|
|
146
|
+
|
|
147
|
+
const numbers = line.match(/\d+\.\d+/g);
|
|
148
|
+
if (numbers && numbers.length >= 2) {
|
|
149
|
+
// 如果第一个数字和已有的金额相近,或者没有金额
|
|
150
|
+
if (!invoice.amount || Math.abs(parseFloat(invoice.amount) - parseFloat(numbers[0])) < 0.01) {
|
|
151
|
+
invoice.amount = numbers[0];
|
|
152
|
+
invoice.taxAmount = numbers[1];
|
|
153
|
+
console.log(`DEBUG: Found amount/tax in summary line: ${line} -> ${numbers[0]}, ${numbers[1]}`);
|
|
154
|
+
break;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// 4. 终极尝试:在 fullText 全文查找符合 a + t = total 的组合
|
|
162
|
+
if (!invoice.taxAmount || invoice.taxAmount === '0' || invoice.taxAmount === '1' || invoice.taxAmount === '0.00' || invoice.taxAmount === invoice.totalAmount) {
|
|
163
|
+
const allNumbers = fullText.match(/\d+\.\d+/g);
|
|
164
|
+
if (allNumbers && allNumbers.length >= 2) {
|
|
165
|
+
// 尝试寻找符合 a + t = total 的组合
|
|
166
|
+
// 如果有 totalAmount,则根据 totalAmount 寻找 a 和 t
|
|
167
|
+
if (invoice.totalAmount) {
|
|
168
|
+
const total = parseFloat(invoice.totalAmount);
|
|
169
|
+
for (let i = allNumbers.length - 1; i >= 0; i--) {
|
|
170
|
+
const num = parseFloat(allNumbers[i]);
|
|
171
|
+
// 如果这个数字本身就是总额(或者非常接近),我们查找它前面的两个数字
|
|
172
|
+
if (Math.abs(num - total) < 0.01) {
|
|
173
|
+
// 查找它前面的两个数字,看看是否相加等于它
|
|
174
|
+
for (let j = i - 1; j >= 1; j--) {
|
|
175
|
+
for (let k = j - 1; k >= 0; k--) {
|
|
176
|
+
const a = parseFloat(allNumbers[k]);
|
|
177
|
+
const t = parseFloat(allNumbers[j]);
|
|
178
|
+
if (Math.abs(a + t - total) < 0.01) {
|
|
179
|
+
invoice.amount = allNumbers[k];
|
|
180
|
+
invoice.taxAmount = allNumbers[j];
|
|
181
|
+
console.log(`DEBUG: Found valid amount/tax combination by total ${total}: ${a} + ${t}`);
|
|
182
|
+
return;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// 如果没有明确的总额,或者上面的方法没找到,尝试猜测
|
|
191
|
+
for (let i = allNumbers.length - 1; i >= 1; i--) {
|
|
192
|
+
for (let j = i - 1; j >= 0; j--) {
|
|
193
|
+
const a = parseFloat(allNumbers[j]);
|
|
194
|
+
const t = parseFloat(allNumbers[i]);
|
|
195
|
+
|
|
196
|
+
// 如果已经有了总额,检查是否匹配
|
|
197
|
+
if (invoice.totalAmount) {
|
|
198
|
+
const total = parseFloat(invoice.totalAmount);
|
|
199
|
+
if (Math.abs(a + t - total) < 0.01) {
|
|
200
|
+
invoice.amount = allNumbers[j];
|
|
201
|
+
invoice.taxAmount = allNumbers[i];
|
|
202
|
+
console.log(`DEBUG: Found valid amount/tax combination: ${a} + ${t} = ${total}`);
|
|
203
|
+
return;
|
|
204
|
+
}
|
|
205
|
+
} else {
|
|
206
|
+
// 如果没有总额,尝试寻找常见的税率关系 (如 3%, 6%, 9%, 13%)
|
|
207
|
+
const rates = [0.03, 0.06, 0.09, 0.13, 0.01];
|
|
208
|
+
for (const r of rates) {
|
|
209
|
+
if (Math.abs(a * r - t) < 0.05) {
|
|
210
|
+
invoice.amount = allNumbers[j];
|
|
211
|
+
invoice.taxAmount = allNumbers[i];
|
|
212
|
+
invoice.totalAmount = (a + t).toFixed(2);
|
|
213
|
+
console.log(`DEBUG: Found amount/tax by rate ${r}: ${a}, ${t}, guessed total=${invoice.totalAmount}`);
|
|
214
|
+
return;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// 最后的最后:如果 test.pdf 这种情况,amount=50.00, tax=3.00, total=53.00
|
|
222
|
+
// allNumbers 可能是 [..., 50.00, 3.00, 53.00]
|
|
223
|
+
if (allNumbers.length >= 3) {
|
|
224
|
+
const last = parseFloat(allNumbers[allNumbers.length - 1]);
|
|
225
|
+
const mid = parseFloat(allNumbers[allNumbers.length - 2]);
|
|
226
|
+
const first = parseFloat(allNumbers[allNumbers.length - 3]);
|
|
227
|
+
|
|
228
|
+
if (Math.abs(first + mid - last) < 0.01) {
|
|
229
|
+
invoice.amount = allNumbers[allNumbers.length - 3];
|
|
230
|
+
invoice.taxAmount = allNumbers[allNumbers.length - 2];
|
|
231
|
+
invoice.totalAmount = allNumbers[allNumbers.length - 1];
|
|
232
|
+
console.log(`DEBUG: Found triplet: ${first} + ${mid} = ${last}`);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// 4. 价税合计
|
|
239
|
+
let totalMatch = allText.match(RegexPatterns.AMOUNT_FIELDS.totalAmount);
|
|
240
|
+
if (!totalMatch) {
|
|
241
|
+
// 尝试在 fullText 中查找带空格的价税合计
|
|
242
|
+
totalMatch = fullText.match(/价税合计(?:\(大写\))?[::\s]*([^\(\)]*?)(?:\(小写\))?[::\s]*¥?(\d+\.?\d*)/);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// 专门针对 test.pdf 这种标签和金额完全分离的情况
|
|
246
|
+
if (!totalMatch && fullText.includes('价税合计')) {
|
|
247
|
+
// 查找价税合计后面的第一个浮点数
|
|
248
|
+
const parts = fullText.split('价税合计');
|
|
249
|
+
const afterTotal = parts[parts.length - 1]; // 取最后一个价税合计后面
|
|
250
|
+
|
|
251
|
+
// 寻找大写金额:需要包含圆或整,且长度足够
|
|
252
|
+
const chineseMatch = afterTotal.match(/([壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整]{2,})/);
|
|
253
|
+
if (chineseMatch) {
|
|
254
|
+
invoice.totalAmountString = chineseMatch[1];
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// 寻找小写金额:跳过大写部分,寻找第一个浮点数
|
|
258
|
+
// 排除掉可能被误认为总额的 amount 或 taxAmount
|
|
259
|
+
const numbers = afterTotal.match(/\d+\.\d+/g) || [];
|
|
260
|
+
for (const n of numbers) {
|
|
261
|
+
// 如果这个数字不等于 amount 且不等于 taxAmount
|
|
262
|
+
// 或者虽然相等但它是独立出现的(比如 total = amount + 0)
|
|
263
|
+
if (n !== invoice.amount && n !== invoice.taxAmount) {
|
|
264
|
+
invoice.totalAmount = n;
|
|
265
|
+
console.log(`DEBUG: Found totalAmount by proximity (filtering): ${invoice.totalAmount}`);
|
|
266
|
+
break;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// 特殊情况:合计金额和价税合计相等(税额为0)
|
|
271
|
+
if (!invoice.totalAmount && invoice.amount && (!invoice.taxAmount || parseFloat(invoice.taxAmount) === 0)) {
|
|
272
|
+
if (afterTotal.includes(invoice.amount)) {
|
|
273
|
+
invoice.totalAmount = invoice.amount;
|
|
274
|
+
console.log(`DEBUG: TotalAmount equals Amount (Tax is 0): ${invoice.totalAmount}`);
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// 如果还是没找到,且有 amount 和 taxAmount,则计算
|
|
279
|
+
if (!invoice.totalAmount && invoice.amount && invoice.taxAmount) {
|
|
280
|
+
invoice.totalAmount = (parseFloat(invoice.amount) + parseFloat(invoice.taxAmount)).toFixed(2);
|
|
281
|
+
console.log(`DEBUG: Calculated totalAmount: ${invoice.totalAmount}`);
|
|
282
|
+
}
|
|
283
|
+
} else if (totalMatch) {
|
|
284
|
+
invoice.totalAmountString = totalMatch[1].trim();
|
|
285
|
+
invoice.totalAmount = totalMatch[2];
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// 针对 test.pdf: 如果 totalAmountString 还是空,尝试在大写括号中间找
|
|
289
|
+
if (!invoice.totalAmountString && (fullText.includes('大写') || allText.includes('大写'))) {
|
|
290
|
+
// 更灵活的正则表达式,匹配各种括号和空格
|
|
291
|
+
const combinedText = fullText + allText;
|
|
292
|
+
console.log(`DEBUG: 查找totalAmountString的文本: ${combinedText.substring(0, 300)}...`);
|
|
293
|
+
|
|
294
|
+
// 尝试多种模式
|
|
295
|
+
const patterns = [
|
|
296
|
+
/大写[)))\s]*([壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整]+)/,
|
|
297
|
+
/(大写)([壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整]+)/,
|
|
298
|
+
/\(大写\)([壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整]+)/,
|
|
299
|
+
/价税合计.*[((].*[))]([壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整]+)/,
|
|
300
|
+
/价税合计.*([壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整]+)/
|
|
301
|
+
];
|
|
302
|
+
|
|
303
|
+
for (const pattern of patterns) {
|
|
304
|
+
const match = combinedText.match(pattern);
|
|
305
|
+
if (match) {
|
|
306
|
+
invoice.totalAmountString = match[1];
|
|
307
|
+
console.log(`DEBUG: 通过模式 ${pattern} 找到totalAmountString: ${invoice.totalAmountString}`);
|
|
308
|
+
break;
|
|
309
|
+
} else {
|
|
310
|
+
console.log(`DEBUG: 模式 ${pattern} 没有匹配到`);
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// 如果最后还是没有 totalAmountString,且有 totalAmount,可以尝试转换(可选,目前先保留)
|
|
316
|
+
if (!invoice.totalAmountString) {
|
|
317
|
+
const chineseNumbers = "壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整";
|
|
318
|
+
const combined = fullText + allText;
|
|
319
|
+
const words = combined.split(/[\s()()]/);
|
|
320
|
+
for (const word of words) {
|
|
321
|
+
if (word.length >= 3 && [...word].every(char => chineseNumbers.includes(char))) {
|
|
322
|
+
invoice.totalAmountString = word;
|
|
323
|
+
break;
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
/**
|
|
331
|
+
* 从销售方文本中提取价税合计大写金额
|
|
332
|
+
*/
|
|
333
|
+
static extractTotalAmountStringFromSellerText(invoice, sellerText) {
|
|
334
|
+
console.log(`DEBUG: 尝试从销售方文本提取totalAmountString: ${sellerText.substring(0, 100)}...`);
|
|
335
|
+
|
|
336
|
+
// 尝试多种模式
|
|
337
|
+
const patterns = [
|
|
338
|
+
/价税合计\s*[((]?\s*大写\s*[))]?\s*([壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整]+)/,
|
|
339
|
+
/价税合计(大写)([壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整]+)/,
|
|
340
|
+
/价税合计\(大写\)([壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整]+)/,
|
|
341
|
+
/价税合计.*([壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整]+圆整)/,
|
|
342
|
+
/(大写)([壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整]+)/,
|
|
343
|
+
/\(大写\)([壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整]+)/
|
|
344
|
+
];
|
|
345
|
+
|
|
346
|
+
for (const pattern of patterns) {
|
|
347
|
+
const match = sellerText.match(pattern);
|
|
348
|
+
if (match) {
|
|
349
|
+
invoice.totalAmountString = match[1];
|
|
350
|
+
console.log(`DEBUG: 使用模式 ${pattern} 找到totalAmountString: ${invoice.totalAmountString}`);
|
|
351
|
+
return;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// 如果以上模式都没匹配到,尝试查找中文数字
|
|
356
|
+
const chineseNumbers = "壹贰叁肆伍陆柒捌玖拾佰仟万亿圆整";
|
|
357
|
+
const lines = sellerText.split('\n');
|
|
358
|
+
for (const line of lines) {
|
|
359
|
+
if (line.includes('价税合计') || line.includes('大写')) {
|
|
360
|
+
console.log(`DEBUG: 检查包含价税合计或大写的行: ${line}`);
|
|
361
|
+
// 查找连续的中文数字
|
|
362
|
+
let chineseNum = '';
|
|
363
|
+
for (const char of line) {
|
|
364
|
+
if (chineseNumbers.includes(char)) {
|
|
365
|
+
chineseNum += char;
|
|
366
|
+
} else if (chineseNum.length > 0) {
|
|
367
|
+
break;
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
if (chineseNum.length >= 3) {
|
|
371
|
+
invoice.totalAmountString = chineseNum;
|
|
372
|
+
console.log(`DEBUG: 从行中提取totalAmountString: ${invoice.totalAmountString}`);
|
|
373
|
+
return;
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
/**
|
|
380
|
+
* 使用坐标定位提取购销方信息
|
|
381
|
+
*/
|
|
382
|
+
static extractPartyInfoByPosition(invoice, analyzer) {
|
|
383
|
+
// 获取购销方区域
|
|
384
|
+
const regions = analyzer.detectPartyRegions();
|
|
385
|
+
|
|
386
|
+
// 提取购买方信息
|
|
387
|
+
if (regions.buyer) {
|
|
388
|
+
const buyerText = analyzer.getTextInRegion(regions.buyer);
|
|
389
|
+
console.log('DEBUG: 购买方文本:', buyerText);
|
|
390
|
+
this.parsePartyInfo(invoice, 'buyer', buyerText);
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
// 提取销售方信息
|
|
394
|
+
if (regions.seller) {
|
|
395
|
+
const sellerText = analyzer.getTextInRegion(regions.seller);
|
|
396
|
+
console.log('DEBUG: 销售方文本:', sellerText);
|
|
397
|
+
this.parsePartyInfo(invoice, 'seller', sellerText);
|
|
398
|
+
|
|
399
|
+
// 尝试从销售方文本中提取totalAmountString
|
|
400
|
+
if (!invoice.totalAmountString && sellerText.includes('价税合计')) {
|
|
401
|
+
this.extractTotalAmountStringFromSellerText(invoice, sellerText);
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
// 提取密码区信息
|
|
406
|
+
const passwordRegion = analyzer.findPasswordRegion();
|
|
407
|
+
if (passwordRegion) {
|
|
408
|
+
const passwordText = analyzer.getTextInRegion(passwordRegion);
|
|
409
|
+
invoice.password = StringUtils.trim(passwordText);
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
/**
|
|
414
|
+
* 解析购销方信息
|
|
415
|
+
*/
|
|
416
|
+
static parsePartyInfo(invoice, type, text) {
|
|
417
|
+
if (!text) return;
|
|
418
|
+
|
|
419
|
+
const patterns = RegexPatterns.PARTY_FIELDS;
|
|
420
|
+
const prefix = type === 'buyer' ? 'buyer' : 'seller';
|
|
421
|
+
|
|
422
|
+
for (const [key, pattern] of Object.entries(patterns)) {
|
|
423
|
+
const match = text.match(pattern);
|
|
424
|
+
if (match) {
|
|
425
|
+
const value = StringUtils.trim(match[1]);
|
|
426
|
+
const fieldName = `${prefix}${key.charAt(0).toUpperCase()}${key.slice(1)}`;
|
|
427
|
+
|
|
428
|
+
// 特殊处理名称字段:如果提取到的名称看起来像门店号而不是公司名称,
|
|
429
|
+
// 尝试在文本中查找更合适的公司名称
|
|
430
|
+
if (key === 'name' && (value.includes('门店号') || value.includes('店号') || /^\d+$/.test(value))) {
|
|
431
|
+
// 尝试在文本中查找公司名称
|
|
432
|
+
const companyNameMatch = text.match(/([\u4e00-\u9fa5()()]{5,}有限公司|[\u4e00-\u9fa5()()]{5,}公司|[\u4e00-\u9fa5()()]{5,}集团)/);
|
|
433
|
+
if (companyNameMatch) {
|
|
434
|
+
invoice[fieldName] = StringUtils.trim(companyNameMatch[1]);
|
|
435
|
+
console.log(`DEBUG: 修正${fieldName}: 从 "${value}" 改为 "${invoice[fieldName]}"`);
|
|
436
|
+
continue;
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
invoice[fieldName] = value;
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
/**
|
|
446
|
+
* 降级处理:基于文本匹配的购销方信息提取
|
|
447
|
+
*/
|
|
448
|
+
static extractPartyInfoByText(invoice, fullText, allText) {
|
|
449
|
+
// 直接从文本中提取购销方信息,不依赖标签
|
|
450
|
+
// 购买方信息通常在"购"字之后,明细之前
|
|
451
|
+
const buyerSectionMatch = fullText.match(/购[\s\S]*?(?=货物或应税劳务|规格型号|单位|数量)/);
|
|
452
|
+
if (buyerSectionMatch) {
|
|
453
|
+
const buyerSection = buyerSectionMatch[0];
|
|
454
|
+
this.parsePartyInfoFromText(invoice, 'buyer', buyerSection);
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
// 销售方信息:通常在发票底部,"销"字之后
|
|
458
|
+
const sellerSectionMatch = fullText.match(/销[\s\S]*?(?=收款人|复核|开票人|$)/);
|
|
459
|
+
if (sellerSectionMatch) {
|
|
460
|
+
const sellerSection = sellerSectionMatch[0];
|
|
461
|
+
this.parsePartyInfoFromText(invoice, 'seller', sellerSection);
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
/**
|
|
466
|
+
* 从文本段解析购销方信息(改进版)
|
|
467
|
+
*/
|
|
468
|
+
static parsePartyInfoFromText(invoice, type, text) {
|
|
469
|
+
const prefix = type === 'buyer' ? 'buyer' : 'seller';
|
|
470
|
+
|
|
471
|
+
// 提取名称
|
|
472
|
+
const nameMatch = text.match(/名[\s]*称[::\s]*([^密\n\r]*)/);
|
|
473
|
+
if (nameMatch) {
|
|
474
|
+
const name = StringUtils.trim(nameMatch[1]);
|
|
475
|
+
if (name && !name.includes('税') && !name.includes('合计')) {
|
|
476
|
+
invoice[`${prefix}Name`] = name;
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
// 提取纳税人识别号
|
|
481
|
+
const codeMatch = text.match(/纳[\s]*税[\s]*人[\s]*识[\s]*别[\s]*号[::\s]*([A-Z0-9]{18})/);
|
|
482
|
+
if (codeMatch) {
|
|
483
|
+
invoice[`${prefix}Code`] = StringUtils.trim(codeMatch[1]);
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
// 提取地址(排除电话、纳税人识别号、密码等)
|
|
487
|
+
const addressMatch = text.match(/地[\s]*址[::\s]*([^电纳密\n\r]*)/);
|
|
488
|
+
if (addressMatch) {
|
|
489
|
+
const address = StringUtils.trim(addressMatch[1]);
|
|
490
|
+
if (address && address !== '、' && !address.match(/^\d{10,}$/)) {
|
|
491
|
+
invoice[`${prefix}Address`] = address;
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
// 提取电话
|
|
496
|
+
const phoneMatch = text.match(/电[\s]*话[::\s]*([^\n\r]*)/);
|
|
497
|
+
if (phoneMatch) {
|
|
498
|
+
const phone = StringUtils.trim(phoneMatch[1]);
|
|
499
|
+
if (phone && phone.match(/\d/)) {
|
|
500
|
+
invoice[`${prefix}Phone`] = phone;
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
// 提取开户行及账号
|
|
505
|
+
const accountMatch = text.match(/开[\s]*户[\s]*行[\s]*及[\s]*账[\s]*号[::\s]*([^\n\r]*)/);
|
|
506
|
+
if (accountMatch) {
|
|
507
|
+
const account = StringUtils.trim(accountMatch[1]);
|
|
508
|
+
if (account && account.length > 5) {
|
|
509
|
+
invoice[`${prefix}Account`] = account;
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
/**
|
|
515
|
+
* 使用坐标定位提取明细行
|
|
516
|
+
*/
|
|
517
|
+
static extractDetailsByPosition(invoice, analyzer, allText) {
|
|
518
|
+
// 检测明细行区域
|
|
519
|
+
const detailsRegion = analyzer.detectDetailsRegion();
|
|
520
|
+
if (!detailsRegion) {
|
|
521
|
+
return; // 无法定位明细区域
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
// 获取明细行的文本
|
|
525
|
+
const detailLines = analyzer.getTextLinesInRegion(detailsRegion);
|
|
526
|
+
|
|
527
|
+
const details = [];
|
|
528
|
+
for (const line of detailLines) {
|
|
529
|
+
if (!line || line.length < 5) {
|
|
530
|
+
continue;
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
// 检查是否为明细行(包含税率或特殊标记)
|
|
534
|
+
if (this.isDetailLine(line)) {
|
|
535
|
+
const detail = this.parseDetailLine(line);
|
|
536
|
+
if (detail) {
|
|
537
|
+
details.push(detail);
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
invoice.details = details;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
/**
|
|
546
|
+
* 判断是否为明细行
|
|
547
|
+
*/
|
|
548
|
+
static isDetailLine(text) {
|
|
549
|
+
// 明细行通常包含:
|
|
550
|
+
// 1. 百分比税率
|
|
551
|
+
// 2. 免税、不征税等标记
|
|
552
|
+
// 3. 数字(金额)
|
|
553
|
+
return /\d+%/.test(text) ||
|
|
554
|
+
/免税|不征税|出口零税率|普通零税率/.test(text) ||
|
|
555
|
+
(/\d+/.test(text) && /\d+\.\d+/.test(text));
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
/**
|
|
559
|
+
* 解析单条明细行
|
|
560
|
+
*/
|
|
561
|
+
static parseDetailLine(line) {
|
|
562
|
+
const detail = new Detail();
|
|
563
|
+
detail.name = '';
|
|
564
|
+
|
|
565
|
+
// 规范化
|
|
566
|
+
line = StringUtils.replace(line);
|
|
567
|
+
const items = StringUtils.split(line, ' ');
|
|
568
|
+
|
|
569
|
+
if (items.length < 2) {
|
|
570
|
+
return null;
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
// 简单情况:只有金额和税额
|
|
574
|
+
if (items.length === 2 && /^\d+/.test(items[0]) && /^\d+/.test(items[1])) {
|
|
575
|
+
detail.amount = items[0];
|
|
576
|
+
detail.taxAmount = items[1];
|
|
577
|
+
return detail;
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
// 复杂情况:包含商品信息、数量、单价、税率等
|
|
581
|
+
if (items.length > 2) {
|
|
582
|
+
// 最后三项通常是:金额、税率、税额
|
|
583
|
+
const lastAmount = items[items.length - 3];
|
|
584
|
+
const taxRate = items[items.length - 2];
|
|
585
|
+
const taxAmount = items[items.length - 1];
|
|
586
|
+
|
|
587
|
+
if (/^\d+/.test(lastAmount)) {
|
|
588
|
+
detail.amount = lastAmount;
|
|
589
|
+
|
|
590
|
+
// 税率处理
|
|
591
|
+
if (/免税|不征税|出口零税率|普通零税率/.test(taxRate)) {
|
|
592
|
+
detail.taxRate = 0;
|
|
593
|
+
detail.taxAmount = 0;
|
|
594
|
+
} else {
|
|
595
|
+
detail.taxRate = RegexPatterns.extractTaxRate(taxRate);
|
|
596
|
+
detail.taxAmount = taxAmount;
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
// 提取数量、单价、规格等信息
|
|
600
|
+
let quantity = null;
|
|
601
|
+
let price = null;
|
|
602
|
+
|
|
603
|
+
for (let j = 0; j < items.length - 3; j++) {
|
|
604
|
+
if (RegexPatterns.DETAIL_LINE.number.test(items[j])) {
|
|
605
|
+
if (!quantity) {
|
|
606
|
+
quantity = items[j];
|
|
607
|
+
} else {
|
|
608
|
+
price = items[j];
|
|
609
|
+
}
|
|
610
|
+
} else if (items[j].length > 1) {
|
|
611
|
+
// 规格或单位
|
|
612
|
+
if (j + 1 < items.length && !RegexPatterns.DETAIL_LINE.number.test(items[j + 1])) {
|
|
613
|
+
detail.model = items[j];
|
|
614
|
+
detail.unit = items[j + 1];
|
|
615
|
+
j++; // 跳过单位
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
if (quantity) detail.count = quantity;
|
|
621
|
+
if (price) detail.price = price;
|
|
622
|
+
|
|
623
|
+
return detail;
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
return null;
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
/**
|
|
631
|
+
* 验证并修正购销方信息
|
|
632
|
+
*/
|
|
633
|
+
static validateAndFixPartyInfo(invoice) {
|
|
634
|
+
// 1. 检查名称是否包含明细表头内容
|
|
635
|
+
const headerKeywords = ['规格型号', '单位', '数量', '单价', '金额', '税率', '税额'];
|
|
636
|
+
|
|
637
|
+
[ 'buyerName', 'sellerName' ].forEach(field => {
|
|
638
|
+
if (invoice[field]) {
|
|
639
|
+
const containsHeader = headerKeywords.some(keyword => invoice[field].includes(keyword));
|
|
640
|
+
if (containsHeader) {
|
|
641
|
+
console.log(`DEBUG: 清除无效的${field}:`, invoice[field]);
|
|
642
|
+
invoice[field] = null;
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
});
|
|
646
|
+
|
|
647
|
+
// 2. 清理各字段中的杂质文字
|
|
648
|
+
const junkWords = ['买', '码', '注', '方区', '密码区', '购方区', '销售方区', '代码', '售', '购', '销', '密'];
|
|
649
|
+
const fieldsToClean = [
|
|
650
|
+
'buyerName', 'buyerCode', 'buyerAddress', 'buyerAccount',
|
|
651
|
+
'sellerName', 'sellerCode', 'sellerAddress', 'sellerAccount'
|
|
652
|
+
];
|
|
653
|
+
|
|
654
|
+
fieldsToClean.forEach(field => {
|
|
655
|
+
if (invoice[field] && typeof invoice[field] === 'string') {
|
|
656
|
+
let value = invoice[field].trim();
|
|
657
|
+
|
|
658
|
+
// 特殊处理:如果提取的内容包含字段标签本身,去掉它
|
|
659
|
+
const labels = ['名称', '纳税人识别号', '识别号', '地址', '电话', '开户行及账号'];
|
|
660
|
+
labels.forEach(label => {
|
|
661
|
+
if (value.startsWith(label)) {
|
|
662
|
+
value = value.replace(new RegExp(`^${label}[::\\s]*`), '').trim();
|
|
663
|
+
}
|
|
664
|
+
});
|
|
665
|
+
|
|
666
|
+
// 移除开头和末尾的杂质
|
|
667
|
+
let changed = true;
|
|
668
|
+
while (changed) {
|
|
669
|
+
changed = false;
|
|
670
|
+
for (const word of junkWords) {
|
|
671
|
+
if (value.endsWith(word)) {
|
|
672
|
+
value = value.substring(0, value.length - word.length).trim();
|
|
673
|
+
changed = true;
|
|
674
|
+
}
|
|
675
|
+
if (value.startsWith(word)) {
|
|
676
|
+
value = value.substring(word.length).trim();
|
|
677
|
+
changed = true;
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
// 特殊修正:针对 buyerAccount 中的 "区" 等单字
|
|
683
|
+
if (field === 'buyerAccount' && value.length === 1) {
|
|
684
|
+
value = null;
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
// 如果清理后只剩下极短的无意义字符,则置空
|
|
688
|
+
if (value && value.length <= 1 && junkWords.some(word => value.includes(word))) {
|
|
689
|
+
value = null;
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
invoice[field] = value;
|
|
693
|
+
}
|
|
694
|
+
});
|
|
695
|
+
|
|
696
|
+
// 3. 修正具体的字段误提取
|
|
697
|
+
if (invoice.buyerAddress && (invoice.buyerAddress.includes('纳税人识别号') || invoice.buyerAddress.includes('识别号'))) {
|
|
698
|
+
invoice.buyerAddress = null;
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
// 如果买方税号被提取到了地址里(常见于 test2.pdf)
|
|
702
|
+
if (!invoice.buyerCode && invoice.buyerAddress) {
|
|
703
|
+
const codeMatch = invoice.buyerAddress.match(/[A-Z0-9]{15,20}/);
|
|
704
|
+
if (codeMatch) {
|
|
705
|
+
invoice.buyerCode = codeMatch[0];
|
|
706
|
+
invoice.buyerAddress = null;
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
// 4. 如果购销方税号相同,且其中一个是错误的,尝试修复
|
|
711
|
+
if (invoice.buyerCode && invoice.sellerCode && invoice.buyerCode === invoice.sellerCode) {
|
|
712
|
+
if (!invoice.buyerName) {
|
|
713
|
+
invoice.buyerCode = null;
|
|
714
|
+
invoice.buyerAddress = null;
|
|
715
|
+
invoice.buyerAccount = null;
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
// 6. 特殊处理:当购买方是个人时,清理无效的地址和账号信息
|
|
720
|
+
if (invoice.buyerName && invoice.buyerName.includes('个人')) {
|
|
721
|
+
console.log(`DEBUG: 检测到购买方为个人: ${invoice.buyerName}`);
|
|
722
|
+
|
|
723
|
+
// 清理buyerName:只保留"个人",去掉后面的数字和特殊字符
|
|
724
|
+
const cleanName = invoice.buyerName.replace(/个人\s*[\d\s<>*\/+\-]*$/, '个人').trim();
|
|
725
|
+
if (cleanName !== invoice.buyerName) {
|
|
726
|
+
console.log(`DEBUG: 清理buyerName: 从 "${invoice.buyerName}" 改为 "${cleanName}"`);
|
|
727
|
+
invoice.buyerName = cleanName;
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
// 个人通常没有地址和账号,如果包含特殊字符则清空
|
|
731
|
+
const specialChars = /[<>*\/+]/; // 移除-,因为电话号码中可能包含-
|
|
732
|
+
if (invoice.buyerAddress && specialChars.test(invoice.buyerAddress)) {
|
|
733
|
+
console.log(`DEBUG: 清空包含特殊字符的buyerAddress: ${invoice.buyerAddress}`);
|
|
734
|
+
invoice.buyerAddress = null;
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
if (invoice.buyerAccount && specialChars.test(invoice.buyerAccount)) {
|
|
738
|
+
console.log(`DEBUG: 清空包含特殊字符的buyerAccount: ${invoice.buyerAccount}`);
|
|
739
|
+
invoice.buyerAccount = null;
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
// 个人通常没有纳税人识别号
|
|
743
|
+
if (invoice.buyerCode && specialChars.test(invoice.buyerCode)) {
|
|
744
|
+
console.log(`DEBUG: 清空包含特殊字符的buyerCode: ${invoice.buyerCode}`);
|
|
745
|
+
invoice.buyerCode = null;
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
// 8. 清理名称末尾的常见杂质字
|
|
750
|
+
const nameSuffixes = ['备', '注', '密', '码', '区'];
|
|
751
|
+
['buyerName', 'sellerName'].forEach(field => {
|
|
752
|
+
if (invoice[field]) {
|
|
753
|
+
let name = invoice[field];
|
|
754
|
+
let changed = true;
|
|
755
|
+
while (changed) {
|
|
756
|
+
changed = false;
|
|
757
|
+
for (const suffix of nameSuffixes) {
|
|
758
|
+
if (name.endsWith(suffix)) {
|
|
759
|
+
name = name.substring(0, name.length - suffix.length).trim();
|
|
760
|
+
changed = true;
|
|
761
|
+
console.log(`DEBUG: 清理${field}末尾的"${suffix}": 从 "${invoice[field]}" 改为 "${name}"`);
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
invoice[field] = name;
|
|
766
|
+
}
|
|
767
|
+
});
|
|
768
|
+
|
|
769
|
+
// 9. 通用密码区字符检查:如果字段包含密码区字符,清空
|
|
770
|
+
const passwordChars = /[<>*\/+]/; // 注意:不包含-,因为电话号码中可能包含-
|
|
771
|
+
const passwordFields = ['buyerAddress', 'buyerAccount', 'buyerCode', 'sellerAddress', 'sellerAccount', 'sellerCode'];
|
|
772
|
+
|
|
773
|
+
passwordFields.forEach(field => {
|
|
774
|
+
if (invoice[field] && passwordChars.test(invoice[field])) {
|
|
775
|
+
console.log(`DEBUG: 清空包含密码区字符的${field}: ${invoice[field]}`);
|
|
776
|
+
invoice[field] = null;
|
|
777
|
+
}
|
|
778
|
+
});
|
|
779
|
+
|
|
780
|
+
// 10. 检查地址是否包含无效字符
|
|
781
|
+
if (invoice.buyerAddress === '、') invoice.buyerAddress = null;
|
|
782
|
+
if (invoice.sellerAddress === '、') invoice.sellerAddress = null;
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
module.exports = PdfRegularInvoiceService;
|