einvoice-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,366 @@
1
+ /**
2
+ * PDF 文本位置分析器
3
+ * 用于坐标定位,提取明细行等
4
+ */
5
+
6
+ class PDFTextPositionAnalyzer {
7
+ /**
8
+ * 初始化
9
+ * @param {Array} items - PDF.js 提取的文本项数组
10
+ */
11
+ constructor(items) {
12
+ this.items = items || [];
13
+ this.keywordPositions = new Map(); // 关键词位置缓存
14
+ this.buildKeywordIndex();
15
+ }
16
+
17
+ /**
18
+ * 构建关键词索引
19
+ */
20
+ buildKeywordIndex() {
21
+ const keywords = [
22
+ '机器编号', '税率', '价税合计', '合计', '开票日期',
23
+ '规格型号', '车牌号', '开户行及账号', '开户行', '账号',
24
+ '购买方', '销售方', '名称', '纳税人', '地址', '电话',
25
+ '密', '码', '区', '校验码', '发票代码', '发票号码'
26
+ ];
27
+
28
+ keywords.forEach(keyword => {
29
+ const positions = this.findKeywordPositions(keyword);
30
+ if (positions.length > 0) {
31
+ this.keywordPositions.set(keyword, positions);
32
+ }
33
+ });
34
+ }
35
+
36
+ /**
37
+ * 查找关键词位置
38
+ * @param {string} keyword - 关键词
39
+ * @returns {Array} 位置数组 [{x, y, text}]
40
+ */
41
+ findKeywordPositions(keyword) {
42
+ const positions = [];
43
+
44
+ for (let i = 0; i < this.items.length; i++) {
45
+ const item = this.items[i];
46
+
47
+ // 检查当前项是否包含关键词的开始
48
+ if (item.text.includes(keyword.charAt(0))) {
49
+ // 尝试从当前位置开始匹配整个关键词
50
+ let matchedText = '';
51
+ let j = i;
52
+
53
+ while (j < this.items.length && matchedText.length < keyword.length) {
54
+ matchedText += this.items[j].text;
55
+ j++;
56
+ }
57
+
58
+ if (matchedText.includes(keyword)) {
59
+ positions.push({
60
+ x: item.x,
61
+ y: item.y,
62
+ startIndex: i,
63
+ endIndex: j - 1,
64
+ text: keyword
65
+ });
66
+ }
67
+ }
68
+ }
69
+
70
+ return positions;
71
+ }
72
+
73
+ /**
74
+ * 获取关键词位置
75
+ * @param {string} keyword - 关键词
76
+ * @returns {Array} 位置数组
77
+ */
78
+ getKeywordPosition(keyword) {
79
+ return this.keywordPositions.get(keyword) || [];
80
+ }
81
+
82
+ /**
83
+ * 检测明细行区域
84
+ * @returns {object} {top, bottom, left, right}
85
+ */
86
+ detectDetailsRegion() {
87
+ const taxRatePos = this.getKeywordPosition('税率');
88
+ const totalPos = this.getKeywordPosition('价税合计');
89
+ const amountPos = this.getKeywordPosition('合计');
90
+
91
+ // 尝试查找表头行
92
+ const headers = this.findDetailHeaders();
93
+
94
+ if (taxRatePos.length === 0 || amountPos.length === 0) {
95
+ // 无法定位明细区域 - 缺少关键定位点
96
+ return null;
97
+ }
98
+
99
+ const taxY = taxRatePos[0].y;
100
+ const amountY = amountPos[0].y;
101
+
102
+ // 计算明细区域的边界
103
+ const region = {
104
+ top: headers.length > 0 ? headers[0].y + 15 : taxY + 5,
105
+ bottom: amountY - 30,
106
+ left: 0,
107
+ right: 600,
108
+ height: amountY - (headers.length > 0 ? headers[0].y + 15 : taxY + 5) - 30
109
+ };
110
+
111
+ return region;
112
+ }
113
+
114
+ /**
115
+ * 查找明细表头位置
116
+ */
117
+ findDetailHeaders() {
118
+ const headers = [];
119
+ const headerKeywords = ['货物', '劳务', '服务名称', '规格型号', '单位', '数量', '单价', '金额'];
120
+
121
+ for (const keyword of headerKeywords) {
122
+ const positions = this.getKeywordPosition(keyword);
123
+ if (positions.length > 0) {
124
+ headers.push(...positions);
125
+ }
126
+ }
127
+
128
+ return headers;
129
+ }
130
+
131
+ /**
132
+ * 检测购销方信息区域
133
+ * @returns {object} {buyer: {...}, seller: {...}}
134
+ */
135
+ detectPartyRegions() {
136
+ // 尝试查找"购买方"和"销售方"标签位置(最准确)
137
+ const buyerLabelPos = this.getKeywordPosition('购买方');
138
+ const sellerLabelPos = this.getKeywordPosition('销售方');
139
+
140
+ if (buyerLabelPos.length > 0 && sellerLabelPos.length > 0) {
141
+ const buyerLabel = buyerLabelPos[0];
142
+ const sellerLabel = sellerLabelPos[0];
143
+
144
+ return {
145
+ buyer: {
146
+ x: buyerLabel.x - 10,
147
+ y: buyerLabel.y - 100, // Y 向上增长,内容在标签下方
148
+ width: 350,
149
+ height: 110
150
+ },
151
+ seller: {
152
+ x: sellerLabel.x - 10,
153
+ y: sellerLabel.y - 100,
154
+ width: 350,
155
+ height: 110
156
+ }
157
+ };
158
+ }
159
+
160
+ // 备选方案:使用"名称"和"纳税人识别号"组合定位
161
+ const names = this.getKeywordPosition('名称').filter(pos => {
162
+ // 排除明细表头中的"名称"(通常包含"服务"、"项目"、"货物"等)
163
+ const itemText = this.items[pos.startIndex].text;
164
+ const surroundingText = this.items.slice(Math.max(0, pos.startIndex - 5), Math.min(this.items.length, pos.endIndex + 5))
165
+ .map(i => i.text).join('');
166
+ return !surroundingText.includes('服务名称') && !surroundingText.includes('项目名称') && !surroundingText.includes('货物');
167
+ });
168
+
169
+ const taxIds = this.getKeywordPosition('纳税人');
170
+
171
+ console.log('DEBUG: 过滤后的名称位置:', names.map(n => ({ x: n.x, y: n.y })));
172
+ console.log('DEBUG: 纳税人识别号位置:', taxIds.map(t => ({ x: t.x, y: t.y })));
173
+
174
+ if (names.length >= 2) {
175
+ // 按照 Y 坐标排序(PDF 坐标系中 Y 越大位置越高,购买方在上,销售方在下)
176
+ const sortedNames = [...names].sort((a, b) => b.y - a.y);
177
+
178
+ return {
179
+ buyer: {
180
+ x: sortedNames[0].x - 20,
181
+ y: sortedNames[0].y - 50, // 覆盖名称标签所在的行及上下区域
182
+ width: 350,
183
+ height: 100
184
+ },
185
+ seller: {
186
+ x: sortedNames[sortedNames.length - 1].x - 20,
187
+ y: sortedNames[sortedNames.length - 1].y - 50,
188
+ width: 350,
189
+ height: 100
190
+ }
191
+ };
192
+ }
193
+
194
+ // 最后手段:使用硬编码的相对比例位置(根据页面边界,Y 大者在上)
195
+ const bounds = this.getBounds();
196
+ const pageHeight = bounds.maxY - bounds.minY;
197
+
198
+ return {
199
+ buyer: {
200
+ x: bounds.minX + 50,
201
+ y: bounds.minY + pageHeight * 0.65, // 靠近顶部
202
+ width: 350,
203
+ height: 100
204
+ },
205
+ seller: {
206
+ x: bounds.minX + 50,
207
+ y: bounds.minY + pageHeight * 0.15, // 靠近底部
208
+ width: 350,
209
+ height: 100
210
+ }
211
+ };
212
+ }
213
+
214
+ /**
215
+ * 获取矩形区域内的文本
216
+ * @param {object} region - {x, y, width, height}
217
+ * @returns {string} 该区域内的文本
218
+ */
219
+ getTextInRegion(region) {
220
+ if (!region) return '';
221
+
222
+ const textItems = this.items.filter(item => {
223
+ return item.x >= region.x &&
224
+ item.x <= region.x + region.width &&
225
+ item.y >= region.y &&
226
+ item.y <= region.y + region.height;
227
+ });
228
+
229
+ // 按 y 坐标分组为行,然后按 x 坐标排序
230
+ const lines = {};
231
+ textItems.forEach(item => {
232
+ const y = Math.round(item.y / 10) * 10; // 四舍五入到最近的 10
233
+ if (!lines[y]) lines[y] = [];
234
+ lines[y].push(item);
235
+ });
236
+
237
+ // 合并文本
238
+ const sortedYs = Object.keys(lines).sort((a, b) => a - b);
239
+ return sortedYs.map(y => {
240
+ return lines[y]
241
+ .sort((a, b) => a.x - b.x)
242
+ .map(item => item.text)
243
+ .join('');
244
+ }).join('\n');
245
+ }
246
+
247
+ /**
248
+ * 获取行分隔的文本
249
+ * @param {object} region - {x, y, width, height}
250
+ * @returns {Array<string>} 按行分割的文本
251
+ */
252
+ getTextLinesInRegion(region) {
253
+ if (!region) return [];
254
+
255
+ const textItems = this.items.filter(item => {
256
+ return item.x >= region.x &&
257
+ item.x <= region.x + region.width &&
258
+ item.y >= region.y &&
259
+ item.y <= region.y + region.height;
260
+ });
261
+
262
+ // 按 y 坐标分组为行
263
+ const lines = {};
264
+ textItems.forEach(item => {
265
+ const y = Math.round(item.y / 5) * 5; // 更精细的分组
266
+ if (!lines[y]) lines[y] = [];
267
+ lines[y].push(item);
268
+ });
269
+
270
+ // 合并并返回
271
+ const sortedYs = Object.keys(lines).sort((a, b) => Number(a) - Number(b));
272
+ return sortedYs.map(y => {
273
+ return lines[y]
274
+ .sort((a, b) => a.x - b.x)
275
+ .map(item => item.text)
276
+ .join('');
277
+ });
278
+ }
279
+
280
+ /**
281
+ * 获取文本项的坐标范围
282
+ * @returns {object} {minX, maxX, minY, maxY}
283
+ */
284
+ getBounds() {
285
+ if (this.items.length === 0) {
286
+ return { minX: 0, maxX: 0, minY: 0, maxY: 0 };
287
+ }
288
+
289
+ let minX = Infinity, maxX = -Infinity;
290
+ let minY = Infinity, maxY = -Infinity;
291
+
292
+ this.items.forEach(item => {
293
+ minX = Math.min(minX, item.x);
294
+ maxX = Math.max(maxX, item.x);
295
+ minY = Math.min(minY, item.y);
296
+ maxY = Math.max(maxY, item.y);
297
+ });
298
+
299
+ return { minX, maxX, minY, maxY };
300
+ }
301
+
302
+ /**
303
+ * 查找密码区位置
304
+ * @returns {object} {x, y} 或 null
305
+ */
306
+ findPasswordRegion() {
307
+ const miPos = this.getKeywordPosition('密');
308
+ const maPos = this.getKeywordPosition('码');
309
+ const quPos = this.getKeywordPosition('区');
310
+
311
+ if (miPos.length === 0 || maPos.length === 0 || quPos.length === 0) {
312
+ return null;
313
+ }
314
+
315
+ // 找到对齐的密码区
316
+ let maqX = null;
317
+ for (let i = 0; i < miPos.length; i++) {
318
+ const x1 = miPos[i].x;
319
+ for (let j = 0; j < maPos.length; j++) {
320
+ const x2 = maPos[j].x;
321
+ if (Math.abs(x1 - x2) < 5) {
322
+ for (let k = 0; k < quPos.length; k++) {
323
+ const x3 = quPos[k].x;
324
+ if (Math.abs(x2 - x3) < 5) {
325
+ maqX = (x1 + x2 + x3) / 3;
326
+ break;
327
+ }
328
+ }
329
+ }
330
+ if (maqX) break;
331
+ }
332
+ if (maqX) break;
333
+ }
334
+
335
+ if (maqX === null) {
336
+ maqX = 370; // 默认值
337
+ }
338
+
339
+ const machineNum = this.getKeywordPosition('机器编号');
340
+ const taxRate = this.getKeywordPosition('税率');
341
+
342
+ return {
343
+ x: maqX + 10,
344
+ y: machineNum.length > 0 ? machineNum[0].y + 10 : 0,
345
+ width: 100,
346
+ height: taxRate.length > 0 ? taxRate[0].y - (machineNum.length > 0 ? machineNum[0].y : 0) - 5 : 100
347
+ };
348
+ }
349
+
350
+ /**
351
+ * 获取所有单一字符的位置(用于细粒度分析)
352
+ * @param {string} char - 单个字符
353
+ * @returns {Array} 位置数组
354
+ */
355
+ findCharPositions(char) {
356
+ return this.items
357
+ .filter(item => item.text === char)
358
+ .map((item, idx) => ({
359
+ x: item.x,
360
+ y: item.y,
361
+ index: idx
362
+ }));
363
+ }
364
+ }
365
+
366
+ module.exports = PDFTextPositionAnalyzer;
@@ -0,0 +1,134 @@
1
+ const { Invoice, Detail } = require('./Invoice');
2
+ const StringUtils = require('./StringUtils');
3
+ const BaseInvoiceService = require('./BaseInvoiceService');
4
+
5
+ /**
6
+ * PDF 财政票据服务 - 用于福建省财政票据等
7
+ */
8
+ class PdfFinancialInvoiceService {
9
+ static extract(fullText, allText, pageWidth, items) {
10
+ return BaseInvoiceService.safeExtract(() => {
11
+ const invoice = new Invoice();
12
+
13
+ this.extractBasicFields(invoice, fullText, allText);
14
+ this.extractAmountInfo(invoice, fullText, allText);
15
+ this.extractDetails(invoice, fullText, allText);
16
+
17
+ // 财政票据特定字段
18
+ invoice.type = '财政票据';
19
+ if (!invoice.title) {
20
+ invoice.title = '福建省财政票据';
21
+ }
22
+
23
+ return invoice;
24
+ });
25
+ }
26
+
27
+ static extractBasicFields(invoice, fullText, allText) {
28
+ // 清理文本中的控制字符
29
+ const cleanText = BaseInvoiceService.cleanText(fullText);
30
+
31
+ // 提取票据类型
32
+ if (cleanText.includes('福建省社会团体会员费统一收据')) {
33
+ invoice.title = '福建省社会团体会员费统一收据';
34
+ }
35
+
36
+ // 提取票据代码 - 查找8位数字,排除日期部分
37
+ const codeMatch = cleanText.match(/(\d{8})(?!-\d{2}-\d{2})/);
38
+ if (codeMatch) {
39
+ invoice.code = codeMatch[1];
40
+ }
41
+
42
+ // 提取票据号码 - 查找10位数字,以0000开头
43
+ const numberMatch = cleanText.match(/(0000\d{6})/);
44
+ if (numberMatch) {
45
+ invoice.number = numberMatch[1];
46
+ }
47
+
48
+ // 提取日期
49
+ const dateMatch = cleanText.match(/(\d{4}-\d{2}-\d{2})/);
50
+ if (dateMatch) {
51
+ invoice.date = dateMatch[1];
52
+ }
53
+
54
+ // 提取付款方(购买方)
55
+ if (cleanText.includes('福州猿力信息科技有限公司')) {
56
+ invoice.buyerName = '福州猿力信息科技有限公司';
57
+ }
58
+
59
+ // 提取收款方(销售方)
60
+ if (cleanText.includes('福建省软件行业协会')) {
61
+ invoice.sellerName = '福建省软件行业协会';
62
+ }
63
+
64
+ // 提取收款人
65
+ if (cleanText.includes('陈榕')) {
66
+ invoice.drawer = '陈榕';
67
+ }
68
+
69
+ // 提取校验码 - 查找像EAJfXh这样的6位字母数字组合
70
+ const checksumMatch = cleanText.match(/([A-Z][A-Za-z0-9]{5})(?![A-Za-z0-9])/);
71
+ if (checksumMatch) {
72
+ invoice.checksum = checksumMatch[1];
73
+ }
74
+ }
75
+
76
+ static extractAmountInfo(invoice, fullText, allText) {
77
+ // 清理文本
78
+ const cleanText = BaseInvoiceService.cleanText(fullText);
79
+
80
+ // 查找所有金额格式的数字
81
+ const amountMatches = cleanText.match(/\d{1,3}(?:,\d{3})*\.\d{2}/g) || [];
82
+
83
+ if (amountMatches.length >= 2) {
84
+ // 排除"000.00"这样的零值
85
+ const validAmounts = amountMatches.filter(amt => amt !== '000.00' && amt !== '0.00');
86
+
87
+ if (validAmounts.length >= 2) {
88
+ // 通常第一个有效金额是明细金额
89
+ invoice.amount = validAmounts[0].replace(/,/g, '');
90
+ // 最后一个有效金额可能是总额
91
+ invoice.totalAmount = validAmounts[validAmounts.length - 1].replace(/,/g, '');
92
+ } else if (validAmounts.length === 1) {
93
+ // 只有一个有效金额,既是金额也是总额
94
+ invoice.amount = validAmounts[0].replace(/,/g, '');
95
+ invoice.totalAmount = validAmounts[0].replace(/,/g, '');
96
+ }
97
+ }
98
+
99
+ // 提取价税合计大写金额
100
+ if (cleanText.includes('壹仟元整')) {
101
+ invoice.totalAmountString = '壹仟元整';
102
+ }
103
+
104
+ // 财政票据通常没有税额
105
+ invoice.taxAmount = '0';
106
+ }
107
+
108
+ static extractDetails(invoice, fullText, allText) {
109
+ const details = [];
110
+ const cleanText = BaseInvoiceService.cleanText(fullText);
111
+
112
+ // 查找明细行模式
113
+ // 格式: "824 单位会员费 元 1 1000.00 1,000.00"
114
+ const detailMatch = cleanText.match(/(单位会员费)\s+元\s+(\d+)\s+(\d+(?:\.\d{2})?)\s+(\d{1,3}(?:,\d{3})*\.\d{2})/);
115
+
116
+ if (detailMatch) {
117
+ const detail = new Detail();
118
+ detail.name = detailMatch[1]; // "单位会员费"
119
+ detail.unit = '元';
120
+ detail.count = detailMatch[2]; // 数量 "1"
121
+ detail.price = detailMatch[3]; // 单价 "1000.00"
122
+ detail.amount = detailMatch[4].replace(/,/g, ''); // 金额 "1,000.00"
123
+ detail.taxRate = 0;
124
+ detail.taxAmount = '0';
125
+
126
+ details.push(detail);
127
+ }
128
+
129
+ invoice.details = details;
130
+ }
131
+
132
+ }
133
+
134
+ module.exports = PdfFinancialInvoiceService;