@muyichengshayu/promptx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,383 @@
1
+ import fs from 'node:fs'
2
+ import path from 'node:path'
3
+ import { createRequire } from 'node:module'
4
+ import { fileURLToPath } from 'node:url'
5
+ import { createCanvas, DOMMatrix, ImageData, Path2D } from '@napi-rs/canvas'
6
+ import { nanoid } from 'nanoid'
7
+
8
+ const require = createRequire(import.meta.url)
9
+
10
+ if (!process.getBuiltinModule) {
11
+ process.getBuiltinModule = (name) => require(name)
12
+ }
13
+ if (!globalThis.DOMMatrix) {
14
+ globalThis.DOMMatrix = DOMMatrix
15
+ }
16
+ if (!globalThis.ImageData) {
17
+ globalThis.ImageData = ImageData
18
+ }
19
+ if (!globalThis.Path2D) {
20
+ globalThis.Path2D = Path2D
21
+ }
22
+
23
+ const pdfjs = await import('pdfjs-dist/legacy/build/pdf.mjs')
24
+ const currentDir = path.dirname(fileURLToPath(import.meta.url))
25
+ const standardFontDataUrl = `${path.resolve(currentDir, '../node_modules/pdfjs-dist/standard_fonts')}/`
26
+
27
+ function clamp(value, min, max) {
28
+ return Math.max(min, Math.min(max, value))
29
+ }
30
+
31
+ function normalizeText(text = '') {
32
+ return String(text || '')
33
+ .replace(/\u0000/g, '')
34
+ .replace(/\s+/g, ' ')
35
+ .trim()
36
+ }
37
+
38
+ function createImageBlock(content, meta = {}) {
39
+ return {
40
+ type: 'image',
41
+ content,
42
+ meta,
43
+ }
44
+ }
45
+
46
+ function createTextBlock(content, meta = {}) {
47
+ return {
48
+ type: 'text',
49
+ content,
50
+ meta,
51
+ }
52
+ }
53
+
54
+ function toPixelBox(bboxReader, index, width, height) {
55
+ if (!bboxReader || bboxReader.isEmpty(index)) {
56
+ return null
57
+ }
58
+
59
+ const minX = clamp(Math.floor(bboxReader.minX(index) * width), 0, width)
60
+ const minY = clamp(Math.floor(bboxReader.minY(index) * height), 0, height)
61
+ const maxX = clamp(Math.ceil(bboxReader.maxX(index) * width), 0, width)
62
+ const maxY = clamp(Math.ceil(bboxReader.maxY(index) * height), 0, height)
63
+
64
+ if (maxX <= minX || maxY <= minY) {
65
+ return null
66
+ }
67
+
68
+ return {
69
+ left: minX,
70
+ top: minY,
71
+ right: maxX,
72
+ bottom: maxY,
73
+ width: maxX - minX,
74
+ height: maxY - minY,
75
+ }
76
+ }
77
+
78
+ function mergeLines(items = [], pageHeight) {
79
+ const lines = []
80
+ let current = null
81
+
82
+ for (const raw of items) {
83
+ const value = raw.str ?? ''
84
+ if (!value || !value.trim()) {
85
+ if (current && raw.hasEOL) {
86
+ lines.push(current)
87
+ current = null
88
+ }
89
+ continue
90
+ }
91
+
92
+ const itemHeight = Math.max(Math.abs(raw.height || 0), Math.abs(raw.transform?.[3] || 0), 1)
93
+ const top = clamp(pageHeight - raw.transform[5] - itemHeight, 0, pageHeight)
94
+ const bottom = clamp(pageHeight - raw.transform[5], 0, pageHeight)
95
+ const item = {
96
+ text: value,
97
+ x: raw.transform[4],
98
+ top,
99
+ bottom,
100
+ height: Math.max(bottom - top, itemHeight),
101
+ }
102
+
103
+ if (!current) {
104
+ current = {
105
+ items: [item],
106
+ top: item.top,
107
+ bottom: item.bottom,
108
+ height: item.height,
109
+ }
110
+ } else {
111
+ const threshold = Math.max(current.height, item.height, 10) * 0.65
112
+ if (Math.abs(item.top - current.top) <= threshold) {
113
+ current.items.push(item)
114
+ current.top = Math.min(current.top, item.top)
115
+ current.bottom = Math.max(current.bottom, item.bottom)
116
+ current.height = Math.max(current.height, item.height)
117
+ } else {
118
+ lines.push(current)
119
+ current = {
120
+ items: [item],
121
+ top: item.top,
122
+ bottom: item.bottom,
123
+ height: item.height,
124
+ }
125
+ }
126
+ }
127
+
128
+ if (raw.hasEOL && current) {
129
+ lines.push(current)
130
+ current = null
131
+ }
132
+ }
133
+
134
+ if (current) {
135
+ lines.push(current)
136
+ }
137
+
138
+ return lines
139
+ .map((line) => {
140
+ const text = line.items
141
+ .sort((left, right) => left.x - right.x)
142
+ .map((item) => item.text)
143
+ .join('')
144
+ .replace(/[ \t]+/g, ' ')
145
+ .trim()
146
+
147
+ if (!text) {
148
+ return null
149
+ }
150
+
151
+ return {
152
+ text,
153
+ top: line.top,
154
+ bottom: line.bottom,
155
+ height: line.height,
156
+ }
157
+ })
158
+ .filter(Boolean)
159
+ }
160
+
161
+ function mergeParagraphs(lines = []) {
162
+ const paragraphs = []
163
+ let current = null
164
+
165
+ for (const line of lines) {
166
+ if (!current) {
167
+ current = {
168
+ text: line.text,
169
+ top: line.top,
170
+ bottom: line.bottom,
171
+ height: line.height,
172
+ }
173
+ continue
174
+ }
175
+
176
+ const gap = line.top - current.bottom
177
+ const threshold = Math.max(current.height, line.height, 12) * 0.9
178
+ if (gap <= threshold) {
179
+ current.text = `${current.text}\n${line.text}`
180
+ current.bottom = Math.max(current.bottom, line.bottom)
181
+ current.height = Math.max(current.height, line.height)
182
+ continue
183
+ }
184
+
185
+ paragraphs.push(current)
186
+ current = {
187
+ text: line.text,
188
+ top: line.top,
189
+ bottom: line.bottom,
190
+ height: line.height,
191
+ }
192
+ }
193
+
194
+ if (current) {
195
+ paragraphs.push(current)
196
+ }
197
+
198
+ return paragraphs
199
+ .map((paragraph) => ({
200
+ ...paragraph,
201
+ text: normalizeText(paragraph.text.replace(/\n{3,}/g, '\n\n')),
202
+ }))
203
+ .filter((paragraph) => paragraph.text)
204
+ }
205
+
206
+ function extractImageOperations(operatorList, bboxReader, canvasWidth, canvasHeight, pageWidth, pageHeight) {
207
+ const imageOps = []
208
+
209
+ for (let index = 0; index < operatorList.fnArray.length; index += 1) {
210
+ const fn = operatorList.fnArray[index]
211
+ if (![pdfjs.OPS.paintImageXObject, pdfjs.OPS.paintInlineImageXObject, pdfjs.OPS.paintImageXObjectRepeat].includes(fn)) {
212
+ continue
213
+ }
214
+
215
+ const box = toPixelBox(bboxReader, index, canvasWidth, canvasHeight)
216
+ if (!box) {
217
+ continue
218
+ }
219
+ if (box.width < 32 || box.height < 32 || box.width * box.height < 4096) {
220
+ continue
221
+ }
222
+
223
+ imageOps.push({
224
+ top: bboxReader.minY(index) * pageHeight,
225
+ bottom: bboxReader.maxY(index) * pageHeight,
226
+ box,
227
+ })
228
+ }
229
+
230
+ return imageOps
231
+ }
232
+
233
+ function uniqueImageOps(images = []) {
234
+ const unique = []
235
+
236
+ for (const image of images) {
237
+ const duplicated = unique.some((existing) => {
238
+ const dx = Math.abs(existing.box.left - image.box.left)
239
+ const dy = Math.abs(existing.box.top - image.box.top)
240
+ const dw = Math.abs(existing.box.width - image.box.width)
241
+ const dh = Math.abs(existing.box.height - image.box.height)
242
+ return dx <= 4 && dy <= 4 && dw <= 4 && dh <= 4
243
+ })
244
+
245
+ if (!duplicated) {
246
+ unique.push(image)
247
+ }
248
+ }
249
+
250
+ return unique
251
+ }
252
+
253
+ function saveCroppedImage(pageCanvas, image, uploadsDir) {
254
+ const targetCanvas = createCanvas(image.box.width, image.box.height)
255
+ const targetContext = targetCanvas.getContext('2d')
256
+ targetContext.drawImage(
257
+ pageCanvas,
258
+ image.box.left,
259
+ image.box.top,
260
+ image.box.width,
261
+ image.box.height,
262
+ 0,
263
+ 0,
264
+ image.box.width,
265
+ image.box.height
266
+ )
267
+
268
+ const outputName = `${nanoid(16)}.jpg`
269
+ const outputPath = path.join(uploadsDir, outputName)
270
+ fs.writeFileSync(outputPath, targetCanvas.toBuffer('image/jpeg'))
271
+ return `/uploads/${outputName}`
272
+ }
273
+
274
+ async function renderPage(page, scale) {
275
+ const viewport = page.getViewport({ scale })
276
+ const canvas = createCanvas(Math.ceil(viewport.width), Math.ceil(viewport.height))
277
+ const context = canvas.getContext('2d')
278
+ await page.render({
279
+ canvasContext: context,
280
+ viewport,
281
+ recordOperations: true,
282
+ }).promise
283
+
284
+ return {
285
+ viewport,
286
+ canvas,
287
+ }
288
+ }
289
+
290
+ export async function importPdfBlocks(buffer, options = {}) {
291
+ const { uploadsDir } = options
292
+ const loadingTask = pdfjs.getDocument({
293
+ data: new Uint8Array(buffer),
294
+ useWorkerFetch: false,
295
+ isEvalSupported: false,
296
+ standardFontDataUrl,
297
+ })
298
+ const pdf = await loadingTask.promise
299
+ const pageEntries = []
300
+ const createdAssets = []
301
+
302
+ try {
303
+ for (let pageNumber = 1; pageNumber <= pdf.numPages; pageNumber += 1) {
304
+ const page = await pdf.getPage(pageNumber)
305
+ const pageViewport = page.getViewport({ scale: 1 })
306
+ const { viewport, canvas } = await renderPage(page, 2)
307
+ const textContent = await page.getTextContent()
308
+ const lines = mergeLines(textContent.items || [], pageViewport.height)
309
+ const paragraphs = mergeParagraphs(lines).map((paragraph) => ({
310
+ type: 'text',
311
+ page: pageNumber,
312
+ top: paragraph.top,
313
+ bottom: paragraph.bottom,
314
+ block: createTextBlock(paragraph.text, {
315
+ source: 'pdf',
316
+ page: pageNumber,
317
+ }),
318
+ }))
319
+
320
+ const operatorList = await page.getOperatorList()
321
+ const images = uniqueImageOps(
322
+ extractImageOperations(
323
+ operatorList,
324
+ page.recordedBBoxes,
325
+ canvas.width,
326
+ canvas.height,
327
+ pageViewport.width,
328
+ pageViewport.height
329
+ )
330
+ ).map((image) => {
331
+ const assetUrl = saveCroppedImage(canvas, image, uploadsDir)
332
+ createdAssets.push(assetUrl)
333
+ return {
334
+ type: 'image',
335
+ page: pageNumber,
336
+ top: image.top,
337
+ bottom: image.bottom,
338
+ block: createImageBlock(assetUrl, {
339
+ source: 'pdf',
340
+ page: pageNumber,
341
+ }),
342
+ }
343
+ })
344
+
345
+ pageEntries.push(...paragraphs, ...images)
346
+ page.cleanup()
347
+ }
348
+ } catch (error) {
349
+ await pdf.destroy()
350
+ throw Object.assign(error, { createdAssets })
351
+ }
352
+
353
+ await pdf.destroy()
354
+
355
+ const blocks = pageEntries
356
+ .sort((left, right) => {
357
+ if (left.page !== right.page) {
358
+ return left.page - right.page
359
+ }
360
+ if (left.top !== right.top) {
361
+ return left.top - right.top
362
+ }
363
+ if (left.type !== right.type) {
364
+ return left.type === 'text' ? -1 : 1
365
+ }
366
+ return left.bottom - right.bottom
367
+ })
368
+ .map((entry) => entry.block)
369
+
370
+ if (!blocks.length) {
371
+ return {
372
+ blocks: [],
373
+ pageCount: pdf.numPages,
374
+ createdAssets,
375
+ }
376
+ }
377
+
378
+ return {
379
+ blocks,
380
+ pageCount: pdf.numPages,
381
+ createdAssets,
382
+ }
383
+ }