@editneo/pdf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/worker.d.ts +3 -0
- package/dist/worker.d.ts.map +1 -0
- package/dist/worker.js +79 -0
- package/dist/worker.js.map +1 -0
- package/package.json +24 -0
package/dist/worker.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"worker.d.ts","sourceRoot":"","sources":["../src/worker.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,QAAQ,EAAa,MAAM,eAAe,CAAC;AAOpD,wBAAsB,oBAAoB,CAAC,IAAI,EAAE,WAAW,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC,CAkEjF"}
|
package/dist/worker.js
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import * as pdfjs from 'pdfjs-dist';
|
|
2
|
+
import { v4 as uuid } from 'uuid';
|
|
3
|
+
// Configure worker - in a real app, this needs careful path handling or a bundler plugin
|
|
4
|
+
// For now, we assume the worker is loaded by the main thread or handled by the build system
|
|
5
|
+
// pdfjs.GlobalWorkerOptions.workerSrc = ...;
|
|
6
|
+
export async function extractBlocksFromPdf(data) {
|
|
7
|
+
const loadingTask = pdfjs.getDocument(data);
|
|
8
|
+
const pdf = await loadingTask.promise;
|
|
9
|
+
const blocks = [];
|
|
10
|
+
for (let i = 1; i <= pdf.numPages; i++) {
|
|
11
|
+
const page = await pdf.getPage(i);
|
|
12
|
+
const textContent = await page.getTextContent();
|
|
13
|
+
const opList = await page.getOperatorList();
|
|
14
|
+
// Simplistic mode font size calculation for the page
|
|
15
|
+
const fontSizes = {};
|
|
16
|
+
for (const item of textContent.items) {
|
|
17
|
+
if ('height' in item) {
|
|
18
|
+
const height = Math.floor(item.height);
|
|
19
|
+
fontSizes[height] = (fontSizes[height] || 0) + 1;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
let modeFontSize = 12;
|
|
23
|
+
let maxCount = 0;
|
|
24
|
+
for (const size in fontSizes) {
|
|
25
|
+
if (fontSizes[size] > maxCount) {
|
|
26
|
+
maxCount = fontSizes[size];
|
|
27
|
+
modeFontSize = Number(size);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
// Heuristic 1: Text extraction
|
|
31
|
+
// This is a simplified extraction that doesn't handle layout perfectly but follows the plan
|
|
32
|
+
let currentBlockText = '';
|
|
33
|
+
let currentBlockType = 'paragraph';
|
|
34
|
+
let lastY = -1;
|
|
35
|
+
for (const item of textContent.items) {
|
|
36
|
+
if (!('str' in item))
|
|
37
|
+
continue;
|
|
38
|
+
// Simple line grouping test
|
|
39
|
+
if (lastY !== -1 && Math.abs(item.transform[5] - lastY) > 5) {
|
|
40
|
+
if (currentBlockText.trim()) {
|
|
41
|
+
blocks.push(createBlock(currentBlockType, currentBlockText));
|
|
42
|
+
}
|
|
43
|
+
currentBlockText = '';
|
|
44
|
+
// Reset type based on new line height
|
|
45
|
+
if (item.height > modeFontSize * 2)
|
|
46
|
+
currentBlockType = 'heading-1';
|
|
47
|
+
else if (item.height > modeFontSize * 1.5)
|
|
48
|
+
currentBlockType = 'heading-2';
|
|
49
|
+
else
|
|
50
|
+
currentBlockType = 'paragraph';
|
|
51
|
+
}
|
|
52
|
+
currentBlockText += item.str + ' ';
|
|
53
|
+
lastY = item.transform[5];
|
|
54
|
+
}
|
|
55
|
+
if (currentBlockText.trim()) {
|
|
56
|
+
blocks.push(createBlock(currentBlockType, currentBlockText));
|
|
57
|
+
}
|
|
58
|
+
// Heuristic 2: Images (Simplistic mock logic as real extraction form opList is complex)
|
|
59
|
+
// Real implementation requires processing opList.fnArray and argsArray for PaintImageXObject
|
|
60
|
+
// For this task, we will just add a placeholder if we detect image ops
|
|
61
|
+
if (opList.fnArray.includes(pdfjs.OPS.paintImageXObject)) {
|
|
62
|
+
blocks.push(createBlock('image', '', { src: 'placeholder-image-url' }));
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
return blocks;
|
|
66
|
+
}
|
|
67
|
+
function createBlock(type, text, props = {}) {
|
|
68
|
+
return {
|
|
69
|
+
id: uuid(),
|
|
70
|
+
type,
|
|
71
|
+
content: [{ text: text.trim() }],
|
|
72
|
+
props,
|
|
73
|
+
children: [],
|
|
74
|
+
parentId: null,
|
|
75
|
+
createdAt: Date.now(),
|
|
76
|
+
updatedAt: Date.now(),
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
//# sourceMappingURL=worker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"worker.js","sourceRoot":"","sources":["../src/worker.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,KAAK,MAAM,YAAY,CAAC;AAEpC,OAAO,EAAE,EAAE,IAAI,IAAI,EAAE,MAAM,MAAM,CAAC;AAElC,yFAAyF;AACzF,4FAA4F;AAC5F,8CAA8C;AAE9C,MAAM,CAAC,KAAK,UAAU,oBAAoB,CAAC,IAAiB;IAC1D,MAAM,WAAW,GAAG,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;IAC5C,MAAM,GAAG,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC;IACtC,MAAM,MAAM,GAAe,EAAE,CAAC;IAE9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAClC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;QAChD,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,eAAe,EAAE,CAAC;QAE5C,qDAAqD;QACrD,MAAM,SAAS,GAA2B,EAAE,CAAC;QAC7C,KAAK,MAAM,IAAI,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;YACrC,IAAI,QAAQ,IAAI,IAAI,EAAE,CAAC;gBACrB,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACvC,SAAS,CAAC,MAAM,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;YACnD,CAAC;QACH,CAAC;QAED,IAAI,YAAY,GAAG,EAAE,CAAC;QACtB,IAAI,QAAQ,GAAG,CAAC,CAAC;QACjB,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;YAC7B,IAAI,SAAS,CAAC,IAAI,CAAC,GAAG,QAAQ,EAAE,CAAC;gBAC/B,QAAQ,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;gBAC3B,YAAY,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC;YAC9B,CAAC;QACH,CAAC;QAED,+BAA+B;QAC/B,4FAA4F;QAC5F,IAAI,gBAAgB,GAAG,EAAE,CAAC;QAC1B,IAAI,gBAAgB,GAAc,WAAW,CAAC;QAC9C,IAAI,KAAK,GAAG,CAAC,CAAC,CAAC;QAEf,KAAK,MAAM,IAAI,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;YACnC,IAAI,CAAC,CAAC,KAAK,IAAI,IAAI,CAAC;gBAAE,SAAS;YAE/B,4BAA4B;YAC5B,IAAI,KAAK,KAAK,CAAC,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC;gBACzD,IAAI,gBAAgB,CAAC,IAAI,EAAE,EAAE,CAAC;oBAC1B,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,gBAAgB,EAAE,gBAAgB,CAAC,CAAC,CAAC;gBACjE,CAAC;gBACD,gBAAgB,GAAG,EAAE,CAAC;gBACtB,sCAAsC;gBACtC,IAAI,IAAI,CAAC,MAAM,GAAG,YAAY,GAAG,CAAC;oBAAE,gBAAgB,GAAG,WAAW,CAAC;qBAC9D,IAAI,IAAI,CAAC,MAAM,GAAG,YAAY,GAAG,GAAG;oBAAE,gBAAgB,GAAG,WAAW,CAAC;;oBACrE,gBAAgB,GAAG,WAAW,CAAC;YACzC,CAAC;YAED,gBAAgB,IAAI,IAAI,CAAC,GAAG,GAAG,GAAG,CAAC;YACnC,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;QAC9B,CAAC;QAED,IAAI,gBAAgB,CAAC,IAAI,EAAE,EAAE,CAAC;YAC1B,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,gBAAgB,EAAE,gBAAgB,CAAC,CAAC,CAAC;QACjE,CAAC;QAED,wFAAwF;QACxF,6FAA6F;QAC7F,uEAAuE;QACvE,IAAI,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,iBAAiB,CAAC,EAAE,CAAC;YACtD,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,EAAE,EAAE,GAAG,EAAE,uBAAuB,EAAE,CAAC,CAAC,CAAC;QAC7E,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,WAAW,CAAC,IAAe,EAAE,IAAY,EAAE,QAA6B,EAAE;IAC/E,OAAO;QACH,EAAE,EAAE,IAAI,EAAE;QACV,IAAI;QACJ,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;QAChC,KAAK;QACL,QAAQ,EAAE,EAAE;QACZ,QAAQ,EAAE,IAAI;QACd,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE;QACrB,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE;KACxB,CAAC;AACN,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@editneo/pdf",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "PDF transmutation engine for EditNeo — converts PDFs into editable blocks",
|
|
5
|
+
"main": "./dist/worker.js",
|
|
6
|
+
"types": "./dist/worker.d.ts",
|
|
7
|
+
"files": ["dist"],
|
|
8
|
+
"scripts": {
|
|
9
|
+
"build": "tsc",
|
|
10
|
+
"check-types": "tsc --noEmit"
|
|
11
|
+
},
|
|
12
|
+
"publishConfig": {
|
|
13
|
+
"access": "public"
|
|
14
|
+
},
|
|
15
|
+
"dependencies": {
|
|
16
|
+
"pdfjs-dist": "3.11.174",
|
|
17
|
+
"@editneo/core": "*"
|
|
18
|
+
},
|
|
19
|
+
"devDependencies": {
|
|
20
|
+
"typescript": "^5.3.3",
|
|
21
|
+
"@types/pdfjs-dist": "^2.10.378"
|
|
22
|
+
},
|
|
23
|
+
"license": "MIT"
|
|
24
|
+
}
|