@editneo/pdf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ import { NeoBlock } from '@editneo/core';
2
+ export declare function extractBlocksFromPdf(data: ArrayBuffer): Promise<NeoBlock[]>;
3
+ //# sourceMappingURL=worker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"worker.d.ts","sourceRoot":"","sources":["../src/worker.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,QAAQ,EAAa,MAAM,eAAe,CAAC;AAOpD,wBAAsB,oBAAoB,CAAC,IAAI,EAAE,WAAW,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC,CAkEjF"}
package/dist/worker.js ADDED
@@ -0,0 +1,79 @@
1
+ import * as pdfjs from 'pdfjs-dist';
2
+ import { v4 as uuid } from 'uuid';
3
+ // Configure worker - in a real app, this needs careful path handling or a bundler plugin
4
+ // For now, we assume the worker is loaded by the main thread or handled by the build system
5
+ // pdfjs.GlobalWorkerOptions.workerSrc = ...;
6
+ export async function extractBlocksFromPdf(data) {
7
+ const loadingTask = pdfjs.getDocument(data);
8
+ const pdf = await loadingTask.promise;
9
+ const blocks = [];
10
+ for (let i = 1; i <= pdf.numPages; i++) {
11
+ const page = await pdf.getPage(i);
12
+ const textContent = await page.getTextContent();
13
+ const opList = await page.getOperatorList();
14
+ // Simplistic mode font size calculation for the page
15
+ const fontSizes = {};
16
+ for (const item of textContent.items) {
17
+ if ('height' in item) {
18
+ const height = Math.floor(item.height);
19
+ fontSizes[height] = (fontSizes[height] || 0) + 1;
20
+ }
21
+ }
22
+ let modeFontSize = 12;
23
+ let maxCount = 0;
24
+ for (const size in fontSizes) {
25
+ if (fontSizes[size] > maxCount) {
26
+ maxCount = fontSizes[size];
27
+ modeFontSize = Number(size);
28
+ }
29
+ }
30
+ // Heuristic 1: Text extraction
31
+ // This is a simplified extraction that doesn't handle layout perfectly but follows the plan
32
+ let currentBlockText = '';
33
+ let currentBlockType = 'paragraph';
34
+ let lastY = -1;
35
+ for (const item of textContent.items) {
36
+ if (!('str' in item))
37
+ continue;
38
+ // Simple line grouping test
39
+ if (lastY !== -1 && Math.abs(item.transform[5] - lastY) > 5) {
40
+ if (currentBlockText.trim()) {
41
+ blocks.push(createBlock(currentBlockType, currentBlockText));
42
+ }
43
+ currentBlockText = '';
44
+ // Reset type based on new line height
45
+ if (item.height > modeFontSize * 2)
46
+ currentBlockType = 'heading-1';
47
+ else if (item.height > modeFontSize * 1.5)
48
+ currentBlockType = 'heading-2';
49
+ else
50
+ currentBlockType = 'paragraph';
51
+ }
52
+ currentBlockText += item.str + ' ';
53
+ lastY = item.transform[5];
54
+ }
55
+ if (currentBlockText.trim()) {
56
+ blocks.push(createBlock(currentBlockType, currentBlockText));
57
+ }
58
+ // Heuristic 2: Images (Simplistic mock logic as real extraction form opList is complex)
59
+ // Real implementation requires processing opList.fnArray and argsArray for PaintImageXObject
60
+ // For this task, we will just add a placeholder if we detect image ops
61
+ if (opList.fnArray.includes(pdfjs.OPS.paintImageXObject)) {
62
+ blocks.push(createBlock('image', '', { src: 'placeholder-image-url' }));
63
+ }
64
+ }
65
+ return blocks;
66
+ }
67
+ function createBlock(type, text, props = {}) {
68
+ return {
69
+ id: uuid(),
70
+ type,
71
+ content: [{ text: text.trim() }],
72
+ props,
73
+ children: [],
74
+ parentId: null,
75
+ createdAt: Date.now(),
76
+ updatedAt: Date.now(),
77
+ };
78
+ }
79
+ //# sourceMappingURL=worker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"worker.js","sourceRoot":"","sources":["../src/worker.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,KAAK,MAAM,YAAY,CAAC;AAEpC,OAAO,EAAE,EAAE,IAAI,IAAI,EAAE,MAAM,MAAM,CAAC;AAElC,yFAAyF;AACzF,4FAA4F;AAC5F,8CAA8C;AAE9C,MAAM,CAAC,KAAK,UAAU,oBAAoB,CAAC,IAAiB;IAC1D,MAAM,WAAW,GAAG,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;IAC5C,MAAM,GAAG,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC;IACtC,MAAM,MAAM,GAAe,EAAE,CAAC;IAE9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAClC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;QAChD,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,eAAe,EAAE,CAAC;QAE5C,qDAAqD;QACrD,MAAM,SAAS,GAA2B,EAAE,CAAC;QAC7C,KAAK,MAAM,IAAI,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;YACrC,IAAI,QAAQ,IAAI,IAAI,EAAE,CAAC;gBACrB,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACvC,SAAS,CAAC,MAAM,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;YACnD,CAAC;QACH,CAAC;QAED,IAAI,YAAY,GAAG,EAAE,CAAC;QACtB,IAAI,QAAQ,GAAG,CAAC,CAAC;QACjB,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;YAC7B,IAAI,SAAS,CAAC,IAAI,CAAC,GAAG,QAAQ,EAAE,CAAC;gBAC/B,QAAQ,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;gBAC3B,YAAY,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC;YAC9B,CAAC;QACH,CAAC;QAED,+BAA+B;QAC/B,4FAA4F;QAC5F,IAAI,gBAAgB,GAAG,EAAE,CAAC;QAC1B,IAAI,gBAAgB,GAAc,WAAW,CAAC;QAC9C,IAAI,KAAK,GAAG,CAAC,CAAC,CAAC;QAEf,KAAK,MAAM,IAAI,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;YACnC,IAAI,CAAC,CAAC,KAAK,IAAI,IAAI,CAAC;gBAAE,SAAS;YAE/B,4BAA4B;YAC5B,IAAI,KAAK,KAAK,CAAC,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC;gBACzD,IAAI,gBAAgB,CAAC,IAAI,EAAE,EAAE,CAAC;oBAC1B,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,gBAAgB,EAAE,gBAAgB,CAAC,CAAC,CAAC;gBACjE,CAAC;gBACD,gBAAgB,GAAG,EAAE,CAAC;gBACtB,sCAAsC;gBACtC,IAAI,IAAI,CAAC,MAAM,GAAG,YAAY,GAAG,CAAC;oBAAE,gBAAgB,GAAG,WAAW,CAAC;qBAC9D,IAAI,IAAI,CAAC,MAAM,GAAG,YAAY,GAAG,GAAG;oBAAE,gBAAgB,GAAG,WAAW,CAAC;;oBACrE,gBAAgB,GAAG,WAAW,CAAC;YACzC,CAAC;YAED,gBAAgB,IAAI,IAAI,CAAC,GAAG,GAAG,GAAG,CAAC;YACnC,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;QAC9B,CAAC;QAED,IAAI,gBAAgB,CAAC,IAAI,EAAE,EAAE,CAAC;YAC1B,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,gBAAgB,EAAE,gBAAgB,CAAC,CAAC,CAAC;QACjE,CAAC;QAED,wFAAwF;QACxF,6FAA6F;QAC7F,uEAAuE;QACvE,IAAI,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,iBAAiB,CAAC,EAAE,CAAC;YACtD,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,EAAE,EAAE,GAAG,EAAE,uBAAuB,EAAE,CAAC,CAAC,CAAC;QAC7E,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,WAAW,CAAC,IAAe,EAAE,IAAY,EAAE,QAA6B,EAAE;IAC/E,OAAO;QACH,EAAE,EAAE,IAAI,EAAE;QACV,IAAI;QACJ,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;QAChC,KAAK;QACL,QAAQ,EAAE,EAAE;QACZ,QAAQ,EAAE,IAAI;QACd,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE;QACrB,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE;KACxB,CAAC;AACN,CAAC"}
package/package.json ADDED
@@ -0,0 +1,24 @@
1
+ {
2
+ "name": "@editneo/pdf",
3
+ "version": "0.1.0",
4
+ "description": "PDF transmutation engine for EditNeo — converts PDFs into editable blocks",
5
+ "main": "./dist/worker.js",
6
+ "types": "./dist/worker.d.ts",
7
+ "files": ["dist"],
8
+ "scripts": {
9
+ "build": "tsc",
10
+ "check-types": "tsc --noEmit"
11
+ },
12
+ "publishConfig": {
13
+ "access": "public"
14
+ },
15
+ "dependencies": {
16
+ "pdfjs-dist": "3.11.174",
17
+ "@editneo/core": "*"
18
+ },
19
+ "devDependencies": {
20
+ "typescript": "^5.3.3",
21
+ "@types/pdfjs-dist": "^2.10.378"
22
+ },
23
+ "license": "MIT"
24
+ }