@uploadista/flow-documents-unpdf 0.0.16-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +16 -0
- package/LICENSE +21 -0
- package/README.md +50 -0
- package/dist/index.d.mts +9 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +45 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +32 -0
- package/src/document-plugin.ts +91 -0
- package/src/index.ts +1 -0
- package/tsconfig.json +14 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
> @uploadista/flow-documents-unpdf@0.0.1 build /Users/denislaboureyras/Documents/uploadista/dev/uploadista-workspace/uploadista-sdk/packages/flow/documents/unpdf
|
|
4
|
+
> tsdown
|
|
5
|
+
|
|
6
|
+
[34mℹ[39m tsdown [2mv0.16.5[22m powered by rolldown [2mv1.0.0-beta.50[22m
|
|
7
|
+
[34mℹ[39m entry: [34msrc/index.ts[39m
|
|
8
|
+
[34mℹ[39m tsconfig: [34mtsconfig.json[39m
|
|
9
|
+
[34mℹ[39m Build start
|
|
10
|
+
[34mℹ[39m Cleaning 4 files
|
|
11
|
+
[34mℹ[39m [2mdist/[22m[1mindex.mjs[22m [2m2.25 kB[22m [2m│ gzip: 0.76 kB[22m
|
|
12
|
+
[34mℹ[39m [2mdist/[22mindex.mjs.map [2m4.30 kB[22m [2m│ gzip: 1.27 kB[22m
|
|
13
|
+
[34mℹ[39m [2mdist/[22mindex.d.mts.map [2m0.18 kB[22m [2m│ gzip: 0.15 kB[22m
|
|
14
|
+
[34mℹ[39m [2mdist/[22m[32m[1mindex.d.mts[22m[39m [2m0.39 kB[22m [2m│ gzip: 0.21 kB[22m
|
|
15
|
+
[34mℹ[39m 4 files, total: 7.12 kB
|
|
16
|
+
[32m✔[39m Build complete in [32m1311ms[39m
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 uploadista
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# @uploadista/flow-documents-unpdf
|
|
2
|
+
|
|
3
|
+
unpdf-based text extraction plugin for Uploadista Flow.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Fast text extraction from searchable PDFs
|
|
8
|
+
- Modern TypeScript-first API
|
|
9
|
+
- Async/await support
|
|
10
|
+
- Multi-page text extraction with structure preservation
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pnpm add @uploadista/flow-documents-unpdf
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
```typescript
|
|
21
|
+
import { UnpdfDocumentPluginLive } from "@uploadista/flow-documents-unpdf";
|
|
22
|
+
import { Effect } from "effect";
|
|
23
|
+
|
|
24
|
+
// Provide the plugin for text extraction
|
|
25
|
+
const program = Effect.gen(function* () {
|
|
26
|
+
// Your flow logic here
|
|
27
|
+
}).pipe(Effect.provide(UnpdfDocumentPluginLive));
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Why unpdf?
|
|
31
|
+
|
|
32
|
+
- **Modern**: TypeScript-first library maintained as an alternative to pdf-parse
|
|
33
|
+
- **Reliable**: Uses pdfjs-dist under the hood (Mozilla's PDF.js)
|
|
34
|
+
- **Fast**: Optimized for text extraction performance
|
|
35
|
+
- **Universal**: Works across all JavaScript runtimes
|
|
36
|
+
|
|
37
|
+
## Use Cases
|
|
38
|
+
|
|
39
|
+
- Extract text from searchable PDFs for indexing
|
|
40
|
+
- Parse PDF documents for content analysis
|
|
41
|
+
- Extract structured text with paragraph/line preservation
|
|
42
|
+
|
|
43
|
+
## When NOT to use
|
|
44
|
+
|
|
45
|
+
- For scanned documents or image-based PDFs (use OCR instead)
|
|
46
|
+
- For PDF manipulation (use pdf-lib instead)
|
|
47
|
+
|
|
48
|
+
## License
|
|
49
|
+
|
|
50
|
+
MIT
|
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { DocumentPlugin } from "@uploadista/core/flow";
|
|
2
|
+
import { Layer } from "effect";
|
|
3
|
+
|
|
4
|
+
//#region src/document-plugin.d.ts
|
|
5
|
+
declare const unpdfDocumentPlugin: Layer.Layer<DocumentPlugin, never, never>;
|
|
6
|
+
declare const UnpdfDocumentPluginLive: Layer.Layer<DocumentPlugin, never, never>;
|
|
7
|
+
//#endregion
|
|
8
|
+
export { UnpdfDocumentPluginLive, unpdfDocumentPlugin };
|
|
9
|
+
//# sourceMappingURL=index.d.mts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/document-plugin.ts"],"sourcesContent":[],"mappings":";;;;cAKa,qBAAmB,KAAA,CAAA,MAAA;cAqFnB,yBAAuB,KAAA,CAAA,MAAA"}
|
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { UploadistaError } from "@uploadista/core/errors";
|
|
2
|
+
import { DocumentPlugin } from "@uploadista/core/flow";
|
|
3
|
+
import { Effect, Layer } from "effect";
|
|
4
|
+
import { extractText } from "unpdf";
|
|
5
|
+
|
|
6
|
+
//#region src/document-plugin.ts
|
|
7
|
+
const unpdfDocumentPlugin = Layer.succeed(DocumentPlugin, DocumentPlugin.of({
|
|
8
|
+
extractText: (input) => {
|
|
9
|
+
return Effect.gen(function* () {
|
|
10
|
+
const text = yield* Effect.tryPromise({
|
|
11
|
+
try: async () => {
|
|
12
|
+
return (await extractText(input, { mergePages: true })).text;
|
|
13
|
+
},
|
|
14
|
+
catch: (error) => {
|
|
15
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
16
|
+
if (errorMessage.toLowerCase().includes("encrypt") || errorMessage.toLowerCase().includes("password")) return UploadistaError.fromCode("PDF_ENCRYPTED", { cause: errorMessage });
|
|
17
|
+
if (errorMessage.toLowerCase().includes("corrupt") || errorMessage.toLowerCase().includes("invalid") || errorMessage.toLowerCase().includes("malformed")) return UploadistaError.fromCode("PDF_CORRUPTED", { cause: errorMessage });
|
|
18
|
+
return UploadistaError.fromCode("DOCUMENT_PROCESSING_FAILED", { cause: errorMessage });
|
|
19
|
+
}
|
|
20
|
+
});
|
|
21
|
+
if (!text || text.trim().length === 0) yield* Effect.logWarning("No text extracted from PDF. This might be a scanned document or image-based PDF. Consider using OCR instead.");
|
|
22
|
+
return text;
|
|
23
|
+
});
|
|
24
|
+
},
|
|
25
|
+
getMetadata: () => {
|
|
26
|
+
return Effect.gen(function* () {
|
|
27
|
+
return yield* UploadistaError.fromCode("DOCUMENT_PROCESSING_FAILED", { cause: "unpdf does not support metadata extraction. Use @uploadista/flow-documents-pdflib instead." }).toEffect();
|
|
28
|
+
});
|
|
29
|
+
},
|
|
30
|
+
splitPdf: () => {
|
|
31
|
+
return Effect.gen(function* () {
|
|
32
|
+
return yield* UploadistaError.fromCode("DOCUMENT_PROCESSING_FAILED", { cause: "unpdf does not support PDF splitting. Use @uploadista/flow-documents-pdflib instead." }).toEffect();
|
|
33
|
+
});
|
|
34
|
+
},
|
|
35
|
+
mergePdfs: () => {
|
|
36
|
+
return Effect.gen(function* () {
|
|
37
|
+
return yield* UploadistaError.fromCode("DOCUMENT_PROCESSING_FAILED", { cause: "unpdf does not support PDF merging. Use @uploadista/flow-documents-pdflib instead." }).toEffect();
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
}));
|
|
41
|
+
const UnpdfDocumentPluginLive = unpdfDocumentPlugin;
|
|
42
|
+
|
|
43
|
+
//#endregion
|
|
44
|
+
export { UnpdfDocumentPluginLive, unpdfDocumentPlugin };
|
|
45
|
+
//# sourceMappingURL=index.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.mjs","names":[],"sources":["../src/document-plugin.ts"],"sourcesContent":["import { UploadistaError } from \"@uploadista/core/errors\";\nimport { DocumentPlugin } from \"@uploadista/core/flow\";\nimport { Effect, Layer } from \"effect\";\nimport { extractText } from \"unpdf\";\n\nexport const unpdfDocumentPlugin = Layer.succeed(\n DocumentPlugin,\n DocumentPlugin.of({\n extractText: (input) => {\n return Effect.gen(function* () {\n const text = yield* Effect.tryPromise({\n try: async () => {\n const result = await extractText(input, {\n mergePages: true,\n });\n return result.text;\n },\n catch: (error) => {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n\n if (\n errorMessage.toLowerCase().includes(\"encrypt\") ||\n errorMessage.toLowerCase().includes(\"password\")\n ) {\n return UploadistaError.fromCode(\"PDF_ENCRYPTED\", {\n cause: errorMessage,\n });\n }\n\n if (\n errorMessage.toLowerCase().includes(\"corrupt\") ||\n errorMessage.toLowerCase().includes(\"invalid\") ||\n errorMessage.toLowerCase().includes(\"malformed\")\n ) {\n return UploadistaError.fromCode(\"PDF_CORRUPTED\", {\n cause: errorMessage,\n });\n }\n\n return UploadistaError.fromCode(\"DOCUMENT_PROCESSING_FAILED\", {\n cause: errorMessage,\n });\n },\n });\n\n // If no text was extracted, log a warning\n if (!text || text.trim().length === 0) {\n yield* Effect.logWarning(\n \"No text extracted from PDF. This might be a scanned document or image-based PDF. Consider using OCR instead.\",\n );\n }\n\n return text;\n });\n },\n\n getMetadata: () => {\n return Effect.gen(function* () {\n // unpdf doesn't support metadata extraction\n // Return an error indicating that pdf-lib should be used instead\n return yield* UploadistaError.fromCode(\"DOCUMENT_PROCESSING_FAILED\", {\n cause:\n \"unpdf does not support metadata extraction. Use @uploadista/flow-documents-pdflib instead.\",\n }).toEffect();\n });\n },\n\n splitPdf: () => {\n return Effect.gen(function* () {\n // unpdf doesn't support PDF splitting\n return yield* UploadistaError.fromCode(\"DOCUMENT_PROCESSING_FAILED\", {\n cause:\n \"unpdf does not support PDF splitting. Use @uploadista/flow-documents-pdflib instead.\",\n }).toEffect();\n });\n },\n\n mergePdfs: () => {\n return Effect.gen(function* () {\n // unpdf doesn't support PDF merging\n return yield* UploadistaError.fromCode(\"DOCUMENT_PROCESSING_FAILED\", {\n cause:\n \"unpdf does not support PDF merging. Use @uploadista/flow-documents-pdflib instead.\",\n }).toEffect();\n });\n },\n }),\n);\n\nexport const UnpdfDocumentPluginLive = unpdfDocumentPlugin;\n"],"mappings":";;;;;;AAKA,MAAa,sBAAsB,MAAM,QACvC,gBACA,eAAe,GAAG;CAChB,cAAc,UAAU;AACtB,SAAO,OAAO,IAAI,aAAa;GAC7B,MAAM,OAAO,OAAO,OAAO,WAAW;IACpC,KAAK,YAAY;AAIf,aAHe,MAAM,YAAY,OAAO,EACtC,YAAY,MACb,CAAC,EACY;;IAEhB,QAAQ,UAAU;KAChB,MAAM,eACJ,iBAAiB,QAAQ,MAAM,UAAU,OAAO,MAAM;AAExD,SACE,aAAa,aAAa,CAAC,SAAS,UAAU,IAC9C,aAAa,aAAa,CAAC,SAAS,WAAW,CAE/C,QAAO,gBAAgB,SAAS,iBAAiB,EAC/C,OAAO,cACR,CAAC;AAGJ,SACE,aAAa,aAAa,CAAC,SAAS,UAAU,IAC9C,aAAa,aAAa,CAAC,SAAS,UAAU,IAC9C,aAAa,aAAa,CAAC,SAAS,YAAY,CAEhD,QAAO,gBAAgB,SAAS,iBAAiB,EAC/C,OAAO,cACR,CAAC;AAGJ,YAAO,gBAAgB,SAAS,8BAA8B,EAC5D,OAAO,cACR,CAAC;;IAEL,CAAC;AAGF,OAAI,CAAC,QAAQ,KAAK,MAAM,CAAC,WAAW,EAClC,QAAO,OAAO,WACZ,+GACD;AAGH,UAAO;IACP;;CAGJ,mBAAmB;AACjB,SAAO,OAAO,IAAI,aAAa;AAG7B,UAAO,OAAO,gBAAgB,SAAS,8BAA8B,EACnE,OACE,8FACH,CAAC,CAAC,UAAU;IACb;;CAGJ,gBAAgB;AACd,SAAO,OAAO,IAAI,aAAa;AAE7B,UAAO,OAAO,gBAAgB,SAAS,8BAA8B,EACnE,OACE,wFACH,CAAC,CAAC,UAAU;IACb;;CAGJ,iBAAiB;AACf,SAAO,OAAO,IAAI,aAAa;AAE7B,UAAO,OAAO,gBAAgB,SAAS,8BAA8B,EACnE,OACE,sFACH,CAAC,CAAC,UAAU;IACb;;CAEL,CAAC,CACH;AAED,MAAa,0BAA0B"}
|
package/package.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@uploadista/flow-documents-unpdf",
|
|
3
|
+
"type": "module",
|
|
4
|
+
"version": "0.0.16-beta.2",
|
|
5
|
+
"description": "unpdf plugin for Uploadista document text extraction",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"author": "Uploadista",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./dist/index.d.mts",
|
|
11
|
+
"import": "./dist/index.mjs",
|
|
12
|
+
"require": "./dist/index.cjs",
|
|
13
|
+
"default": "./dist/index.mjs"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"dependencies": {
|
|
17
|
+
"effect": "3.19.4",
|
|
18
|
+
"unpdf": "^1.4.0",
|
|
19
|
+
"@uploadista/core": "0.0.16-beta.2"
|
|
20
|
+
},
|
|
21
|
+
"devDependencies": {
|
|
22
|
+
"@types/node": "24.10.1",
|
|
23
|
+
"tsdown": "0.16.5",
|
|
24
|
+
"@uploadista/typescript-config": "0.0.16-beta.2"
|
|
25
|
+
},
|
|
26
|
+
"scripts": {
|
|
27
|
+
"build": "tsdown",
|
|
28
|
+
"format": "biome format --write ./src",
|
|
29
|
+
"lint": "biome lint --write ./src",
|
|
30
|
+
"check": "biome check --write ./src"
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import { UploadistaError } from "@uploadista/core/errors";
|
|
2
|
+
import { DocumentPlugin } from "@uploadista/core/flow";
|
|
3
|
+
import { Effect, Layer } from "effect";
|
|
4
|
+
import { extractText } from "unpdf";
|
|
5
|
+
|
|
6
|
+
export const unpdfDocumentPlugin = Layer.succeed(
|
|
7
|
+
DocumentPlugin,
|
|
8
|
+
DocumentPlugin.of({
|
|
9
|
+
extractText: (input) => {
|
|
10
|
+
return Effect.gen(function* () {
|
|
11
|
+
const text = yield* Effect.tryPromise({
|
|
12
|
+
try: async () => {
|
|
13
|
+
const result = await extractText(input, {
|
|
14
|
+
mergePages: true,
|
|
15
|
+
});
|
|
16
|
+
return result.text;
|
|
17
|
+
},
|
|
18
|
+
catch: (error) => {
|
|
19
|
+
const errorMessage =
|
|
20
|
+
error instanceof Error ? error.message : String(error);
|
|
21
|
+
|
|
22
|
+
if (
|
|
23
|
+
errorMessage.toLowerCase().includes("encrypt") ||
|
|
24
|
+
errorMessage.toLowerCase().includes("password")
|
|
25
|
+
) {
|
|
26
|
+
return UploadistaError.fromCode("PDF_ENCRYPTED", {
|
|
27
|
+
cause: errorMessage,
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
if (
|
|
32
|
+
errorMessage.toLowerCase().includes("corrupt") ||
|
|
33
|
+
errorMessage.toLowerCase().includes("invalid") ||
|
|
34
|
+
errorMessage.toLowerCase().includes("malformed")
|
|
35
|
+
) {
|
|
36
|
+
return UploadistaError.fromCode("PDF_CORRUPTED", {
|
|
37
|
+
cause: errorMessage,
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
return UploadistaError.fromCode("DOCUMENT_PROCESSING_FAILED", {
|
|
42
|
+
cause: errorMessage,
|
|
43
|
+
});
|
|
44
|
+
},
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
// If no text was extracted, log a warning
|
|
48
|
+
if (!text || text.trim().length === 0) {
|
|
49
|
+
yield* Effect.logWarning(
|
|
50
|
+
"No text extracted from PDF. This might be a scanned document or image-based PDF. Consider using OCR instead.",
|
|
51
|
+
);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
return text;
|
|
55
|
+
});
|
|
56
|
+
},
|
|
57
|
+
|
|
58
|
+
getMetadata: () => {
|
|
59
|
+
return Effect.gen(function* () {
|
|
60
|
+
// unpdf doesn't support metadata extraction
|
|
61
|
+
// Return an error indicating that pdf-lib should be used instead
|
|
62
|
+
return yield* UploadistaError.fromCode("DOCUMENT_PROCESSING_FAILED", {
|
|
63
|
+
cause:
|
|
64
|
+
"unpdf does not support metadata extraction. Use @uploadista/flow-documents-pdflib instead.",
|
|
65
|
+
}).toEffect();
|
|
66
|
+
});
|
|
67
|
+
},
|
|
68
|
+
|
|
69
|
+
splitPdf: () => {
|
|
70
|
+
return Effect.gen(function* () {
|
|
71
|
+
// unpdf doesn't support PDF splitting
|
|
72
|
+
return yield* UploadistaError.fromCode("DOCUMENT_PROCESSING_FAILED", {
|
|
73
|
+
cause:
|
|
74
|
+
"unpdf does not support PDF splitting. Use @uploadista/flow-documents-pdflib instead.",
|
|
75
|
+
}).toEffect();
|
|
76
|
+
});
|
|
77
|
+
},
|
|
78
|
+
|
|
79
|
+
mergePdfs: () => {
|
|
80
|
+
return Effect.gen(function* () {
|
|
81
|
+
// unpdf doesn't support PDF merging
|
|
82
|
+
return yield* UploadistaError.fromCode("DOCUMENT_PROCESSING_FAILED", {
|
|
83
|
+
cause:
|
|
84
|
+
"unpdf does not support PDF merging. Use @uploadista/flow-documents-pdflib instead.",
|
|
85
|
+
}).toEffect();
|
|
86
|
+
});
|
|
87
|
+
},
|
|
88
|
+
}),
|
|
89
|
+
);
|
|
90
|
+
|
|
91
|
+
export const UnpdfDocumentPluginLive = unpdfDocumentPlugin;
|
package/src/index.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { unpdfDocumentPlugin, UnpdfDocumentPluginLive } from "./document-plugin";
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
{
|
|
2
|
+
"extends": "@uploadista/typescript-config/server.json",
|
|
3
|
+
"compilerOptions": {
|
|
4
|
+
"baseUrl": "./",
|
|
5
|
+
"paths": {
|
|
6
|
+
"@/*": ["./src/*"]
|
|
7
|
+
},
|
|
8
|
+
"outDir": "./dist",
|
|
9
|
+
"rootDir": "./src",
|
|
10
|
+
"lib": ["ESNext", "DOM", "DOM.Iterable"],
|
|
11
|
+
"types": []
|
|
12
|
+
},
|
|
13
|
+
"include": ["src"]
|
|
14
|
+
}
|