@plurnk/plurnk-mimetypes-application-pdf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +23 -0
- package/dist/ApplicationPdf.d.ts +6 -0
- package/dist/ApplicationPdf.d.ts.map +1 -0
- package/dist/ApplicationPdf.js +77 -0
- package/dist/ApplicationPdf.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -0
- package/package.json +47 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 PossumTech Laboratories, LLC
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# @plurnk/plurnk-mimetypes-application-pdf
|
|
2
|
+
|
|
3
|
+
`application/pdf` mimetype handler for the [plurnk](https://github.com/plurnk) ecosystem. Binary content; extracts text via [pdfjs-dist](https://www.npmjs.com/package/pdfjs-dist).
|
|
4
|
+
|
|
5
|
+
## install
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
npm i @plurnk/plurnk-mimetypes-application-pdf
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## what it does
|
|
12
|
+
|
|
13
|
+
PDF is a binary mimetype — the package declares `plurnk.binary: true`, and the framework reads files as `Uint8Array` before passing to handler methods.
|
|
14
|
+
|
|
15
|
+
- `validate(content)` parses the PDF via pdfjs-dist's legacy build (Node-compatible); throws on parse failure or on image-only PDFs (scans without OCR — no extractable text means the LLM would get nothing useful).
|
|
16
|
+
- `preview(content, budget)` extracts text page-by-page (joined with `\n\n`), budgeted via the framework's `fitContent`. This is the value-add: the consumer gets readable text content suitable for LLM ingestion.
|
|
17
|
+
- `symbols(content)` empty (the preview *is* the structural signal for a PDF; future enhancement could surface the document outline / bookmarks as heading symbols).
|
|
18
|
+
|
|
19
|
+
Salvage pattern from [rummy.web/WebFetcher.js](https://github.com/possumtech/rummy.web): pdfjs configured with `isEvalSupported: false` (no PDF JS execution) and `verbosity: 0` (silences font-warning noise — we read text streams directly, not glyphs).
|
|
20
|
+
|
|
21
|
+
## license
|
|
22
|
+
|
|
23
|
+
MIT.
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { BaseHandler } from "@plurnk/plurnk-mimetypes";
|
|
2
|
+
export default class ApplicationPdf extends BaseHandler {
|
|
3
|
+
validate(content: string | Uint8Array): Promise<void>;
|
|
4
|
+
preview(content: string | Uint8Array, budget: number): Promise<string>;
|
|
5
|
+
}
|
|
6
|
+
//# sourceMappingURL=ApplicationPdf.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ApplicationPdf.d.ts","sourceRoot":"","sources":["../src/ApplicationPdf.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAc,MAAM,0BAA0B,CAAC;AAqBnE,MAAM,CAAC,OAAO,OAAO,cAAe,SAAQ,WAAW;IAC7C,QAAQ,CAAC,OAAO,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAQrD,OAAO,CAAC,OAAO,EAAE,MAAM,GAAG,UAAU,EAAE,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;CAU/E"}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { BaseHandler, fitContent } from "@plurnk/plurnk-mimetypes";
|
|
2
|
+
// application/pdf handler. Binary mimetype — receives Uint8Array content.
|
|
3
|
+
// Validates the PDF parses cleanly and extracts text via pdfjs-dist; the
|
|
4
|
+
// preview is the joined text content, budgeted via the framework's
|
|
5
|
+
// fitContent.
|
|
6
|
+
//
|
|
7
|
+
// Salvage pattern from rummy.web/WebFetcher.js:
|
|
8
|
+
// - pdfjs-dist legacy build (Node-compatible)
|
|
9
|
+
// - isEvalSupported:false (no PDF JS execution)
|
|
10
|
+
// - verbosity:0 (silences "standardFontDataUrl not provided" noise;
|
|
11
|
+
// we read text streams directly and don't render glyphs)
|
|
12
|
+
// - Pages joined with "\n\n"
|
|
13
|
+
// - Image-only PDFs (scans without OCR) parse cleanly but produce no
|
|
14
|
+
// text — surfaced as a validate error, not an empty preview
|
|
15
|
+
//
|
|
16
|
+
// symbols() stays empty — PDFs have no exposed structural outline in the
|
|
17
|
+
// duck contract today. (A future enhancement could read the PDF's
|
|
18
|
+
// document outline / bookmarks as heading symbols, but the simpler
|
|
19
|
+
// "extracted text as preview" path covers the LLM-consumption use case
|
|
20
|
+
// cleanly.)
|
|
21
|
+
export default class ApplicationPdf extends BaseHandler {
|
|
22
|
+
async validate(content) {
|
|
23
|
+
const bytes = toBytes(content);
|
|
24
|
+
const text = await extractAllText(bytes);
|
|
25
|
+
if (text.trim() === "") {
|
|
26
|
+
throw new Error("PDF has no extractable text (image-only scan, or encrypted/protected document)");
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
async preview(content, budget) {
|
|
30
|
+
const bytes = toBytes(content);
|
|
31
|
+
let text;
|
|
32
|
+
try {
|
|
33
|
+
text = await extractAllText(bytes);
|
|
34
|
+
}
|
|
35
|
+
catch {
|
|
36
|
+
return "";
|
|
37
|
+
}
|
|
38
|
+
return fitContent(text, budget, this.tokenize);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
function toBytes(content) {
|
|
42
|
+
if (content instanceof Uint8Array)
|
|
43
|
+
return content;
|
|
44
|
+
// Treat string content as latin1 byte sequence — preserves bytes round-tripped
|
|
45
|
+
// through utf-8 only if every byte fits in 0–255, but consumers passing inline
|
|
46
|
+
// string content for binary mimetypes are responsible for the encoding choice.
|
|
47
|
+
return new TextEncoder().encode(content);
|
|
48
|
+
}
|
|
49
|
+
async function extractAllText(bytes) {
|
|
50
|
+
const pdfjs = await import("pdfjs-dist/legacy/build/pdf.mjs");
|
|
51
|
+
// isEvalSupported and verbosity are real pdfjs runtime parameters but
|
|
52
|
+
// aren't declared in DocumentInitParameters' published .d.ts — cast through
|
|
53
|
+
// unknown to set them without disabling type safety wholesale.
|
|
54
|
+
const params = {
|
|
55
|
+
data: bytes,
|
|
56
|
+
isEvalSupported: false,
|
|
57
|
+
verbosity: 0,
|
|
58
|
+
};
|
|
59
|
+
const doc = await pdfjs.getDocument(params).promise;
|
|
60
|
+
try {
|
|
61
|
+
const pages = [];
|
|
62
|
+
for (let i = 1; i <= doc.numPages; i += 1) {
|
|
63
|
+
const page = await doc.getPage(i);
|
|
64
|
+
const tc = await page.getTextContent();
|
|
65
|
+
const pageText = tc.items
|
|
66
|
+
.map((it) => it.str ?? "")
|
|
67
|
+
.join(" ");
|
|
68
|
+
pages.push(pageText);
|
|
69
|
+
page.cleanup();
|
|
70
|
+
}
|
|
71
|
+
return pages.join("\n\n");
|
|
72
|
+
}
|
|
73
|
+
finally {
|
|
74
|
+
await doc.destroy();
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
//# sourceMappingURL=ApplicationPdf.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ApplicationPdf.js","sourceRoot":"","sources":["../src/ApplicationPdf.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AAEnE,0EAA0E;AAC1E,yEAAyE;AACzE,mEAAmE;AACnE,cAAc;AACd,EAAE;AACF,gDAAgD;AAChD,gDAAgD;AAChD,kDAAkD;AAClD,sEAAsE;AACtE,6DAA6D;AAC7D,+BAA+B;AAC/B,uEAAuE;AACvE,gEAAgE;AAChE,EAAE;AACF,yEAAyE;AACzE,kEAAkE;AAClE,mEAAmE;AACnE,uEAAuE;AACvE,YAAY;AACZ,MAAM,CAAC,OAAO,OAAO,cAAe,SAAQ,WAAW;IACnD,KAAK,CAAC,QAAQ,CAAC,OAA4B;QACvC,MAAM,KAAK,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC;QAC/B,MAAM,IAAI,GAAG,MAAM,cAAc,CAAC,KAAK,CAAC,CAAC;QACzC,IAAI,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC;YACrB,MAAM,IAAI,KAAK,CAAC,gFAAgF,CAAC,CAAC;QACtG,CAAC;IACL,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,OAA4B,EAAE,MAAc;QACtD,MAAM,KAAK,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC;QAC/B,IAAI,IAAY,CAAC;QACjB,IAAI,CAAC;YACD,IAAI,GAAG,MAAM,cAAc,CAAC,KAAK,CAAC,CAAC;QACvC,CAAC;QAAC,MAAM,CAAC;YACL,OAAO,EAAE,CAAC;QACd,CAAC;QACD,OAAO,UAAU,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAC;IACnD,CAAC;CACJ;AAED,SAAS,OAAO,CAAC,OAA4B;IACzC,IAAI,OAAO,YAAY,UAAU;QAAE,OAAO,OAAO,CAAC;IAClD,+EAA+E;IAC/E,+EAA+E;IAC/E,+EAA+E;IAC/E,OAAO,IAAI,WAAW,EAAE,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;AAC7C,CAAC;AAED,KAAK,UAAU,cAAc,CAAC,KAAiB;IAC3C,MAAM,KAAK,GAAG,MAAM,MAAM,CAAC,iCAAiC,CAAC,CAAC;IAC9D,sEAAsE;IACtE,4EAA4E;IAC5E,+DAA+D;IAC/D,MAAM,MAAM,GAAG;QACX,IAAI,EAAE,KAAK;QACX,eAAe,EAAE,KAAK;QACtB,SAAS,EAAE,CAAC;KACuC,CAAC;IACxD,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC;IACpD,IAAI,CAAC;QACD,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC;YACxC,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YAClC,MAAM,EAAE,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;YACvC,MAAM,QAAQ,GAAG,EAAE,CAAC,KAAK;iBACpB,GAAG,CAAC,CAAC,EAAW,EAAE,EAAE,CAAE,EAAuB,CAAC,GAAG,IAAI,EAAE,CAAC;iBACxD,IAAI,CAAC,GAAG,CAAC,CAAC;YACf,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACrB,IAAI,CAAC,OAAO,EAAE,CAAC;QACnB,CAAC;QACD,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC9B,CAAC;YAAS,CAAC;QACP,MAAM,GAAG,CAAC,OAAO,EAAE,CAAC;IACxB,CAAC;AACL,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,IAAI,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAChE,OAAO,EAAE,OAAO,EAAE,MAAM,qBAAqB,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,IAAI,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAChE,OAAO,EAAE,OAAO,EAAE,MAAM,qBAAqB,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@plurnk/plurnk-mimetypes-application-pdf",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "application/pdf mimetype handler for plurnk-service. Binary content; extracts text via pdfjs-dist.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"publishConfig": {
|
|
8
|
+
"access": "public"
|
|
9
|
+
},
|
|
10
|
+
"engines": {
|
|
11
|
+
"node": ">=25"
|
|
12
|
+
},
|
|
13
|
+
"plurnk": {
|
|
14
|
+
"kind": "mimetype",
|
|
15
|
+
"binary": true,
|
|
16
|
+
"handlers": [
|
|
17
|
+
{ "name": "application/pdf", "glyph": "📕", "extensions": [".pdf"] }
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
"exports": {
|
|
21
|
+
".": {
|
|
22
|
+
"types": "./dist/index.d.ts",
|
|
23
|
+
"default": "./dist/index.js"
|
|
24
|
+
},
|
|
25
|
+
"./package.json": "./package.json"
|
|
26
|
+
},
|
|
27
|
+
"files": [
|
|
28
|
+
"dist/**/*",
|
|
29
|
+
"README.md"
|
|
30
|
+
],
|
|
31
|
+
"scripts": {
|
|
32
|
+
"test:lint": "tsc --noEmit",
|
|
33
|
+
"test:unit": "node --test src/**/*.test.ts",
|
|
34
|
+
"test": "npm run test:lint && npm run test:unit",
|
|
35
|
+
"build:dist": "tsc -p tsconfig.build.json",
|
|
36
|
+
"build": "npm run build:dist",
|
|
37
|
+
"prepare": "npm run build"
|
|
38
|
+
},
|
|
39
|
+
"dependencies": {
|
|
40
|
+
"@plurnk/plurnk-mimetypes": "^0.3.0",
|
|
41
|
+
"pdfjs-dist": "^5.0.0"
|
|
42
|
+
},
|
|
43
|
+
"devDependencies": {
|
|
44
|
+
"@types/node": "^25.8.0",
|
|
45
|
+
"typescript": "^6.0.3"
|
|
46
|
+
}
|
|
47
|
+
}
|