@elizaos/plugin-pdf 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +36 -11
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
// src/services/pdf.ts
|
|
2
|
-
import { Service, ServiceType } from "@elizaos/core";
|
|
2
|
+
import { Service, ServiceType, logger } from "@elizaos/core";
|
|
3
3
|
import pkg from "pdfjs-dist";
|
|
4
4
|
var { getDocument } = pkg;
|
|
5
5
|
var PdfService = class _PdfService extends Service {
|
|
@@ -48,17 +48,42 @@ var PdfService = class _PdfService extends Service {
|
|
|
48
48
|
* @returns {Promise<string>} A Promise that resolves with the text content of the PDF.
|
|
49
49
|
*/
|
|
50
50
|
async convertPdfToText(pdfBuffer) {
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
51
|
+
try {
|
|
52
|
+
const uint8Array = new Uint8Array(pdfBuffer);
|
|
53
|
+
const pdf = await getDocument({ data: uint8Array }).promise;
|
|
54
|
+
const numPages = pdf.numPages;
|
|
55
|
+
const textPages = [];
|
|
56
|
+
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
|
|
57
|
+
const page = await pdf.getPage(pageNum);
|
|
58
|
+
const textContent = await page.getTextContent();
|
|
59
|
+
const pageText = textContent.items.filter(isTextItem).map((item) => item.str).join(" ");
|
|
60
|
+
textPages.push(pageText);
|
|
61
|
+
}
|
|
62
|
+
const rawText = textPages.join("\n");
|
|
63
|
+
return this.cleanUpContent(rawText);
|
|
64
|
+
} catch (error) {
|
|
65
|
+
logger.error(`PdfService: Failed to convert PDF to text - error: ${error}, bufferSize: ${pdfBuffer.length}, runtimeId: ${this.runtime?.agentId || "unknown"}`);
|
|
66
|
+
throw error;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Cleans up PDF text content by removing problematic characters.
|
|
71
|
+
*
|
|
72
|
+
* @param {string} content - The raw text content from PDF.
|
|
73
|
+
* @returns {string} The cleaned text content.
|
|
74
|
+
*/
|
|
75
|
+
cleanUpContent(content) {
|
|
76
|
+
try {
|
|
77
|
+
const filtered = content.split("").filter((char) => {
|
|
78
|
+
const charCode = char.charCodeAt(0);
|
|
79
|
+
return !(charCode === 0 || charCode >= 1 && charCode <= 8 || charCode >= 11 && charCode <= 12 || charCode >= 14 && charCode <= 31 || charCode === 127);
|
|
80
|
+
}).join("");
|
|
81
|
+
const cleaned = filtered.replace(/[^\S\r\n]+/g, " ").replace(/[ \t]+(\r?\n)/g, "$1").trim();
|
|
82
|
+
return cleaned;
|
|
83
|
+
} catch (error) {
|
|
84
|
+
logger.error(`PdfService: Failed to clean up content - error: ${error}, contentLength: ${content.length}`);
|
|
85
|
+
return content;
|
|
60
86
|
}
|
|
61
|
-
return textPages.join("\n");
|
|
62
87
|
}
|
|
63
88
|
};
|
|
64
89
|
function isTextItem(item) {
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/services/pdf.ts","../src/index.ts"],"sourcesContent":["import { type IAgentRuntime, Service, type ServiceTypeName, ServiceType } from '@elizaos/core';\nimport pkg from 'pdfjs-dist';\nconst { getDocument } = pkg;\nimport type { TextItem, TextMarkedContent } from 'pdfjs-dist/types/src/display/api';\n\n/**\n * Class representing a PDF service that can convert PDF files to text.\n * * @extends Service\n */\nexport class PdfService extends Service {\n static serviceType: ServiceTypeName = ServiceType.PDF;\n capabilityDescription = 'The agent is able to convert PDF files to text';\n\n /**\n * Constructor for creating a new instance of the class.\n *\n * @param {IAgentRuntime} runtime - The runtime object passed to the constructor.\n */\n constructor(runtime: IAgentRuntime) {\n super();\n this.runtime = runtime;\n }\n\n /**\n * Starts the PdfService asynchronously.\n * @param {IAgentRuntime} runtime - The runtime object for the agent.\n * @returns {Promise<PdfService>} A promise that resolves with the PdfService instance.\n */\n static async start(runtime: IAgentRuntime): Promise<PdfService> {\n const service = new PdfService(runtime);\n return service;\n }\n\n /**\n * Stop the PDF service in the given runtime.\n *\n * @param {IAgentRuntime} runtime - The runtime to stop the PDF service in.\n * @returns {Promise<void>} - A promise that resolves once the PDF service is stopped.\n */\n static async stop(runtime: IAgentRuntime) {\n const service = runtime.getService(ServiceType.PDF);\n if (service) {\n await service.stop();\n }\n }\n\n /**\n * Asynchronously stops the process.\n * Does nothing.\n */\n async stop() {\n // do nothing\n }\n\n /**\n * Converts a PDF Buffer to text.\n *\n * @param {Buffer} pdfBuffer - The PDF Buffer to convert to text.\n * @returns {Promise<string>} A Promise that resolves with the text content of the PDF.\n */\n async convertPdfToText(pdfBuffer: Buffer): Promise<string> {\n
|
|
1
|
+
{"version":3,"sources":["../src/services/pdf.ts","../src/index.ts"],"sourcesContent":["import { type IAgentRuntime, Service, type ServiceTypeName, ServiceType, logger } from '@elizaos/core';\nimport pkg from 'pdfjs-dist';\nconst { getDocument } = pkg;\nimport type { TextItem, TextMarkedContent } from 'pdfjs-dist/types/src/display/api';\n\n/**\n * Class representing a PDF service that can convert PDF files to text.\n * * @extends Service\n */\nexport class PdfService extends Service {\n static serviceType: ServiceTypeName = ServiceType.PDF;\n capabilityDescription = 'The agent is able to convert PDF files to text';\n\n /**\n * Constructor for creating a new instance of the class.\n *\n * @param {IAgentRuntime} runtime - The runtime object passed to the constructor.\n */\n constructor(runtime: IAgentRuntime) {\n super();\n this.runtime = runtime;\n }\n\n /**\n * Starts the PdfService asynchronously.\n * @param {IAgentRuntime} runtime - The runtime object for the agent.\n * @returns {Promise<PdfService>} A promise that resolves with the PdfService instance.\n */\n static async start(runtime: IAgentRuntime): Promise<PdfService> {\n const service = new PdfService(runtime);\n return service;\n }\n\n /**\n * Stop the PDF service in the given runtime.\n *\n * @param {IAgentRuntime} runtime - The runtime to stop the PDF service in.\n * @returns {Promise<void>} - A promise that resolves once the PDF service is stopped.\n */\n static async stop(runtime: IAgentRuntime) {\n const service = runtime.getService(ServiceType.PDF);\n if (service) {\n await service.stop();\n }\n }\n\n /**\n * Asynchronously stops the process.\n * Does nothing.\n */\n async stop() {\n // do nothing\n }\n\n /**\n * Converts a PDF Buffer to text.\n *\n * @param {Buffer} pdfBuffer - The PDF Buffer to convert to text.\n * @returns {Promise<string>} A Promise that resolves with the text content of the PDF.\n */\n async convertPdfToText(pdfBuffer: Buffer): Promise<string> {\n try {\n const uint8Array = new Uint8Array(pdfBuffer);\n\n const pdf = await getDocument({ data: uint8Array }).promise;\n const numPages = pdf.numPages;\n\n const textPages: string[] = [];\n\n for (let pageNum = 1; pageNum <= numPages; pageNum++) {\n const page = await pdf.getPage(pageNum);\n const textContent = await page.getTextContent();\n const pageText = textContent.items\n .filter(isTextItem)\n .map((item: TextItem) => item.str)\n .join(' ');\n textPages.push(pageText);\n }\n const rawText = textPages.join('\\n');\n\n return this.cleanUpContent(rawText);\n } catch (error) {\n logger.error(`PdfService: Failed to convert PDF to text - error: ${error}, bufferSize: ${pdfBuffer.length}, runtimeId: ${this.runtime?.agentId || 'unknown'}`);\n throw error;\n }\n }\n\n /**\n * Cleans up PDF text content by removing problematic characters.\n *\n * @param {string} content - The raw text content from PDF.\n * @returns {string} The cleaned text content.\n */\n cleanUpContent(content: string): string {\n try {\n // Filter out null characters and other problematic control characters\n const filtered = content\n .split('')\n .filter(char => {\n const charCode = char.charCodeAt(0);\n // Keep all characters except control characters (0-31 and 127)\n // but preserve tab (9), newline (10), and carriage return (13)\n return !(charCode === 0 ||\n (charCode >= 1 && charCode <= 8) ||\n (charCode >= 11 && charCode <= 12) ||\n (charCode >= 14 && charCode <= 31) ||\n charCode === 127);\n })\n .join('');\n\n const cleaned = filtered\n // Collapse spaces and tabs but preserve newlines\n .replace(/[^\\S\\r\\n]+/g, ' ')\n // Trim trailing spaces at end of lines\n .replace(/[ \\t]+(\\r?\\n)/g, '$1')\n // Trim whitespace from start and end\n .trim();\n\n return cleaned;\n } catch (error) {\n logger.error(`PdfService: Failed to clean up content - error: ${error}, contentLength: ${content.length}`);\n // Return original content if cleanup fails\n return content;\n }\n }\n}\n\n// Type guard function\n/**\n * Check if the input is a TextItem.\n *\n * @param item - The input item to check.\n * @returns A boolean indicating if the input is a TextItem.\n */\nfunction isTextItem(item: TextItem | TextMarkedContent): item is TextItem {\n return 'str' in item;\n}\n","import type { Plugin } from \"@elizaos/core\";\n\nimport { PdfService } from \"./services/pdf\";\n\nexport const pdfPlugin: Plugin = {\n name: \"pdf\",\n description: \"Plugin for PDF reading and processing\",\n services: [PdfService],\n actions: [],\n};\n\nexport default pdfPlugin;\n"],"mappings":";AAAA,SAA6B,SAA+B,aAAa,cAAc;AACvF,OAAO,SAAS;AAChB,IAAM,EAAE,YAAY,IAAI;AAOjB,IAAM,aAAN,MAAM,oBAAmB,QAAQ;AAAA,EACtC,OAAO,cAA+B,YAAY;AAAA,EAClD,wBAAwB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOxB,YAAY,SAAwB;AAClC,UAAM;AACN,SAAK,UAAU;AAAA,EACjB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOA,aAAa,MAAM,SAA6C;AAC9D,UAAM,UAAU,IAAI,YAAW,OAAO;AACtC,WAAO;AAAA,EACT;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAQA,aAAa,KAAK,SAAwB;AACxC,UAAM,UAAU,QAAQ,WAAW,YAAY,GAAG;AAClD,QAAI,SAAS;AACX,YAAM,QAAQ,KAAK;AAAA,IACrB;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,OAAO;AAAA,EAEb;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAQA,MAAM,iBAAiB,WAAoC;AACzD,QAAI;AACF,YAAM,aAAa,IAAI,WAAW,SAAS;AAE3C,YAAM,MAAM,MAAM,YAAY,EAAE,MAAM,WAAW,CAAC,EAAE;AACpD,YAAM,WAAW,IAAI;AAErB,YAAM,YAAsB,CAAC;AAE7B,eAAS,UAAU,GAAG,WAAW,UAAU,WAAW;AACpD,cAAM,OAAO,MAAM,IAAI,QAAQ,OAAO;AACtC,cAAM,cAAc,MAAM,KAAK,eAAe;AAC9C,cAAM,WAAW,YAAY,MAC1B,OAAO,UAAU,EACjB,IAAI,CAAC,SAAmB,KAAK,GAAG,EAChC,KAAK,GAAG;AACX,kBAAU,KAAK,QAAQ;AAAA,MACzB;AACA,YAAM,UAAU,UAAU,KAAK,IAAI;AAEnC,aAAO,KAAK,eAAe,OAAO;AAAA,IACpC,SAAS,OAAO;AACd,aAAO,MAAM,sDAAsD,KAAK,iBAAiB,UAAU,MAAM,gBAAgB,KAAK,SAAS,WAAW,SAAS,EAAE;AAC7J,YAAM;AAAA,IACR;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAQA,eAAe,SAAyB;AACtC,QAAI;AAEF,YAAM,WAAW,QACd,MAAM,EAAE,EACR,OAAO,UAAQ;AACd,cAAM,WAAW,KAAK,WAAW,CAAC;AAGlC,eAAO,EAAE,aAAa,KACnB,YAAY,KAAK,YAAY,KAC7B,YAAY,MAAM,YAAY,MAC9B,YAAY,MAAM,YAAY,MAC/B,aAAa;AAAA,MACjB,CAAC,EACA,KAAK,EAAE;AAEV,YAAM,UAAU,SAEb,QAAQ,eAAe,GAAG,EAE1B,QAAQ,kBAAkB,IAAI,EAE9B,KAAK;AAER,aAAO;AAAA,IACT,SAAS,OAAO;AACd,aAAO,MAAM,mDAAmD,KAAK,oBAAoB,QAAQ,MAAM,EAAE;AAEzG,aAAO;AAAA,IACT;AAAA,EACF;AACF;AASA,SAAS,WAAW,MAAsD;AACxE,SAAO,SAAS;AAClB;;;ACpIO,IAAM,YAAoB;AAAA,EAC/B,MAAM;AAAA,EACN,aAAa;AAAA,EACb,UAAU,CAAC,UAAU;AAAA,EACrB,SAAS,CAAC;AACZ;AAEA,IAAO,gBAAQ;","names":[]}
|